diff --git a/README.md b/README.md index a5818f7c..5a671a38 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,85 @@ +### Change log [2024-09-09 07:01:29] +1. Item Updated: `hugging_face_serving` (from version: `1.0.0` to `1.0.0`) +2. Item Updated: `model_monitoring_batch` (from version: `1.1.0` to `1.1.0`) +3. Item Updated: `aggregate` (from version: `1.3.0` to `1.3.0`) +4. Item Updated: `onnx_utils` (from version: `1.2.0` to `1.2.0`) +5. Item Updated: `batch_inference` (from version: `1.7.0` to `1.7.0`) +6. Item Updated: `test_classifier` (from version: `1.1.0` to `1.1.0`) +7. Item Updated: `coxph_test` (from version: `1.1.0` to `1.1.0`) +8. Item Updated: `batch_inference_v2` (from version: `2.5.0` to `2.5.0`) +9. Item Updated: `text_to_audio_generator` (from version: `1.1.0` to `1.1.0`) +10. Item Updated: `describe_spark` (from version: `1.1.0` to `1.1.0`) +11. Item Updated: `model_server_tester` (from version: `1.1.0` to `1.1.0`) +12. Item Updated: `arc_to_parquet` (from version: `1.4.1` to `1.4.1`) +13. Item Updated: `sklearn_classifier` (from version: `1.1.1` to `1.1.1`) +14. Item Updated: `validate_great_expectations` (from version: `1.1.0` to `1.1.0`) +15. Item Updated: `xgb_test` (from version: `1.1.1` to `1.1.1`) +16. Item Updated: `churn_server` (from version: `1.1.0` to `1.1.0`) +17. Item Updated: `coxph_trainer` (from version: `1.1.0` to `1.1.0`) +18. Item Updated: `v2_model_tester` (from version: `1.1.0` to `1.1.0`) +19. Item Updated: `model_server` (from version: `1.1.0` to `1.1.0`) +20. Item Updated: `pyannote_audio` (from version: `1.1.0` to `1.1.0`) +21. Item Updated: `open_archive` (from version: `1.1.0` to `1.1.0`) +22. Item Updated: `pii_recognizer` (from version: `0.2.0` to `0.2.0`) +23. Item Updated: `silero_vad` (from version: `1.2.0` to `1.2.0`) +24. Item Updated: `load_dataset` (from version: `1.2.0` to `1.2.0`) +25. Item Updated: `sklearn_classifier_dask` (from version: `1.1.1` to `1.1.1`) +26. Item Updated: `noise_reduction` (from version: `1.0.0` to `1.0.0`) +27. Item Updated: `github_utils` (from version: `1.1.0` to `1.1.0`) +28. Item Updated: `v2_model_server` (from version: `1.1.0` to `1.1.0`) +29. Item Updated: `azureml_serving` (from version: `1.1.0` to `1.1.0`) +30. Item Updated: `transcribe` (from version: `1.0.0` to `1.0.0`) +31. Item Updated: `gen_class_data` (from version: `1.2.0` to `1.2.0`) +32. Item Updated: `describe_dask` (from version: `1.1.0` to `1.1.0`) +33. Item Updated: `translate` (from version: `0.0.2` to `0.0.2`) +34. Item Updated: `structured_data_generator` (from version: `1.4.0` to `1.4.0`) +35. Item Updated: `tf2_serving` (from version: `1.1.0` to `1.1.0`) +36. Item Updated: `xgb_trainer` (from version: `1.1.1` to `1.1.1`) +37. Item Updated: `question_answering` (from version: `0.3.1` to `0.3.1`) +38. Item Updated: `azureml_utils` (from version: `1.3.0` to `1.3.0`) +39. Item Updated: `send_email` (from version: `1.2.0` to `1.2.0`) +40. Item Updated: `auto_trainer` (from version: `1.7.0` to `1.7.0`) +41. Item Updated: `feature_selection` (from version: `1.4.0` to `1.4.0`) +42. Item Updated: `describe` (from version: `1.2.0` to `1.2.0`) +43. Item Removed: `tf2_serving_v2` +44. Item Removed: `sql_to_file` +45. Item Removed: `bert_embeddings` +46. Item Removed: `pandas_profiling_report` +47. Item Removed: `slack_notify` +48. Item Removed: `xgb_serving` +49. Item Removed: `stream_to_parquet` +50. Item Removed: `concept_drift` +51. Item Removed: `tf1_serving` +52. Item Removed: `model_monitoring_stream` +53. Item Removed: `virtual_drift` +54. Item Removed: `rnn_serving` +55. Item Removed: `feature_perms` +56. Item Removed: `concept_drift_streaming` +57. Item Removed: `ingest` +58. Item Removed: `get_offline_features` +59. Item Removed: `snowflake_dask` +60. Item Removed: `hugging_face_classifier_trainer` +61. Item Removed: `huggingface_auto_trainer` +62. Item Removed: `ingest` +63. Item Removed: `concept_drift` +64. Item Removed: `tf2_serving_v2` +65. Item Removed: `sql_to_file` +66. Item Removed: `concept_drift_streaming` +67. Item Removed: `get_offline_features` +68. Item Removed: `feature_perms` +69. Item Removed: `slack_notify` +70. Item Removed: `virtual_drift` +71. Item Removed: `tf1_serving` +72. Item Removed: `stream_to_parquet` +73. Item Removed: `rnn_serving` +74. Item Removed: `xgb_serving` +75. Item Removed: `bert_embeddings` +76. Item Removed: `pandas_profiling_report` +77. Item Removed: `model_monitoring_stream` +78. Item Removed: `snowflake_dask` +79. Item Removed: `hugging_face_classifier_trainer` +80. Item Removed: `huggingface_auto_trainer` + ### Change log [2024-08-28 09:06:55] 1. Item Updated: `tf1_serving` (from version: `1.1.0` to `1.1.0`) 2. Item Updated: `sklearn_classifier_dask` (from version: `1.1.1` to `1.1.1`) diff --git a/catalog.json b/catalog.json index e860096c..173b8226 100644 --- a/catalog.json +++ b/catalog.json @@ -1 +1 @@ -{"functions": {"development": {"tf2_serving": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "tf2-serving", "platformVersion": "3.5.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf2-serving", "platformVersion": "3.2.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.8.0"}, "0.9.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf2-serving", "platformVersion": "3.2.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.9.1"}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "tf2-serving", "platformVersion": "", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "tf2-serving", "platformVersion": "3.5.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf2-serving", "platformVersion": "3.2.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.9.0"}}, "load_dataset": {"latest": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.0", "name": "load-dataset", "platformVersion": "3.5.5", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "load-dataset", "platformVersion": "3.2.0", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.0", "name": "load-dataset", "platformVersion": "3.5.5", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "load-dataset", "platformVersion": "", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "load-dataset", "platformVersion": "3.5.0", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "load-dataset", "platformVersion": "3.2.0", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}}, "model_server_tester": {"latest": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-server-tester", "platformVersion": "3.5.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server-tester", "platformVersion": "3.2.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "1.0.0": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server-tester", "platformVersion": "3.2.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "model-server-tester", "platformVersion": "", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-server-tester", "platformVersion": "3.5.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server-tester", "platformVersion": "3.2.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}}, "tf2_serving_v2": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server v2", "doc": "", "example": "tf2_serving_v2.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "tf2-serving-v2", "platformVersion": "3.5.0", "spec": {"filename": "tf2_serving_v2.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server v2", "doc": "", "example": "tf2_serving_v2.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf2-serving-v2", "platformVersion": "3.2.0", "spec": {"filename": "tf2_serving_v2.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.8.0"}, "0.9.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server v2", "doc": "", "example": "tf2_serving_v2.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf2-serving-v2", "platformVersion": "3.2.0", "spec": {"filename": "tf2_serving_v2.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.9.1"}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server v2", "doc": "", "example": "tf2_serving_v2.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "tf2-serving-v2", "platformVersion": "", "spec": {"filename": "tf2_serving_v2.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server v2", "doc": "", "example": "tf2_serving_v2.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "tf2-serving-v2", "platformVersion": "3.5.0", "spec": {"filename": "tf2_serving_v2.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server v2", "doc": "", "example": "tf2_serving_v2.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf2-serving-v2", "platformVersion": "3.2.0", "spec": {"filename": "tf2_serving_v2.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.9.0"}}, "sql_to_file": {"latest": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "SQL To File - Ingest data using SQL query", "doc": "", "example": "sql_to_file.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "adih"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "sql-to-file", "platformVersion": "3.5.0", "spec": {"filename": "sql_to_file.py", "handler": "sql_to_file", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "SQL To File - Ingest data using SQL query", "doc": "", "example": "sql_to_file.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "adih"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "sql-to-file", "platformVersion": "3.2.0", "spec": {"filename": "sql_to_file.py", "handler": "sql_to_file", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "0.9.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "SQL To File - Ingest data using SQL query", "doc": "", "example": "sql_to_file.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "adih"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "sql-to-file", "platformVersion": "3.2.0", "spec": {"filename": "sql_to_file.py", "handler": "sql_to_file", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.1"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "SQL To File - Ingest data using SQL query", "doc": "", "example": "sql_to_file.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "adih"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "sql-to-file", "platformVersion": "", "spec": {"filename": "sql_to_file.py", "handler": "sql_to_file", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "SQL To File - Ingest data using SQL query", "doc": "", "example": "sql_to_file.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "adih"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "sql-to-file", "platformVersion": "3.5.0", "spec": {"filename": "sql_to_file.py", "handler": "sql_to_file", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "SQL To File - Ingest data using SQL query", "doc": "", "example": "sql_to_file.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "adih"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "sql-to-file", "platformVersion": "3.2.0", "spec": {"filename": "sql_to_file.py", "handler": "sql_to_file", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}}, "feature_selection": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.4.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "feature-selection", "platformVersion": "3.2.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection/feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "1.3.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "0.9.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.9.0", "name": "feature-selection", "platformVersion": "3.2.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.1"}, "1.4.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.4.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.5.4", "name": "feature-selection", "platformVersion": "2.10.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection/feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.9.0", "name": "feature-selection", "platformVersion": "3.2.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection/feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "1.1.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.1"}}, "aggregate": {"latest": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "aggregate", "platformVersion": "3.5.4", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2021-05-19:22-31", "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "aggregate", "platformVersion": "3.2.0", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "1.3.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "aggregate", "platformVersion": "3.5.4", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "aggregate", "platformVersion": "3.5.2", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "1.0.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2021-05-19:22-31", "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "aggregate", "platformVersion": "3.2.0", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2021-05-19:22-31", "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.6.2", "name": "aggregate", "platformVersion": "3.0.0", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "0.0.2": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2021-05-19:22-31", "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.7.1", "name": "aggregate", "platformVersion": "3.2.0", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.2"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "aggregate", "platformVersion": "3.5.2", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "aggregate", "platformVersion": "3.2.0", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "1.1.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "aggregate", "platformVersion": "3.5.2", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.1"}}, "bert_embeddings": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Get BERT based embeddings for given text", "doc": "", "example": "bert_embeddings.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"framework": "pytorch"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "bert-embeddings", "platformVersion": "3.5.3", "spec": {"filename": "bert_embeddings.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio", "requirements": ["torch"]}, "url": "", "version": "1.2.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Get BERT based embeddings for given text", "doc": "", "example": "bert_embeddings.ipynb", "generationDate": "2021-05-19:22-04", "icon": "", "labels": {"framework": "pytorch"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "bert-embeddings", "platformVersion": "3.2.0", "spec": {"filename": "bert_embeddings.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["torch==1.6.0"]}, "url": "", "version": "0.8.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Get BERT based embeddings for given text", "doc": "", "example": "bert_embeddings.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"framework": "pytorch"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "bert-embeddings", "platformVersion": "3.5.3", "spec": {"filename": "bert_embeddings.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio", "requirements": ["torch"]}, "url": "", "version": "1.2.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Get BERT based embeddings for given text", "doc": "", "example": "bert_embeddings.ipynb", "generationDate": "2021-05-19:22-04", "icon": "", "labels": {"framework": "pytorch"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.5.4", "name": "bert-embeddings", "platformVersion": "2.10.0", "spec": {"filename": "bert_embeddings.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["torch==1.6.0"]}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Get BERT based embeddings for given text", "doc": "", "example": "bert_embeddings.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"framework": "pytorch"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "bert-embeddings", "platformVersion": "3.5.0", "spec": {"filename": "bert_embeddings.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["torch==1.6.0"]}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Get BERT based embeddings for given text", "doc": "", "example": "bert_embeddings.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"framework": "pytorch"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "bert-embeddings", "platformVersion": "3.2.0", "spec": {"filename": "bert_embeddings.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["torch==1.6.0"]}, "url": "", "version": "0.9.0"}, "1.1.1": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Get BERT based embeddings for given text", "doc": "", "example": "bert_embeddings.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"framework": "pytorch"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "bert-embeddings", "platformVersion": "3.5.0", "spec": {"filename": "bert_embeddings.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["torch==1.6.0"]}, "url": "", "version": "1.1.1"}}, "describe": {"latest": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "describe", "platformVersion": "3.5.3", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "Iguazio"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe", "platformVersion": "3.2.0", "spec": {"filename": "describe.py", "handler": "summarize", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "describe", "platformVersion": "3.5.3", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "0.9.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-04-07:14-20", "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe", "platformVersion": "3.2.0", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.1"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "Iguazio"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.5.4", "name": "describe", "platformVersion": "2.10.0", "spec": {"filename": "describe.py", "handler": "summarize", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "describe", "platformVersion": "3.5.0", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Iguazio"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe", "platformVersion": "3.2.0", "spec": {"filename": "describe.py", "handler": "summarize", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "0.9.2": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-04-26:10-20", "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe", "platformVersion": "3.2.0", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.2"}}, "model_server": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-server", "platformVersion": "3.5.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server", "platformVersion": "3.2.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "0.8.0"}, "1.0.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server", "platformVersion": "3.2.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "1.0.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "model-server", "platformVersion": "", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-server", "platformVersion": "3.5.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server", "platformVersion": "3.2.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "0.9.0"}}, "pandas_profiling_report": {"latest": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "Create Pandas Profiling Report from Dataset", "doc": "", "example": "pandas_profiling_report.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "nicks"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "pandas-profiling-report", "platformVersion": "3.5.0", "spec": {"filename": "pandas_profiling_report.py", "handler": "pandas_profiling_report", "image": "mlrun/mlrun", "kind": "job", "requirements": ["pandas_profiling"]}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "Create Pandas Profiling Report from Dataset", "doc": "", "example": "pandas_profiling_report.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "nicks"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "pandas-profiling-report", "platformVersion": "3.2.0", "spec": {"filename": "pandas_profiling_report.py", "handler": "pandas_profiling_report", "image": "mlrun/mlrun", "kind": "job", "requirements": ["pandas_profiling"]}, "url": "", "version": "0.8.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "Create Pandas Profiling Report from Dataset", "doc": "", "example": "pandas_profiling_report.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "nicks"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "pandas-profiling-report", "platformVersion": "", "spec": {"filename": "pandas_profiling_report.py", "handler": "pandas_profiling_report", "image": "mlrun/mlrun", "kind": "job", "requirements": ["pandas_profiling"]}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "Create Pandas Profiling Report from Dataset", "doc": "", "example": "pandas_profiling_report.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "nicks"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "pandas-profiling-report", "platformVersion": "3.5.0", "spec": {"filename": "pandas_profiling_report.py", "handler": "pandas_profiling_report", "image": "mlrun/mlrun", "kind": "job", "requirements": ["pandas_profiling"]}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "Create Pandas Profiling Report from Dataset", "doc": "", "example": "pandas_profiling_report.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "nicks"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "pandas-profiling-report", "platformVersion": "3.2.0", "spec": {"filename": "pandas_profiling_report.py", "handler": "pandas_profiling_report", "image": "mlrun/mlrun", "kind": "job", "requirements": ["pandas_profiling"]}, "url": "", "version": "0.9.0"}}, "slack_notify": {"latest": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Slack notification", "doc": "", "example": "slack_notify.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "mdl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "slack-notify", "platformVersion": "3.5.0", "spec": {"filename": "slack_notify.py", "handler": "slack_notify", "image": "python:3.6-jessie", "kind": "job", "requirements": ["requests"]}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Slack notification", "doc": "", "example": "slack_notify.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "mdl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "slack-notify", "platformVersion": "3.2.0", "spec": {"filename": "slack_notify.py", "handler": "slack_notify", "image": "python:3.6-jessie", "kind": "job", "requirements": ["requests"]}, "url": "", "version": "0.8.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Slack notification", "doc": "", "example": "slack_notify.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "mdl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "slack-notify", "platformVersion": "", "spec": {"filename": "slack_notify.py", "handler": "slack_notify", "image": "python:3.6-jessie", "kind": "job", "requirements": ["requests"]}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Slack notification", "doc": "", "example": "slack_notify.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "mdl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "slack-notify", "platformVersion": "3.5.0", "spec": {"filename": "slack_notify.py", "handler": "slack_notify", "image": "python:3.6-jessie", "kind": "job", "requirements": ["requests"]}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Slack notification", "doc": "", "example": "slack_notify.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "mdl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "slack-notify", "platformVersion": "3.2.0", "spec": {"filename": "slack_notify.py", "handler": "slack_notify", "image": "python:3.6-jessie", "kind": "job", "requirements": ["requests"]}, "url": "", "version": "0.9.0"}}, "xgb_serving": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an XGBoost model server.", "doc": "", "example": "xgb_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "xgb_serving", "platformVersion": "3.5.3", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "xgb_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "1.1.2"}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an XGBoost model server.", "doc": "", "example": "xgb_serving.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "xgb_serving", "platformVersion": "3.2.0", "spec": {"filename": "xgb_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "remote", "requirements": []}, "url": "", "version": "0.8.0"}, "1.1.2": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an XGBoost model server.", "doc": "", "example": "xgb_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "xgb_serving", "platformVersion": "3.5.3", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "xgb_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "1.1.2"}, "1.0.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an XGBoost model server.", "doc": "", "example": "xgb_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "xgb_serving", "platformVersion": "3.2.0", "spec": {"filename": "xgb_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "remote", "requirements": []}, "url": "", "version": "1.0.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an XGBoost model server.", "doc": "", "example": "xgb_serving.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.6.2", "name": "xgb_serving", "platformVersion": "3.0.0", "spec": {"filename": "xgb_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "remote", "requirements": []}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an XGBoost model server.", "doc": "", "example": "xgb_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "xgb_serving", "platformVersion": "3.5.0", "spec": {"filename": "xgb_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "remote", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an XGBoost model server.", "doc": "", "example": "xgb_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "xgb_serving", "platformVersion": "3.2.0", "spec": {"filename": "xgb_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "remote", "requirements": []}, "url": "", "version": "0.9.0"}}, "model_monitoring_batch": {"latest": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_batch.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-monitoring-batch", "platformVersion": "3.5.0", "spec": {"filename": "model_monitoring_batch.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_batch.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-monitoring-batch", "platformVersion": "3.2.0", "spec": {"filename": "model_monitoring_batch.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "0.9.1": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_batch.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-monitoring-batch", "platformVersion": "3.2.0", "spec": {"filename": "model_monitoring_batch.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.1"}, "0.0.1": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_batch.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "model-monitoring-batch", "platformVersion": "", "spec": {"filename": "model_monitoring_batch.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_batch.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-monitoring-batch", "platformVersion": "3.5.0", "spec": {"filename": "model_monitoring_batch.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_batch.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-monitoring-batch", "platformVersion": "3.2.0", "spec": {"filename": "model_monitoring_batch.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}}, "stream_to_parquet": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Saves a stream to Parquet and can lunch drift detection task on it", "doc": "", "example": "stream_to_parquet.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "stream-to-parquet", "platformVersion": "3.5.0", "spec": {"customFields": {"max_replicas": 1, "min_replicas": 1}, "filename": "stream_to_parquet.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": []}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Saves a stream to Parquet and can lunch drift detection task on it", "doc": "", "example": "stream_to_parquet.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "stream-to-parquet", "platformVersion": "3.2.0", "spec": {"customFields": {"max_replicas": 1, "min_replicas": 1}, "filename": "stream_to_parquet.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": []}, "url": "", "version": "0.8.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Saves a stream to Parquet and can lunch drift detection task on it", "doc": "", "example": "stream_to_parquet.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "stream-to-parquet", "platformVersion": "", "spec": {"filename": "stream_to_parquet.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": [], "customFields": {"min_replicas": 1, "max_replicas": 1}}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Saves a stream to Parquet and can lunch drift detection task on it", "doc": "", "example": "stream_to_parquet.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "stream-to-parquet", "platformVersion": "3.5.0", "spec": {"customFields": {"max_replicas": 1, "min_replicas": 1}, "filename": "stream_to_parquet.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Saves a stream to Parquet and can lunch drift detection task on it", "doc": "", "example": "stream_to_parquet.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "stream-to-parquet", "platformVersion": "3.2.0", "spec": {"customFields": {"max_replicas": 1, "min_replicas": 1}, "filename": "stream_to_parquet.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": []}, "url": "", "version": "0.9.0"}}, "describe_spark": {"latest": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "describe-spark", "platformVersion": "3.5.0", "spec": {"filename": "describe_spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe-spark", "platformVersion": "3.2.0", "spec": {"filename": "describe-spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "0.9.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe-spark", "platformVersion": "3.2.0", "spec": {"filename": "describe_spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "0.9.1"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "describe-spark", "platformVersion": "", "spec": {"filename": "describe-spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "describe-spark", "platformVersion": "3.5.0", "spec": {"filename": "describe_spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe-spark", "platformVersion": "3.2.0", "spec": {"filename": "describe-spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}}, "gen_class_data": {"latest": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "gen_class_data", "platformVersion": "3.5.3", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "gen_class_data", "platformVersion": "3.2.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "gen_class_data", "platformVersion": "3.5.3", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "0.10.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "gen_class_data", "platformVersion": "3.2.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.10.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.6.2", "name": "gen_class_data", "platformVersion": "3.0.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "gen_class_data", "platformVersion": "3.5.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "gen_class_data", "platformVersion": "3.2.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}}, "open_archive": {"latest": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Open a file/object archive into a target directory", "doc": "", "example": "open_archive.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "open-archive", "platformVersion": "3.5.0", "spec": {"filename": "open_archive.py", "handler": "open_archive", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Open a file/object archive into a target directory", "doc": "", "example": "open_archive.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "open-archive", "platformVersion": "3.2.0", "spec": {"filename": "open_archive.py", "handler": "open_archive", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Open a file/object archive into a target directory", "doc": "", "example": "open_archive.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "open-archive", "platformVersion": "", "spec": {"filename": "open_archive.py", "handler": "open_archive", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Open a file/object archive into a target directory", "doc": "", "example": "open_archive.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "open-archive", "platformVersion": "3.5.0", "spec": {"filename": "open_archive.py", "handler": "open_archive", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Open a file/object archive into a target directory", "doc": "", "example": "open_archive.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "open-archive", "platformVersion": "3.2.0", "spec": {"filename": "open_archive.py", "handler": "open_archive", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}}, "send_email": {"latest": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "send-email", "platformVersion": "3.5.3", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "send-email", "platformVersion": "3.2.0", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "send-email", "platformVersion": "3.5.3", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "send-email", "platformVersion": "", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "send-email", "platformVersion": "3.5.0", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "send-email", "platformVersion": "3.2.0", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}}, "concept_drift": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "Deploy a streaming Concept Drift detector on a labeled stream", "doc": "", "example": "concept_drift.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "concept-drift", "platformVersion": "3.5.0", "spec": {"filename": "concept_drift.py", "handler": "concept_drift_deployer", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-multiflow"]}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "Deploy a streaming Concept Drift detector on a labeled stream", "doc": "", "example": "concept_drift.ipynb", "generationDate": "2021-05-19:22-04", "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "concept-drift", "platformVersion": "3.2.0", "spec": {"filename": "concept_drift.py", "handler": "concept_drift_deployer", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-multiflow"]}, "url": "", "version": "0.8.0"}, "0.9.1": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "Deploy a streaming Concept Drift detector on a labeled stream", "doc": "", "example": "concept_drift.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "concept-drift", "platformVersion": "3.2.0", "spec": {"filename": "concept_drift.py", "handler": "concept_drift_deployer", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-multiflow"]}, "url": "", "version": "0.9.1"}, "0.0.1": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "Deploy a streaming Concept Drift detector on a labeled stream", "doc": "", "example": "concept_drift.ipynb", "generationDate": "2021-05-19:22-04", "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "concept-drift", "platformVersion": "", "spec": {"filename": "concept_drift.py", "handler": "concept_drift_deployer", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "0.0.2": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "Deploy a streaming Concept Drift detector on a labeled stream", "doc": "", "example": "concept_drift.ipynb", "generationDate": "2021-05-19:22-04", "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "concept-drift", "platformVersion": "", "spec": {"filename": "concept_drift.py", "handler": "concept_drift_deployer", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-multiflow"]}, "url": "", "version": "0.0.2"}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "Deploy a streaming Concept Drift detector on a labeled stream", "doc": "", "example": "concept_drift.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "concept-drift", "platformVersion": "3.5.0", "spec": {"filename": "concept_drift.py", "handler": "concept_drift_deployer", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-multiflow"]}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "Deploy a streaming Concept Drift detector on a labeled stream", "doc": "", "example": "concept_drift.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "concept-drift", "platformVersion": "3.2.0", "spec": {"filename": "concept_drift.py", "handler": "concept_drift_deployer", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-multiflow"]}, "url": "", "version": "0.9.0"}}, "tf1_serving": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf1 image classification server", "doc": "", "example": "tf1_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "tf1-serving", "platformVersion": "3.5.0", "spec": {"env": {"ENABLE_EXPLAINER": false, "MODEL_CLASS": "TFModel"}, "filename": "tf1_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf1 image classification server", "doc": "", "example": "tf1_serving.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf1-serving", "platformVersion": "3.2.0", "spec": {"env": {"ENABLE_EXPLAINER": false, "MODEL_CLASS": "TFModel"}, "filename": "tf1_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "0.8.0"}, "0.9.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf1 image classification server", "doc": "", "example": "tf1_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf1-serving", "platformVersion": "3.2.0", "spec": {"env": {"ENABLE_EXPLAINER": false, "MODEL_CLASS": "TFModel"}, "filename": "tf1_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "0.9.1"}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf1 image classification server", "doc": "", "example": "tf1_serving.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "tf1-serving", "platformVersion": "", "spec": {"filename": "tf1_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": [], "env": {"MODEL_CLASS": "TFModel", "ENABLE_EXPLAINER": false}}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf1 image classification server", "doc": "", "example": "tf1_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "tf1-serving", "platformVersion": "3.5.0", "spec": {"env": {"ENABLE_EXPLAINER": false, "MODEL_CLASS": "TFModel"}, "filename": "tf1_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf1 image classification server", "doc": "", "example": "tf1_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf1-serving", "platformVersion": "3.2.0", "spec": {"env": {"ENABLE_EXPLAINER": false, "MODEL_CLASS": "TFModel"}, "filename": "tf1_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "0.9.0"}}, "churn_server": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "churn classification and predictor", "doc": "", "example": "churn_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Iguazio", "framework": "churn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "churn-server", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "ChurnModel"}, "env": {"ENABLE_EXPLAINER": "False"}, "filename": "churn_server.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["xgboost==1.3.1", "lifelines==0.22.8"]}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "churn classification and predictor", "doc": "", "example": "churn_server.ipynb", "generationDate": "2021-05-19:22-04", "icon": "", "labels": {"author": "Iguazio", "framework": "churn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "churn-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ChurnModel"}, "env": {"ENABLE_EXPLAINER": "False"}, "filename": "churn_server.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["xgboost==1.3.1", "lifelines==0.22.8"]}, "url": "", "version": "0.8.0"}, "1.0.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "churn classification and predictor", "doc": "", "example": "churn_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Iguazio", "framework": "churn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "churn-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ChurnModel"}, "env": {"ENABLE_EXPLAINER": "False"}, "filename": "churn_server.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["xgboost==1.3.1", "lifelines==0.22.8"]}, "url": "", "version": "1.0.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "churn classification and predictor", "doc": "", "example": "churn_server.ipynb", "generationDate": "2021-05-19:22-04", "icon": "", "labels": {"author": "Iguazio", "framework": "churn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "churn-server", "platformVersion": "", "spec": {"filename": "churn_server.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": [], "env": {"ENABLE_EXPLAINER": "False"}, "customFields": {"default_class": "ChurnModel"}}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "churn classification and predictor", "doc": "", "example": "churn_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Iguazio", "framework": "churn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "churn-server", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "ChurnModel"}, "env": {"ENABLE_EXPLAINER": "False"}, "filename": "churn_server.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["xgboost==1.3.1", "lifelines==0.22.8"]}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "churn classification and predictor", "doc": "", "example": "churn_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Iguazio", "framework": "churn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "churn-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ChurnModel"}, "env": {"ENABLE_EXPLAINER": "False"}, "filename": "churn_server.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["xgboost==1.3.1", "lifelines==0.22.8"]}, "url": "", "version": "0.9.0"}}, "model_monitoring_stream": {"latest": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_stream.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-monitoring-stream", "platformVersion": "3.5.0", "spec": {"filename": "model_monitoring_stream.py", "handler": "handler", "image": "livsmichael/mlrun-api:automation", "kind": "nuclio", "requirements": []}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_stream.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-monitoring-stream", "platformVersion": "3.2.0", "spec": {"filename": "model_monitoring_stream.py", "handler": "handler", "image": "livsmichael/mlrun-api:automation", "kind": "nuclio", "requirements": []}, "url": "", "version": "0.8.0"}, "0.9.1": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_stream.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-monitoring-stream", "platformVersion": "3.2.0", "spec": {"filename": "model_monitoring_stream.py", "handler": "handler", "image": "livsmichael/mlrun-api:automation", "kind": "nuclio", "requirements": []}, "url": "", "version": "0.9.1"}, "0.0.1": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_stream.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "model-monitoring-stream", "platformVersion": "", "spec": {"filename": "model_monitoring_stream.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio", "requirements": []}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_stream.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-monitoring-stream", "platformVersion": "3.5.0", "spec": {"filename": "model_monitoring_stream.py", "handler": "handler", "image": "livsmichael/mlrun-api:automation", "kind": "nuclio", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_stream.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-monitoring-stream", "platformVersion": "3.2.0", "spec": {"filename": "model_monitoring_stream.py", "handler": "handler", "image": "livsmichael/mlrun-api:automation", "kind": "nuclio", "requirements": []}, "url": "", "version": "0.9.0"}}, "virtual_drift": {"latest": {"apiVersion": "v1", "categories": ["data-analysis", "machine-learning"], "description": "Compute drift magnitude between Time-Samples T and U", "doc": "", "example": "virtual_drift.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "virtual-drift", "platformVersion": "3.5.0", "spec": {"filename": "virtual_drift.py", "handler": "drift_magnitude", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-learn", "scipy", "v3io_frames"]}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-analysis", "machine-learning"], "description": "Compute drift magnitude between Time-Samples T and U", "doc": "", "example": "virtual_drift.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "virtual-drift", "platformVersion": "3.2.0", "spec": {"filename": "virtual_drift.py", "handler": "drift_magnitude", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-learn", "scipy", "v3io_frames"]}, "url": "", "version": "0.8.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-analysis", "machine-learning"], "description": "Compute drift magnitude between Time-Samples T and U", "doc": "", "example": "virtual_drift.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "virtual-drift", "platformVersion": "", "spec": {"filename": "virtual_drift.py", "handler": "drift_magnitude", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-learn", "scipy", "v3io_frames"]}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-analysis", "machine-learning"], "description": "Compute drift magnitude between Time-Samples T and U", "doc": "", "example": "virtual_drift.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "virtual-drift", "platformVersion": "3.5.0", "spec": {"filename": "virtual_drift.py", "handler": "drift_magnitude", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-learn", "scipy", "v3io_frames"]}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-analysis", "machine-learning"], "description": "Compute drift magnitude between Time-Samples T and U", "doc": "", "example": "virtual_drift.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "virtual-drift", "platformVersion": "3.2.0", "spec": {"filename": "virtual_drift.py", "handler": "drift_magnitude", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-learn", "scipy", "v3io_frames"]}, "url": "", "version": "0.9.0"}}, "rnn_serving": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an rnn based stock analysis model server.", "doc": "", "example": "rnn_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "rnn-serving", "platformVersion": "3.5.0", "spec": {"filename": "rnn_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": null}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an rnn based stock analysis model server.", "doc": "", "example": "rnn_serving.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "rnn-serving", "platformVersion": "3.2.0", "spec": {"filename": "rnn_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": null}, "url": "", "version": "0.8.0"}, "1.0.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an rnn based stock analysis model server.", "doc": "", "example": "rnn_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "rnn-serving", "platformVersion": "3.2.0", "spec": {"filename": "rnn_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": null}, "url": "", "version": "1.0.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an rnn based stock analysis model server.", "doc": "", "example": "rnn_serving.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "rnn-serving", "platformVersion": "", "spec": {"filename": "rnn_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["keras"]}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an rnn based stock analysis model server.", "doc": "", "example": "rnn_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "rnn-serving", "platformVersion": "3.5.0", "spec": {"filename": "rnn_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": null}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an rnn based stock analysis model server.", "doc": "", "example": "rnn_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "rnn-serving", "platformVersion": "3.2.0", "spec": {"filename": "rnn_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": null}, "url": "", "version": "0.9.0"}}, "feature_perms": {"latest": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "estimate feature importances using permutations", "doc": "", "example": "feature_perms.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-perms", "platformVersion": "3.5.0", "spec": {"filename": "feature_perms.py", "handler": "permutation_importance", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "test_valid": false}, "0.8.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "estimate feature importances using permutations", "doc": "", "example": "feature_perms.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "feature-perms", "platformVersion": "3.2.0", "spec": {"filename": "feature_perms.py", "handler": "permutation_importance", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "1.0.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "estimate feature importances using permutations", "doc": "", "example": "feature_perms.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "feature-perms", "platformVersion": "3.2.0", "spec": {"filename": "feature_perms.py", "handler": "permutation_importance", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.0.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "estimate feature importances using permutations", "doc": "", "example": "feature_perms.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "feature-perms", "platformVersion": "", "spec": {"filename": "feature_perms.py", "handler": "permutation_importance", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "estimate feature importances using permutations", "doc": "", "example": "feature_perms.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-perms", "platformVersion": "3.5.0", "spec": {"filename": "feature_perms.py", "handler": "permutation_importance", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "test_valid": false}, "0.9.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "estimate feature importances using permutations", "doc": "", "example": "feature_perms.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "feature-perms", "platformVersion": "3.2.0", "spec": {"filename": "feature_perms.py", "handler": "permutation_importance", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}}, "v2_model_tester": {"latest": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "v2-model-tester", "platformVersion": "3.5.0", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-tester", "platformVersion": "3.2.0", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "v2-model-tester", "platformVersion": "", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "v2-model-tester", "platformVersion": "3.5.0", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-tester", "platformVersion": "3.2.0", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}}, "coxph_test": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "model-testing"], "description": "Test cox proportional hazards model", "doc": "", "example": "coxph_test.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Iguazio", "framework": "survival"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "coxph-test", "platformVersion": "3.5.0", "spec": {"filename": "coxph_test.py", "handler": "cox_test", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-testing"], "description": "Test cox proportional hazards model", "doc": "", "example": "coxph_test.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "Iguazio", "framework": "survival"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "coxph-test", "platformVersion": "3.2.0", "spec": {"filename": "coxph_test.py", "handler": "cox_test", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "1.0.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-testing"], "description": "Test cox proportional hazards model", "doc": "", "example": "coxph_test.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Iguazio", "framework": "survival"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "coxph-test", "platformVersion": "3.2.0", "spec": {"filename": "coxph_test.py", "handler": "cox_test", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.0.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["machine-learning", "model-testing"], "description": "Test cox proportional hazards model", "doc": "", "example": "coxph_test.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "Iguazio", "framework": "survival"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "coxph-test", "platformVersion": "", "spec": {"filename": "coxph_test.py", "handler": "cox_test", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-testing"], "description": "Test cox proportional hazards model", "doc": "", "example": "coxph_test.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Iguazio", "framework": "survival"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "coxph-test", "platformVersion": "3.5.0", "spec": {"filename": "coxph_test.py", "handler": "cox_test", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-testing"], "description": "Test cox proportional hazards model", "doc": "", "example": "coxph_test.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Iguazio", "framework": "survival"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "coxph-test", "platformVersion": "3.2.0", "spec": {"filename": "coxph_test.py", "handler": "cox_test", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}}, "arc_to_parquet": {"latest": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avi"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "arc-to-parquet", "platformVersion": "3.5.4", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.4.1"}, "0.8.0": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2021-05-19:22-04", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "arc-to-parquet", "platformVersion": "3.2.0", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/ml-base", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "arc-to-parquet", "platformVersion": "3.5.0", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/ml-base", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "1.4.1": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avi"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "arc-to-parquet", "platformVersion": "3.5.4", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.4.1"}, "0.0.1": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2021-05-19:22-04", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.5.4", "name": "arc-to-parquet", "platformVersion": "2.10.0", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/ml-base", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "arc-to-parquet", "platformVersion": "3.5.0", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/ml-base", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "arc-to-parquet", "platformVersion": "3.2.0", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/ml-base", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}}, "github_utils": {"latest": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "github-utils", "platformVersion": "3.5.0", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "github-utils", "platformVersion": "3.2.0", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "github-utils", "platformVersion": "", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "github-utils", "platformVersion": "3.5.0", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "github-utils", "platformVersion": "3.2.0", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}}, "v2_model_server": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "v2-model-server", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "0.8.0"}, "1.0.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "1.0.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "v2-model-server", "platformVersion": "", "spec": {"filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": [], "customFields": {"default_class": "ClassifierModel"}}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "v2-model-server", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "0.9.0"}}, "concept_drift_streaming": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "monitoring"], "description": "Deploy a streaming Concept Drift detector on a labeled stream. the nuclio part of the concept_drift function", "doc": "", "example": "concept_drift_streaming.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "concept-drift-streaming", "platformVersion": "3.5.0", "spec": {"filename": "concept_drift_streaming.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["scikit-multiflow==0.4.1", "v3io_frames"]}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["machine-learning", "monitoring"], "description": "Deploy a streaming Concept Drift detector on a labeled stream. the nuclio part of the concept_drift function", "doc": "", "example": "concept_drift_streaming.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "concept-drift-streaming", "platformVersion": "3.2.0", "spec": {"filename": "concept_drift_streaming.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["scikit-multiflow==0.4.1", "v3io_frames"]}, "url": "", "version": "0.8.0"}, "0.9.1": {"apiVersion": "v1", "categories": ["machine-learning", "monitoring"], "description": "Deploy a streaming Concept Drift detector on a labeled stream. the nuclio part of the concept_drift function", "doc": "", "example": "concept_drift_streaming.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "concept-drift-streaming", "platformVersion": "3.2.0", "spec": {"filename": "concept_drift_streaming.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["scikit-multiflow==0.4.1", "v3io_frames"]}, "url": "", "version": "0.9.1"}, "0.0.1": {"apiVersion": "v1", "categories": ["machine-learning", "monitoring"], "description": "Deploy a streaming Concept Drift detector on a labeled stream. the nuclio part of the concept_drift function", "doc": "", "example": "concept_drift_streaming.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "concept-drift-streaming", "platformVersion": "", "spec": {"filename": "concept_drift_streaming.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["scikit-multiflow==0.4.1", "v3io_frames"]}, "url": "", "version": "0.0.1"}, "0.0.2": {"apiVersion": "v1", "categories": ["machine-learning", "monitoring"], "description": "Deploy a streaming Concept Drift detector on a labeled stream. the nuclio part of the concept_drift function", "doc": "", "example": "concept_drift_streaming.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "concept-drift-streaming", "platformVersion": "", "spec": {"filename": "concept_drift_streaming.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["scikit-multiflow==0.4.1", "v3io_frames"]}, "url": "", "version": "0.0.2"}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "monitoring"], "description": "Deploy a streaming Concept Drift detector on a labeled stream. the nuclio part of the concept_drift function", "doc": "", "example": "concept_drift_streaming.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "concept-drift-streaming", "platformVersion": "3.5.0", "spec": {"filename": "concept_drift_streaming.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["scikit-multiflow==0.4.1", "v3io_frames"]}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["machine-learning", "monitoring"], "description": "Deploy a streaming Concept Drift detector on a labeled stream. the nuclio part of the concept_drift function", "doc": "", "example": "concept_drift_streaming.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "concept-drift-streaming", "platformVersion": "3.2.0", "spec": {"filename": "concept_drift_streaming.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["scikit-multiflow==0.4.1", "v3io_frames"]}, "url": "", "version": "0.9.0"}}, "onnx_utils": {"latest": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "onnx_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "with_mlrun": false}}, "filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/mlrun", "kind": "job", "requirements": ["onnx~=1.13.0", "onnxruntime~=1.14.0", "onnxoptimizer~=0.3.0", "onnxmltools~=1.11.0", "tf2onnx~=1.13.0"]}, "url": "", "version": "1.2.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2021-10-25:00-15", "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "onnx_utils", "platformVersion": "3.2.0", "spec": {"filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "onnx_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "with_mlrun": false}}, "filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/mlrun", "kind": "job", "requirements": ["onnx~=1.13.0", "onnxruntime~=1.14.0", "onnxoptimizer~=0.3.0", "onnxmltools~=1.11.0", "tf2onnx~=1.13.0"]}, "url": "", "version": "1.2.0"}, "0.10.1": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.10.0", "name": "onnx_utils", "platformVersion": "3.2.0", "spec": {"filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.10.1"}, "0.0.1": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2021-10-25:00-15", "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "onnx_utils", "platformVersion": "", "spec": {"filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "onnx_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "with_mlrun": false}}, "filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/ml-models", "kind": "job", "requirements": ["onnx~=1.10.1", "onnxruntime~=1.8.1", "onnxoptimizer~=0.2.0", "onnxmltools~=1.9.0", "tf2onnx~=1.9.0"]}, "url": "", "version": "1.1.0"}, "0.10.2": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.10.0", "name": "onnx_utils", "platformVersion": "3.2.0", "spec": {"filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.10.2"}, "0.9.0": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "onnx_utils", "platformVersion": "3.2.0", "spec": {"filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "0.8.1": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "onnx_utils", "platformVersion": "3.2.0", "spec": {"filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.1"}, "1.1.1": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "onnx_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "with_mlrun": false}}, "filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/ml-models", "kind": "job", "requirements": ["onnx~=1.13.0", "onnxruntime~=1.14.0", "onnxoptimizer~=0.3.0", "onnxmltools~=1.11.0", "tf2onnx~=1.13.0"]}, "url": "", "version": "1.1.1"}}, "ingest": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "Feature Store ingest function that runs the transformation graph on the source of the featureset.", "doc": "", "example": "ingest.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "ingest", "platformVersion": "3.5.0", "spec": {"filename": "ingest.py", "handler": "ingest", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "Feature Store ingest function that runs the transformation graph on the source of the featureset.", "doc": "", "example": "ingest.ipynb", "generationDate": "2021-11-13:00-15", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "ingest", "platformVersion": "", "spec": {"filename": "ingest.py", "handler": "ingest", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "Feature Store ingest function that runs the transformation graph on the source of the featureset.", "doc": "", "example": "ingest.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "ingest", "platformVersion": "3.5.0", "spec": {"filename": "ingest.py", "handler": "ingest", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "Feature Store ingest function that runs the transformation graph on the source of the featureset.", "doc": "", "example": "ingest.ipynb", "generationDate": "2021-11-13:00-15", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "ingest", "platformVersion": "", "spec": {"filename": "ingest.py", "handler": "ingest", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}}, "get_offline_features": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "retrieve offline feature vector results", "doc": "", "example": "get_offline_features.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "get_offline_features", "platformVersion": "3.5.0", "spec": {"filename": "get_offline_features.py", "handler": "get_offline_features", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "1.0.1": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "retrieve offline feature vector results", "doc": "", "example": "get_offline_features.ipynb", "generationDate": "2022-05-25:10-58", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.0.1", "name": "get_offline_features", "platformVersion": "", "spec": {"filename": "get_offline_features.py", "handler": "get_offline_features", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.1"}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "retrieve offline feature vector results", "doc": "", "example": "get_offline_features.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "get_offline_features", "platformVersion": "3.5.0", "spec": {"filename": "get_offline_features.py", "handler": "get_offline_features", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "0.9.1": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "retrieve offline feature vector results", "doc": "", "example": "get_offline_features.ipynb", "generationDate": "2022-05-25:10-58", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.0.1", "name": "get_offline_features", "platformVersion": "", "spec": {"filename": "get_offline_features.py", "handler": "get_offline_features", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.1"}, "1.0.0": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "retrieve offline feature vector results", "doc": "", "example": "get_offline_features.ipynb", "generationDate": "2022-05-25:10-58", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.0.1", "name": "get_offline_features", "platformVersion": "", "spec": {"filename": "get_offline_features.py", "handler": "get_offline_features", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "retrieve offline feature vector results", "doc": "", "example": "get_offline_features.ipynb", "generationDate": "2022-01-17:17-56", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.9.1", "name": "get_offline_features", "platformVersion": "", "spec": {"filename": "get_offline_features.py", "handler": "get_offline_features", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "retrieve offline feature vector results", "doc": "", "example": "get_offline_features.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "get_offline_features", "platformVersion": "3.5.0", "spec": {"filename": "get_offline_features.py", "handler": "get_offline_features", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "retrieve offline feature vector results", "doc": "", "example": "get_offline_features.ipynb", "generationDate": "2022-01-17:17-56", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.9.1", "name": "get_offline_features", "platformVersion": "", "spec": {"filename": "get_offline_features.py", "handler": "get_offline_features", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}}, "azureml_utils": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "azureml_utils", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "commands": ["apt-get update && apt-get install -y --no-install-recommends git", "apt install -y liblttng-ust0"], "with_mlrun": true}}, "filename": "azureml_utils.py", "handler": "train", "image": "python:3.9-bullseye", "kind": "job", "requirements": ["azureml-core==1.54.0.post1", "azureml-train-automl-client==1.54.0.post1", "plotly~=5.4"]}, "url": "", "version": "1.3.0", "test_valid": true}, "1.3.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "azureml_utils", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "commands": ["apt-get update && apt-get install -y --no-install-recommends git", "apt install -y liblttng-ust0"], "with_mlrun": true}}, "filename": "azureml_utils.py", "handler": "train", "image": "python:3.9-bullseye", "kind": "job", "requirements": ["azureml-core==1.54.0.post1", "azureml-train-automl-client==1.54.0.post1", "plotly~=5.4"]}, "url": "", "version": "1.3.0", "test_valid": true}, "1.2.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "azureml_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "commands": ["python -m pip install pip==22.1.2", "apt-get update && apt-get install -y --no-install-recommends git"], "with_mlrun": true}}, "filename": "azureml_utils.py", "handler": "train", "image": "python:3.7.9-slim", "kind": "job", "requirements": ["azureml-core==1.40.0", "azureml-train-automl-client==1.40.0", "plotly~=5.4"]}, "url": "", "version": "1.2.0", "test_valid": false}, "0.9.4": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2021-11-13:00-15", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "azureml_utils", "platformVersion": "", "spec": {"filename": "azureml_utils.py", "handler": "train", "extra_spec": {"build": {"commands": ["python -m pip install pip==21.2.4", "apt-get update && apt-get install -y --no-install-recommends git"], "with_mlrun": true, "auto_build": true}, "allow_empty_resources": true}, "image": "python:3.7.9-slim", "kind": "job", "requirements": ["azureml-core==1.33.0", "azureml-train-automl-client==1.33.0", "plotly~=5.4"]}, "url": "", "version": "0.9.4"}, "0.0.1": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2021-11-13:00-15", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "azureml_utils", "platformVersion": "", "spec": {"filename": "azureml_utils.py", "handler": "train", "commands": null, "image": "", "kind": "job", "requirements": ["azureml-core==1.33.0", "azureml-train-automl-client==1.33.0"]}, "url": "", "version": "0.0.1"}, "0.9.5": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2021-04-20:15-18", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "azureml_utils", "platformVersion": "", "spec": {"filename": "azureml_utils.py", "handler": "train", "extra_spec": {"build": {"commands": ["python -m pip install pip==21.2.4", "apt-get update && apt-get install -y --no-install-recommends git"], "with_mlrun": true, "auto_build": true}, "allow_empty_resources": true}, "image": "python:3.7.9-slim", "kind": "job", "requirements": ["azureml-core==1.40.0", "azureml-train-automl-client==1.40.0", "plotly~=5.4"]}, "url": "", "version": "0.9.5"}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "azureml_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "commands": ["python -m pip install pip==22.1.2", "apt-get update && apt-get install -y --no-install-recommends git"], "with_mlrun": true}}, "filename": "azureml_utils.py", "handler": "train", "image": "python:3.7.9-slim", "kind": "job", "requirements": ["azureml-core==1.40.0", "azureml-train-automl-client==1.40.0", "plotly~=5.4"]}, "url": "", "version": "1.1.0"}, "0.9.3": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2021-11-13:00-15", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "azureml_utils", "platformVersion": "", "spec": {"filename": "azureml_utils.py", "handler": "train", "extra_spec": {"build": {"commands": ["python -m pip install pip==21.2.4", "apt-get update && apt-get install -y --no-install-recommends git"], "with_mlrun": true, "auto_build": true}, "allow_empty_resources": true}, "image": "python:3.7.9-slim", "kind": "job", "requirements": ["azureml-core==1.33.0", "azureml-train-automl-client==1.33.0", "plotly~=5.4"]}, "url": "", "version": "0.9.3"}, "0.9.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2021-11-13:00-15", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "azureml_utils", "platformVersion": "", "spec": {"filename": "azureml_utils.py", "handler": "train", "commands": null, "image": "", "kind": "job", "requirements": ["azureml-core==1.33.0", "azureml-train-automl-client==1.33.0"]}, "url": "", "version": "0.9.0"}}, "auto_trainer": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.7.0"}, "1.3.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.3.0", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0"}, "1.0.1": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-04-26:10-43", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.0.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "", "kind": "job", "requirements": []}, "url": "", "version": "1.0.1"}, "1.0.6": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-04-26:10-43", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.0.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.6"}, "0.10.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-02-06:10-18", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.10.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "", "kind": "job", "requirements": []}, "url": "", "version": "0.10.0"}, "0.10.1": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-02-06:10-18", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.10.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "", "kind": "job", "requirements": []}, "url": "", "version": "0.10.1"}, "1.0.4": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-04-26:10-43", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.0.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.4"}, "1.0.2": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-04-26:10-43", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.0.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.2"}, "1.4.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.4.0"}, "1.5.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.5.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "1.7.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.7.0"}, "0.10.2": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-02-06:10-18", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.10.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "", "kind": "job", "requirements": []}, "url": "", "version": "0.10.2"}, "0.9.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-02-06:10-18", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.10.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "1.6.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.6.0"}, "0.10.3": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-04-26:10-43", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.10.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "", "kind": "job", "requirements": []}, "url": "", "version": "0.10.3"}}, "snowflake_dask": {"latest": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Snowflake Dask - Ingest snowflake data in parallel with Dask cluster", "doc": "", "example": "snowflake-dask-mlrun.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "xingsheng", "framework": "dask"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "snowflake_dask", "platformVersion": "3.5.0", "spec": {"filename": "snowflake_dask.py", "handler": "load_results", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Snowflake Dask - Ingest snowflake data in parallel with Dask cluster", "doc": "", "example": "snowflake-dask-mlrun.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "xingsheng", "framework": "dask"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "snowflake_dask", "platformVersion": "3.5.0", "spec": {"filename": "snowflake_dask.py", "handler": "load_results", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Snowflake Dask - Ingest snowflake data in parallel with Dask cluster", "doc": "", "example": "snowflake-dask-mlrun.ipynb", "generationDate": "2022-03-20:12-28", "icon": "", "labels": {"author": "xingsheng", "framework": "dask"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.9.1", "name": "snowflake_dask", "platformVersion": "3.2.0", "spec": {"filename": "snowflake_dask.py", "handler": "load_results", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}}, "azureml_serving": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "AzureML serving function", "doc": "", "example": "azureml_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "azureml_serving", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "mlrun.frameworks.sklearn.PickleModelServer"}, "filename": "azureml_serving.py", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["azureml-automl-runtime~=1.38.1"]}, "url": "", "version": "1.1.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "AzureML serving function", "doc": "", "example": "azureml_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "azureml_serving", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "mlrun.frameworks.sklearn.PickleModelServer"}, "filename": "azureml_serving.py", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["azureml-automl-runtime~=1.38.1"]}, "url": "", "version": "1.1.0"}}, "batch_inference": {"latest": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.7.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.2.0"}, "1.5.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.5.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference ( also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-learn", "plotly"]}, "url": "", "version": "1.1.0"}, "1.7.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.7.0"}, "1.6.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.6.0"}, "1.1.1": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.1.1"}}, "hugging_face_serving": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "Generic Hugging Face model server.", "doc": "", "example": "hugging_face_serving.ipynb", "generationDate": "2022-09-05:17-00", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "hugging_face_serving", "platformVersion": "", "spec": {"customFields": {"default_class": "HuggingFaceModelServer"}, "filename": "hugging_face_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["transformers==4.21.3", "tensorflow==2.9.2"]}, "url": "", "version": "1.0.0"}, "1.0.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "Generic Hugging Face model server.", "doc": "", "example": "hugging_face_serving.ipynb", "generationDate": "2022-09-05:17-00", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "hugging_face_serving", "platformVersion": "", "spec": {"customFields": {"default_class": "HuggingFaceModelServer"}, "filename": "hugging_face_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["transformers==4.21.3", "tensorflow==2.9.2"]}, "url": "", "version": "1.0.0"}}, "hugging_face_classifier_trainer": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train and optimize functions for HuggingFace framework", "doc": "", "example": "hugging_face_classifier_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.1", "name": "hugging_face_classifier_trainer", "platformVersion": "3.5.5", "spec": {"filename": "hugging_face_classifier_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": ["onnx~=1.14.1", "onnxruntime~=1.16.1", "optimum~=1.6.4", "transformers~=4.26.1", "datasets~=2.10.1", "scikit-learn~=1.0.2"]}, "url": "", "version": "0.2.0"}, "0.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train and optimize functions for HuggingFace framework", "doc": "", "example": "hugging_face_classifier_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "hugging_face_classifier_trainer", "platformVersion": "3.5.0", "spec": {"filename": "hugging_face_classifier_trainer.py", "handler": "train", "image": "mlrun/ml-models", "kind": "job", "requirements": ["onnx~=1.10.1", "onnxruntime~=1.8.1", "optimum~=1.6.4", "transformers~=4.26.1", "datasets~=2.10.1", "scikit-learn~=1.0.2"]}, "url": "", "version": "0.1.0"}, "0.2.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train and optimize functions for HuggingFace framework", "doc": "", "example": "hugging_face_classifier_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.1", "name": "hugging_face_classifier_trainer", "platformVersion": "3.5.5", "spec": {"filename": "hugging_face_classifier_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": ["onnx~=1.14.1", "onnxruntime~=1.16.1", "optimum~=1.6.4", "transformers~=4.26.1", "datasets~=2.10.1", "scikit-learn~=1.0.2"]}, "url": "", "version": "0.2.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train and optimize functions for HuggingFace framework", "doc": "", "example": "hugging_face_classifier_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "hugging_face_classifier_trainer", "platformVersion": "3.5.0", "spec": {"filename": "hugging_face_classifier_trainer.py", "handler": "train", "image": "mlrun/ml-models", "kind": "job", "requirements": ["onnx~=1.10.1", "onnxruntime~=1.8.1", "optimum~=1.6.4", "transformers~=4.26.1", "datasets~=2.10.1", "scikit-learn~=1.0.2"]}, "url": "", "version": "0.0.1"}}, "validate_great_expectations": {"latest": {"apiVersion": "v1", "categories": ["data-validation", "data-analysis"], "description": "Validate a dataset using Great Expectations", "doc": "", "example": "validate_great_expectations.ipynb", "generationDate": "2022-04-26:12-28", "hidden": false, "icon": "", "labels": {"author": "nicks", "framework": "great-expectations"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "validate-great-expectations", "platformVersion": "3.5.2", "spec": {"filename": "validate_great_expectations.py", "handler": "validate_expectations", "image": "mlrun/mlrun", "kind": "job", "requirements": ["great-expectations==0.15.41"]}, "url": "", "version": "1.1.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-validation", "data-analysis"], "description": "Validate a dataset using Great Expectations", "doc": "", "example": "validate_great_expectations.ipynb", "generationDate": "2022-04-26:12-28", "hidden": false, "icon": "", "labels": {"author": "nicks", "framework": "great-expectations"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "validate-great-expectations", "platformVersion": "3.5.2", "spec": {"filename": "validate_great_expectations.py", "handler": "validate_expectations", "image": "mlrun/mlrun", "kind": "job", "requirements": ["great-expectations==0.15.41"]}, "url": "", "version": "1.1.0"}}, "transcribe": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Transcribe audio files into text files", "doc": "", "example": "transcribe.ipynb", "generationDate": "2023-07-13:11-20", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "transcribe", "platformVersion": "3.5.3", "spec": {"filename": "transcribe.py", "handler": "transcribe", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "tqdm", "torchaudio", "torch", "accelerate"]}, "url": "", "version": "1.0.0"}, "1.0.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Transcribe audio files into text files", "doc": "", "example": "transcribe.ipynb", "generationDate": "2023-07-13:11-20", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "transcribe", "platformVersion": "3.5.3", "spec": {"filename": "transcribe.py", "handler": "transcribe", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "tqdm", "torchaudio", "torch", "accelerate"]}, "url": "", "version": "1.0.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Transcribe audio files into text files", "doc": "", "example": "transcribe.ipynb", "generationDate": "2023-07-13:11-20", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "transcribe", "platformVersion": "3.5.3", "spec": {"filename": "transcribe.py", "handler": "transcribe", "image": "mlrun/mlrun", "kind": "job", "requirements": ["openai-whisper", "tqdm"]}, "url": "", "version": "0.0.1", "test_valid": false}, "0.0.2": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Transcribe audio files into text files", "doc": "", "example": "transcribe.ipynb", "generationDate": "2023-07-13:11-20", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "transcribe", "platformVersion": "3.5.3", "spec": {"filename": "transcribe.py", "handler": "transcribe", "image": "mlrun/mlrun", "kind": "job", "requirements": ["openai-whisper", "tqdm"]}, "url": "", "version": "0.0.2", "test_valid": true}}, "question_answering": {"latest": {"apiVersion": "v1", "categories": ["machine-learning"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "torch", "tqdm"]}, "url": "", "version": "0.3.1"}, "0.2.0": {"apiVersion": "v1", "categories": ["machine-learning"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": "transformers torch tqdm"}, "url": "", "version": "0.2.0"}, "0.3.0": {"apiVersion": "v1", "categories": ["machine-learning"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": "transformers torch tqdm"}, "url": "", "version": "0.3.0"}, "0.3.1": {"apiVersion": "v1", "categories": ["machine-learning"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "torch", "tqdm"]}, "url": "", "version": "0.3.1"}}, "huggingface_auto_trainer": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "fine-tune llm model with ease", "doc": "", "example": "huggingface_auto_trainer.ipynb", "generationDate": "2023-08-21:17-25", "hidden": false, "icon": "", "labels": {"author": "Zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "huggingface-auto-trainer", "platformVersion": "3.5.0", "spec": {"filename": "huggingface_auto_trainer.py", "handler": "finetune_llm", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.0"}, "1.0.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "fine-tune llm model with ease", "doc": "", "example": "huggingface_auto_trainer.ipynb", "generationDate": "2023-08-21:17-25", "hidden": false, "icon": "", "labels": {"author": "Zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "huggingface-auto-trainer", "platformVersion": "3.5.0", "spec": {"filename": "huggingface_auto_trainer.py", "handler": "finetune_llm", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.0"}}, "pii_recognizer": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "This function is used to recognize PII in a directory of text files", "doc": "", "example": "pii_recognizer.ipynb", "generationDate": "2023-08-15:10-24", "hidden": false, "icon": "", "labels": {"author": "pgw"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "pii-recognizer", "platformVersion": "3.5.3", "spec": {"filename": "pii_recognizer.py", "handler": "recognize_pii", "image": "mlrun/mlrun", "kind": "job", "requirements": ["nltk", "pandas", "presidio-anonymizer", "presidio-analyzer", "torch", "flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653", "st-annotated-text", "https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl"]}, "url": "", "version": "0.2.0", "test_valid": false}, "0.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "This function is used to recognize PII in a directory of text files", "doc": "", "example": "pii_recognizer.ipynb", "generationDate": "2023-08-15:10-24", "hidden": false, "icon": "", "labels": {"author": "pgw"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "pii-recognizer", "platformVersion": "3.5.3", "spec": {"filename": "pii_recognizer.py", "handler": "recognize_pii", "image": "mlrun/mlrun", "kind": "job", "requirements": ["nltk", "pandas", "presidio-anonymizer", "presidio-analyzer", "torch", "flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653", "st-annotated-text", "https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl"]}, "url": "", "version": "0.1.0", "test_valid": false}, "0.2.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "This function is used to recognize PII in a directory of text files", "doc": "", "example": "pii_recognizer.ipynb", "generationDate": "2023-08-15:10-24", "hidden": false, "icon": "", "labels": {"author": "pgw"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "pii-recognizer", "platformVersion": "3.5.3", "spec": {"filename": "pii_recognizer.py", "handler": "recognize_pii", "image": "mlrun/mlrun", "kind": "job", "requirements": ["nltk", "pandas", "presidio-anonymizer", "presidio-analyzer", "torch", "flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653", "st-annotated-text", "https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl"]}, "url": "", "version": "0.2.0", "test_valid": false}}, "batch_inference_v2": {"latest": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.5.0"}, "1.9.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0-rc16", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.9.0"}, "2.0.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.0.0"}, "2.5.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.5.0"}, "2.1.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.1.0"}, "2.2.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.2.0"}, "1.5.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0-rc9", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.5.0"}, "1.7.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0-rc13", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.7.0"}, "1.6.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0-rc9", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.6.0"}, "1.8.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0-rc13", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.8.0"}, "2.3.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.3.0"}}, "translate": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning", "deep-learning", "NLP"], "description": "Translate text files from one language to another", "doc": "", "example": "translate.ipynb", "generationDate": "2023-12-05:17-20", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "translate", "platformVersion": "3.5.3", "spec": {"filename": "translate.py", "handler": "translate", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "sentencepiece", "torch", "tqdm"]}, "url": "", "version": "0.0.2", "test_valid": true}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Translate text files from one language to another", "doc": "", "example": "translate.ipynb", "generationDate": "2023-12-05:17-20", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "translate", "platformVersion": "3.5.3", "spec": {"filename": "translate.py", "handler": "translate", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "sentencepiece", "torch", "tqdm"]}, "url": "", "version": "0.0.1", "test_valid": true}, "0.0.2": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning", "deep-learning", "NLP"], "description": "Translate text files from one language to another", "doc": "", "example": "translate.ipynb", "generationDate": "2023-12-05:17-20", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "translate", "platformVersion": "3.5.3", "spec": {"filename": "translate.py", "handler": "translate", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "sentencepiece", "torch", "tqdm"]}, "url": "", "version": "0.0.2", "test_valid": true}}, "structured_data_generator": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "GenAI"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.1", "name": "structured_data_generator", "platformVersion": "3.5.5", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.4.0"}, "1.3.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "structured_data_generator", "platformVersion": "3.5.0", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.3.0"}, "1.0.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "structured_data_generator", "platformVersion": "3.5.0", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.0.0"}, "1.4.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "GenAI"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.1", "name": "structured_data_generator", "platformVersion": "3.5.5", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.4.0"}, "1.3.1": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "structured_data_generator", "platformVersion": "3.5.0", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.3.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "structured_data_generator", "platformVersion": "3.5.0", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.1.0"}}, "text_to_audio_generator": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Generate audio file from text using different speakers", "doc": "", "example": "text_to_audio_generator.ipynb", "generationDate": "2023-12-03:15-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "text_to_audio_generator", "platformVersion": "3.5.3", "spec": {"filename": "text_to_audio_generator.py", "handler": "generate_multi_speakers_audio", "image": "mlrun/mlrun", "kind": "job", "requirements": ["bark", "torchaudio"]}, "url": "", "version": "1.1.0", "test_valid": true}, "1.0.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Generate audio file from text using different speakers", "doc": "", "example": "text_to_audio_generator.ipynb", "generationDate": "2023-12-03:15-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "text_to_audio_generator", "platformVersion": "3.5.3", "spec": {"filename": "text_to_audio_generator.py", "handler": "generate_multi_speakers_audio", "image": "mlrun/mlrun", "kind": "job", "requirements": ["bark", "torchaudio"]}, "url": "", "version": "1.0.0", "test_valid": true}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Generate audio file from text using different speakers", "doc": "", "example": "text_to_audio_generator.ipynb", "generationDate": "2023-12-03:15-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "text_to_audio_generator", "platformVersion": "3.5.3", "spec": {"filename": "text_to_audio_generator.py", "handler": "generate_multi_speakers_audio", "image": "mlrun/mlrun", "kind": "job", "requirements": ["bark", "torchaudio"]}, "url": "", "version": "1.1.0", "test_valid": true}}, "silero_vad": {"latest": {"apiVersion": "v1", "categories": ["deep-learning", "PyTorch", "Audio"], "description": "Silero VAD (Voice Activity Detection) functions.", "doc": "", "example": "silero_vad.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "silero_vad", "platformVersion": "3.5.3", "spec": {"filename": "silero_vad.py", "handler": "detect_voice", "image": "mlrun/mlrun", "kind": "job", "requirements": ["torch", "torchaudio", "tqdm", "onnxruntime"]}, "url": "", "version": "1.2.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["deep-learning", "PyTorch", "Audio"], "description": "Silero VAD (Voice Activity Detection) functions.", "doc": "", "example": "silero_vad.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "silero_vad", "platformVersion": "3.5.3", "spec": {"filename": "silero_vad.py", "handler": "detect_voice", "image": "mlrun/mlrun", "kind": "job", "requirements": ["torch", "torchaudio", "tqdm", "onnxruntime"]}, "url": "", "version": "1.2.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["deep-learning", "pytorch", "audio"], "description": "Silero VAD (Voice Activity Detection) functions.", "doc": "", "example": "silero_vad.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "silero_vad", "platformVersion": "3.5.3", "spec": {"filename": "silero_vad.py", "handler": "detect_voice", "image": "mlrun/mlrun", "kind": "job", "requirements": ["torch", "torchaudio", "tqdm", "onnxruntime"]}, "url": "", "version": "1.1.0"}}, "pyannote_audio": {"latest": {"apiVersion": "v1", "categories": ["deep-learning", "Huggingface", "Audio"], "description": "pyannote's speech diarization of audio files", "doc": "", "example": "pyannote_audio.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "pyannote-audio", "platformVersion": "3.5.3", "spec": {"filename": "pyannote_audio.py", "handler": "diarize", "image": "mlrun/mlrun-gpu", "kind": "job", "requirements": ["pyannote.audio", "pyannote.core", "torchaudio", "tqdm"]}, "url": "", "version": "1.1.0"}, "1.0.0": {"apiVersion": "v1", "categories": ["deep-learning", "huggingface", "audio"], "description": "pyannote's speech diarization of audio files", "doc": "", "example": "pyannote_audio.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "pyannote-audio", "platformVersion": "3.5.3", "spec": {"filename": "pyannote_audio.py", "handler": "diarize", "image": "mlrun/mlrun-gpu", "kind": "job", "requirements": ["pyannote.audio", "pyannote.core", "torchaudio", "tqdm"]}, "url": "", "version": "1.0.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["deep-learning", "Huggingface", "Audio"], "description": "pyannote's speech diarization of audio files", "doc": "", "example": "pyannote_audio.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "pyannote-audio", "platformVersion": "3.5.3", "spec": {"filename": "pyannote_audio.py", "handler": "diarize", "image": "mlrun/mlrun-gpu", "kind": "job", "requirements": ["pyannote.audio", "pyannote.core", "torchaudio", "tqdm"]}, "url": "", "version": "1.1.0"}}, "noise_reduction": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Reduce noise from audio files", "doc": "", "example": "noise_reduction.ipynb", "generationDate": "2024-03-04:17-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "mlrunVersion": "1.5.2", "name": "noise-reduction", "platformVersion": "3.5.3", "spec": {"filename": "noise_reduction.py", "handler": "reduce_noise", "image": "mlrun/mlrun", "kind": "job", "requirements": ["librosa", "noisereduce", "deepfilternet", "torchaudio>=2.1.2"]}, "url": "", "version": "1.0.0"}, "1.0.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Reduce noise from audio files", "doc": "", "example": "noise_reduction.ipynb", "generationDate": "2024-03-04:17-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "mlrunVersion": "1.5.2", "name": "noise-reduction", "platformVersion": "3.5.3", "spec": {"filename": "noise_reduction.py", "handler": "reduce_noise", "image": "mlrun/mlrun", "kind": "job", "requirements": ["librosa", "noisereduce", "deepfilternet", "torchaudio>=2.1.2"]}, "url": "", "version": "1.0.0"}}}, "master": {"tf2_serving": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "tf2-serving", "platformVersion": "3.5.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf2-serving", "platformVersion": "3.2.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.8.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf2-serving", "platformVersion": "3.2.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.9.0"}, "0.9.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf2-serving", "platformVersion": "3.2.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.9.1"}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "tf2-serving", "platformVersion": "", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "tf2-serving", "platformVersion": "3.5.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "1.1.0"}}, "load_dataset": {"latest": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.0", "name": "load-dataset", "platformVersion": "3.5.5", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "load-dataset", "platformVersion": "3.2.0", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "load-dataset", "platformVersion": "3.2.0", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "load-dataset", "platformVersion": "", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "load-dataset", "platformVersion": "3.5.0", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.0", "name": "load-dataset", "platformVersion": "3.5.5", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}}, "model_server_tester": {"latest": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-server-tester", "platformVersion": "3.5.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server-tester", "platformVersion": "3.2.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server-tester", "platformVersion": "3.2.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "1.0.0": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server-tester", "platformVersion": "3.2.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "model-server-tester", "platformVersion": "", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-server-tester", "platformVersion": "3.5.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}}, "tf2_serving_v2": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server v2", "doc": "", "example": "tf2_serving_v2.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "tf2-serving-v2", "platformVersion": "3.5.0", "spec": {"filename": "tf2_serving_v2.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "1.2.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server v2", "doc": "", "example": "tf2_serving_v2.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf2-serving-v2", "platformVersion": "3.2.0", "spec": {"filename": "tf2_serving_v2.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.8.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server v2", "doc": "", "example": "tf2_serving_v2.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf2-serving-v2", "platformVersion": "3.2.0", "spec": {"filename": "tf2_serving_v2.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.9.0"}, "0.9.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server v2", "doc": "", "example": "tf2_serving_v2.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf2-serving-v2", "platformVersion": "3.2.0", "spec": {"filename": "tf2_serving_v2.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.9.1"}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server v2", "doc": "", "example": "tf2_serving_v2.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "tf2-serving-v2", "platformVersion": "", "spec": {"filename": "tf2_serving_v2.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server v2", "doc": "", "example": "tf2_serving_v2.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "tf2-serving-v2", "platformVersion": "3.5.0", "spec": {"filename": "tf2_serving_v2.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "1.1.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server v2", "doc": "", "example": "tf2_serving_v2.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "tf2-serving-v2", "platformVersion": "3.5.0", "spec": {"filename": "tf2_serving_v2.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "1.2.0"}}, "sql_to_file": {"latest": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "SQL To File - Ingest data using SQL query", "doc": "", "example": "sql_to_file.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "adih"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "sql-to-file", "platformVersion": "3.5.0", "spec": {"filename": "sql_to_file.py", "handler": "sql_to_file", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "SQL To File - Ingest data using SQL query", "doc": "", "example": "sql_to_file.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "adih"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "sql-to-file", "platformVersion": "3.2.0", "spec": {"filename": "sql_to_file.py", "handler": "sql_to_file", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "SQL To File - Ingest data using SQL query", "doc": "", "example": "sql_to_file.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "adih"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "sql-to-file", "platformVersion": "3.2.0", "spec": {"filename": "sql_to_file.py", "handler": "sql_to_file", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "0.9.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "SQL To File - Ingest data using SQL query", "doc": "", "example": "sql_to_file.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "adih"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "sql-to-file", "platformVersion": "3.2.0", "spec": {"filename": "sql_to_file.py", "handler": "sql_to_file", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.1"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "SQL To File - Ingest data using SQL query", "doc": "", "example": "sql_to_file.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "adih"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "sql-to-file", "platformVersion": "", "spec": {"filename": "sql_to_file.py", "handler": "sql_to_file", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "SQL To File - Ingest data using SQL query", "doc": "", "example": "sql_to_file.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "adih"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "sql-to-file", "platformVersion": "3.5.0", "spec": {"filename": "sql_to_file.py", "handler": "sql_to_file", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}}, "feature_selection": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.3", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.5.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "feature-selection", "platformVersion": "3.2.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection/feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.9.0", "name": "feature-selection", "platformVersion": "3.2.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection/feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "1.3.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0"}, "0.9.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.9.0", "name": "feature-selection", "platformVersion": "3.2.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.1"}, "1.1.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.1"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.5.4", "name": "feature-selection", "platformVersion": "2.10.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection/feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.4.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.4.0"}, "1.5.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.3", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.5.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}}, "aggregate": {"latest": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "aggregate", "platformVersion": "3.5.4", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "aggregate", "platformVersion": "3.2.0", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "aggregate", "platformVersion": "3.2.0", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "1.3.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "aggregate", "platformVersion": "3.5.4", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2021-05-19:22-31", "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.6.2", "name": "aggregate", "platformVersion": "3.0.0", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2022-08-28:17-25", "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "aggregate", "platformVersion": "3.5.0", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "aggregate", "platformVersion": "3.5.2", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}}, "bert_embeddings": {"latest": {"apiVersion": "v1", "categories": ["huggingface", "machine-learning", "data-preparation", "pytorch"], "description": "Get BERT based embeddings for given text", "doc": "", "example": "bert_embeddings.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"framework": "pytorch"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "bert-embeddings", "platformVersion": "3.5.3", "spec": {"filename": "bert_embeddings.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio", "requirements": ["torch"]}, "url": "", "version": "1.3.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Get BERT based embeddings for given text", "doc": "", "example": "bert_embeddings.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"framework": "pytorch"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "bert-embeddings", "platformVersion": "3.2.0", "spec": {"filename": "bert_embeddings.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["torch==1.6.0"]}, "url": "", "version": "0.8.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Get BERT based embeddings for given text", "doc": "", "example": "bert_embeddings.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"framework": "pytorch"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "bert-embeddings", "platformVersion": "3.2.0", "spec": {"filename": "bert_embeddings.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["torch==1.6.0"]}, "url": "", "version": "0.9.0"}, "1.3.0": {"apiVersion": "v1", "categories": ["huggingface", "machine-learning", "data-preparation", "pytorch"], "description": "Get BERT based embeddings for given text", "doc": "", "example": "bert_embeddings.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"framework": "pytorch"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "bert-embeddings", "platformVersion": "3.5.3", "spec": {"filename": "bert_embeddings.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio", "requirements": ["torch"]}, "url": "", "version": "1.3.0"}, "1.1.1": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Get BERT based embeddings for given text", "doc": "", "example": "bert_embeddings.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"framework": "pytorch"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "bert-embeddings", "platformVersion": "3.5.0", "spec": {"filename": "bert_embeddings.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["torch==1.6.0"]}, "url": "", "version": "1.1.1"}, "0.0.1": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Get BERT based embeddings for given text", "doc": "", "example": "bert_embeddings.ipynb", "generationDate": "2021-05-19:22-04", "icon": "", "labels": {"framework": "pytorch"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.5.4", "name": "bert-embeddings", "platformVersion": "2.10.0", "spec": {"filename": "bert_embeddings.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["torch==1.6.0"]}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Get BERT based embeddings for given text", "doc": "", "example": "bert_embeddings.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"framework": "pytorch"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "bert-embeddings", "platformVersion": "3.5.0", "spec": {"filename": "bert_embeddings.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["torch==1.6.0"]}, "url": "", "version": "1.1.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Get BERT based embeddings for given text", "doc": "", "example": "bert_embeddings.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"framework": "pytorch"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "bert-embeddings", "platformVersion": "3.5.3", "spec": {"filename": "bert_embeddings.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio", "requirements": ["torch"]}, "url": "", "version": "1.2.0"}}, "describe": {"latest": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.0", "name": "describe", "platformVersion": "3.5.3", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Iguazio"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe", "platformVersion": "3.2.0", "spec": {"filename": "describe.py", "handler": "summarize", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Iguazio"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe", "platformVersion": "3.2.0", "spec": {"filename": "describe.py", "handler": "summarize", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "1.3.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.0", "name": "describe", "platformVersion": "3.5.3", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0"}, "0.9.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-04-07:14-20", "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe", "platformVersion": "3.2.0", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.1"}, "0.9.2": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-04-26:10-20", "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe", "platformVersion": "3.2.0", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.2"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "Iguazio"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.5.4", "name": "describe", "platformVersion": "2.10.0", "spec": {"filename": "describe.py", "handler": "summarize", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "describe", "platformVersion": "3.5.0", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "describe", "platformVersion": "3.5.3", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}}, "model_server": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-server", "platformVersion": "3.5.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server", "platformVersion": "3.2.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "0.8.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server", "platformVersion": "3.2.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "0.9.0"}, "1.0.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server", "platformVersion": "3.2.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "1.0.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "model-server", "platformVersion": "", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-server", "platformVersion": "3.5.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "1.1.0"}}, "pandas_profiling_report": {"latest": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "Create Pandas Profiling Report from Dataset", "doc": "", "example": "pandas_profiling_report.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "nicks"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "pandas-profiling-report", "platformVersion": "3.5.0", "spec": {"filename": "pandas_profiling_report.py", "handler": "pandas_profiling_report", "image": "mlrun/mlrun", "kind": "job", "requirements": ["pandas_profiling"]}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "Create Pandas Profiling Report from Dataset", "doc": "", "example": "pandas_profiling_report.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "nicks"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "pandas-profiling-report", "platformVersion": "3.2.0", "spec": {"filename": "pandas_profiling_report.py", "handler": "pandas_profiling_report", "image": "mlrun/mlrun", "kind": "job", "requirements": ["pandas_profiling"]}, "url": "", "version": "0.8.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "Create Pandas Profiling Report from Dataset", "doc": "", "example": "pandas_profiling_report.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "nicks"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "pandas-profiling-report", "platformVersion": "3.2.0", "spec": {"filename": "pandas_profiling_report.py", "handler": "pandas_profiling_report", "image": "mlrun/mlrun", "kind": "job", "requirements": ["pandas_profiling"]}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "Create Pandas Profiling Report from Dataset", "doc": "", "example": "pandas_profiling_report.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "nicks"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "pandas-profiling-report", "platformVersion": "", "spec": {"filename": "pandas_profiling_report.py", "handler": "pandas_profiling_report", "image": "mlrun/mlrun", "kind": "job", "requirements": ["pandas_profiling"]}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "Create Pandas Profiling Report from Dataset", "doc": "", "example": "pandas_profiling_report.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "nicks"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "pandas-profiling-report", "platformVersion": "3.5.0", "spec": {"filename": "pandas_profiling_report.py", "handler": "pandas_profiling_report", "image": "mlrun/mlrun", "kind": "job", "requirements": ["pandas_profiling"]}, "url": "", "version": "1.1.0"}}, "load_dask": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "etl"], "description": "load dask cluster with data", "doc": "", "example": "load_dask.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "load-dask", "platformVersion": "3.5.0", "spec": {"filename": "load_dask.py", "handler": "load_dask", "image": "mlrun/ml-models", "kind": "dask", "requirements": []}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation", "etl"], "description": "load dask cluster with data", "doc": "", "example": "load_dask.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "load-dask", "platformVersion": "3.2.0", "spec": {"filename": "load_dask.py", "handler": "load_dask", "image": "mlrun/ml-models", "kind": "dask", "requirements": []}, "url": "", "version": "0.8.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation", "etl"], "description": "load dask cluster with data", "doc": "", "example": "load_dask.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "load-dask", "platformVersion": "3.2.0", "spec": {"filename": "load_dask.py", "handler": "load_dask", "image": "mlrun/ml-models", "kind": "dask", "requirements": []}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation", "etl"], "description": "load dask cluster with data", "doc": "", "example": "load_dask.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "load-dask", "platformVersion": "", "spec": {"filename": "load_dask.py", "handler": "load_dask", "image": "mlrun/ml-models", "kind": "dask", "requirements": []}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation", "etl"], "description": "load dask cluster with data", "doc": "", "example": "load_dask.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "load-dask", "platformVersion": "3.5.0", "spec": {"filename": "load_dask.py", "handler": "load_dask", "image": "mlrun/ml-models", "kind": "dask", "requirements": []}, "url": "", "version": "1.1.0"}}, "slack_notify": {"latest": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Slack notification", "doc": "", "example": "slack_notify.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "mdl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "slack-notify", "platformVersion": "3.5.0", "spec": {"filename": "slack_notify.py", "handler": "slack_notify", "image": "python:3.6-jessie", "kind": "job", "requirements": ["requests"]}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Slack notification", "doc": "", "example": "slack_notify.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "mdl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "slack-notify", "platformVersion": "3.2.0", "spec": {"filename": "slack_notify.py", "handler": "slack_notify", "image": "python:3.6-jessie", "kind": "job", "requirements": ["requests"]}, "url": "", "version": "0.8.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Slack notification", "doc": "", "example": "slack_notify.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "mdl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "slack-notify", "platformVersion": "3.2.0", "spec": {"filename": "slack_notify.py", "handler": "slack_notify", "image": "python:3.6-jessie", "kind": "job", "requirements": ["requests"]}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Slack notification", "doc": "", "example": "slack_notify.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "mdl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "slack-notify", "platformVersion": "", "spec": {"filename": "slack_notify.py", "handler": "slack_notify", "image": "python:3.6-jessie", "kind": "job", "requirements": ["requests"]}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Slack notification", "doc": "", "example": "slack_notify.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "mdl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "slack-notify", "platformVersion": "3.5.0", "spec": {"filename": "slack_notify.py", "handler": "slack_notify", "image": "python:3.6-jessie", "kind": "job", "requirements": ["requests"]}, "url": "", "version": "1.1.0"}}, "xgb_serving": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an XGBoost model server.", "doc": "", "example": "xgb_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "xgb_serving", "platformVersion": "3.5.3", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "xgb_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "1.1.2"}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an XGBoost model server.", "doc": "", "example": "xgb_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "xgb_serving", "platformVersion": "3.2.0", "spec": {"filename": "xgb_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "remote", "requirements": []}, "url": "", "version": "0.8.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an XGBoost model server.", "doc": "", "example": "xgb_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "xgb_serving", "platformVersion": "3.2.0", "spec": {"filename": "xgb_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "remote", "requirements": []}, "url": "", "version": "0.9.0"}, "1.0.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an XGBoost model server.", "doc": "", "example": "xgb_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "xgb_serving", "platformVersion": "3.2.0", "spec": {"filename": "xgb_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "remote", "requirements": []}, "url": "", "version": "1.0.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an XGBoost model server.", "doc": "", "example": "xgb_serving.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.6.2", "name": "xgb_serving", "platformVersion": "3.0.0", "spec": {"filename": "xgb_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "remote", "requirements": []}, "url": "", "version": "0.0.1"}, "1.1.2": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an XGBoost model server.", "doc": "", "example": "xgb_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "xgb_serving", "platformVersion": "3.5.3", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "xgb_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "1.1.2"}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an XGBoost model server.", "doc": "", "example": "xgb_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "xgb_serving", "platformVersion": "3.5.0", "spec": {"filename": "xgb_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "remote", "requirements": []}, "url": "", "version": "1.1.0"}}, "model_monitoring_batch": {"latest": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_batch.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-monitoring-batch", "platformVersion": "3.5.0", "spec": {"filename": "model_monitoring_batch.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_batch.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-monitoring-batch", "platformVersion": "3.2.0", "spec": {"filename": "model_monitoring_batch.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_batch.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-monitoring-batch", "platformVersion": "3.2.0", "spec": {"filename": "model_monitoring_batch.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "0.9.1": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_batch.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-monitoring-batch", "platformVersion": "3.2.0", "spec": {"filename": "model_monitoring_batch.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.1"}, "0.0.1": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_batch.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "model-monitoring-batch", "platformVersion": "", "spec": {"filename": "model_monitoring_batch.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_batch.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-monitoring-batch", "platformVersion": "3.5.0", "spec": {"filename": "model_monitoring_batch.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}}, "stream_to_parquet": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Saves a stream to Parquet and can lunch drift detection task on it", "doc": "", "example": "stream_to_parquet.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "stream-to-parquet", "platformVersion": "3.5.0", "spec": {"customFields": {"max_replicas": 1, "min_replicas": 1}, "filename": "stream_to_parquet.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": []}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Saves a stream to Parquet and can lunch drift detection task on it", "doc": "", "example": "stream_to_parquet.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "stream-to-parquet", "platformVersion": "3.2.0", "spec": {"customFields": {"max_replicas": 1, "min_replicas": 1}, "filename": "stream_to_parquet.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": []}, "url": "", "version": "0.8.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Saves a stream to Parquet and can lunch drift detection task on it", "doc": "", "example": "stream_to_parquet.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "stream-to-parquet", "platformVersion": "3.2.0", "spec": {"customFields": {"max_replicas": 1, "min_replicas": 1}, "filename": "stream_to_parquet.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": []}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Saves a stream to Parquet and can lunch drift detection task on it", "doc": "", "example": "stream_to_parquet.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "stream-to-parquet", "platformVersion": "", "spec": {"filename": "stream_to_parquet.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": [], "customFields": {"min_replicas": 1, "max_replicas": 1}}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Saves a stream to Parquet and can lunch drift detection task on it", "doc": "", "example": "stream_to_parquet.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "stream-to-parquet", "platformVersion": "3.5.0", "spec": {"customFields": {"max_replicas": 1, "min_replicas": 1}, "filename": "stream_to_parquet.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": []}, "url": "", "version": "1.1.0"}}, "describe_spark": {"latest": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "describe-spark", "platformVersion": "3.5.0", "spec": {"filename": "describe_spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe-spark", "platformVersion": "3.2.0", "spec": {"filename": "describe-spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe-spark", "platformVersion": "3.2.0", "spec": {"filename": "describe-spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "0.9.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe-spark", "platformVersion": "3.2.0", "spec": {"filename": "describe_spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "0.9.1"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "describe-spark", "platformVersion": "", "spec": {"filename": "describe-spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "describe-spark", "platformVersion": "3.5.0", "spec": {"filename": "describe_spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}}, "gen_class_data": {"latest": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "gen_class_data", "platformVersion": "3.5.3", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "gen_class_data", "platformVersion": "3.2.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "gen_class_data", "platformVersion": "3.2.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.6.2", "name": "gen_class_data", "platformVersion": "3.0.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "gen_class_data", "platformVersion": "3.5.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "gen_class_data", "platformVersion": "3.5.3", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "0.10.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "gen_class_data", "platformVersion": "3.2.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.10.0"}}, "open_archive": {"latest": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Open a file/object archive into a target directory", "doc": "", "example": "open_archive.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "open-archive", "platformVersion": "3.5.0", "spec": {"filename": "open_archive.py", "handler": "open_archive", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Open a file/object archive into a target directory", "doc": "", "example": "open_archive.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "open-archive", "platformVersion": "3.2.0", "spec": {"filename": "open_archive.py", "handler": "open_archive", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Open a file/object archive into a target directory", "doc": "", "example": "open_archive.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "open-archive", "platformVersion": "3.2.0", "spec": {"filename": "open_archive.py", "handler": "open_archive", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Open a file/object archive into a target directory", "doc": "", "example": "open_archive.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "open-archive", "platformVersion": "", "spec": {"filename": "open_archive.py", "handler": "open_archive", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Open a file/object archive into a target directory", "doc": "", "example": "open_archive.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "open-archive", "platformVersion": "3.5.0", "spec": {"filename": "open_archive.py", "handler": "open_archive", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}}, "send_email": {"latest": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "send-email", "platformVersion": "3.5.3", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "send-email", "platformVersion": "3.2.0", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "send-email", "platformVersion": "3.2.0", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "send-email", "platformVersion": "", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "send-email", "platformVersion": "3.5.0", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "send-email", "platformVersion": "3.5.3", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}}, "concept_drift": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "Deploy a streaming Concept Drift detector on a labeled stream", "doc": "", "example": "concept_drift.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "concept-drift", "platformVersion": "3.5.0", "spec": {"filename": "concept_drift.py", "handler": "concept_drift_deployer", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-multiflow"]}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "Deploy a streaming Concept Drift detector on a labeled stream", "doc": "", "example": "concept_drift.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "concept-drift", "platformVersion": "3.2.0", "spec": {"filename": "concept_drift.py", "handler": "concept_drift_deployer", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-multiflow"]}, "url": "", "version": "0.8.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "Deploy a streaming Concept Drift detector on a labeled stream", "doc": "", "example": "concept_drift.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "concept-drift", "platformVersion": "3.2.0", "spec": {"filename": "concept_drift.py", "handler": "concept_drift_deployer", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-multiflow"]}, "url": "", "version": "0.9.0"}, "0.9.1": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "Deploy a streaming Concept Drift detector on a labeled stream", "doc": "", "example": "concept_drift.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "concept-drift", "platformVersion": "3.2.0", "spec": {"filename": "concept_drift.py", "handler": "concept_drift_deployer", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-multiflow"]}, "url": "", "version": "0.9.1"}, "0.0.1": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "Deploy a streaming Concept Drift detector on a labeled stream", "doc": "", "example": "concept_drift.ipynb", "generationDate": "2021-05-19:22-04", "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "concept-drift", "platformVersion": "", "spec": {"filename": "concept_drift.py", "handler": "concept_drift_deployer", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "Deploy a streaming Concept Drift detector on a labeled stream", "doc": "", "example": "concept_drift.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "concept-drift", "platformVersion": "3.5.0", "spec": {"filename": "concept_drift.py", "handler": "concept_drift_deployer", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-multiflow"]}, "url": "", "version": "1.1.0"}}, "tf1_serving": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf1 image classification server", "doc": "", "example": "tf1_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "tf1-serving", "platformVersion": "3.5.0", "spec": {"env": {"ENABLE_EXPLAINER": false, "MODEL_CLASS": "TFModel"}, "filename": "tf1_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf1 image classification server", "doc": "", "example": "tf1_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf1-serving", "platformVersion": "3.2.0", "spec": {"env": {"ENABLE_EXPLAINER": false, "MODEL_CLASS": "TFModel"}, "filename": "tf1_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "0.8.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf1 image classification server", "doc": "", "example": "tf1_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf1-serving", "platformVersion": "3.2.0", "spec": {"env": {"ENABLE_EXPLAINER": false, "MODEL_CLASS": "TFModel"}, "filename": "tf1_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "0.9.0"}, "0.9.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf1 image classification server", "doc": "", "example": "tf1_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf1-serving", "platformVersion": "3.2.0", "spec": {"env": {"ENABLE_EXPLAINER": false, "MODEL_CLASS": "TFModel"}, "filename": "tf1_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "0.9.1"}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf1 image classification server", "doc": "", "example": "tf1_serving.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "tf1-serving", "platformVersion": "", "spec": {"filename": "tf1_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": [], "env": {"MODEL_CLASS": "TFModel", "ENABLE_EXPLAINER": false}}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf1 image classification server", "doc": "", "example": "tf1_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "tf1-serving", "platformVersion": "3.5.0", "spec": {"env": {"ENABLE_EXPLAINER": false, "MODEL_CLASS": "TFModel"}, "filename": "tf1_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "1.1.0"}}, "churn_server": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "churn classification and predictor", "doc": "", "example": "churn_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Iguazio", "framework": "churn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "churn-server", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "ChurnModel"}, "env": {"ENABLE_EXPLAINER": "False"}, "filename": "churn_server.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["xgboost==1.3.1", "lifelines==0.22.8"]}, "url": "", "version": "1.2.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "churn classification and predictor", "doc": "", "example": "churn_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Iguazio", "framework": "churn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "churn-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ChurnModel"}, "env": {"ENABLE_EXPLAINER": "False"}, "filename": "churn_server.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["xgboost==1.3.1", "lifelines==0.22.8"]}, "url": "", "version": "0.8.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "churn classification and predictor", "doc": "", "example": "churn_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Iguazio", "framework": "churn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "churn-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ChurnModel"}, "env": {"ENABLE_EXPLAINER": "False"}, "filename": "churn_server.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["xgboost==1.3.1", "lifelines==0.22.8"]}, "url": "", "version": "0.9.0"}, "1.0.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "churn classification and predictor", "doc": "", "example": "churn_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Iguazio", "framework": "churn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "churn-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ChurnModel"}, "env": {"ENABLE_EXPLAINER": "False"}, "filename": "churn_server.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["xgboost==1.3.1", "lifelines==0.22.8"]}, "url": "", "version": "1.0.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "churn classification and predictor", "doc": "", "example": "churn_server.ipynb", "generationDate": "2021-05-19:22-04", "icon": "", "labels": {"author": "Iguazio", "framework": "churn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "churn-server", "platformVersion": "", "spec": {"filename": "churn_server.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": [], "env": {"ENABLE_EXPLAINER": "False"}, "customFields": {"default_class": "ChurnModel"}}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "churn classification and predictor", "doc": "", "example": "churn_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Iguazio", "framework": "churn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "churn-server", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "ChurnModel"}, "env": {"ENABLE_EXPLAINER": "False"}, "filename": "churn_server.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["xgboost==1.3.1", "lifelines==0.22.8"]}, "url": "", "version": "1.1.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "churn classification and predictor", "doc": "", "example": "churn_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Iguazio", "framework": "churn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "churn-server", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "ChurnModel"}, "env": {"ENABLE_EXPLAINER": "False"}, "filename": "churn_server.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["xgboost==1.3.1", "lifelines==0.22.8"]}, "url": "", "version": "1.2.0"}}, "model_monitoring_stream": {"latest": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_stream.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-monitoring-stream", "platformVersion": "3.5.0", "spec": {"filename": "model_monitoring_stream.py", "handler": "handler", "image": "livsmichael/mlrun-api:automation", "kind": "nuclio", "requirements": []}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_stream.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-monitoring-stream", "platformVersion": "3.2.0", "spec": {"filename": "model_monitoring_stream.py", "handler": "handler", "image": "livsmichael/mlrun-api:automation", "kind": "nuclio", "requirements": []}, "url": "", "version": "0.8.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_stream.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-monitoring-stream", "platformVersion": "3.2.0", "spec": {"filename": "model_monitoring_stream.py", "handler": "handler", "image": "livsmichael/mlrun-api:automation", "kind": "nuclio", "requirements": []}, "url": "", "version": "0.9.0"}, "0.9.1": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_stream.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-monitoring-stream", "platformVersion": "3.2.0", "spec": {"filename": "model_monitoring_stream.py", "handler": "handler", "image": "livsmichael/mlrun-api:automation", "kind": "nuclio", "requirements": []}, "url": "", "version": "0.9.1"}, "0.0.1": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_stream.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "model-monitoring-stream", "platformVersion": "", "spec": {"filename": "model_monitoring_stream.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio", "requirements": []}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_stream.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-monitoring-stream", "platformVersion": "3.5.0", "spec": {"filename": "model_monitoring_stream.py", "handler": "handler", "image": "livsmichael/mlrun-api:automation", "kind": "nuclio", "requirements": []}, "url": "", "version": "1.1.0"}}, "virtual_drift": {"latest": {"apiVersion": "v1", "categories": ["data-analysis", "machine-learning"], "description": "Compute drift magnitude between Time-Samples T and U", "doc": "", "example": "virtual_drift.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "virtual-drift", "platformVersion": "3.5.0", "spec": {"filename": "virtual_drift.py", "handler": "drift_magnitude", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-learn", "scipy", "v3io_frames"]}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-analysis", "machine-learning"], "description": "Compute drift magnitude between Time-Samples T and U", "doc": "", "example": "virtual_drift.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "virtual-drift", "platformVersion": "3.2.0", "spec": {"filename": "virtual_drift.py", "handler": "drift_magnitude", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-learn", "scipy", "v3io_frames"]}, "url": "", "version": "0.8.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-analysis", "machine-learning"], "description": "Compute drift magnitude between Time-Samples T and U", "doc": "", "example": "virtual_drift.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "virtual-drift", "platformVersion": "3.2.0", "spec": {"filename": "virtual_drift.py", "handler": "drift_magnitude", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-learn", "scipy", "v3io_frames"]}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-analysis", "machine-learning"], "description": "Compute drift magnitude between Time-Samples T and U", "doc": "", "example": "virtual_drift.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "virtual-drift", "platformVersion": "", "spec": {"filename": "virtual_drift.py", "handler": "drift_magnitude", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-learn", "scipy", "v3io_frames"]}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-analysis", "machine-learning"], "description": "Compute drift magnitude between Time-Samples T and U", "doc": "", "example": "virtual_drift.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "virtual-drift", "platformVersion": "3.5.0", "spec": {"filename": "virtual_drift.py", "handler": "drift_magnitude", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-learn", "scipy", "v3io_frames"]}, "url": "", "version": "1.1.0"}}, "rnn_serving": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an rnn based stock analysis model server.", "doc": "", "example": "rnn_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "rnn-serving", "platformVersion": "3.5.0", "spec": {"filename": "rnn_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": null}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an rnn based stock analysis model server.", "doc": "", "example": "rnn_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "rnn-serving", "platformVersion": "3.2.0", "spec": {"filename": "rnn_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": null}, "url": "", "version": "0.8.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an rnn based stock analysis model server.", "doc": "", "example": "rnn_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "rnn-serving", "platformVersion": "3.2.0", "spec": {"filename": "rnn_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": null}, "url": "", "version": "0.9.0"}, "1.0.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an rnn based stock analysis model server.", "doc": "", "example": "rnn_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "rnn-serving", "platformVersion": "3.2.0", "spec": {"filename": "rnn_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": null}, "url": "", "version": "1.0.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an rnn based stock analysis model server.", "doc": "", "example": "rnn_serving.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "rnn-serving", "platformVersion": "", "spec": {"filename": "rnn_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["keras"]}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an rnn based stock analysis model server.", "doc": "", "example": "rnn_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "rnn-serving", "platformVersion": "3.5.0", "spec": {"filename": "rnn_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": null}, "url": "", "version": "1.1.0"}}, "feature_perms": {"latest": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "estimate feature importances using permutations", "doc": "", "example": "feature_perms.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-perms", "platformVersion": "3.5.0", "spec": {"filename": "feature_perms.py", "handler": "permutation_importance", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "test_valid": false}, "0.8.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "estimate feature importances using permutations", "doc": "", "example": "feature_perms.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "feature-perms", "platformVersion": "3.2.0", "spec": {"filename": "feature_perms.py", "handler": "permutation_importance", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "estimate feature importances using permutations", "doc": "", "example": "feature_perms.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "feature-perms", "platformVersion": "3.2.0", "spec": {"filename": "feature_perms.py", "handler": "permutation_importance", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "1.0.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "estimate feature importances using permutations", "doc": "", "example": "feature_perms.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "feature-perms", "platformVersion": "3.2.0", "spec": {"filename": "feature_perms.py", "handler": "permutation_importance", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.0.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "estimate feature importances using permutations", "doc": "", "example": "feature_perms.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "feature-perms", "platformVersion": "", "spec": {"filename": "feature_perms.py", "handler": "permutation_importance", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "estimate feature importances using permutations", "doc": "", "example": "feature_perms.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-perms", "platformVersion": "3.5.0", "spec": {"filename": "feature_perms.py", "handler": "permutation_importance", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "test_valid": false}}, "v2_model_tester": {"latest": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "v2-model-tester", "platformVersion": "3.5.0", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-tester", "platformVersion": "3.2.0", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-tester", "platformVersion": "3.2.0", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "v2-model-tester", "platformVersion": "", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "v2-model-tester", "platformVersion": "3.5.0", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}}, "coxph_test": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "model-testing"], "description": "Test cox proportional hazards model", "doc": "", "example": "coxph_test.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Iguazio", "framework": "survival"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "coxph-test", "platformVersion": "3.5.0", "spec": {"filename": "coxph_test.py", "handler": "cox_test", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-testing"], "description": "Test cox proportional hazards model", "doc": "", "example": "coxph_test.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Iguazio", "framework": "survival"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "coxph-test", "platformVersion": "3.2.0", "spec": {"filename": "coxph_test.py", "handler": "cox_test", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-testing"], "description": "Test cox proportional hazards model", "doc": "", "example": "coxph_test.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Iguazio", "framework": "survival"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "coxph-test", "platformVersion": "3.2.0", "spec": {"filename": "coxph_test.py", "handler": "cox_test", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "1.0.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-testing"], "description": "Test cox proportional hazards model", "doc": "", "example": "coxph_test.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Iguazio", "framework": "survival"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "coxph-test", "platformVersion": "3.2.0", "spec": {"filename": "coxph_test.py", "handler": "cox_test", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.0.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["machine-learning", "model-testing"], "description": "Test cox proportional hazards model", "doc": "", "example": "coxph_test.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "Iguazio", "framework": "survival"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "coxph-test", "platformVersion": "", "spec": {"filename": "coxph_test.py", "handler": "cox_test", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-testing"], "description": "Test cox proportional hazards model", "doc": "", "example": "coxph_test.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Iguazio", "framework": "survival"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "coxph-test", "platformVersion": "3.5.0", "spec": {"filename": "coxph_test.py", "handler": "cox_test", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}}, "arc_to_parquet": {"latest": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avi"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "arc-to-parquet", "platformVersion": "3.5.4", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.4.1"}, "0.8.0": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "arc-to-parquet", "platformVersion": "3.2.0", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/ml-base", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "1.4.1": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avi"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "arc-to-parquet", "platformVersion": "3.5.4", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.4.1"}, "0.9.0": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "arc-to-parquet", "platformVersion": "3.2.0", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/ml-base", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2021-05-19:22-04", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.5.4", "name": "arc-to-parquet", "platformVersion": "2.10.0", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/ml-base", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2022-08-28:17-25", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "arc-to-parquet", "platformVersion": "3.5.0", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/ml-base", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "arc-to-parquet", "platformVersion": "3.5.0", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/ml-base", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}}, "github_utils": {"latest": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "github-utils", "platformVersion": "3.5.0", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "github-utils", "platformVersion": "3.2.0", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "github-utils", "platformVersion": "3.2.0", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "github-utils", "platformVersion": "", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "github-utils", "platformVersion": "3.5.0", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}}, "v2_model_server": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "v2-model-server", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "1.2.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "0.8.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "0.9.0"}, "1.0.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "1.0.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "v2-model-server", "platformVersion": "", "spec": {"filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": [], "customFields": {"default_class": "ClassifierModel"}}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "v2-model-server", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "1.1.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "v2-model-server", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "1.2.0"}}, "concept_drift_streaming": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "monitoring"], "description": "Deploy a streaming Concept Drift detector on a labeled stream. the nuclio part of the concept_drift function", "doc": "", "example": "concept_drift_streaming.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "concept-drift-streaming", "platformVersion": "3.5.0", "spec": {"filename": "concept_drift_streaming.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["scikit-multiflow==0.4.1", "v3io_frames"]}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["machine-learning", "monitoring"], "description": "Deploy a streaming Concept Drift detector on a labeled stream. the nuclio part of the concept_drift function", "doc": "", "example": "concept_drift_streaming.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "concept-drift-streaming", "platformVersion": "3.2.0", "spec": {"filename": "concept_drift_streaming.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["scikit-multiflow==0.4.1", "v3io_frames"]}, "url": "", "version": "0.8.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["machine-learning", "monitoring"], "description": "Deploy a streaming Concept Drift detector on a labeled stream. the nuclio part of the concept_drift function", "doc": "", "example": "concept_drift_streaming.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "concept-drift-streaming", "platformVersion": "3.2.0", "spec": {"filename": "concept_drift_streaming.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["scikit-multiflow==0.4.1", "v3io_frames"]}, "url": "", "version": "0.9.0"}, "0.9.1": {"apiVersion": "v1", "categories": ["machine-learning", "monitoring"], "description": "Deploy a streaming Concept Drift detector on a labeled stream. the nuclio part of the concept_drift function", "doc": "", "example": "concept_drift_streaming.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "concept-drift-streaming", "platformVersion": "3.2.0", "spec": {"filename": "concept_drift_streaming.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["scikit-multiflow==0.4.1", "v3io_frames"]}, "url": "", "version": "0.9.1"}, "0.0.1": {"apiVersion": "v1", "categories": ["machine-learning", "monitoring"], "description": "Deploy a streaming Concept Drift detector on a labeled stream. the nuclio part of the concept_drift function", "doc": "", "example": "concept_drift_streaming.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "concept-drift-streaming", "platformVersion": "", "spec": {"filename": "concept_drift_streaming.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["scikit-multiflow==0.4.1", "v3io_frames"]}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "monitoring"], "description": "Deploy a streaming Concept Drift detector on a labeled stream. the nuclio part of the concept_drift function", "doc": "", "example": "concept_drift_streaming.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "concept-drift-streaming", "platformVersion": "3.5.0", "spec": {"filename": "concept_drift_streaming.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["scikit-multiflow==0.4.1", "v3io_frames"]}, "url": "", "version": "1.1.0"}}, "onnx_utils": {"latest": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "onnx_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "with_mlrun": false}}, "filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/mlrun", "kind": "job", "requirements": ["onnx~=1.13.0", "onnxruntime~=1.14.0", "onnxoptimizer~=0.3.0", "onnxmltools~=1.11.0", "tf2onnx~=1.13.0"]}, "url": "", "version": "1.2.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "onnx_utils", "platformVersion": "3.2.0", "spec": {"filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "onnx_utils", "platformVersion": "3.2.0", "spec": {"filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "1.1.1": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "onnx_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "with_mlrun": false}}, "filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/ml-models", "kind": "job", "requirements": ["onnx~=1.13.0", "onnxruntime~=1.14.0", "onnxoptimizer~=0.3.0", "onnxmltools~=1.11.0", "tf2onnx~=1.13.0"]}, "url": "", "version": "1.1.1"}, "0.10.2": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.10.0", "name": "onnx_utils", "platformVersion": "3.2.0", "spec": {"filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.10.2"}, "1.1.0": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "onnx_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "with_mlrun": false}}, "filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/ml-models", "kind": "job", "requirements": ["onnx~=1.10.1", "onnxruntime~=1.8.1", "onnxoptimizer~=0.2.0", "onnxmltools~=1.9.0", "tf2onnx~=1.9.0"]}, "url": "", "version": "1.1.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "onnx_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "with_mlrun": false}}, "filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/mlrun", "kind": "job", "requirements": ["onnx~=1.13.0", "onnxruntime~=1.14.0", "onnxoptimizer~=0.3.0", "onnxmltools~=1.11.0", "tf2onnx~=1.13.0"]}, "url": "", "version": "1.2.0"}}, "ingest": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "Feature Store ingest function that runs the transformation graph on the source of the featureset.", "doc": "", "example": "ingest.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "ingest", "platformVersion": "3.5.0", "spec": {"filename": "ingest.py", "handler": "ingest", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "Feature Store ingest function that runs the transformation graph on the source of the featureset.", "doc": "", "example": "ingest.ipynb", "generationDate": "2021-11-13:00-15", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "ingest", "platformVersion": "", "spec": {"filename": "ingest.py", "handler": "ingest", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "Feature Store ingest function that runs the transformation graph on the source of the featureset.", "doc": "", "example": "ingest.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "ingest", "platformVersion": "3.5.0", "spec": {"filename": "ingest.py", "handler": "ingest", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}}, "get_offline_features": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "retrieve offline feature vector results", "doc": "", "example": "get_offline_features.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.3", "name": "get_offline_features", "platformVersion": "3.5.0", "spec": {"filename": "get_offline_features.py", "handler": "get_offline_features", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0"}, "1.0.1": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "retrieve offline feature vector results", "doc": "", "example": "get_offline_features.ipynb", "generationDate": "2022-05-25:10-58", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.0.1", "name": "get_offline_features", "platformVersion": "", "spec": {"filename": "get_offline_features.py", "handler": "get_offline_features", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.1"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "retrieve offline feature vector results", "doc": "", "example": "get_offline_features.ipynb", "generationDate": "2022-01-17:17-56", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.9.1", "name": "get_offline_features", "platformVersion": "", "spec": {"filename": "get_offline_features.py", "handler": "get_offline_features", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "1.3.0": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "retrieve offline feature vector results", "doc": "", "example": "get_offline_features.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.3", "name": "get_offline_features", "platformVersion": "3.5.0", "spec": {"filename": "get_offline_features.py", "handler": "get_offline_features", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0"}, "1.0.2": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "retrieve offline feature vector results", "doc": "", "example": "get_offline_features.ipynb", "generationDate": "2022-05-25:10-58", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.0.1", "name": "get_offline_features", "platformVersion": "", "spec": {"filename": "get_offline_features.py", "handler": "get_offline_features", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.2"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "retrieve offline feature vector results", "doc": "", "example": "get_offline_features.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "get_offline_features", "platformVersion": "3.5.0", "spec": {"filename": "get_offline_features.py", "handler": "get_offline_features", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "retrieve offline feature vector results", "doc": "", "example": "get_offline_features.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "get_offline_features", "platformVersion": "3.5.0", "spec": {"filename": "get_offline_features.py", "handler": "get_offline_features", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}}, "azureml_utils": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "azureml_utils", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "commands": ["apt-get update && apt-get install -y --no-install-recommends git", "apt install -y liblttng-ust0"], "with_mlrun": true}}, "filename": "azureml_utils.py", "handler": "train", "image": "python:3.9-bullseye", "kind": "job", "requirements": ["azureml-core==1.54.0.post1", "azureml-train-automl-client==1.54.0.post1", "plotly~=5.4"]}, "url": "", "version": "1.3.0", "test_valid": true}, "0.9.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2021-11-13:00-15", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "azureml_utils", "platformVersion": "", "spec": {"filename": "azureml_utils.py", "handler": "train", "commands": null, "image": "", "kind": "job", "requirements": ["azureml-core==1.33.0", "azureml-train-automl-client==1.33.0"]}, "url": "", "version": "0.9.0"}, "1.3.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "azureml_utils", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "commands": ["apt-get update && apt-get install -y --no-install-recommends git", "apt install -y liblttng-ust0"], "with_mlrun": true}}, "filename": "azureml_utils.py", "handler": "train", "image": "python:3.9-bullseye", "kind": "job", "requirements": ["azureml-core==1.54.0.post1", "azureml-train-automl-client==1.54.0.post1", "plotly~=5.4"]}, "url": "", "version": "1.3.0", "test_valid": true}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "azureml_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "commands": ["python -m pip install pip==22.1.2", "apt-get update && apt-get install -y --no-install-recommends git"], "with_mlrun": true}}, "filename": "azureml_utils.py", "handler": "train", "image": "python:3.7.9-slim", "kind": "job", "requirements": ["azureml-core==1.40.0", "azureml-train-automl-client==1.40.0", "plotly~=5.4"]}, "url": "", "version": "1.1.0"}, "0.9.4": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2021-11-13:00-15", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "azureml_utils", "platformVersion": "", "spec": {"filename": "azureml_utils.py", "handler": "train", "extra_spec": {"build": {"commands": ["python -m pip install pip==21.2.4", "apt-get update && apt-get install -y --no-install-recommends git"], "with_mlrun": true, "auto_build": true}, "allow_empty_resources": true}, "image": "python:3.7.9-slim", "kind": "job", "requirements": ["azureml-core==1.33.0", "azureml-train-automl-client==1.33.0", "plotly~=5.4"]}, "url": "", "version": "0.9.4"}, "0.9.5": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2021-04-20:15-18", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "azureml_utils", "platformVersion": "", "spec": {"filename": "azureml_utils.py", "handler": "train", "extra_spec": {"build": {"commands": ["python -m pip install pip==21.2.4", "apt-get update && apt-get install -y --no-install-recommends git"], "with_mlrun": true, "auto_build": true}, "allow_empty_resources": true}, "image": "python:3.7.9-slim", "kind": "job", "requirements": ["azureml-core==1.40.0", "azureml-train-automl-client==1.40.0", "plotly~=5.4"]}, "url": "", "version": "0.9.5"}, "1.2.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "azureml_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "commands": ["python -m pip install pip==22.1.2", "apt-get update && apt-get install -y --no-install-recommends git"], "with_mlrun": true}}, "filename": "azureml_utils.py", "handler": "train", "image": "python:3.7.9-slim", "kind": "job", "requirements": ["azureml-core==1.40.0", "azureml-train-automl-client==1.40.0", "plotly~=5.4"]}, "url": "", "version": "1.2.0", "test_valid": false}}, "auto_trainer": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.7.0"}, "0.10.3": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-04-26:10-43", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.10.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "", "kind": "job", "requirements": []}, "url": "", "version": "0.10.3"}, "1.0.5": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-04-26:10-43", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.0.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.5"}, "1.3.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.3.0", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0"}, "1.0.6": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-04-26:10-43", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.0.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.6"}, "0.10.2": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-02-06:10-18", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.10.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "", "kind": "job", "requirements": []}, "url": "", "version": "0.10.2"}, "1.4.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.4.0"}, "1.5.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.5.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "1.7.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.7.0"}, "1.0.7": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-04-26:10-43", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.0.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.7"}, "1.6.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.6.0"}}, "snowflake_dask": {"latest": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Snowflake Dask - Ingest snowflake data in parallel with Dask cluster", "doc": "", "example": "snowflake-dask-mlrun.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "xingsheng", "framework": "dask"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "snowflake_dask", "platformVersion": "3.5.0", "spec": {"filename": "snowflake_dask.py", "handler": "load_results", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Snowflake Dask - Ingest snowflake data in parallel with Dask cluster", "doc": "", "example": "snowflake-dask-mlrun.ipynb", "generationDate": "2022-03-20:12-28", "icon": "", "labels": {"author": "xingsheng", "framework": "dask"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.9.1", "name": "snowflake_dask", "platformVersion": "3.2.0", "spec": {"filename": "snowflake_dask.py", "handler": "load_results", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Snowflake Dask - Ingest snowflake data in parallel with Dask cluster", "doc": "", "example": "snowflake-dask-mlrun.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "xingsheng", "framework": "dask"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "snowflake_dask", "platformVersion": "3.5.0", "spec": {"filename": "snowflake_dask.py", "handler": "load_results", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}}, "azureml_serving": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "AzureML serving function", "doc": "", "example": "azureml_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "azureml_serving", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "mlrun.frameworks.sklearn.PickleModelServer"}, "filename": "azureml_serving.py", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["azureml-automl-runtime~=1.38.1"]}, "url": "", "version": "1.1.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "AzureML serving function", "doc": "", "example": "azureml_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "azureml_serving", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "mlrun.frameworks.sklearn.PickleModelServer"}, "filename": "azureml_serving.py", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["azureml-automl-runtime~=1.38.1"]}, "url": "", "version": "1.1.0"}}, "batch_inference": {"latest": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.7.0"}, "1.3.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.3.0"}, "1.1.1": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-learn", "plotly"]}, "url": "", "version": "1.1.1"}, "1.4.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.4.0"}, "1.5.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.5.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference ( also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-learn", "plotly"]}, "url": "", "version": "1.1.0"}, "1.7.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.7.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.2.0"}, "1.6.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.6.0"}}, "hugging_face_serving": {"latest": {"apiVersion": "v1", "categories": ["huggingface", "genai", "model-serving", "machine-learning"], "description": "Generic Hugging Face model server.", "doc": "", "example": "hugging_face_serving.ipynb", "generationDate": "2022-09-05:17-00", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "hugging_face_serving", "platformVersion": "", "spec": {"customFields": {"default_class": "HuggingFaceModelServer"}, "filename": "hugging_face_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["transformers==4.21.3", "tensorflow==2.9.2"]}, "url": "", "version": "1.1.0", "test_valid": false}, "1.0.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "Generic Hugging Face model server.", "doc": "", "example": "hugging_face_serving.ipynb", "generationDate": "2022-09-05:17-00", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "hugging_face_serving", "platformVersion": "", "spec": {"customFields": {"default_class": "HuggingFaceModelServer"}, "filename": "hugging_face_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["transformers==4.21.3", "tensorflow==2.9.2"]}, "url": "", "version": "1.0.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["huggingface", "genai", "model-serving", "machine-learning"], "description": "Generic Hugging Face model server.", "doc": "", "example": "hugging_face_serving.ipynb", "generationDate": "2022-09-05:17-00", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "hugging_face_serving", "platformVersion": "", "spec": {"customFields": {"default_class": "HuggingFaceModelServer"}, "filename": "hugging_face_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["transformers==4.21.3", "tensorflow==2.9.2"]}, "url": "", "version": "1.1.0", "test_valid": false}}, "hugging_face_classifier_trainer": {"latest": {"apiVersion": "v1", "categories": ["deep-learning", "huggingface", "machine-learning", "model-training"], "description": "Automatic train and optimize functions for HuggingFace framework", "doc": "", "example": "hugging_face_classifier_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.1", "name": "hugging_face_classifier_trainer", "platformVersion": "3.5.5", "spec": {"filename": "hugging_face_classifier_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": ["onnx~=1.14.1", "onnxruntime~=1.16.1", "optimum~=1.6.4", "transformers~=4.26.1", "datasets~=2.10.1", "scikit-learn~=1.0.2"]}, "url": "", "version": "0.3.0"}, "0.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train and optimize functions for HuggingFace framework", "doc": "", "example": "hugging_face_classifier_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "hugging_face_classifier_trainer", "platformVersion": "3.5.0", "spec": {"filename": "hugging_face_classifier_trainer.py", "handler": "train", "image": "mlrun/ml-models", "kind": "job", "requirements": ["onnx~=1.10.1", "onnxruntime~=1.8.1", "optimum~=1.6.4", "transformers~=4.26.1", "datasets~=2.10.1", "scikit-learn~=1.0.2"]}, "url": "", "version": "0.1.0"}, "0.3.0": {"apiVersion": "v1", "categories": ["deep-learning", "huggingface", "machine-learning", "model-training"], "description": "Automatic train and optimize functions for HuggingFace framework", "doc": "", "example": "hugging_face_classifier_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.1", "name": "hugging_face_classifier_trainer", "platformVersion": "3.5.5", "spec": {"filename": "hugging_face_classifier_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": ["onnx~=1.14.1", "onnxruntime~=1.16.1", "optimum~=1.6.4", "transformers~=4.26.1", "datasets~=2.10.1", "scikit-learn~=1.0.2"]}, "url": "", "version": "0.3.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train and optimize functions for HuggingFace framework", "doc": "", "example": "hugging_face_classifier_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "hugging_face_classifier_trainer", "platformVersion": "3.5.0", "spec": {"filename": "hugging_face_classifier_trainer.py", "handler": "train", "image": "mlrun/ml-models", "kind": "job", "requirements": ["onnx~=1.10.1", "onnxruntime~=1.8.1", "optimum~=1.6.4", "transformers~=4.26.1", "datasets~=2.10.1", "scikit-learn~=1.0.2"]}, "url": "", "version": "0.0.1"}, "0.2.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train and optimize functions for HuggingFace framework", "doc": "", "example": "hugging_face_classifier_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.1", "name": "hugging_face_classifier_trainer", "platformVersion": "3.5.5", "spec": {"filename": "hugging_face_classifier_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": ["onnx~=1.14.1", "onnxruntime~=1.16.1", "optimum~=1.6.4", "transformers~=4.26.1", "datasets~=2.10.1", "scikit-learn~=1.0.2"]}, "url": "", "version": "0.2.0"}}, "validate_great_expectations": {"latest": {"apiVersion": "v1", "categories": ["data-validation", "data-analysis"], "description": "Validate a dataset using Great Expectations", "doc": "", "example": "validate_great_expectations.ipynb", "generationDate": "2022-04-26:12-28", "hidden": false, "icon": "", "labels": {"author": "nicks", "framework": "great-expectations"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "validate-great-expectations", "platformVersion": "3.5.2", "spec": {"filename": "validate_great_expectations.py", "handler": "validate_expectations", "image": "mlrun/mlrun", "kind": "job", "requirements": ["great-expectations==0.15.41"]}, "url": "", "version": "1.1.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-validation", "data-analysis"], "description": "Validate a dataset using Great Expectations", "doc": "", "example": "validate_great_expectations.ipynb", "generationDate": "2022-04-26:12-28", "hidden": false, "icon": "", "labels": {"author": "nicks", "framework": "great-expectations"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "validate-great-expectations", "platformVersion": "3.5.2", "spec": {"filename": "validate_great_expectations.py", "handler": "validate_expectations", "image": "mlrun/mlrun", "kind": "job", "requirements": ["great-expectations==0.15.41"]}, "url": "", "version": "1.1.0"}}, "question_answering": {"latest": {"apiVersion": "v1", "categories": ["genai", "huggingface", "machine-learning"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "torch", "tqdm"]}, "url": "", "version": "0.4.0"}, "0.1.0": {"apiVersion": "v1", "categories": ["machine-learning"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": "transformers torch tqdm"}, "url": "", "version": "0.1.0"}, "0.3.0": {"apiVersion": "v1", "categories": ["machine-learning"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": "transformers torch tqdm"}, "url": "", "version": "0.3.0"}, "0.4.0": {"apiVersion": "v1", "categories": ["genai", "huggingface", "machine-learning"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "torch", "tqdm"]}, "url": "", "version": "0.4.0"}, "0.3.1": {"apiVersion": "v1", "categories": ["machine-learning"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "torch", "tqdm"]}, "url": "", "version": "0.3.1"}, "0.2.0": {"apiVersion": "v1", "categories": ["machine-learning"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": "transformers torch tqdm"}, "url": "", "version": "0.2.0"}}, "transcribe": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "genai", "huggingface", "machine-learning"], "description": "Transcribe audio files into text files", "doc": "", "example": "transcribe.ipynb", "generationDate": "2023-07-13:11-20", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "transcribe", "platformVersion": "3.5.3", "spec": {"filename": "transcribe.py", "handler": "transcribe", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "tqdm", "torchaudio", "torch", "accelerate"]}, "url": "", "version": "1.1.0"}, "0.0.2": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Transcribe audio files into text files", "doc": "", "example": "transcribe.ipynb", "generationDate": "2023-07-13:11-20", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "transcribe", "platformVersion": "3.5.3", "spec": {"filename": "transcribe.py", "handler": "transcribe", "image": "mlrun/mlrun", "kind": "job", "requirements": ["openai-whisper", "tqdm"]}, "url": "", "version": "0.0.2", "test_valid": true}, "1.0.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Transcribe audio files into text files", "doc": "", "example": "transcribe.ipynb", "generationDate": "2023-07-13:11-20", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "transcribe", "platformVersion": "3.5.3", "spec": {"filename": "transcribe.py", "handler": "transcribe", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "tqdm", "torchaudio", "torch", "accelerate"]}, "url": "", "version": "1.0.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Transcribe audio files into text files", "doc": "", "example": "transcribe.ipynb", "generationDate": "2023-07-13:11-20", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "transcribe", "platformVersion": "3.5.3", "spec": {"filename": "transcribe.py", "handler": "transcribe", "image": "mlrun/mlrun", "kind": "job", "requirements": ["openai-whisper", "tqdm"]}, "url": "", "version": "0.0.1", "test_valid": false}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation", "genai", "huggingface", "machine-learning"], "description": "Transcribe audio files into text files", "doc": "", "example": "transcribe.ipynb", "generationDate": "2023-07-13:11-20", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "transcribe", "platformVersion": "3.5.3", "spec": {"filename": "transcribe.py", "handler": "transcribe", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "tqdm", "torchaudio", "torch", "accelerate"]}, "url": "", "version": "1.1.0"}}, "pii_recognizer": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "NLP"], "description": "This function is used to recognize PII in a directory of text files", "doc": "", "example": "pii_recognizer.ipynb", "generationDate": "2023-08-15:10-24", "hidden": false, "icon": "", "labels": {"author": "pgw"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "pii-recognizer", "platformVersion": "3.5.3", "spec": {"filename": "pii_recognizer.py", "handler": "recognize_pii", "image": "mlrun/mlrun", "kind": "job", "requirements": ["nltk", "pandas", "presidio-anonymizer", "presidio-analyzer", "torch", "flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653", "st-annotated-text", "https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl"]}, "url": "", "version": "0.3.0", "test_valid": false}, "0.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "This function is used to recognize PII in a directory of text files", "doc": "", "example": "pii_recognizer.ipynb", "generationDate": "2023-08-15:10-24", "hidden": false, "icon": "", "labels": {"author": "pgw"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "pii-recognizer", "platformVersion": "3.5.3", "spec": {"filename": "pii_recognizer.py", "handler": "recognize_pii", "image": "mlrun/mlrun", "kind": "job", "requirements": ["nltk", "pandas", "presidio-anonymizer", "presidio-analyzer", "torch", "flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653", "st-annotated-text", "https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl"]}, "url": "", "version": "0.1.0", "test_valid": false}, "0.3.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "NLP"], "description": "This function is used to recognize PII in a directory of text files", "doc": "", "example": "pii_recognizer.ipynb", "generationDate": "2023-08-15:10-24", "hidden": false, "icon": "", "labels": {"author": "pgw"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "pii-recognizer", "platformVersion": "3.5.3", "spec": {"filename": "pii_recognizer.py", "handler": "recognize_pii", "image": "mlrun/mlrun", "kind": "job", "requirements": ["nltk", "pandas", "presidio-anonymizer", "presidio-analyzer", "torch", "flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653", "st-annotated-text", "https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl"]}, "url": "", "version": "0.3.0", "test_valid": false}, "0.0.1": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "This function is used to recognize PII in a directory of text files", "doc": "", "example": "pii_recognizer.ipynb", "generationDate": "2023-08-15:10-24", "hidden": false, "icon": "", "labels": {"author": "pgw"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "pii-recognizer", "platformVersion": "3.5.3", "spec": {"filename": "pii_recognizer.py", "handler": "recognize_pii", "image": "mlrun/mlrun", "kind": "job", "requirements": ["nltk", "pandas", "presidio-anonymizer", "presidio-analyzer", "torch", "flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653", "st-annotated-text", "https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl"]}, "url": "", "version": "0.0.1"}, "0.2.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "This function is used to recognize PII in a directory of text files", "doc": "", "example": "pii_recognizer.ipynb", "generationDate": "2023-08-15:10-24", "hidden": false, "icon": "", "labels": {"author": "pgw"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "pii-recognizer", "platformVersion": "3.5.3", "spec": {"filename": "pii_recognizer.py", "handler": "recognize_pii", "image": "mlrun/mlrun", "kind": "job", "requirements": ["nltk", "pandas", "presidio-anonymizer", "presidio-analyzer", "torch", "flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653", "st-annotated-text", "https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl"]}, "url": "", "version": "0.2.0", "test_valid": false}}, "huggingface_auto_trainer": {"latest": {"apiVersion": "v1", "categories": ["huggingface", "genai", "machine-learning", "model-training"], "description": "fine-tune llm model with ease", "doc": "", "example": "huggingface_auto_trainer.ipynb", "generationDate": "2023-08-21:17-25", "hidden": false, "icon": "", "labels": {"author": "Zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "huggingface-auto-trainer", "platformVersion": "3.5.0", "spec": {"filename": "huggingface_auto_trainer.py", "handler": "finetune_llm", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "1.0.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "fine-tune llm model with ease", "doc": "", "example": "huggingface_auto_trainer.ipynb", "generationDate": "2023-08-21:17-25", "hidden": false, "icon": "", "labels": {"author": "Zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "huggingface-auto-trainer", "platformVersion": "3.5.0", "spec": {"filename": "huggingface_auto_trainer.py", "handler": "finetune_llm", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["huggingface", "genai", "machine-learning", "model-training"], "description": "fine-tune llm model with ease", "doc": "", "example": "huggingface_auto_trainer.ipynb", "generationDate": "2023-08-21:17-25", "hidden": false, "icon": "", "labels": {"author": "Zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "huggingface-auto-trainer", "platformVersion": "3.5.0", "spec": {"filename": "huggingface_auto_trainer.py", "handler": "finetune_llm", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}}, "batch_inference_v2": {"latest": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.5.0"}, "2.5.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.5.0"}, "1.9.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0-rc16", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.9.0"}, "2.2.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.2.0"}, "2.1.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.1.0"}, "2.4.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.4.0"}, "2.0.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.0.0"}, "1.5.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0-rc9", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.5.0"}, "1.8.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0-rc13", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.8.0"}, "1.6.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0-rc9", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.6.0"}}, "translate": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "huggingface", "machine-learning", "deep-learning", "NLP"], "description": "Translate text files from one language to another", "doc": "", "example": "translate.ipynb", "generationDate": "2023-12-05:17-20", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "translate", "platformVersion": "3.5.3", "spec": {"filename": "translate.py", "handler": "translate", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "sentencepiece", "torch", "tqdm"]}, "url": "", "version": "0.1.0", "test_valid": true}, "0.0.2": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning", "deep-learning", "NLP"], "description": "Translate text files from one language to another", "doc": "", "example": "translate.ipynb", "generationDate": "2023-12-05:17-20", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "translate", "platformVersion": "3.5.3", "spec": {"filename": "translate.py", "handler": "translate", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "sentencepiece", "torch", "tqdm"]}, "url": "", "version": "0.0.2", "test_valid": true}, "0.1.0": {"apiVersion": "v1", "categories": ["data-preparation", "huggingface", "machine-learning", "deep-learning", "NLP"], "description": "Translate text files from one language to another", "doc": "", "example": "translate.ipynb", "generationDate": "2023-12-05:17-20", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "translate", "platformVersion": "3.5.3", "spec": {"filename": "translate.py", "handler": "translate", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "sentencepiece", "torch", "tqdm"]}, "url": "", "version": "0.1.0", "test_valid": true}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Translate text files from one language to another", "doc": "", "example": "translate.ipynb", "generationDate": "2023-12-05:17-20", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "translate", "platformVersion": "3.5.3", "spec": {"filename": "translate.py", "handler": "translate", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "sentencepiece", "torch", "tqdm"]}, "url": "", "version": "0.0.1", "test_valid": true}}, "structured_data_generator": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.1", "name": "structured_data_generator", "platformVersion": "3.5.5", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.5.0"}, "1.0.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "structured_data_generator", "platformVersion": "3.5.0", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.0.0"}, "1.3.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "structured_data_generator", "platformVersion": "3.5.0", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.3.0"}, "1.4.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.1", "name": "structured_data_generator", "platformVersion": "3.5.5", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.4.0"}, "1.5.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.1", "name": "structured_data_generator", "platformVersion": "3.5.5", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.5.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "structured_data_generator", "platformVersion": "3.5.0", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.1.0"}}, "text_to_audio_generator": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning", "pytorch"], "description": "Generate audio file from text using different speakers", "doc": "", "example": "text_to_audio_generator.ipynb", "generationDate": "2023-12-03:15-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "text_to_audio_generator", "platformVersion": "3.5.3", "spec": {"filename": "text_to_audio_generator.py", "handler": "generate_multi_speakers_audio", "image": "mlrun/mlrun", "kind": "job", "requirements": ["bark", "torchaudio"]}, "url": "", "version": "1.2.0", "test_valid": true}, "1.0.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Generate audio file from text using different speakers", "doc": "", "example": "text_to_audio_generator.ipynb", "generationDate": "2023-12-03:15-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "text_to_audio_generator", "platformVersion": "3.5.3", "spec": {"filename": "text_to_audio_generator.py", "handler": "generate_multi_speakers_audio", "image": "mlrun/mlrun", "kind": "job", "requirements": ["bark", "torchaudio"]}, "url": "", "version": "1.0.0", "test_valid": true}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Generate audio file from text using different speakers", "doc": "", "example": "text_to_audio_generator.ipynb", "generationDate": "2023-12-03:15-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "text_to_audio_generator", "platformVersion": "3.5.3", "spec": {"filename": "text_to_audio_generator.py", "handler": "generate_multi_speakers_audio", "image": "mlrun/mlrun", "kind": "job", "requirements": ["bark", "torchaudio"]}, "url": "", "version": "1.1.0", "test_valid": true}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning", "pytorch"], "description": "Generate audio file from text using different speakers", "doc": "", "example": "text_to_audio_generator.ipynb", "generationDate": "2023-12-03:15-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "text_to_audio_generator", "platformVersion": "3.5.3", "spec": {"filename": "text_to_audio_generator.py", "handler": "generate_multi_speakers_audio", "image": "mlrun/mlrun", "kind": "job", "requirements": ["bark", "torchaudio"]}, "url": "", "version": "1.2.0", "test_valid": true}}, "silero_vad": {"latest": {"apiVersion": "v1", "categories": ["deep-learning", "pytorch", "audio"], "description": "Silero VAD (Voice Activity Detection) functions.", "doc": "", "example": "silero_vad.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "silero_vad", "platformVersion": "3.5.3", "spec": {"filename": "silero_vad.py", "handler": "detect_voice", "image": "mlrun/mlrun", "kind": "job", "requirements": ["torch", "torchaudio", "tqdm", "onnxruntime"]}, "url": "", "version": "1.3.0"}, "1.3.0": {"apiVersion": "v1", "categories": ["deep-learning", "pytorch", "audio"], "description": "Silero VAD (Voice Activity Detection) functions.", "doc": "", "example": "silero_vad.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "silero_vad", "platformVersion": "3.5.3", "spec": {"filename": "silero_vad.py", "handler": "detect_voice", "image": "mlrun/mlrun", "kind": "job", "requirements": ["torch", "torchaudio", "tqdm", "onnxruntime"]}, "url": "", "version": "1.3.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["deep-learning", "pytorch", "audio"], "description": "Silero VAD (Voice Activity Detection) functions.", "doc": "", "example": "silero_vad.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "silero_vad", "platformVersion": "3.5.3", "spec": {"filename": "silero_vad.py", "handler": "detect_voice", "image": "mlrun/mlrun", "kind": "job", "requirements": ["torch", "torchaudio", "tqdm", "onnxruntime"]}, "url": "", "version": "1.1.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["deep-learning", "pytorch", "audio"], "description": "Silero VAD (Voice Activity Detection) functions.", "doc": "", "example": "silero_vad.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "silero_vad", "platformVersion": "3.5.3", "spec": {"filename": "silero_vad.py", "handler": "detect_voice", "image": "mlrun/mlrun", "kind": "job", "requirements": ["torch", "torchaudio", "tqdm", "onnxruntime"]}, "url": "", "version": "1.2.0"}}, "pyannote_audio": {"latest": {"apiVersion": "v1", "categories": ["deep-learning", "huggingface", "audio"], "description": "pyannote's speech diarization of audio files", "doc": "", "example": "pyannote_audio.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "pyannote-audio", "platformVersion": "3.5.3", "spec": {"filename": "pyannote_audio.py", "handler": "diarize", "image": "mlrun/mlrun-gpu", "kind": "job", "requirements": ["pyannote.audio", "pyannote.core", "torchaudio", "tqdm"]}, "url": "", "version": "1.2.0"}, "1.0.0": {"apiVersion": "v1", "categories": ["deep-learning", "huggingface", "audio"], "description": "pyannote's speech diarization of audio files", "doc": "", "example": "pyannote_audio.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "pyannote-audio", "platformVersion": "3.5.3", "spec": {"filename": "pyannote_audio.py", "handler": "diarize", "image": "mlrun/mlrun-gpu", "kind": "job", "requirements": ["pyannote.audio", "pyannote.core", "torchaudio", "tqdm"]}, "url": "", "version": "1.0.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["deep-learning", "huggingface", "audio"], "description": "pyannote's speech diarization of audio files", "doc": "", "example": "pyannote_audio.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "pyannote-audio", "platformVersion": "3.5.3", "spec": {"filename": "pyannote_audio.py", "handler": "diarize", "image": "mlrun/mlrun-gpu", "kind": "job", "requirements": ["pyannote.audio", "pyannote.core", "torchaudio", "tqdm"]}, "url": "", "version": "1.1.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["deep-learning", "huggingface", "audio"], "description": "pyannote's speech diarization of audio files", "doc": "", "example": "pyannote_audio.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "pyannote-audio", "platformVersion": "3.5.3", "spec": {"filename": "pyannote_audio.py", "handler": "diarize", "image": "mlrun/mlrun-gpu", "kind": "job", "requirements": ["pyannote.audio", "pyannote.core", "torchaudio", "tqdm"]}, "url": "", "version": "1.2.0"}}, "mlflow_utils": {"latest": {"apiVersion": "v1", "categories": ["genai", "model-serving", "machine-learning", "mlflow"], "description": "Mlflow model server, and additional utils.", "doc": "", "example": "mlflow_utils.ipynb", "generationDate": "2024-05-23:12-00", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0-rc17", "name": "mlflow_utils", "platformVersion": "", "spec": {"customFields": {"default_class": "MLFlowModelServer"}, "filename": "mlflow_utils.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["mlflow==2.12.2", "lightgbm", "xgboost"]}, "url": "", "version": "1.0.0"}, "1.0.0": {"apiVersion": "v1", "categories": ["genai", "model-serving", "machine-learning", "mlflow"], "description": "Mlflow model server, and additional utils.", "doc": "", "example": "mlflow_utils.ipynb", "generationDate": "2024-05-23:12-00", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0-rc17", "name": "mlflow_utils", "platformVersion": "", "spec": {"customFields": {"default_class": "MLFlowModelServer"}, "filename": "mlflow_utils.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["mlflow==2.12.2", "lightgbm", "xgboost"]}, "url": "", "version": "1.0.0"}}}}} \ No newline at end of file +{"functions": {"development": {"tf2_serving": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "tf2-serving", "platformVersion": "3.5.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "tf2-serving", "platformVersion": "", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "tf2-serving", "platformVersion": "3.5.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf2-serving", "platformVersion": "3.2.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.8.0"}, "0.9.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf2-serving", "platformVersion": "3.2.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.9.1"}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf2-serving", "platformVersion": "3.2.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.9.0"}}, "load_dataset": {"latest": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.0", "name": "load-dataset", "platformVersion": "3.5.5", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "load-dataset", "platformVersion": "", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.0", "name": "load-dataset", "platformVersion": "3.5.5", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "load-dataset", "platformVersion": "3.5.0", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "load-dataset", "platformVersion": "3.2.0", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "load-dataset", "platformVersion": "3.2.0", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}}, "model_server_tester": {"latest": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-server-tester", "platformVersion": "3.5.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "1.0.0": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server-tester", "platformVersion": "3.2.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "model-server-tester", "platformVersion": "", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-server-tester", "platformVersion": "3.5.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server-tester", "platformVersion": "3.2.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server-tester", "platformVersion": "3.2.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}}, "feature_selection": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.4.0"}, "1.1.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.1"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.5.4", "name": "feature-selection", "platformVersion": "2.10.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection/feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "1.3.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "feature-selection", "platformVersion": "3.2.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection/feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "1.4.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.4.0"}, "0.9.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.9.0", "name": "feature-selection", "platformVersion": "3.2.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.1"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.9.0", "name": "feature-selection", "platformVersion": "3.2.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection/feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}}, "aggregate": {"latest": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "aggregate", "platformVersion": "3.5.4", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0"}, "1.0.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2021-05-19:22-31", "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "aggregate", "platformVersion": "3.2.0", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.0"}, "1.1.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "aggregate", "platformVersion": "3.5.2", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.1"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2021-05-19:22-31", "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.6.2", "name": "aggregate", "platformVersion": "3.0.0", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "aggregate", "platformVersion": "3.5.2", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "1.3.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "aggregate", "platformVersion": "3.5.4", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "aggregate", "platformVersion": "3.5.2", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2021-05-19:22-31", "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "aggregate", "platformVersion": "3.2.0", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "0.0.2": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2021-05-19:22-31", "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.7.1", "name": "aggregate", "platformVersion": "3.2.0", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.2"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "aggregate", "platformVersion": "3.2.0", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}}, "describe": {"latest": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "describe", "platformVersion": "3.5.3", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "Iguazio"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.5.4", "name": "describe", "platformVersion": "2.10.0", "spec": {"filename": "describe.py", "handler": "summarize", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.2.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "describe", "platformVersion": "3.5.3", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "describe", "platformVersion": "3.5.0", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "Iguazio"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe", "platformVersion": "3.2.0", "spec": {"filename": "describe.py", "handler": "summarize", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "0.9.2": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-04-26:10-20", "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe", "platformVersion": "3.2.0", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.2"}, "0.9.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-04-07:14-20", "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe", "platformVersion": "3.2.0", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.1"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Iguazio"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe", "platformVersion": "3.2.0", "spec": {"filename": "describe.py", "handler": "summarize", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}}, "model_server": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-server", "platformVersion": "3.5.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "1.1.0"}, "1.0.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server", "platformVersion": "3.2.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "1.0.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "model-server", "platformVersion": "", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-server", "platformVersion": "3.5.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server", "platformVersion": "3.2.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "0.8.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server", "platformVersion": "3.2.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "0.9.0"}}, "model_monitoring_batch": {"latest": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_batch.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-monitoring-batch", "platformVersion": "3.5.0", "spec": {"filename": "model_monitoring_batch.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_batch.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "model-monitoring-batch", "platformVersion": "", "spec": {"filename": "model_monitoring_batch.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_batch.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-monitoring-batch", "platformVersion": "3.5.0", "spec": {"filename": "model_monitoring_batch.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_batch.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-monitoring-batch", "platformVersion": "3.2.0", "spec": {"filename": "model_monitoring_batch.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "0.9.1": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_batch.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-monitoring-batch", "platformVersion": "3.2.0", "spec": {"filename": "model_monitoring_batch.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.1"}, "0.9.0": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_batch.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-monitoring-batch", "platformVersion": "3.2.0", "spec": {"filename": "model_monitoring_batch.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}}, "describe_spark": {"latest": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "describe-spark", "platformVersion": "3.5.0", "spec": {"filename": "describe_spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "describe-spark", "platformVersion": "", "spec": {"filename": "describe-spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "describe-spark", "platformVersion": "3.5.0", "spec": {"filename": "describe_spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe-spark", "platformVersion": "3.2.0", "spec": {"filename": "describe-spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "0.9.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe-spark", "platformVersion": "3.2.0", "spec": {"filename": "describe_spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "0.9.1"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe-spark", "platformVersion": "3.2.0", "spec": {"filename": "describe-spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}}, "gen_class_data": {"latest": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "gen_class_data", "platformVersion": "3.5.3", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.6.2", "name": "gen_class_data", "platformVersion": "3.0.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "gen_class_data", "platformVersion": "3.5.3", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "gen_class_data", "platformVersion": "3.5.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "gen_class_data", "platformVersion": "3.2.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "0.10.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "gen_class_data", "platformVersion": "3.2.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.10.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "gen_class_data", "platformVersion": "3.2.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}}, "open_archive": {"latest": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Open a file/object archive into a target directory", "doc": "", "example": "open_archive.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "open-archive", "platformVersion": "3.5.0", "spec": {"filename": "open_archive.py", "handler": "open_archive", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Open a file/object archive into a target directory", "doc": "", "example": "open_archive.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "open-archive", "platformVersion": "", "spec": {"filename": "open_archive.py", "handler": "open_archive", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Open a file/object archive into a target directory", "doc": "", "example": "open_archive.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "open-archive", "platformVersion": "3.5.0", "spec": {"filename": "open_archive.py", "handler": "open_archive", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Open a file/object archive into a target directory", "doc": "", "example": "open_archive.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "open-archive", "platformVersion": "3.2.0", "spec": {"filename": "open_archive.py", "handler": "open_archive", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Open a file/object archive into a target directory", "doc": "", "example": "open_archive.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "open-archive", "platformVersion": "3.2.0", "spec": {"filename": "open_archive.py", "handler": "open_archive", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}}, "send_email": {"latest": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "send-email", "platformVersion": "3.5.3", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "send-email", "platformVersion": "", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.2.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "send-email", "platformVersion": "3.5.3", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "send-email", "platformVersion": "3.5.0", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "send-email", "platformVersion": "3.2.0", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "send-email", "platformVersion": "3.2.0", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}}, "churn_server": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "churn classification and predictor", "doc": "", "example": "churn_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Iguazio", "framework": "churn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "churn-server", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "ChurnModel"}, "env": {"ENABLE_EXPLAINER": "False"}, "filename": "churn_server.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["xgboost==1.3.1", "lifelines==0.22.8"]}, "url": "", "version": "1.1.0"}, "1.0.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "churn classification and predictor", "doc": "", "example": "churn_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Iguazio", "framework": "churn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "churn-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ChurnModel"}, "env": {"ENABLE_EXPLAINER": "False"}, "filename": "churn_server.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["xgboost==1.3.1", "lifelines==0.22.8"]}, "url": "", "version": "1.0.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "churn classification and predictor", "doc": "", "example": "churn_server.ipynb", "generationDate": "2021-05-19:22-04", "icon": "", "labels": {"author": "Iguazio", "framework": "churn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "churn-server", "platformVersion": "", "spec": {"filename": "churn_server.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": [], "env": {"ENABLE_EXPLAINER": "False"}, "customFields": {"default_class": "ChurnModel"}}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "churn classification and predictor", "doc": "", "example": "churn_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Iguazio", "framework": "churn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "churn-server", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "ChurnModel"}, "env": {"ENABLE_EXPLAINER": "False"}, "filename": "churn_server.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["xgboost==1.3.1", "lifelines==0.22.8"]}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "churn classification and predictor", "doc": "", "example": "churn_server.ipynb", "generationDate": "2021-05-19:22-04", "icon": "", "labels": {"author": "Iguazio", "framework": "churn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "churn-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ChurnModel"}, "env": {"ENABLE_EXPLAINER": "False"}, "filename": "churn_server.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["xgboost==1.3.1", "lifelines==0.22.8"]}, "url": "", "version": "0.8.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "churn classification and predictor", "doc": "", "example": "churn_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Iguazio", "framework": "churn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "churn-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ChurnModel"}, "env": {"ENABLE_EXPLAINER": "False"}, "filename": "churn_server.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["xgboost==1.3.1", "lifelines==0.22.8"]}, "url": "", "version": "0.9.0"}}, "v2_model_tester": {"latest": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "v2-model-tester", "platformVersion": "3.5.0", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "v2-model-tester", "platformVersion": "", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "v2-model-tester", "platformVersion": "3.5.0", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-tester", "platformVersion": "3.2.0", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-tester", "platformVersion": "3.2.0", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}}, "coxph_test": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "model-testing"], "description": "Test cox proportional hazards model", "doc": "", "example": "coxph_test.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Iguazio", "framework": "survival"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "coxph-test", "platformVersion": "3.5.0", "spec": {"filename": "coxph_test.py", "handler": "cox_test", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "1.0.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-testing"], "description": "Test cox proportional hazards model", "doc": "", "example": "coxph_test.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Iguazio", "framework": "survival"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "coxph-test", "platformVersion": "3.2.0", "spec": {"filename": "coxph_test.py", "handler": "cox_test", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.0.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["machine-learning", "model-testing"], "description": "Test cox proportional hazards model", "doc": "", "example": "coxph_test.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "Iguazio", "framework": "survival"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "coxph-test", "platformVersion": "", "spec": {"filename": "coxph_test.py", "handler": "cox_test", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-testing"], "description": "Test cox proportional hazards model", "doc": "", "example": "coxph_test.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Iguazio", "framework": "survival"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "coxph-test", "platformVersion": "3.5.0", "spec": {"filename": "coxph_test.py", "handler": "cox_test", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-testing"], "description": "Test cox proportional hazards model", "doc": "", "example": "coxph_test.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "Iguazio", "framework": "survival"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "coxph-test", "platformVersion": "3.2.0", "spec": {"filename": "coxph_test.py", "handler": "cox_test", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-testing"], "description": "Test cox proportional hazards model", "doc": "", "example": "coxph_test.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Iguazio", "framework": "survival"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "coxph-test", "platformVersion": "3.2.0", "spec": {"filename": "coxph_test.py", "handler": "cox_test", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}}, "arc_to_parquet": {"latest": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avi"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "arc-to-parquet", "platformVersion": "3.5.4", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.4.1"}, "0.0.1": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2021-05-19:22-04", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.5.4", "name": "arc-to-parquet", "platformVersion": "2.10.0", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/ml-base", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.2.0": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "arc-to-parquet", "platformVersion": "3.5.0", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/ml-base", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "arc-to-parquet", "platformVersion": "3.5.0", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/ml-base", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2021-05-19:22-04", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "arc-to-parquet", "platformVersion": "3.2.0", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/ml-base", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "1.4.1": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avi"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "arc-to-parquet", "platformVersion": "3.5.4", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.4.1"}, "0.9.0": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "arc-to-parquet", "platformVersion": "3.2.0", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/ml-base", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}}, "github_utils": {"latest": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "github-utils", "platformVersion": "3.5.0", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "github-utils", "platformVersion": "", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "github-utils", "platformVersion": "3.5.0", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "github-utils", "platformVersion": "3.2.0", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "github-utils", "platformVersion": "3.2.0", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}}, "v2_model_server": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "v2-model-server", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "1.1.0"}, "1.0.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "1.0.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "v2-model-server", "platformVersion": "", "spec": {"filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": [], "customFields": {"default_class": "ClassifierModel"}}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "v2-model-server", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "0.8.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "0.9.0"}}, "onnx_utils": {"latest": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "onnx_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "with_mlrun": false}}, "filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/mlrun", "kind": "job", "requirements": ["onnx~=1.13.0", "onnxruntime~=1.14.0", "onnxoptimizer~=0.3.0", "onnxmltools~=1.11.0", "tf2onnx~=1.13.0"]}, "url": "", "version": "1.2.0"}, "1.1.1": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "onnx_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "with_mlrun": false}}, "filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/ml-models", "kind": "job", "requirements": ["onnx~=1.13.0", "onnxruntime~=1.14.0", "onnxoptimizer~=0.3.0", "onnxmltools~=1.11.0", "tf2onnx~=1.13.0"]}, "url": "", "version": "1.1.1"}, "0.0.1": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2021-10-25:00-15", "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "onnx_utils", "platformVersion": "", "spec": {"filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.2.0": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "onnx_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "with_mlrun": false}}, "filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/mlrun", "kind": "job", "requirements": ["onnx~=1.13.0", "onnxruntime~=1.14.0", "onnxoptimizer~=0.3.0", "onnxmltools~=1.11.0", "tf2onnx~=1.13.0"]}, "url": "", "version": "1.2.0"}, "0.10.1": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.10.0", "name": "onnx_utils", "platformVersion": "3.2.0", "spec": {"filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.10.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "onnx_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "with_mlrun": false}}, "filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/ml-models", "kind": "job", "requirements": ["onnx~=1.10.1", "onnxruntime~=1.8.1", "onnxoptimizer~=0.2.0", "onnxmltools~=1.9.0", "tf2onnx~=1.9.0"]}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2021-10-25:00-15", "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "onnx_utils", "platformVersion": "3.2.0", "spec": {"filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "0.10.2": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.10.0", "name": "onnx_utils", "platformVersion": "3.2.0", "spec": {"filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.10.2"}, "0.8.1": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "onnx_utils", "platformVersion": "3.2.0", "spec": {"filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.1"}, "0.9.0": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "onnx_utils", "platformVersion": "3.2.0", "spec": {"filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}}, "azureml_utils": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "azureml_utils", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "commands": ["apt-get update && apt-get install -y --no-install-recommends git", "apt install -y liblttng-ust0"], "with_mlrun": true}}, "filename": "azureml_utils.py", "handler": "train", "image": "python:3.9-bullseye", "kind": "job", "requirements": ["azureml-core==1.54.0.post1", "azureml-train-automl-client==1.54.0.post1", "plotly~=5.4"]}, "url": "", "version": "1.3.0", "test_valid": true}, "0.0.1": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2021-11-13:00-15", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "azureml_utils", "platformVersion": "", "spec": {"filename": "azureml_utils.py", "handler": "train", "commands": null, "image": "", "kind": "job", "requirements": ["azureml-core==1.33.0", "azureml-train-automl-client==1.33.0"]}, "url": "", "version": "0.0.1"}, "0.9.3": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2021-11-13:00-15", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "azureml_utils", "platformVersion": "", "spec": {"filename": "azureml_utils.py", "handler": "train", "extra_spec": {"build": {"commands": ["python -m pip install pip==21.2.4", "apt-get update && apt-get install -y --no-install-recommends git"], "with_mlrun": true, "auto_build": true}, "allow_empty_resources": true}, "image": "python:3.7.9-slim", "kind": "job", "requirements": ["azureml-core==1.33.0", "azureml-train-automl-client==1.33.0", "plotly~=5.4"]}, "url": "", "version": "0.9.3"}, "1.2.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "azureml_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "commands": ["python -m pip install pip==22.1.2", "apt-get update && apt-get install -y --no-install-recommends git"], "with_mlrun": true}}, "filename": "azureml_utils.py", "handler": "train", "image": "python:3.7.9-slim", "kind": "job", "requirements": ["azureml-core==1.40.0", "azureml-train-automl-client==1.40.0", "plotly~=5.4"]}, "url": "", "version": "1.2.0", "test_valid": false}, "1.3.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "azureml_utils", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "commands": ["apt-get update && apt-get install -y --no-install-recommends git", "apt install -y liblttng-ust0"], "with_mlrun": true}}, "filename": "azureml_utils.py", "handler": "train", "image": "python:3.9-bullseye", "kind": "job", "requirements": ["azureml-core==1.54.0.post1", "azureml-train-automl-client==1.54.0.post1", "plotly~=5.4"]}, "url": "", "version": "1.3.0", "test_valid": true}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "azureml_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "commands": ["python -m pip install pip==22.1.2", "apt-get update && apt-get install -y --no-install-recommends git"], "with_mlrun": true}}, "filename": "azureml_utils.py", "handler": "train", "image": "python:3.7.9-slim", "kind": "job", "requirements": ["azureml-core==1.40.0", "azureml-train-automl-client==1.40.0", "plotly~=5.4"]}, "url": "", "version": "1.1.0"}, "0.9.5": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2021-04-20:15-18", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "azureml_utils", "platformVersion": "", "spec": {"filename": "azureml_utils.py", "handler": "train", "extra_spec": {"build": {"commands": ["python -m pip install pip==21.2.4", "apt-get update && apt-get install -y --no-install-recommends git"], "with_mlrun": true, "auto_build": true}, "allow_empty_resources": true}, "image": "python:3.7.9-slim", "kind": "job", "requirements": ["azureml-core==1.40.0", "azureml-train-automl-client==1.40.0", "plotly~=5.4"]}, "url": "", "version": "0.9.5"}, "0.9.4": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2021-11-13:00-15", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "azureml_utils", "platformVersion": "", "spec": {"filename": "azureml_utils.py", "handler": "train", "extra_spec": {"build": {"commands": ["python -m pip install pip==21.2.4", "apt-get update && apt-get install -y --no-install-recommends git"], "with_mlrun": true, "auto_build": true}, "allow_empty_resources": true}, "image": "python:3.7.9-slim", "kind": "job", "requirements": ["azureml-core==1.33.0", "azureml-train-automl-client==1.33.0", "plotly~=5.4"]}, "url": "", "version": "0.9.4"}, "0.9.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2021-11-13:00-15", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "azureml_utils", "platformVersion": "", "spec": {"filename": "azureml_utils.py", "handler": "train", "commands": null, "image": "", "kind": "job", "requirements": ["azureml-core==1.33.0", "azureml-train-automl-client==1.33.0"]}, "url": "", "version": "0.9.0"}}, "auto_trainer": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.7.0"}, "1.0.4": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-04-26:10-43", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.0.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.4"}, "0.10.1": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-02-06:10-18", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.10.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "", "kind": "job", "requirements": []}, "url": "", "version": "0.10.1"}, "1.0.2": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-04-26:10-43", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.0.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.2"}, "1.7.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.7.0"}, "1.6.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.6.0"}, "1.3.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.3.0", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0"}, "1.5.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.5.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.10.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-02-06:10-18", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.10.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "", "kind": "job", "requirements": []}, "url": "", "version": "0.10.0"}, "0.10.2": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-02-06:10-18", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.10.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "", "kind": "job", "requirements": []}, "url": "", "version": "0.10.2"}, "1.4.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.4.0"}, "1.0.6": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-04-26:10-43", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.0.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.6"}, "0.10.3": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-04-26:10-43", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.10.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "", "kind": "job", "requirements": []}, "url": "", "version": "0.10.3"}, "1.0.1": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-04-26:10-43", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.0.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "", "kind": "job", "requirements": []}, "url": "", "version": "1.0.1"}, "0.9.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-02-06:10-18", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.10.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}}, "azureml_serving": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "AzureML serving function", "doc": "", "example": "azureml_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "azureml_serving", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "mlrun.frameworks.sklearn.PickleModelServer"}, "filename": "azureml_serving.py", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["azureml-automl-runtime~=1.38.1"]}, "url": "", "version": "1.1.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "AzureML serving function", "doc": "", "example": "azureml_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "azureml_serving", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "mlrun.frameworks.sklearn.PickleModelServer"}, "filename": "azureml_serving.py", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["azureml-automl-runtime~=1.38.1"]}, "url": "", "version": "1.1.0"}}, "batch_inference": {"latest": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.7.0"}, "1.1.1": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.1.1"}, "1.2.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.2.0"}, "1.7.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.7.0"}, "1.6.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.6.0"}, "1.5.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.5.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference ( also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-learn", "plotly"]}, "url": "", "version": "1.1.0"}}, "hugging_face_serving": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "Generic Hugging Face model server.", "doc": "", "example": "hugging_face_serving.ipynb", "generationDate": "2022-09-05:17-00", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "hugging_face_serving", "platformVersion": "", "spec": {"customFields": {"default_class": "HuggingFaceModelServer"}, "filename": "hugging_face_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["transformers==4.21.3", "tensorflow==2.9.2"]}, "url": "", "version": "1.0.0"}, "1.0.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "Generic Hugging Face model server.", "doc": "", "example": "hugging_face_serving.ipynb", "generationDate": "2022-09-05:17-00", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "hugging_face_serving", "platformVersion": "", "spec": {"customFields": {"default_class": "HuggingFaceModelServer"}, "filename": "hugging_face_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["transformers==4.21.3", "tensorflow==2.9.2"]}, "url": "", "version": "1.0.0"}}, "validate_great_expectations": {"latest": {"apiVersion": "v1", "categories": ["data-validation", "data-analysis"], "description": "Validate a dataset using Great Expectations", "doc": "", "example": "validate_great_expectations.ipynb", "generationDate": "2022-04-26:12-28", "hidden": false, "icon": "", "labels": {"author": "nicks", "framework": "great-expectations"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "validate-great-expectations", "platformVersion": "3.5.2", "spec": {"filename": "validate_great_expectations.py", "handler": "validate_expectations", "image": "mlrun/mlrun", "kind": "job", "requirements": ["great-expectations==0.15.41"]}, "url": "", "version": "1.1.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-validation", "data-analysis"], "description": "Validate a dataset using Great Expectations", "doc": "", "example": "validate_great_expectations.ipynb", "generationDate": "2022-04-26:12-28", "hidden": false, "icon": "", "labels": {"author": "nicks", "framework": "great-expectations"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "validate-great-expectations", "platformVersion": "3.5.2", "spec": {"filename": "validate_great_expectations.py", "handler": "validate_expectations", "image": "mlrun/mlrun", "kind": "job", "requirements": ["great-expectations==0.15.41"]}, "url": "", "version": "1.1.0"}}, "transcribe": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Transcribe audio files into text files", "doc": "", "example": "transcribe.ipynb", "generationDate": "2023-07-13:11-20", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "transcribe", "platformVersion": "3.5.3", "spec": {"filename": "transcribe.py", "handler": "transcribe", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "tqdm", "torchaudio", "torch", "accelerate"]}, "url": "", "version": "1.0.0"}, "1.0.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Transcribe audio files into text files", "doc": "", "example": "transcribe.ipynb", "generationDate": "2023-07-13:11-20", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "transcribe", "platformVersion": "3.5.3", "spec": {"filename": "transcribe.py", "handler": "transcribe", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "tqdm", "torchaudio", "torch", "accelerate"]}, "url": "", "version": "1.0.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Transcribe audio files into text files", "doc": "", "example": "transcribe.ipynb", "generationDate": "2023-07-13:11-20", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "transcribe", "platformVersion": "3.5.3", "spec": {"filename": "transcribe.py", "handler": "transcribe", "image": "mlrun/mlrun", "kind": "job", "requirements": ["openai-whisper", "tqdm"]}, "url": "", "version": "0.0.1", "test_valid": false}, "0.0.2": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Transcribe audio files into text files", "doc": "", "example": "transcribe.ipynb", "generationDate": "2023-07-13:11-20", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "transcribe", "platformVersion": "3.5.3", "spec": {"filename": "transcribe.py", "handler": "transcribe", "image": "mlrun/mlrun", "kind": "job", "requirements": ["openai-whisper", "tqdm"]}, "url": "", "version": "0.0.2", "test_valid": true}}, "question_answering": {"latest": {"apiVersion": "v1", "categories": ["machine-learning"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "torch", "tqdm"]}, "url": "", "version": "0.3.1"}, "0.3.1": {"apiVersion": "v1", "categories": ["machine-learning"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "torch", "tqdm"]}, "url": "", "version": "0.3.1"}, "0.3.0": {"apiVersion": "v1", "categories": ["machine-learning"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": "transformers torch tqdm"}, "url": "", "version": "0.3.0"}, "0.2.0": {"apiVersion": "v1", "categories": ["machine-learning"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": "transformers torch tqdm"}, "url": "", "version": "0.2.0"}}, "pii_recognizer": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "This function is used to recognize PII in a directory of text files", "doc": "", "example": "pii_recognizer.ipynb", "generationDate": "2023-08-15:10-24", "hidden": false, "icon": "", "labels": {"author": "pgw"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "pii-recognizer", "platformVersion": "3.5.3", "spec": {"filename": "pii_recognizer.py", "handler": "recognize_pii", "image": "mlrun/mlrun", "kind": "job", "requirements": ["nltk", "pandas", "presidio-anonymizer", "presidio-analyzer", "torch", "flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653", "st-annotated-text", "https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl"]}, "url": "", "version": "0.2.0", "test_valid": false}, "0.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "This function is used to recognize PII in a directory of text files", "doc": "", "example": "pii_recognizer.ipynb", "generationDate": "2023-08-15:10-24", "hidden": false, "icon": "", "labels": {"author": "pgw"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "pii-recognizer", "platformVersion": "3.5.3", "spec": {"filename": "pii_recognizer.py", "handler": "recognize_pii", "image": "mlrun/mlrun", "kind": "job", "requirements": ["nltk", "pandas", "presidio-anonymizer", "presidio-analyzer", "torch", "flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653", "st-annotated-text", "https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl"]}, "url": "", "version": "0.1.0", "test_valid": false}, "0.2.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "This function is used to recognize PII in a directory of text files", "doc": "", "example": "pii_recognizer.ipynb", "generationDate": "2023-08-15:10-24", "hidden": false, "icon": "", "labels": {"author": "pgw"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "pii-recognizer", "platformVersion": "3.5.3", "spec": {"filename": "pii_recognizer.py", "handler": "recognize_pii", "image": "mlrun/mlrun", "kind": "job", "requirements": ["nltk", "pandas", "presidio-anonymizer", "presidio-analyzer", "torch", "flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653", "st-annotated-text", "https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl"]}, "url": "", "version": "0.2.0", "test_valid": false}}, "batch_inference_v2": {"latest": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.5.0"}, "2.3.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.3.0"}, "2.2.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.2.0"}, "1.8.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0-rc13", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.8.0"}, "1.7.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0-rc13", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.7.0"}, "1.6.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0-rc9", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.6.0"}, "1.5.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0-rc9", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.5.0"}, "1.9.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0-rc16", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.9.0"}, "2.0.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.0.0"}, "2.5.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.5.0"}, "2.1.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.1.0"}}, "translate": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning", "deep-learning", "NLP"], "description": "Translate text files from one language to another", "doc": "", "example": "translate.ipynb", "generationDate": "2023-12-05:17-20", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "translate", "platformVersion": "3.5.3", "spec": {"filename": "translate.py", "handler": "translate", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "sentencepiece", "torch", "tqdm"]}, "url": "", "version": "0.0.2", "test_valid": true}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Translate text files from one language to another", "doc": "", "example": "translate.ipynb", "generationDate": "2023-12-05:17-20", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "translate", "platformVersion": "3.5.3", "spec": {"filename": "translate.py", "handler": "translate", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "sentencepiece", "torch", "tqdm"]}, "url": "", "version": "0.0.1", "test_valid": true}, "0.0.2": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning", "deep-learning", "NLP"], "description": "Translate text files from one language to another", "doc": "", "example": "translate.ipynb", "generationDate": "2023-12-05:17-20", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "translate", "platformVersion": "3.5.3", "spec": {"filename": "translate.py", "handler": "translate", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "sentencepiece", "torch", "tqdm"]}, "url": "", "version": "0.0.2", "test_valid": true}}, "structured_data_generator": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "GenAI"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.1", "name": "structured_data_generator", "platformVersion": "3.5.5", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.4.0"}, "1.0.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "structured_data_generator", "platformVersion": "3.5.0", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.0.0"}, "1.3.1": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "structured_data_generator", "platformVersion": "3.5.0", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.3.1"}, "1.3.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "structured_data_generator", "platformVersion": "3.5.0", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.3.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "structured_data_generator", "platformVersion": "3.5.0", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.1.0"}, "1.4.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "GenAI"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.1", "name": "structured_data_generator", "platformVersion": "3.5.5", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.4.0"}}, "text_to_audio_generator": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Generate audio file from text using different speakers", "doc": "", "example": "text_to_audio_generator.ipynb", "generationDate": "2023-12-03:15-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "text_to_audio_generator", "platformVersion": "3.5.3", "spec": {"filename": "text_to_audio_generator.py", "handler": "generate_multi_speakers_audio", "image": "mlrun/mlrun", "kind": "job", "requirements": ["bark", "torchaudio"]}, "url": "", "version": "1.1.0", "test_valid": true}, "1.0.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Generate audio file from text using different speakers", "doc": "", "example": "text_to_audio_generator.ipynb", "generationDate": "2023-12-03:15-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "text_to_audio_generator", "platformVersion": "3.5.3", "spec": {"filename": "text_to_audio_generator.py", "handler": "generate_multi_speakers_audio", "image": "mlrun/mlrun", "kind": "job", "requirements": ["bark", "torchaudio"]}, "url": "", "version": "1.0.0", "test_valid": true}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Generate audio file from text using different speakers", "doc": "", "example": "text_to_audio_generator.ipynb", "generationDate": "2023-12-03:15-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "text_to_audio_generator", "platformVersion": "3.5.3", "spec": {"filename": "text_to_audio_generator.py", "handler": "generate_multi_speakers_audio", "image": "mlrun/mlrun", "kind": "job", "requirements": ["bark", "torchaudio"]}, "url": "", "version": "1.1.0", "test_valid": true}}, "silero_vad": {"latest": {"apiVersion": "v1", "categories": ["deep-learning", "PyTorch", "Audio"], "description": "Silero VAD (Voice Activity Detection) functions.", "doc": "", "example": "silero_vad.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "silero_vad", "platformVersion": "3.5.3", "spec": {"filename": "silero_vad.py", "handler": "detect_voice", "image": "mlrun/mlrun", "kind": "job", "requirements": ["torch", "torchaudio", "tqdm", "onnxruntime"]}, "url": "", "version": "1.2.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["deep-learning", "PyTorch", "Audio"], "description": "Silero VAD (Voice Activity Detection) functions.", "doc": "", "example": "silero_vad.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "silero_vad", "platformVersion": "3.5.3", "spec": {"filename": "silero_vad.py", "handler": "detect_voice", "image": "mlrun/mlrun", "kind": "job", "requirements": ["torch", "torchaudio", "tqdm", "onnxruntime"]}, "url": "", "version": "1.2.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["deep-learning", "pytorch", "audio"], "description": "Silero VAD (Voice Activity Detection) functions.", "doc": "", "example": "silero_vad.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "silero_vad", "platformVersion": "3.5.3", "spec": {"filename": "silero_vad.py", "handler": "detect_voice", "image": "mlrun/mlrun", "kind": "job", "requirements": ["torch", "torchaudio", "tqdm", "onnxruntime"]}, "url": "", "version": "1.1.0"}}, "pyannote_audio": {"latest": {"apiVersion": "v1", "categories": ["deep-learning", "Huggingface", "Audio"], "description": "pyannote's speech diarization of audio files", "doc": "", "example": "pyannote_audio.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "pyannote-audio", "platformVersion": "3.5.3", "spec": {"filename": "pyannote_audio.py", "handler": "diarize", "image": "mlrun/mlrun-gpu", "kind": "job", "requirements": ["pyannote.audio", "pyannote.core", "torchaudio", "tqdm"]}, "url": "", "version": "1.1.0"}, "1.0.0": {"apiVersion": "v1", "categories": ["deep-learning", "huggingface", "audio"], "description": "pyannote's speech diarization of audio files", "doc": "", "example": "pyannote_audio.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "pyannote-audio", "platformVersion": "3.5.3", "spec": {"filename": "pyannote_audio.py", "handler": "diarize", "image": "mlrun/mlrun-gpu", "kind": "job", "requirements": ["pyannote.audio", "pyannote.core", "torchaudio", "tqdm"]}, "url": "", "version": "1.0.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["deep-learning", "Huggingface", "Audio"], "description": "pyannote's speech diarization of audio files", "doc": "", "example": "pyannote_audio.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "pyannote-audio", "platformVersion": "3.5.3", "spec": {"filename": "pyannote_audio.py", "handler": "diarize", "image": "mlrun/mlrun-gpu", "kind": "job", "requirements": ["pyannote.audio", "pyannote.core", "torchaudio", "tqdm"]}, "url": "", "version": "1.1.0"}}, "noise_reduction": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Reduce noise from audio files", "doc": "", "example": "noise_reduction.ipynb", "generationDate": "2024-03-04:17-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "mlrunVersion": "1.5.2", "name": "noise-reduction", "platformVersion": "3.5.3", "spec": {"filename": "noise_reduction.py", "handler": "reduce_noise", "image": "mlrun/mlrun", "kind": "job", "requirements": ["librosa", "noisereduce", "deepfilternet", "torchaudio>=2.1.2"]}, "url": "", "version": "1.0.0"}, "1.0.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Reduce noise from audio files", "doc": "", "example": "noise_reduction.ipynb", "generationDate": "2024-03-04:17-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "mlrunVersion": "1.5.2", "name": "noise-reduction", "platformVersion": "3.5.3", "spec": {"filename": "noise_reduction.py", "handler": "reduce_noise", "image": "mlrun/mlrun", "kind": "job", "requirements": ["librosa", "noisereduce", "deepfilternet", "torchaudio>=2.1.2"]}, "url": "", "version": "1.0.0"}}}, "master": {"tf2_serving": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "tf2-serving", "platformVersion": "3.5.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf2-serving", "platformVersion": "3.2.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.8.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf2-serving", "platformVersion": "3.2.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.9.0"}, "0.9.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf2-serving", "platformVersion": "3.2.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.9.1"}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "tf2-serving", "platformVersion": "", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "tf2-serving", "platformVersion": "3.5.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "1.1.0"}}, "load_dataset": {"latest": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.0", "name": "load-dataset", "platformVersion": "3.5.5", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "load-dataset", "platformVersion": "3.2.0", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "load-dataset", "platformVersion": "3.2.0", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "load-dataset", "platformVersion": "", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "load-dataset", "platformVersion": "3.5.0", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.0", "name": "load-dataset", "platformVersion": "3.5.5", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}}, "model_server_tester": {"latest": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-server-tester", "platformVersion": "3.5.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server-tester", "platformVersion": "3.2.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server-tester", "platformVersion": "3.2.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "1.0.0": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server-tester", "platformVersion": "3.2.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "model-server-tester", "platformVersion": "", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-server-tester", "platformVersion": "3.5.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}}, "tf2_serving_v2": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server v2", "doc": "", "example": "tf2_serving_v2.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "tf2-serving-v2", "platformVersion": "3.5.0", "spec": {"filename": "tf2_serving_v2.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "1.2.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server v2", "doc": "", "example": "tf2_serving_v2.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf2-serving-v2", "platformVersion": "3.2.0", "spec": {"filename": "tf2_serving_v2.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.8.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server v2", "doc": "", "example": "tf2_serving_v2.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf2-serving-v2", "platformVersion": "3.2.0", "spec": {"filename": "tf2_serving_v2.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.9.0"}, "0.9.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server v2", "doc": "", "example": "tf2_serving_v2.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf2-serving-v2", "platformVersion": "3.2.0", "spec": {"filename": "tf2_serving_v2.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.9.1"}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server v2", "doc": "", "example": "tf2_serving_v2.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "tf2-serving-v2", "platformVersion": "", "spec": {"filename": "tf2_serving_v2.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server v2", "doc": "", "example": "tf2_serving_v2.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "tf2-serving-v2", "platformVersion": "3.5.0", "spec": {"filename": "tf2_serving_v2.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "1.1.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server v2", "doc": "", "example": "tf2_serving_v2.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "tf2-serving-v2", "platformVersion": "3.5.0", "spec": {"filename": "tf2_serving_v2.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "1.2.0"}}, "sql_to_file": {"latest": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "SQL To File - Ingest data using SQL query", "doc": "", "example": "sql_to_file.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "adih"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "sql-to-file", "platformVersion": "3.5.0", "spec": {"filename": "sql_to_file.py", "handler": "sql_to_file", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "SQL To File - Ingest data using SQL query", "doc": "", "example": "sql_to_file.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "adih"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "sql-to-file", "platformVersion": "3.2.0", "spec": {"filename": "sql_to_file.py", "handler": "sql_to_file", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "SQL To File - Ingest data using SQL query", "doc": "", "example": "sql_to_file.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "adih"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "sql-to-file", "platformVersion": "3.2.0", "spec": {"filename": "sql_to_file.py", "handler": "sql_to_file", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "0.9.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "SQL To File - Ingest data using SQL query", "doc": "", "example": "sql_to_file.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "adih"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "sql-to-file", "platformVersion": "3.2.0", "spec": {"filename": "sql_to_file.py", "handler": "sql_to_file", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.1"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "SQL To File - Ingest data using SQL query", "doc": "", "example": "sql_to_file.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "adih"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "sql-to-file", "platformVersion": "", "spec": {"filename": "sql_to_file.py", "handler": "sql_to_file", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "SQL To File - Ingest data using SQL query", "doc": "", "example": "sql_to_file.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "adih"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "sql-to-file", "platformVersion": "3.5.0", "spec": {"filename": "sql_to_file.py", "handler": "sql_to_file", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}}, "feature_selection": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.3", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.5.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "feature-selection", "platformVersion": "3.2.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection/feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.9.0", "name": "feature-selection", "platformVersion": "3.2.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection/feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "1.3.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0"}, "0.9.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.9.0", "name": "feature-selection", "platformVersion": "3.2.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.1"}, "1.1.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.1"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.5.4", "name": "feature-selection", "platformVersion": "2.10.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection/feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.4.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.4.0"}, "1.5.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.3", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.5.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}}, "aggregate": {"latest": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "aggregate", "platformVersion": "3.5.4", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "aggregate", "platformVersion": "3.2.0", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "aggregate", "platformVersion": "3.2.0", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "1.3.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "aggregate", "platformVersion": "3.5.4", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2021-05-19:22-31", "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.6.2", "name": "aggregate", "platformVersion": "3.0.0", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2022-08-28:17-25", "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "aggregate", "platformVersion": "3.5.0", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "aggregate", "platformVersion": "3.5.2", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}}, "bert_embeddings": {"latest": {"apiVersion": "v1", "categories": ["huggingface", "machine-learning", "data-preparation", "pytorch"], "description": "Get BERT based embeddings for given text", "doc": "", "example": "bert_embeddings.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"framework": "pytorch"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "bert-embeddings", "platformVersion": "3.5.3", "spec": {"filename": "bert_embeddings.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio", "requirements": ["torch"]}, "url": "", "version": "1.3.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Get BERT based embeddings for given text", "doc": "", "example": "bert_embeddings.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"framework": "pytorch"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "bert-embeddings", "platformVersion": "3.2.0", "spec": {"filename": "bert_embeddings.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["torch==1.6.0"]}, "url": "", "version": "0.8.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Get BERT based embeddings for given text", "doc": "", "example": "bert_embeddings.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"framework": "pytorch"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "bert-embeddings", "platformVersion": "3.2.0", "spec": {"filename": "bert_embeddings.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["torch==1.6.0"]}, "url": "", "version": "0.9.0"}, "1.3.0": {"apiVersion": "v1", "categories": ["huggingface", "machine-learning", "data-preparation", "pytorch"], "description": "Get BERT based embeddings for given text", "doc": "", "example": "bert_embeddings.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"framework": "pytorch"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "bert-embeddings", "platformVersion": "3.5.3", "spec": {"filename": "bert_embeddings.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio", "requirements": ["torch"]}, "url": "", "version": "1.3.0"}, "1.1.1": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Get BERT based embeddings for given text", "doc": "", "example": "bert_embeddings.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"framework": "pytorch"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "bert-embeddings", "platformVersion": "3.5.0", "spec": {"filename": "bert_embeddings.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["torch==1.6.0"]}, "url": "", "version": "1.1.1"}, "0.0.1": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Get BERT based embeddings for given text", "doc": "", "example": "bert_embeddings.ipynb", "generationDate": "2021-05-19:22-04", "icon": "", "labels": {"framework": "pytorch"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.5.4", "name": "bert-embeddings", "platformVersion": "2.10.0", "spec": {"filename": "bert_embeddings.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["torch==1.6.0"]}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Get BERT based embeddings for given text", "doc": "", "example": "bert_embeddings.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"framework": "pytorch"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "bert-embeddings", "platformVersion": "3.5.0", "spec": {"filename": "bert_embeddings.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["torch==1.6.0"]}, "url": "", "version": "1.1.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Get BERT based embeddings for given text", "doc": "", "example": "bert_embeddings.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"framework": "pytorch"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "bert-embeddings", "platformVersion": "3.5.3", "spec": {"filename": "bert_embeddings.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio", "requirements": ["torch"]}, "url": "", "version": "1.2.0"}}, "describe": {"latest": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.0", "name": "describe", "platformVersion": "3.5.3", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Iguazio"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe", "platformVersion": "3.2.0", "spec": {"filename": "describe.py", "handler": "summarize", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Iguazio"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe", "platformVersion": "3.2.0", "spec": {"filename": "describe.py", "handler": "summarize", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "1.3.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.0", "name": "describe", "platformVersion": "3.5.3", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0"}, "0.9.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-04-07:14-20", "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe", "platformVersion": "3.2.0", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.1"}, "0.9.2": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-04-26:10-20", "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe", "platformVersion": "3.2.0", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.2"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "Iguazio"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.5.4", "name": "describe", "platformVersion": "2.10.0", "spec": {"filename": "describe.py", "handler": "summarize", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "describe", "platformVersion": "3.5.0", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "describe", "platformVersion": "3.5.3", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}}, "model_server": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-server", "platformVersion": "3.5.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server", "platformVersion": "3.2.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "0.8.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server", "platformVersion": "3.2.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "0.9.0"}, "1.0.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server", "platformVersion": "3.2.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "1.0.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "model-server", "platformVersion": "", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-server", "platformVersion": "3.5.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "1.1.0"}}, "pandas_profiling_report": {"latest": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "Create Pandas Profiling Report from Dataset", "doc": "", "example": "pandas_profiling_report.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "nicks"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "pandas-profiling-report", "platformVersion": "3.5.0", "spec": {"filename": "pandas_profiling_report.py", "handler": "pandas_profiling_report", "image": "mlrun/mlrun", "kind": "job", "requirements": ["pandas_profiling"]}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "Create Pandas Profiling Report from Dataset", "doc": "", "example": "pandas_profiling_report.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "nicks"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "pandas-profiling-report", "platformVersion": "3.2.0", "spec": {"filename": "pandas_profiling_report.py", "handler": "pandas_profiling_report", "image": "mlrun/mlrun", "kind": "job", "requirements": ["pandas_profiling"]}, "url": "", "version": "0.8.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "Create Pandas Profiling Report from Dataset", "doc": "", "example": "pandas_profiling_report.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "nicks"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "pandas-profiling-report", "platformVersion": "3.2.0", "spec": {"filename": "pandas_profiling_report.py", "handler": "pandas_profiling_report", "image": "mlrun/mlrun", "kind": "job", "requirements": ["pandas_profiling"]}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "Create Pandas Profiling Report from Dataset", "doc": "", "example": "pandas_profiling_report.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "nicks"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "pandas-profiling-report", "platformVersion": "", "spec": {"filename": "pandas_profiling_report.py", "handler": "pandas_profiling_report", "image": "mlrun/mlrun", "kind": "job", "requirements": ["pandas_profiling"]}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "Create Pandas Profiling Report from Dataset", "doc": "", "example": "pandas_profiling_report.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "nicks"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "pandas-profiling-report", "platformVersion": "3.5.0", "spec": {"filename": "pandas_profiling_report.py", "handler": "pandas_profiling_report", "image": "mlrun/mlrun", "kind": "job", "requirements": ["pandas_profiling"]}, "url": "", "version": "1.1.0"}}, "load_dask": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "etl"], "description": "load dask cluster with data", "doc": "", "example": "load_dask.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "load-dask", "platformVersion": "3.5.0", "spec": {"filename": "load_dask.py", "handler": "load_dask", "image": "mlrun/ml-models", "kind": "dask", "requirements": []}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation", "etl"], "description": "load dask cluster with data", "doc": "", "example": "load_dask.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "load-dask", "platformVersion": "3.2.0", "spec": {"filename": "load_dask.py", "handler": "load_dask", "image": "mlrun/ml-models", "kind": "dask", "requirements": []}, "url": "", "version": "0.8.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation", "etl"], "description": "load dask cluster with data", "doc": "", "example": "load_dask.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "load-dask", "platformVersion": "3.2.0", "spec": {"filename": "load_dask.py", "handler": "load_dask", "image": "mlrun/ml-models", "kind": "dask", "requirements": []}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation", "etl"], "description": "load dask cluster with data", "doc": "", "example": "load_dask.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "load-dask", "platformVersion": "", "spec": {"filename": "load_dask.py", "handler": "load_dask", "image": "mlrun/ml-models", "kind": "dask", "requirements": []}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation", "etl"], "description": "load dask cluster with data", "doc": "", "example": "load_dask.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "load-dask", "platformVersion": "3.5.0", "spec": {"filename": "load_dask.py", "handler": "load_dask", "image": "mlrun/ml-models", "kind": "dask", "requirements": []}, "url": "", "version": "1.1.0"}}, "slack_notify": {"latest": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Slack notification", "doc": "", "example": "slack_notify.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "mdl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "slack-notify", "platformVersion": "3.5.0", "spec": {"filename": "slack_notify.py", "handler": "slack_notify", "image": "python:3.6-jessie", "kind": "job", "requirements": ["requests"]}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Slack notification", "doc": "", "example": "slack_notify.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "mdl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "slack-notify", "platformVersion": "3.2.0", "spec": {"filename": "slack_notify.py", "handler": "slack_notify", "image": "python:3.6-jessie", "kind": "job", "requirements": ["requests"]}, "url": "", "version": "0.8.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Slack notification", "doc": "", "example": "slack_notify.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "mdl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "slack-notify", "platformVersion": "3.2.0", "spec": {"filename": "slack_notify.py", "handler": "slack_notify", "image": "python:3.6-jessie", "kind": "job", "requirements": ["requests"]}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Slack notification", "doc": "", "example": "slack_notify.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "mdl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "slack-notify", "platformVersion": "", "spec": {"filename": "slack_notify.py", "handler": "slack_notify", "image": "python:3.6-jessie", "kind": "job", "requirements": ["requests"]}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Slack notification", "doc": "", "example": "slack_notify.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "mdl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "slack-notify", "platformVersion": "3.5.0", "spec": {"filename": "slack_notify.py", "handler": "slack_notify", "image": "python:3.6-jessie", "kind": "job", "requirements": ["requests"]}, "url": "", "version": "1.1.0"}}, "xgb_serving": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an XGBoost model server.", "doc": "", "example": "xgb_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "xgb_serving", "platformVersion": "3.5.3", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "xgb_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "1.1.2"}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an XGBoost model server.", "doc": "", "example": "xgb_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "xgb_serving", "platformVersion": "3.2.0", "spec": {"filename": "xgb_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "remote", "requirements": []}, "url": "", "version": "0.8.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an XGBoost model server.", "doc": "", "example": "xgb_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "xgb_serving", "platformVersion": "3.2.0", "spec": {"filename": "xgb_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "remote", "requirements": []}, "url": "", "version": "0.9.0"}, "1.0.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an XGBoost model server.", "doc": "", "example": "xgb_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "xgb_serving", "platformVersion": "3.2.0", "spec": {"filename": "xgb_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "remote", "requirements": []}, "url": "", "version": "1.0.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an XGBoost model server.", "doc": "", "example": "xgb_serving.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.6.2", "name": "xgb_serving", "platformVersion": "3.0.0", "spec": {"filename": "xgb_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "remote", "requirements": []}, "url": "", "version": "0.0.1"}, "1.1.2": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an XGBoost model server.", "doc": "", "example": "xgb_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "xgb_serving", "platformVersion": "3.5.3", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "xgb_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "1.1.2"}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an XGBoost model server.", "doc": "", "example": "xgb_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "xgb_serving", "platformVersion": "3.5.0", "spec": {"filename": "xgb_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "remote", "requirements": []}, "url": "", "version": "1.1.0"}}, "model_monitoring_batch": {"latest": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_batch.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-monitoring-batch", "platformVersion": "3.5.0", "spec": {"filename": "model_monitoring_batch.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_batch.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-monitoring-batch", "platformVersion": "3.2.0", "spec": {"filename": "model_monitoring_batch.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_batch.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-monitoring-batch", "platformVersion": "3.2.0", "spec": {"filename": "model_monitoring_batch.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "0.9.1": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_batch.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-monitoring-batch", "platformVersion": "3.2.0", "spec": {"filename": "model_monitoring_batch.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.1"}, "0.0.1": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_batch.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "model-monitoring-batch", "platformVersion": "", "spec": {"filename": "model_monitoring_batch.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_batch.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-monitoring-batch", "platformVersion": "3.5.0", "spec": {"filename": "model_monitoring_batch.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}}, "stream_to_parquet": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Saves a stream to Parquet and can lunch drift detection task on it", "doc": "", "example": "stream_to_parquet.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "stream-to-parquet", "platformVersion": "3.5.0", "spec": {"customFields": {"max_replicas": 1, "min_replicas": 1}, "filename": "stream_to_parquet.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": []}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Saves a stream to Parquet and can lunch drift detection task on it", "doc": "", "example": "stream_to_parquet.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "stream-to-parquet", "platformVersion": "3.2.0", "spec": {"customFields": {"max_replicas": 1, "min_replicas": 1}, "filename": "stream_to_parquet.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": []}, "url": "", "version": "0.8.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Saves a stream to Parquet and can lunch drift detection task on it", "doc": "", "example": "stream_to_parquet.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "stream-to-parquet", "platformVersion": "3.2.0", "spec": {"customFields": {"max_replicas": 1, "min_replicas": 1}, "filename": "stream_to_parquet.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": []}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Saves a stream to Parquet and can lunch drift detection task on it", "doc": "", "example": "stream_to_parquet.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "stream-to-parquet", "platformVersion": "", "spec": {"filename": "stream_to_parquet.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": [], "customFields": {"min_replicas": 1, "max_replicas": 1}}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Saves a stream to Parquet and can lunch drift detection task on it", "doc": "", "example": "stream_to_parquet.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "stream-to-parquet", "platformVersion": "3.5.0", "spec": {"customFields": {"max_replicas": 1, "min_replicas": 1}, "filename": "stream_to_parquet.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": []}, "url": "", "version": "1.1.0"}}, "describe_spark": {"latest": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "describe-spark", "platformVersion": "3.5.0", "spec": {"filename": "describe_spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe-spark", "platformVersion": "3.2.0", "spec": {"filename": "describe-spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe-spark", "platformVersion": "3.2.0", "spec": {"filename": "describe-spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "0.9.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe-spark", "platformVersion": "3.2.0", "spec": {"filename": "describe_spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "0.9.1"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "describe-spark", "platformVersion": "", "spec": {"filename": "describe-spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "describe-spark", "platformVersion": "3.5.0", "spec": {"filename": "describe_spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}}, "gen_class_data": {"latest": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "gen_class_data", "platformVersion": "3.5.3", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "gen_class_data", "platformVersion": "3.2.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "gen_class_data", "platformVersion": "3.2.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.6.2", "name": "gen_class_data", "platformVersion": "3.0.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "gen_class_data", "platformVersion": "3.5.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "gen_class_data", "platformVersion": "3.5.3", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "0.10.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "gen_class_data", "platformVersion": "3.2.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.10.0"}}, "open_archive": {"latest": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Open a file/object archive into a target directory", "doc": "", "example": "open_archive.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "open-archive", "platformVersion": "3.5.0", "spec": {"filename": "open_archive.py", "handler": "open_archive", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Open a file/object archive into a target directory", "doc": "", "example": "open_archive.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "open-archive", "platformVersion": "3.2.0", "spec": {"filename": "open_archive.py", "handler": "open_archive", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Open a file/object archive into a target directory", "doc": "", "example": "open_archive.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "open-archive", "platformVersion": "3.2.0", "spec": {"filename": "open_archive.py", "handler": "open_archive", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Open a file/object archive into a target directory", "doc": "", "example": "open_archive.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "open-archive", "platformVersion": "", "spec": {"filename": "open_archive.py", "handler": "open_archive", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Open a file/object archive into a target directory", "doc": "", "example": "open_archive.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "open-archive", "platformVersion": "3.5.0", "spec": {"filename": "open_archive.py", "handler": "open_archive", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}}, "send_email": {"latest": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "send-email", "platformVersion": "3.5.3", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "send-email", "platformVersion": "3.2.0", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "send-email", "platformVersion": "3.2.0", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "send-email", "platformVersion": "", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "send-email", "platformVersion": "3.5.0", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "send-email", "platformVersion": "3.5.3", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}}, "concept_drift": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "Deploy a streaming Concept Drift detector on a labeled stream", "doc": "", "example": "concept_drift.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "concept-drift", "platformVersion": "3.5.0", "spec": {"filename": "concept_drift.py", "handler": "concept_drift_deployer", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-multiflow"]}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "Deploy a streaming Concept Drift detector on a labeled stream", "doc": "", "example": "concept_drift.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "concept-drift", "platformVersion": "3.2.0", "spec": {"filename": "concept_drift.py", "handler": "concept_drift_deployer", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-multiflow"]}, "url": "", "version": "0.8.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "Deploy a streaming Concept Drift detector on a labeled stream", "doc": "", "example": "concept_drift.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "concept-drift", "platformVersion": "3.2.0", "spec": {"filename": "concept_drift.py", "handler": "concept_drift_deployer", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-multiflow"]}, "url": "", "version": "0.9.0"}, "0.9.1": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "Deploy a streaming Concept Drift detector on a labeled stream", "doc": "", "example": "concept_drift.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "concept-drift", "platformVersion": "3.2.0", "spec": {"filename": "concept_drift.py", "handler": "concept_drift_deployer", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-multiflow"]}, "url": "", "version": "0.9.1"}, "0.0.1": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "Deploy a streaming Concept Drift detector on a labeled stream", "doc": "", "example": "concept_drift.ipynb", "generationDate": "2021-05-19:22-04", "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "concept-drift", "platformVersion": "", "spec": {"filename": "concept_drift.py", "handler": "concept_drift_deployer", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "Deploy a streaming Concept Drift detector on a labeled stream", "doc": "", "example": "concept_drift.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "concept-drift", "platformVersion": "3.5.0", "spec": {"filename": "concept_drift.py", "handler": "concept_drift_deployer", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-multiflow"]}, "url": "", "version": "1.1.0"}}, "tf1_serving": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf1 image classification server", "doc": "", "example": "tf1_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "tf1-serving", "platformVersion": "3.5.0", "spec": {"env": {"ENABLE_EXPLAINER": false, "MODEL_CLASS": "TFModel"}, "filename": "tf1_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf1 image classification server", "doc": "", "example": "tf1_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf1-serving", "platformVersion": "3.2.0", "spec": {"env": {"ENABLE_EXPLAINER": false, "MODEL_CLASS": "TFModel"}, "filename": "tf1_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "0.8.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf1 image classification server", "doc": "", "example": "tf1_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf1-serving", "platformVersion": "3.2.0", "spec": {"env": {"ENABLE_EXPLAINER": false, "MODEL_CLASS": "TFModel"}, "filename": "tf1_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "0.9.0"}, "0.9.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf1 image classification server", "doc": "", "example": "tf1_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf1-serving", "platformVersion": "3.2.0", "spec": {"env": {"ENABLE_EXPLAINER": false, "MODEL_CLASS": "TFModel"}, "filename": "tf1_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "0.9.1"}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf1 image classification server", "doc": "", "example": "tf1_serving.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "tf1-serving", "platformVersion": "", "spec": {"filename": "tf1_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": [], "env": {"MODEL_CLASS": "TFModel", "ENABLE_EXPLAINER": false}}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf1 image classification server", "doc": "", "example": "tf1_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "tf1-serving", "platformVersion": "3.5.0", "spec": {"env": {"ENABLE_EXPLAINER": false, "MODEL_CLASS": "TFModel"}, "filename": "tf1_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "1.1.0"}}, "churn_server": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "churn classification and predictor", "doc": "", "example": "churn_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Iguazio", "framework": "churn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "churn-server", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "ChurnModel"}, "env": {"ENABLE_EXPLAINER": "False"}, "filename": "churn_server.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["xgboost==1.3.1", "lifelines==0.22.8"]}, "url": "", "version": "1.2.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "churn classification and predictor", "doc": "", "example": "churn_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Iguazio", "framework": "churn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "churn-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ChurnModel"}, "env": {"ENABLE_EXPLAINER": "False"}, "filename": "churn_server.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["xgboost==1.3.1", "lifelines==0.22.8"]}, "url": "", "version": "0.8.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "churn classification and predictor", "doc": "", "example": "churn_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Iguazio", "framework": "churn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "churn-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ChurnModel"}, "env": {"ENABLE_EXPLAINER": "False"}, "filename": "churn_server.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["xgboost==1.3.1", "lifelines==0.22.8"]}, "url": "", "version": "0.9.0"}, "1.0.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "churn classification and predictor", "doc": "", "example": "churn_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Iguazio", "framework": "churn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "churn-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ChurnModel"}, "env": {"ENABLE_EXPLAINER": "False"}, "filename": "churn_server.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["xgboost==1.3.1", "lifelines==0.22.8"]}, "url": "", "version": "1.0.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "churn classification and predictor", "doc": "", "example": "churn_server.ipynb", "generationDate": "2021-05-19:22-04", "icon": "", "labels": {"author": "Iguazio", "framework": "churn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "churn-server", "platformVersion": "", "spec": {"filename": "churn_server.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": [], "env": {"ENABLE_EXPLAINER": "False"}, "customFields": {"default_class": "ChurnModel"}}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "churn classification and predictor", "doc": "", "example": "churn_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Iguazio", "framework": "churn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "churn-server", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "ChurnModel"}, "env": {"ENABLE_EXPLAINER": "False"}, "filename": "churn_server.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["xgboost==1.3.1", "lifelines==0.22.8"]}, "url": "", "version": "1.1.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "churn classification and predictor", "doc": "", "example": "churn_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Iguazio", "framework": "churn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "churn-server", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "ChurnModel"}, "env": {"ENABLE_EXPLAINER": "False"}, "filename": "churn_server.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["xgboost==1.3.1", "lifelines==0.22.8"]}, "url": "", "version": "1.2.0"}}, "model_monitoring_stream": {"latest": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_stream.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-monitoring-stream", "platformVersion": "3.5.0", "spec": {"filename": "model_monitoring_stream.py", "handler": "handler", "image": "livsmichael/mlrun-api:automation", "kind": "nuclio", "requirements": []}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_stream.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-monitoring-stream", "platformVersion": "3.2.0", "spec": {"filename": "model_monitoring_stream.py", "handler": "handler", "image": "livsmichael/mlrun-api:automation", "kind": "nuclio", "requirements": []}, "url": "", "version": "0.8.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_stream.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-monitoring-stream", "platformVersion": "3.2.0", "spec": {"filename": "model_monitoring_stream.py", "handler": "handler", "image": "livsmichael/mlrun-api:automation", "kind": "nuclio", "requirements": []}, "url": "", "version": "0.9.0"}, "0.9.1": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_stream.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-monitoring-stream", "platformVersion": "3.2.0", "spec": {"filename": "model_monitoring_stream.py", "handler": "handler", "image": "livsmichael/mlrun-api:automation", "kind": "nuclio", "requirements": []}, "url": "", "version": "0.9.1"}, "0.0.1": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_stream.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "model-monitoring-stream", "platformVersion": "", "spec": {"filename": "model_monitoring_stream.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio", "requirements": []}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_stream.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-monitoring-stream", "platformVersion": "3.5.0", "spec": {"filename": "model_monitoring_stream.py", "handler": "handler", "image": "livsmichael/mlrun-api:automation", "kind": "nuclio", "requirements": []}, "url": "", "version": "1.1.0"}}, "virtual_drift": {"latest": {"apiVersion": "v1", "categories": ["data-analysis", "machine-learning"], "description": "Compute drift magnitude between Time-Samples T and U", "doc": "", "example": "virtual_drift.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "virtual-drift", "platformVersion": "3.5.0", "spec": {"filename": "virtual_drift.py", "handler": "drift_magnitude", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-learn", "scipy", "v3io_frames"]}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-analysis", "machine-learning"], "description": "Compute drift magnitude between Time-Samples T and U", "doc": "", "example": "virtual_drift.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "virtual-drift", "platformVersion": "3.2.0", "spec": {"filename": "virtual_drift.py", "handler": "drift_magnitude", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-learn", "scipy", "v3io_frames"]}, "url": "", "version": "0.8.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-analysis", "machine-learning"], "description": "Compute drift magnitude between Time-Samples T and U", "doc": "", "example": "virtual_drift.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "virtual-drift", "platformVersion": "3.2.0", "spec": {"filename": "virtual_drift.py", "handler": "drift_magnitude", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-learn", "scipy", "v3io_frames"]}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-analysis", "machine-learning"], "description": "Compute drift magnitude between Time-Samples T and U", "doc": "", "example": "virtual_drift.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "virtual-drift", "platformVersion": "", "spec": {"filename": "virtual_drift.py", "handler": "drift_magnitude", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-learn", "scipy", "v3io_frames"]}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-analysis", "machine-learning"], "description": "Compute drift magnitude between Time-Samples T and U", "doc": "", "example": "virtual_drift.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "virtual-drift", "platformVersion": "3.5.0", "spec": {"filename": "virtual_drift.py", "handler": "drift_magnitude", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-learn", "scipy", "v3io_frames"]}, "url": "", "version": "1.1.0"}}, "rnn_serving": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an rnn based stock analysis model server.", "doc": "", "example": "rnn_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "rnn-serving", "platformVersion": "3.5.0", "spec": {"filename": "rnn_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": null}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an rnn based stock analysis model server.", "doc": "", "example": "rnn_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "rnn-serving", "platformVersion": "3.2.0", "spec": {"filename": "rnn_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": null}, "url": "", "version": "0.8.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an rnn based stock analysis model server.", "doc": "", "example": "rnn_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "rnn-serving", "platformVersion": "3.2.0", "spec": {"filename": "rnn_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": null}, "url": "", "version": "0.9.0"}, "1.0.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an rnn based stock analysis model server.", "doc": "", "example": "rnn_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "rnn-serving", "platformVersion": "3.2.0", "spec": {"filename": "rnn_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": null}, "url": "", "version": "1.0.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an rnn based stock analysis model server.", "doc": "", "example": "rnn_serving.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "rnn-serving", "platformVersion": "", "spec": {"filename": "rnn_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["keras"]}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an rnn based stock analysis model server.", "doc": "", "example": "rnn_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "rnn-serving", "platformVersion": "3.5.0", "spec": {"filename": "rnn_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": null}, "url": "", "version": "1.1.0"}}, "feature_perms": {"latest": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "estimate feature importances using permutations", "doc": "", "example": "feature_perms.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-perms", "platformVersion": "3.5.0", "spec": {"filename": "feature_perms.py", "handler": "permutation_importance", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "test_valid": false}, "0.8.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "estimate feature importances using permutations", "doc": "", "example": "feature_perms.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "feature-perms", "platformVersion": "3.2.0", "spec": {"filename": "feature_perms.py", "handler": "permutation_importance", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "estimate feature importances using permutations", "doc": "", "example": "feature_perms.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "feature-perms", "platformVersion": "3.2.0", "spec": {"filename": "feature_perms.py", "handler": "permutation_importance", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "1.0.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "estimate feature importances using permutations", "doc": "", "example": "feature_perms.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "feature-perms", "platformVersion": "3.2.0", "spec": {"filename": "feature_perms.py", "handler": "permutation_importance", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.0.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "estimate feature importances using permutations", "doc": "", "example": "feature_perms.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "feature-perms", "platformVersion": "", "spec": {"filename": "feature_perms.py", "handler": "permutation_importance", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "estimate feature importances using permutations", "doc": "", "example": "feature_perms.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-perms", "platformVersion": "3.5.0", "spec": {"filename": "feature_perms.py", "handler": "permutation_importance", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "test_valid": false}}, "v2_model_tester": {"latest": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "v2-model-tester", "platformVersion": "3.5.0", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-tester", "platformVersion": "3.2.0", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-tester", "platformVersion": "3.2.0", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "v2-model-tester", "platformVersion": "", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "v2-model-tester", "platformVersion": "3.5.0", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}}, "coxph_test": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "model-testing"], "description": "Test cox proportional hazards model", "doc": "", "example": "coxph_test.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Iguazio", "framework": "survival"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "coxph-test", "platformVersion": "3.5.0", "spec": {"filename": "coxph_test.py", "handler": "cox_test", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-testing"], "description": "Test cox proportional hazards model", "doc": "", "example": "coxph_test.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Iguazio", "framework": "survival"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "coxph-test", "platformVersion": "3.2.0", "spec": {"filename": "coxph_test.py", "handler": "cox_test", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-testing"], "description": "Test cox proportional hazards model", "doc": "", "example": "coxph_test.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Iguazio", "framework": "survival"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "coxph-test", "platformVersion": "3.2.0", "spec": {"filename": "coxph_test.py", "handler": "cox_test", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "1.0.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-testing"], "description": "Test cox proportional hazards model", "doc": "", "example": "coxph_test.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Iguazio", "framework": "survival"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "coxph-test", "platformVersion": "3.2.0", "spec": {"filename": "coxph_test.py", "handler": "cox_test", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.0.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["machine-learning", "model-testing"], "description": "Test cox proportional hazards model", "doc": "", "example": "coxph_test.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "Iguazio", "framework": "survival"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "coxph-test", "platformVersion": "", "spec": {"filename": "coxph_test.py", "handler": "cox_test", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-testing"], "description": "Test cox proportional hazards model", "doc": "", "example": "coxph_test.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Iguazio", "framework": "survival"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "coxph-test", "platformVersion": "3.5.0", "spec": {"filename": "coxph_test.py", "handler": "cox_test", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}}, "arc_to_parquet": {"latest": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avi"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "arc-to-parquet", "platformVersion": "3.5.4", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.4.1"}, "0.8.0": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "arc-to-parquet", "platformVersion": "3.2.0", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/ml-base", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "1.4.1": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avi"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "arc-to-parquet", "platformVersion": "3.5.4", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.4.1"}, "0.9.0": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "arc-to-parquet", "platformVersion": "3.2.0", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/ml-base", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2021-05-19:22-04", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.5.4", "name": "arc-to-parquet", "platformVersion": "2.10.0", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/ml-base", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2022-08-28:17-25", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "arc-to-parquet", "platformVersion": "3.5.0", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/ml-base", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "arc-to-parquet", "platformVersion": "3.5.0", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/ml-base", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}}, "github_utils": {"latest": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "github-utils", "platformVersion": "3.5.0", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "github-utils", "platformVersion": "3.2.0", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "github-utils", "platformVersion": "3.2.0", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "github-utils", "platformVersion": "", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "github-utils", "platformVersion": "3.5.0", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}}, "v2_model_server": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "v2-model-server", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "1.2.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "0.8.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "0.9.0"}, "1.0.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "1.0.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "v2-model-server", "platformVersion": "", "spec": {"filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": [], "customFields": {"default_class": "ClassifierModel"}}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "v2-model-server", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "1.1.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "v2-model-server", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "1.2.0"}}, "concept_drift_streaming": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "monitoring"], "description": "Deploy a streaming Concept Drift detector on a labeled stream. the nuclio part of the concept_drift function", "doc": "", "example": "concept_drift_streaming.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "concept-drift-streaming", "platformVersion": "3.5.0", "spec": {"filename": "concept_drift_streaming.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["scikit-multiflow==0.4.1", "v3io_frames"]}, "url": "", "version": "1.1.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["machine-learning", "monitoring"], "description": "Deploy a streaming Concept Drift detector on a labeled stream. the nuclio part of the concept_drift function", "doc": "", "example": "concept_drift_streaming.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "concept-drift-streaming", "platformVersion": "3.2.0", "spec": {"filename": "concept_drift_streaming.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["scikit-multiflow==0.4.1", "v3io_frames"]}, "url": "", "version": "0.8.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["machine-learning", "monitoring"], "description": "Deploy a streaming Concept Drift detector on a labeled stream. the nuclio part of the concept_drift function", "doc": "", "example": "concept_drift_streaming.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "concept-drift-streaming", "platformVersion": "3.2.0", "spec": {"filename": "concept_drift_streaming.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["scikit-multiflow==0.4.1", "v3io_frames"]}, "url": "", "version": "0.9.0"}, "0.9.1": {"apiVersion": "v1", "categories": ["machine-learning", "monitoring"], "description": "Deploy a streaming Concept Drift detector on a labeled stream. the nuclio part of the concept_drift function", "doc": "", "example": "concept_drift_streaming.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "concept-drift-streaming", "platformVersion": "3.2.0", "spec": {"filename": "concept_drift_streaming.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["scikit-multiflow==0.4.1", "v3io_frames"]}, "url": "", "version": "0.9.1"}, "0.0.1": {"apiVersion": "v1", "categories": ["machine-learning", "monitoring"], "description": "Deploy a streaming Concept Drift detector on a labeled stream. the nuclio part of the concept_drift function", "doc": "", "example": "concept_drift_streaming.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "concept-drift-streaming", "platformVersion": "", "spec": {"filename": "concept_drift_streaming.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["scikit-multiflow==0.4.1", "v3io_frames"]}, "url": "", "version": "0.0.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "monitoring"], "description": "Deploy a streaming Concept Drift detector on a labeled stream. the nuclio part of the concept_drift function", "doc": "", "example": "concept_drift_streaming.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "concept-drift-streaming", "platformVersion": "3.5.0", "spec": {"filename": "concept_drift_streaming.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["scikit-multiflow==0.4.1", "v3io_frames"]}, "url": "", "version": "1.1.0"}}, "onnx_utils": {"latest": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "onnx_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "with_mlrun": false}}, "filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/mlrun", "kind": "job", "requirements": ["onnx~=1.13.0", "onnxruntime~=1.14.0", "onnxoptimizer~=0.3.0", "onnxmltools~=1.11.0", "tf2onnx~=1.13.0"]}, "url": "", "version": "1.2.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "onnx_utils", "platformVersion": "3.2.0", "spec": {"filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "onnx_utils", "platformVersion": "3.2.0", "spec": {"filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "1.1.1": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "onnx_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "with_mlrun": false}}, "filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/ml-models", "kind": "job", "requirements": ["onnx~=1.13.0", "onnxruntime~=1.14.0", "onnxoptimizer~=0.3.0", "onnxmltools~=1.11.0", "tf2onnx~=1.13.0"]}, "url": "", "version": "1.1.1"}, "0.10.2": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.10.0", "name": "onnx_utils", "platformVersion": "3.2.0", "spec": {"filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.10.2"}, "1.1.0": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "onnx_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "with_mlrun": false}}, "filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/ml-models", "kind": "job", "requirements": ["onnx~=1.10.1", "onnxruntime~=1.8.1", "onnxoptimizer~=0.2.0", "onnxmltools~=1.9.0", "tf2onnx~=1.9.0"]}, "url": "", "version": "1.1.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "onnx_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "with_mlrun": false}}, "filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/mlrun", "kind": "job", "requirements": ["onnx~=1.13.0", "onnxruntime~=1.14.0", "onnxoptimizer~=0.3.0", "onnxmltools~=1.11.0", "tf2onnx~=1.13.0"]}, "url": "", "version": "1.2.0"}}, "ingest": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "Feature Store ingest function that runs the transformation graph on the source of the featureset.", "doc": "", "example": "ingest.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "ingest", "platformVersion": "3.5.0", "spec": {"filename": "ingest.py", "handler": "ingest", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "Feature Store ingest function that runs the transformation graph on the source of the featureset.", "doc": "", "example": "ingest.ipynb", "generationDate": "2021-11-13:00-15", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "ingest", "platformVersion": "", "spec": {"filename": "ingest.py", "handler": "ingest", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "Feature Store ingest function that runs the transformation graph on the source of the featureset.", "doc": "", "example": "ingest.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "ingest", "platformVersion": "3.5.0", "spec": {"filename": "ingest.py", "handler": "ingest", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}}, "get_offline_features": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "retrieve offline feature vector results", "doc": "", "example": "get_offline_features.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.3", "name": "get_offline_features", "platformVersion": "3.5.0", "spec": {"filename": "get_offline_features.py", "handler": "get_offline_features", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0"}, "1.0.1": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "retrieve offline feature vector results", "doc": "", "example": "get_offline_features.ipynb", "generationDate": "2022-05-25:10-58", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.0.1", "name": "get_offline_features", "platformVersion": "", "spec": {"filename": "get_offline_features.py", "handler": "get_offline_features", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.1"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "retrieve offline feature vector results", "doc": "", "example": "get_offline_features.ipynb", "generationDate": "2022-01-17:17-56", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.9.1", "name": "get_offline_features", "platformVersion": "", "spec": {"filename": "get_offline_features.py", "handler": "get_offline_features", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "1.3.0": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "retrieve offline feature vector results", "doc": "", "example": "get_offline_features.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.3", "name": "get_offline_features", "platformVersion": "3.5.0", "spec": {"filename": "get_offline_features.py", "handler": "get_offline_features", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0"}, "1.0.2": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "retrieve offline feature vector results", "doc": "", "example": "get_offline_features.ipynb", "generationDate": "2022-05-25:10-58", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.0.1", "name": "get_offline_features", "platformVersion": "", "spec": {"filename": "get_offline_features.py", "handler": "get_offline_features", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.2"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "retrieve offline feature vector results", "doc": "", "example": "get_offline_features.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "get_offline_features", "platformVersion": "3.5.0", "spec": {"filename": "get_offline_features.py", "handler": "get_offline_features", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "retrieve offline feature vector results", "doc": "", "example": "get_offline_features.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "get_offline_features", "platformVersion": "3.5.0", "spec": {"filename": "get_offline_features.py", "handler": "get_offline_features", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}}, "azureml_utils": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "azureml_utils", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "commands": ["apt-get update && apt-get install -y --no-install-recommends git", "apt install -y liblttng-ust0"], "with_mlrun": true}}, "filename": "azureml_utils.py", "handler": "train", "image": "python:3.9-bullseye", "kind": "job", "requirements": ["azureml-core==1.54.0.post1", "azureml-train-automl-client==1.54.0.post1", "plotly~=5.4"]}, "url": "", "version": "1.3.0", "test_valid": true}, "0.9.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2021-11-13:00-15", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "azureml_utils", "platformVersion": "", "spec": {"filename": "azureml_utils.py", "handler": "train", "commands": null, "image": "", "kind": "job", "requirements": ["azureml-core==1.33.0", "azureml-train-automl-client==1.33.0"]}, "url": "", "version": "0.9.0"}, "1.3.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "azureml_utils", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "commands": ["apt-get update && apt-get install -y --no-install-recommends git", "apt install -y liblttng-ust0"], "with_mlrun": true}}, "filename": "azureml_utils.py", "handler": "train", "image": "python:3.9-bullseye", "kind": "job", "requirements": ["azureml-core==1.54.0.post1", "azureml-train-automl-client==1.54.0.post1", "plotly~=5.4"]}, "url": "", "version": "1.3.0", "test_valid": true}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "azureml_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "commands": ["python -m pip install pip==22.1.2", "apt-get update && apt-get install -y --no-install-recommends git"], "with_mlrun": true}}, "filename": "azureml_utils.py", "handler": "train", "image": "python:3.7.9-slim", "kind": "job", "requirements": ["azureml-core==1.40.0", "azureml-train-automl-client==1.40.0", "plotly~=5.4"]}, "url": "", "version": "1.1.0"}, "0.9.4": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2021-11-13:00-15", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "azureml_utils", "platformVersion": "", "spec": {"filename": "azureml_utils.py", "handler": "train", "extra_spec": {"build": {"commands": ["python -m pip install pip==21.2.4", "apt-get update && apt-get install -y --no-install-recommends git"], "with_mlrun": true, "auto_build": true}, "allow_empty_resources": true}, "image": "python:3.7.9-slim", "kind": "job", "requirements": ["azureml-core==1.33.0", "azureml-train-automl-client==1.33.0", "plotly~=5.4"]}, "url": "", "version": "0.9.4"}, "0.9.5": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2021-04-20:15-18", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "azureml_utils", "platformVersion": "", "spec": {"filename": "azureml_utils.py", "handler": "train", "extra_spec": {"build": {"commands": ["python -m pip install pip==21.2.4", "apt-get update && apt-get install -y --no-install-recommends git"], "with_mlrun": true, "auto_build": true}, "allow_empty_resources": true}, "image": "python:3.7.9-slim", "kind": "job", "requirements": ["azureml-core==1.40.0", "azureml-train-automl-client==1.40.0", "plotly~=5.4"]}, "url": "", "version": "0.9.5"}, "1.2.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "azureml_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "commands": ["python -m pip install pip==22.1.2", "apt-get update && apt-get install -y --no-install-recommends git"], "with_mlrun": true}}, "filename": "azureml_utils.py", "handler": "train", "image": "python:3.7.9-slim", "kind": "job", "requirements": ["azureml-core==1.40.0", "azureml-train-automl-client==1.40.0", "plotly~=5.4"]}, "url": "", "version": "1.2.0", "test_valid": false}}, "auto_trainer": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.7.0"}, "0.10.3": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-04-26:10-43", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.10.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "", "kind": "job", "requirements": []}, "url": "", "version": "0.10.3"}, "1.0.5": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-04-26:10-43", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.0.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.5"}, "1.3.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.3.0", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0"}, "1.0.6": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-04-26:10-43", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.0.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.6"}, "0.10.2": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-02-06:10-18", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.10.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "", "kind": "job", "requirements": []}, "url": "", "version": "0.10.2"}, "1.4.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.4.0"}, "1.5.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.5.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "1.7.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.7.0"}, "1.0.7": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-04-26:10-43", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.0.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.7"}, "1.6.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.6.0"}}, "snowflake_dask": {"latest": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Snowflake Dask - Ingest snowflake data in parallel with Dask cluster", "doc": "", "example": "snowflake-dask-mlrun.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "xingsheng", "framework": "dask"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "snowflake_dask", "platformVersion": "3.5.0", "spec": {"filename": "snowflake_dask.py", "handler": "load_results", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Snowflake Dask - Ingest snowflake data in parallel with Dask cluster", "doc": "", "example": "snowflake-dask-mlrun.ipynb", "generationDate": "2022-03-20:12-28", "icon": "", "labels": {"author": "xingsheng", "framework": "dask"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.9.1", "name": "snowflake_dask", "platformVersion": "3.2.0", "spec": {"filename": "snowflake_dask.py", "handler": "load_results", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Snowflake Dask - Ingest snowflake data in parallel with Dask cluster", "doc": "", "example": "snowflake-dask-mlrun.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "xingsheng", "framework": "dask"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "snowflake_dask", "platformVersion": "3.5.0", "spec": {"filename": "snowflake_dask.py", "handler": "load_results", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}}, "azureml_serving": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "AzureML serving function", "doc": "", "example": "azureml_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "azureml_serving", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "mlrun.frameworks.sklearn.PickleModelServer"}, "filename": "azureml_serving.py", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["azureml-automl-runtime~=1.38.1"]}, "url": "", "version": "1.1.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "AzureML serving function", "doc": "", "example": "azureml_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "azureml_serving", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "mlrun.frameworks.sklearn.PickleModelServer"}, "filename": "azureml_serving.py", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["azureml-automl-runtime~=1.38.1"]}, "url": "", "version": "1.1.0"}}, "batch_inference": {"latest": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.7.0"}, "1.3.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.3.0"}, "1.1.1": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-learn", "plotly"]}, "url": "", "version": "1.1.1"}, "1.4.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.4.0"}, "1.5.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.5.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference ( also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-learn", "plotly"]}, "url": "", "version": "1.1.0"}, "1.7.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.7.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.2.0"}, "1.6.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.6.0"}}, "hugging_face_serving": {"latest": {"apiVersion": "v1", "categories": ["huggingface", "genai", "model-serving", "machine-learning"], "description": "Generic Hugging Face model server.", "doc": "", "example": "hugging_face_serving.ipynb", "generationDate": "2022-09-05:17-00", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "hugging_face_serving", "platformVersion": "", "spec": {"customFields": {"default_class": "HuggingFaceModelServer"}, "filename": "hugging_face_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["transformers==4.21.3", "tensorflow==2.9.2"]}, "url": "", "version": "1.1.0", "test_valid": false}, "1.0.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "Generic Hugging Face model server.", "doc": "", "example": "hugging_face_serving.ipynb", "generationDate": "2022-09-05:17-00", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "hugging_face_serving", "platformVersion": "", "spec": {"customFields": {"default_class": "HuggingFaceModelServer"}, "filename": "hugging_face_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["transformers==4.21.3", "tensorflow==2.9.2"]}, "url": "", "version": "1.0.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["huggingface", "genai", "model-serving", "machine-learning"], "description": "Generic Hugging Face model server.", "doc": "", "example": "hugging_face_serving.ipynb", "generationDate": "2022-09-05:17-00", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "hugging_face_serving", "platformVersion": "", "spec": {"customFields": {"default_class": "HuggingFaceModelServer"}, "filename": "hugging_face_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["transformers==4.21.3", "tensorflow==2.9.2"]}, "url": "", "version": "1.1.0", "test_valid": false}}, "hugging_face_classifier_trainer": {"latest": {"apiVersion": "v1", "categories": ["deep-learning", "huggingface", "machine-learning", "model-training"], "description": "Automatic train and optimize functions for HuggingFace framework", "doc": "", "example": "hugging_face_classifier_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.1", "name": "hugging_face_classifier_trainer", "platformVersion": "3.5.5", "spec": {"filename": "hugging_face_classifier_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": ["onnx~=1.14.1", "onnxruntime~=1.16.1", "optimum~=1.6.4", "transformers~=4.26.1", "datasets~=2.10.1", "scikit-learn~=1.0.2"]}, "url": "", "version": "0.3.0"}, "0.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train and optimize functions for HuggingFace framework", "doc": "", "example": "hugging_face_classifier_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "hugging_face_classifier_trainer", "platformVersion": "3.5.0", "spec": {"filename": "hugging_face_classifier_trainer.py", "handler": "train", "image": "mlrun/ml-models", "kind": "job", "requirements": ["onnx~=1.10.1", "onnxruntime~=1.8.1", "optimum~=1.6.4", "transformers~=4.26.1", "datasets~=2.10.1", "scikit-learn~=1.0.2"]}, "url": "", "version": "0.1.0"}, "0.3.0": {"apiVersion": "v1", "categories": ["deep-learning", "huggingface", "machine-learning", "model-training"], "description": "Automatic train and optimize functions for HuggingFace framework", "doc": "", "example": "hugging_face_classifier_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.1", "name": "hugging_face_classifier_trainer", "platformVersion": "3.5.5", "spec": {"filename": "hugging_face_classifier_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": ["onnx~=1.14.1", "onnxruntime~=1.16.1", "optimum~=1.6.4", "transformers~=4.26.1", "datasets~=2.10.1", "scikit-learn~=1.0.2"]}, "url": "", "version": "0.3.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train and optimize functions for HuggingFace framework", "doc": "", "example": "hugging_face_classifier_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "hugging_face_classifier_trainer", "platformVersion": "3.5.0", "spec": {"filename": "hugging_face_classifier_trainer.py", "handler": "train", "image": "mlrun/ml-models", "kind": "job", "requirements": ["onnx~=1.10.1", "onnxruntime~=1.8.1", "optimum~=1.6.4", "transformers~=4.26.1", "datasets~=2.10.1", "scikit-learn~=1.0.2"]}, "url": "", "version": "0.0.1"}, "0.2.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train and optimize functions for HuggingFace framework", "doc": "", "example": "hugging_face_classifier_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.1", "name": "hugging_face_classifier_trainer", "platformVersion": "3.5.5", "spec": {"filename": "hugging_face_classifier_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": ["onnx~=1.14.1", "onnxruntime~=1.16.1", "optimum~=1.6.4", "transformers~=4.26.1", "datasets~=2.10.1", "scikit-learn~=1.0.2"]}, "url": "", "version": "0.2.0"}}, "validate_great_expectations": {"latest": {"apiVersion": "v1", "categories": ["data-validation", "data-analysis"], "description": "Validate a dataset using Great Expectations", "doc": "", "example": "validate_great_expectations.ipynb", "generationDate": "2022-04-26:12-28", "hidden": false, "icon": "", "labels": {"author": "nicks", "framework": "great-expectations"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "validate-great-expectations", "platformVersion": "3.5.2", "spec": {"filename": "validate_great_expectations.py", "handler": "validate_expectations", "image": "mlrun/mlrun", "kind": "job", "requirements": ["great-expectations==0.15.41"]}, "url": "", "version": "1.1.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-validation", "data-analysis"], "description": "Validate a dataset using Great Expectations", "doc": "", "example": "validate_great_expectations.ipynb", "generationDate": "2022-04-26:12-28", "hidden": false, "icon": "", "labels": {"author": "nicks", "framework": "great-expectations"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "validate-great-expectations", "platformVersion": "3.5.2", "spec": {"filename": "validate_great_expectations.py", "handler": "validate_expectations", "image": "mlrun/mlrun", "kind": "job", "requirements": ["great-expectations==0.15.41"]}, "url": "", "version": "1.1.0"}}, "question_answering": {"latest": {"apiVersion": "v1", "categories": ["genai", "huggingface", "machine-learning"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "torch", "tqdm"]}, "url": "", "version": "0.4.0"}, "0.1.0": {"apiVersion": "v1", "categories": ["machine-learning"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": "transformers torch tqdm"}, "url": "", "version": "0.1.0"}, "0.3.0": {"apiVersion": "v1", "categories": ["machine-learning"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": "transformers torch tqdm"}, "url": "", "version": "0.3.0"}, "0.4.0": {"apiVersion": "v1", "categories": ["genai", "huggingface", "machine-learning"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "torch", "tqdm"]}, "url": "", "version": "0.4.0"}, "0.3.1": {"apiVersion": "v1", "categories": ["machine-learning"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "torch", "tqdm"]}, "url": "", "version": "0.3.1"}, "0.2.0": {"apiVersion": "v1", "categories": ["machine-learning"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": "transformers torch tqdm"}, "url": "", "version": "0.2.0"}}, "transcribe": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "genai", "huggingface", "machine-learning"], "description": "Transcribe audio files into text files", "doc": "", "example": "transcribe.ipynb", "generationDate": "2023-07-13:11-20", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "transcribe", "platformVersion": "3.5.3", "spec": {"filename": "transcribe.py", "handler": "transcribe", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "tqdm", "torchaudio", "torch", "accelerate"]}, "url": "", "version": "1.1.0"}, "0.0.2": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Transcribe audio files into text files", "doc": "", "example": "transcribe.ipynb", "generationDate": "2023-07-13:11-20", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "transcribe", "platformVersion": "3.5.3", "spec": {"filename": "transcribe.py", "handler": "transcribe", "image": "mlrun/mlrun", "kind": "job", "requirements": ["openai-whisper", "tqdm"]}, "url": "", "version": "0.0.2", "test_valid": true}, "1.0.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Transcribe audio files into text files", "doc": "", "example": "transcribe.ipynb", "generationDate": "2023-07-13:11-20", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "transcribe", "platformVersion": "3.5.3", "spec": {"filename": "transcribe.py", "handler": "transcribe", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "tqdm", "torchaudio", "torch", "accelerate"]}, "url": "", "version": "1.0.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Transcribe audio files into text files", "doc": "", "example": "transcribe.ipynb", "generationDate": "2023-07-13:11-20", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "transcribe", "platformVersion": "3.5.3", "spec": {"filename": "transcribe.py", "handler": "transcribe", "image": "mlrun/mlrun", "kind": "job", "requirements": ["openai-whisper", "tqdm"]}, "url": "", "version": "0.0.1", "test_valid": false}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation", "genai", "huggingface", "machine-learning"], "description": "Transcribe audio files into text files", "doc": "", "example": "transcribe.ipynb", "generationDate": "2023-07-13:11-20", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "transcribe", "platformVersion": "3.5.3", "spec": {"filename": "transcribe.py", "handler": "transcribe", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "tqdm", "torchaudio", "torch", "accelerate"]}, "url": "", "version": "1.1.0"}}, "pii_recognizer": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "NLP"], "description": "This function is used to recognize PII in a directory of text files", "doc": "", "example": "pii_recognizer.ipynb", "generationDate": "2023-08-15:10-24", "hidden": false, "icon": "", "labels": {"author": "pgw"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "pii-recognizer", "platformVersion": "3.5.3", "spec": {"filename": "pii_recognizer.py", "handler": "recognize_pii", "image": "mlrun/mlrun", "kind": "job", "requirements": ["nltk", "pandas", "presidio-anonymizer", "presidio-analyzer", "torch", "flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653", "st-annotated-text", "https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl"]}, "url": "", "version": "0.3.0", "test_valid": false}, "0.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "This function is used to recognize PII in a directory of text files", "doc": "", "example": "pii_recognizer.ipynb", "generationDate": "2023-08-15:10-24", "hidden": false, "icon": "", "labels": {"author": "pgw"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "pii-recognizer", "platformVersion": "3.5.3", "spec": {"filename": "pii_recognizer.py", "handler": "recognize_pii", "image": "mlrun/mlrun", "kind": "job", "requirements": ["nltk", "pandas", "presidio-anonymizer", "presidio-analyzer", "torch", "flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653", "st-annotated-text", "https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl"]}, "url": "", "version": "0.1.0", "test_valid": false}, "0.3.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "NLP"], "description": "This function is used to recognize PII in a directory of text files", "doc": "", "example": "pii_recognizer.ipynb", "generationDate": "2023-08-15:10-24", "hidden": false, "icon": "", "labels": {"author": "pgw"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "pii-recognizer", "platformVersion": "3.5.3", "spec": {"filename": "pii_recognizer.py", "handler": "recognize_pii", "image": "mlrun/mlrun", "kind": "job", "requirements": ["nltk", "pandas", "presidio-anonymizer", "presidio-analyzer", "torch", "flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653", "st-annotated-text", "https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl"]}, "url": "", "version": "0.3.0", "test_valid": false}, "0.0.1": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "This function is used to recognize PII in a directory of text files", "doc": "", "example": "pii_recognizer.ipynb", "generationDate": "2023-08-15:10-24", "hidden": false, "icon": "", "labels": {"author": "pgw"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "pii-recognizer", "platformVersion": "3.5.3", "spec": {"filename": "pii_recognizer.py", "handler": "recognize_pii", "image": "mlrun/mlrun", "kind": "job", "requirements": ["nltk", "pandas", "presidio-anonymizer", "presidio-analyzer", "torch", "flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653", "st-annotated-text", "https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl"]}, "url": "", "version": "0.0.1"}, "0.2.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "This function is used to recognize PII in a directory of text files", "doc": "", "example": "pii_recognizer.ipynb", "generationDate": "2023-08-15:10-24", "hidden": false, "icon": "", "labels": {"author": "pgw"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "pii-recognizer", "platformVersion": "3.5.3", "spec": {"filename": "pii_recognizer.py", "handler": "recognize_pii", "image": "mlrun/mlrun", "kind": "job", "requirements": ["nltk", "pandas", "presidio-anonymizer", "presidio-analyzer", "torch", "flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653", "st-annotated-text", "https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl"]}, "url": "", "version": "0.2.0", "test_valid": false}}, "huggingface_auto_trainer": {"latest": {"apiVersion": "v1", "categories": ["huggingface", "genai", "machine-learning", "model-training"], "description": "fine-tune llm model with ease", "doc": "", "example": "huggingface_auto_trainer.ipynb", "generationDate": "2023-08-21:17-25", "hidden": false, "icon": "", "labels": {"author": "Zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "huggingface-auto-trainer", "platformVersion": "3.5.0", "spec": {"filename": "huggingface_auto_trainer.py", "handler": "finetune_llm", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "1.0.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "fine-tune llm model with ease", "doc": "", "example": "huggingface_auto_trainer.ipynb", "generationDate": "2023-08-21:17-25", "hidden": false, "icon": "", "labels": {"author": "Zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "huggingface-auto-trainer", "platformVersion": "3.5.0", "spec": {"filename": "huggingface_auto_trainer.py", "handler": "finetune_llm", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["huggingface", "genai", "machine-learning", "model-training"], "description": "fine-tune llm model with ease", "doc": "", "example": "huggingface_auto_trainer.ipynb", "generationDate": "2023-08-21:17-25", "hidden": false, "icon": "", "labels": {"author": "Zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "huggingface-auto-trainer", "platformVersion": "3.5.0", "spec": {"filename": "huggingface_auto_trainer.py", "handler": "finetune_llm", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}}, "batch_inference_v2": {"latest": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.5.0"}, "2.5.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.5.0"}, "1.9.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0-rc16", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.9.0"}, "2.2.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.2.0"}, "2.1.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.1.0"}, "2.4.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.4.0"}, "2.0.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.0.0"}, "1.5.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0-rc9", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.5.0"}, "1.8.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0-rc13", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.8.0"}, "1.6.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0-rc9", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.6.0"}}, "translate": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "huggingface", "machine-learning", "deep-learning", "NLP"], "description": "Translate text files from one language to another", "doc": "", "example": "translate.ipynb", "generationDate": "2023-12-05:17-20", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "translate", "platformVersion": "3.5.3", "spec": {"filename": "translate.py", "handler": "translate", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "sentencepiece", "torch", "tqdm"]}, "url": "", "version": "0.1.0", "test_valid": true}, "0.0.2": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning", "deep-learning", "NLP"], "description": "Translate text files from one language to another", "doc": "", "example": "translate.ipynb", "generationDate": "2023-12-05:17-20", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "translate", "platformVersion": "3.5.3", "spec": {"filename": "translate.py", "handler": "translate", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "sentencepiece", "torch", "tqdm"]}, "url": "", "version": "0.0.2", "test_valid": true}, "0.1.0": {"apiVersion": "v1", "categories": ["data-preparation", "huggingface", "machine-learning", "deep-learning", "NLP"], "description": "Translate text files from one language to another", "doc": "", "example": "translate.ipynb", "generationDate": "2023-12-05:17-20", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "translate", "platformVersion": "3.5.3", "spec": {"filename": "translate.py", "handler": "translate", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "sentencepiece", "torch", "tqdm"]}, "url": "", "version": "0.1.0", "test_valid": true}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Translate text files from one language to another", "doc": "", "example": "translate.ipynb", "generationDate": "2023-12-05:17-20", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "translate", "platformVersion": "3.5.3", "spec": {"filename": "translate.py", "handler": "translate", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "sentencepiece", "torch", "tqdm"]}, "url": "", "version": "0.0.1", "test_valid": true}}, "structured_data_generator": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.1", "name": "structured_data_generator", "platformVersion": "3.5.5", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.5.0"}, "1.0.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "structured_data_generator", "platformVersion": "3.5.0", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.0.0"}, "1.3.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "structured_data_generator", "platformVersion": "3.5.0", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.3.0"}, "1.4.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.1", "name": "structured_data_generator", "platformVersion": "3.5.5", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.4.0"}, "1.5.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.1", "name": "structured_data_generator", "platformVersion": "3.5.5", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.5.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "structured_data_generator", "platformVersion": "3.5.0", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.1.0"}}, "text_to_audio_generator": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning", "pytorch"], "description": "Generate audio file from text using different speakers", "doc": "", "example": "text_to_audio_generator.ipynb", "generationDate": "2023-12-03:15-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "text_to_audio_generator", "platformVersion": "3.5.3", "spec": {"filename": "text_to_audio_generator.py", "handler": "generate_multi_speakers_audio", "image": "mlrun/mlrun", "kind": "job", "requirements": ["bark", "torchaudio"]}, "url": "", "version": "1.2.0", "test_valid": true}, "1.0.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Generate audio file from text using different speakers", "doc": "", "example": "text_to_audio_generator.ipynb", "generationDate": "2023-12-03:15-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "text_to_audio_generator", "platformVersion": "3.5.3", "spec": {"filename": "text_to_audio_generator.py", "handler": "generate_multi_speakers_audio", "image": "mlrun/mlrun", "kind": "job", "requirements": ["bark", "torchaudio"]}, "url": "", "version": "1.0.0", "test_valid": true}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Generate audio file from text using different speakers", "doc": "", "example": "text_to_audio_generator.ipynb", "generationDate": "2023-12-03:15-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "text_to_audio_generator", "platformVersion": "3.5.3", "spec": {"filename": "text_to_audio_generator.py", "handler": "generate_multi_speakers_audio", "image": "mlrun/mlrun", "kind": "job", "requirements": ["bark", "torchaudio"]}, "url": "", "version": "1.1.0", "test_valid": true}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning", "pytorch"], "description": "Generate audio file from text using different speakers", "doc": "", "example": "text_to_audio_generator.ipynb", "generationDate": "2023-12-03:15-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "text_to_audio_generator", "platformVersion": "3.5.3", "spec": {"filename": "text_to_audio_generator.py", "handler": "generate_multi_speakers_audio", "image": "mlrun/mlrun", "kind": "job", "requirements": ["bark", "torchaudio"]}, "url": "", "version": "1.2.0", "test_valid": true}}, "silero_vad": {"latest": {"apiVersion": "v1", "categories": ["deep-learning", "pytorch", "audio"], "description": "Silero VAD (Voice Activity Detection) functions.", "doc": "", "example": "silero_vad.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "silero_vad", "platformVersion": "3.5.3", "spec": {"filename": "silero_vad.py", "handler": "detect_voice", "image": "mlrun/mlrun", "kind": "job", "requirements": ["torch", "torchaudio", "tqdm", "onnxruntime"]}, "url": "", "version": "1.3.0"}, "1.3.0": {"apiVersion": "v1", "categories": ["deep-learning", "pytorch", "audio"], "description": "Silero VAD (Voice Activity Detection) functions.", "doc": "", "example": "silero_vad.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "silero_vad", "platformVersion": "3.5.3", "spec": {"filename": "silero_vad.py", "handler": "detect_voice", "image": "mlrun/mlrun", "kind": "job", "requirements": ["torch", "torchaudio", "tqdm", "onnxruntime"]}, "url": "", "version": "1.3.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["deep-learning", "pytorch", "audio"], "description": "Silero VAD (Voice Activity Detection) functions.", "doc": "", "example": "silero_vad.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "silero_vad", "platformVersion": "3.5.3", "spec": {"filename": "silero_vad.py", "handler": "detect_voice", "image": "mlrun/mlrun", "kind": "job", "requirements": ["torch", "torchaudio", "tqdm", "onnxruntime"]}, "url": "", "version": "1.1.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["deep-learning", "pytorch", "audio"], "description": "Silero VAD (Voice Activity Detection) functions.", "doc": "", "example": "silero_vad.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "silero_vad", "platformVersion": "3.5.3", "spec": {"filename": "silero_vad.py", "handler": "detect_voice", "image": "mlrun/mlrun", "kind": "job", "requirements": ["torch", "torchaudio", "tqdm", "onnxruntime"]}, "url": "", "version": "1.2.0"}}, "pyannote_audio": {"latest": {"apiVersion": "v1", "categories": ["deep-learning", "huggingface", "audio"], "description": "pyannote's speech diarization of audio files", "doc": "", "example": "pyannote_audio.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "pyannote-audio", "platformVersion": "3.5.3", "spec": {"filename": "pyannote_audio.py", "handler": "diarize", "image": "mlrun/mlrun-gpu", "kind": "job", "requirements": ["pyannote.audio", "pyannote.core", "torchaudio", "tqdm"]}, "url": "", "version": "1.2.0"}, "1.0.0": {"apiVersion": "v1", "categories": ["deep-learning", "huggingface", "audio"], "description": "pyannote's speech diarization of audio files", "doc": "", "example": "pyannote_audio.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "pyannote-audio", "platformVersion": "3.5.3", "spec": {"filename": "pyannote_audio.py", "handler": "diarize", "image": "mlrun/mlrun-gpu", "kind": "job", "requirements": ["pyannote.audio", "pyannote.core", "torchaudio", "tqdm"]}, "url": "", "version": "1.0.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["deep-learning", "huggingface", "audio"], "description": "pyannote's speech diarization of audio files", "doc": "", "example": "pyannote_audio.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "pyannote-audio", "platformVersion": "3.5.3", "spec": {"filename": "pyannote_audio.py", "handler": "diarize", "image": "mlrun/mlrun-gpu", "kind": "job", "requirements": ["pyannote.audio", "pyannote.core", "torchaudio", "tqdm"]}, "url": "", "version": "1.1.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["deep-learning", "huggingface", "audio"], "description": "pyannote's speech diarization of audio files", "doc": "", "example": "pyannote_audio.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "pyannote-audio", "platformVersion": "3.5.3", "spec": {"filename": "pyannote_audio.py", "handler": "diarize", "image": "mlrun/mlrun-gpu", "kind": "job", "requirements": ["pyannote.audio", "pyannote.core", "torchaudio", "tqdm"]}, "url": "", "version": "1.2.0"}}, "mlflow_utils": {"latest": {"apiVersion": "v1", "categories": ["genai", "model-serving", "machine-learning", "mlflow"], "description": "Mlflow model server, and additional utils.", "doc": "", "example": "mlflow_utils.ipynb", "generationDate": "2024-05-23:12-00", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0-rc17", "name": "mlflow_utils", "platformVersion": "", "spec": {"customFields": {"default_class": "MLFlowModelServer"}, "filename": "mlflow_utils.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["mlflow==2.12.2", "lightgbm", "xgboost"]}, "url": "", "version": "1.0.0"}, "1.0.0": {"apiVersion": "v1", "categories": ["genai", "model-serving", "machine-learning", "mlflow"], "description": "Mlflow model server, and additional utils.", "doc": "", "example": "mlflow_utils.ipynb", "generationDate": "2024-05-23:12-00", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0-rc17", "name": "mlflow_utils", "platformVersion": "", "spec": {"customFields": {"default_class": "MLFlowModelServer"}, "filename": "mlflow_utils.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["mlflow==2.12.2", "lightgbm", "xgboost"]}, "url": "", "version": "1.0.0"}}}}} \ No newline at end of file diff --git a/functions/development/bert_embeddings/0.0.1/src/bert_embeddings.ipynb b/functions/development/bert_embeddings/0.0.1/src/bert_embeddings.ipynb deleted file mode 100644 index 7a7b5826..00000000 --- a/functions/development/bert_embeddings/0.0.1/src/bert_embeddings.ipynb +++ /dev/null @@ -1,270 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## BERT Embeddings Serverless Function\n", - "This notebook presents deployment of pretrained BERT model that outputs embeddings for given textual sequences as a serverless function. Embeddings are meaningful, contextual representations of text in the form of ndarrays that are used frequently as input to various learning tasks in the field of NLP." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: ignore\n", - "import nuclio" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%nuclio cmd -c\n", - "pip install torch==1.6.0\n", - "pip install transformers==3.0.1" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### function code" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "from transformers import BertModel, BertTokenizer\n", - "import torch\n", - "from typing import Union, List\n", - "import json\n", - "import pickle\n", - "\n", - "def init_context(context):\n", - " tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')\n", - " model = BertModel.from_pretrained('bert-base-uncased')\n", - " model.eval()\n", - " setattr(context.user_data, 'tokenizer', tokenizer)\n", - " setattr(context.user_data, 'model', model)\n", - "\n", - "def handler(context, event):\n", - " docs = json.loads(event.body)\n", - " docs = [doc.lower() for doc in docs]\n", - " docs = context.user_data.tokenizer.batch_encode_plus(docs, pad_to_max_length=True, return_tensors='pt')\n", - " with torch.no_grad():\n", - " embeddings = context.user_data.model(**docs)\n", - " embeddings = [embeddings[0].numpy(), embeddings[1].numpy()]\n", - " return pickle.dumps(embeddings)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: end-code" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### local test " - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "event = nuclio.Event(body=json.dumps(['John loves Mary']))\n", - "init_context(context)\n", - "outputs = pickle.loads(handler(context, event))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This is a good chance to view the outputs of this BERT model. It gives two different outputs. The first is a contextual embedding for each token in the input sequence and the second is a pooled embedding for the complete sequence." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "embeddings per token shape: (1, 5, 768), pooled embeddings shape: (1, 768)\n" - ] - } - ], - "source": [ - "print(f'embeddings per token shape: {outputs[0].shape}, pooled embeddings shape: {outputs[1].shape}')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "As seen both outputs share first dimension size of 1. This corresponds to the single sequence we passed as input, \"John loves Mary\". The last dimension for both is of size 768 which is the embedding dimension for this default configuration of bert. Note that the first input has an intermediate dimension of size 5 that corresponds to the number of tokens in the input sequence after addtion of two special tokens marking beginning and end of a sequence by the tokenizer." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Deploy as serverless function" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-06-11 13:16:26,161 function spec saved to path: function.yaml\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from mlrun import code_to_function\n", - "fn = code_to_function(\"bert-embeddings\", kind=\"nuclio\",\n", - " description=\"Get BERT based embeddings for given text\",\n", - " categories=[\"NLP\", \"BERT\", \"embeddings\"],\n", - " labels = {\"author\": \"roye\", \"framework\": \"pytorch\"},\n", - " code_output='.')\n", - "\n", - "fn.export(\"function.yaml\")" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-06-11 13:16:30,576 deploy started\n", - "[nuclio] 2020-06-11 13:16:38,751 (info) Build complete\n", - "[nuclio] 2020-06-11 13:16:58,965 (info) Function deploy complete\n", - "[nuclio] 2020-06-11 13:16:58,972 done updating nlp-servers-bert-embeddings, function address: 192.168.224.208:31596\n", - "[mlrun] 2020-06-11 13:16:58,982 warning!, server (0.4.7) and client (0.4.8) ver dont match\n" - ] - } - ], - "source": [ - "addr = fn.deploy(project='nlp-servers')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Test the function via http request" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "import requests\n", - "\n", - "\n", - "event_data = ['the quick brown fox jumps over the lazy dog', 'Hello I am Jacob']\n", - "resp = requests.post(addr, json=json.dumps(event_data))" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "output_embeddings = pickle.loads(resp.content)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "embeddings per token shape: (2, 11, 768), pooled embeddings shape: (2, 768)\n" - ] - } - ], - "source": [ - "print(f'embeddings per token shape: {output_embeddings[0].shape}, pooled embeddings shape: {output_embeddings[1].shape}')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now we can see that the size of the first dimension of the outputs is two since we passed in two sequences. Also the intermediate dimension of the first output is the maximal number of tokens across all input sequences. Sequences with less tokens are padded with zero values." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.9" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/functions/development/bert_embeddings/0.0.1/src/bert_embeddings.py b/functions/development/bert_embeddings/0.0.1/src/bert_embeddings.py deleted file mode 100644 index 92610a06..00000000 --- a/functions/development/bert_embeddings/0.0.1/src/bert_embeddings.py +++ /dev/null @@ -1,27 +0,0 @@ -import json -import pickle - -import torch -from transformers import BertModel, BertTokenizer - - -def init_context(context): - tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") - model = BertModel.from_pretrained("bert-base-uncased") - model.eval() - - setattr(context.user_data, "tokenizer", tokenizer) - setattr(context.user_data, "model", model) - - -def handler(context, event): - docs = json.loads(event.body) - docs = [doc.lower() for doc in docs] - docs = context.user_data.tokenizer.batch_encode_plus( - docs, pad_to_max_length=True, return_tensors="pt" - ) - - with torch.no_grad(): - embeddings = context.user_data.model(**docs) - embeddings = [embeddings[0].numpy(), embeddings[1].numpy()] - return pickle.dumps(embeddings) diff --git a/functions/development/bert_embeddings/0.0.1/src/function.yaml b/functions/development/bert_embeddings/0.0.1/src/function.yaml deleted file mode 100644 index 5d5dd14f..00000000 --- a/functions/development/bert_embeddings/0.0.1/src/function.yaml +++ /dev/null @@ -1,44 +0,0 @@ -kind: remote -metadata: - name: bert-embeddings - tag: '' - hash: 77dd0921ea6d23ed5c0e944037e291e2cd6727fd - project: default - labels: - framework: pytorch - categories: - - machine-learning - - data-preparation -spec: - command: '' - args: [] - image: mlrun/ml-models - description: Get BERT based embeddings for given text - min_replicas: 1 - max_replicas: 4 - env: [] - base_spec: - apiVersion: nuclio.io/v1 - kind: Function - metadata: - name: bert-embeddings - labels: {} - annotations: - nuclio.io/generated_by: function generated from /home/kali/functions/bert_embeddings/bert_embeddings.py - spec: - runtime: python:3.6 - handler: bert_embeddings:handler - env: [] - volumes: [] - build: - commands: [] - noBaseImagesPull: true - functionSourceCode: aW1wb3J0IGpzb24KaW1wb3J0IHBpY2tsZQoKaW1wb3J0IHRvcmNoCmZyb20gdHJhbnNmb3JtZXJzIGltcG9ydCBCZXJ0TW9kZWwsIEJlcnRUb2tlbml6ZXIKCgpkZWYgaW5pdF9jb250ZXh0KGNvbnRleHQpOgogICAgdG9rZW5pemVyID0gQmVydFRva2VuaXplci5mcm9tX3ByZXRyYWluZWQoImJlcnQtYmFzZS11bmNhc2VkIikKICAgIG1vZGVsID0gQmVydE1vZGVsLmZyb21fcHJldHJhaW5lZCgiYmVydC1iYXNlLXVuY2FzZWQiKQogICAgbW9kZWwuZXZhbCgpCgogICAgc2V0YXR0cihjb250ZXh0LnVzZXJfZGF0YSwgInRva2VuaXplciIsIHRva2VuaXplcikKICAgIHNldGF0dHIoY29udGV4dC51c2VyX2RhdGEsICJtb2RlbCIsIG1vZGVsKQoKCmRlZiBoYW5kbGVyKGNvbnRleHQsIGV2ZW50KToKICAgIGRvY3MgPSBqc29uLmxvYWRzKGV2ZW50LmJvZHkpCiAgICBkb2NzID0gW2RvYy5sb3dlcigpIGZvciBkb2MgaW4gZG9jc10KICAgIGRvY3MgPSBjb250ZXh0LnVzZXJfZGF0YS50b2tlbml6ZXIuYmF0Y2hfZW5jb2RlX3BsdXMoCiAgICAgICAgZG9jcywgcGFkX3RvX21heF9sZW5ndGg9VHJ1ZSwgcmV0dXJuX3RlbnNvcnM9InB0IgogICAgKQoKICAgIHdpdGggdG9yY2gubm9fZ3JhZCgpOgogICAgICAgIGVtYmVkZGluZ3MgPSBjb250ZXh0LnVzZXJfZGF0YS5tb2RlbCgqKmRvY3MpCiAgICBlbWJlZGRpbmdzID0gW2VtYmVkZGluZ3NbMF0ubnVtcHkoKSwgZW1iZWRkaW5nc1sxXS5udW1weSgpXQogICAgcmV0dXJuIHBpY2tsZS5kdW1wcyhlbWJlZGRpbmdzKQo= - source: '' - build: - commands: - - python -m pip install torch==1.6.0 - code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/bert_embeddings/bert_embeddings.py - default_handler: handler - affinity: null -verbose: false diff --git a/functions/development/bert_embeddings/0.0.1/src/item.yaml b/functions/development/bert_embeddings/0.0.1/src/item.yaml deleted file mode 100644 index 728d0818..00000000 --- a/functions/development/bert_embeddings/0.0.1/src/item.yaml +++ /dev/null @@ -1,25 +0,0 @@ -apiVersion: v1 -categories: -- machine-learning -- data-preparation -description: Get BERT based embeddings for given text -doc: '' -example: bert_embeddings.ipynb -generationDate: 2021-05-19:22-04 -icon: '' -labels: - framework: pytorch -maintainers: [] -marketplaceType: '' -mlrunVersion: 0.5.4 -name: bert-embeddings -platformVersion: 2.10.0 -spec: - filename: bert_embeddings.py - handler: handler - image: mlrun/ml-models - kind: nuclio - requirements: - - torch==1.6.0 -url: '' -version: 0.0.1 diff --git a/functions/development/bert_embeddings/0.0.1/src/requirements.txt b/functions/development/bert_embeddings/0.0.1/src/requirements.txt deleted file mode 100644 index d7ee31ee..00000000 --- a/functions/development/bert_embeddings/0.0.1/src/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -mlrun -transformers \ No newline at end of file diff --git a/functions/development/bert_embeddings/0.0.1/src/test_bert_embeddings.py b/functions/development/bert_embeddings/0.0.1/src/test_bert_embeddings.py deleted file mode 100644 index 320dff20..00000000 --- a/functions/development/bert_embeddings/0.0.1/src/test_bert_embeddings.py +++ /dev/null @@ -1,18 +0,0 @@ -from bert_embeddings import init_context,handler -import nuclio -import json -import pickle -import numpy as np - -ARCHIVE = "https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz" -ARTIFACTS_PATH = 'artifacts' - - -def test_bert_embeddings(): - event = nuclio.Event(body=json.dumps(['John loves Mary'])) - ctx = nuclio.Context() - init_context(ctx) - outputs = pickle.loads(handler(ctx, event)) - assert (True if abs(np.mean(outputs[0]) - -0.011996539) <= 0.0001 else False) is True - assert (True if abs(np.mean(outputs[0]) - -0.011996539) > 0 else False) is True - diff --git a/functions/development/bert_embeddings/0.0.1/static/documentation.html b/functions/development/bert_embeddings/0.0.1/static/documentation.html deleted file mode 100644 index ae4d0424..00000000 --- a/functions/development/bert_embeddings/0.0.1/static/documentation.html +++ /dev/null @@ -1,133 +0,0 @@ - - - - - - - -bert_embeddings package - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- - -
-
-
-
-
-
-

bert_embeddings package

-
-

Submodules

-
-
-

bert_embeddings.bert_embeddings module

-
-
-bert_embeddings.bert_embeddings.handler(context, event)[source]
-
-
-
-bert_embeddings.bert_embeddings.init_context(context)[source]
-
-
-
-

Module contents

-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/bert_embeddings/0.0.1/static/example.html b/functions/development/bert_embeddings/0.0.1/static/example.html deleted file mode 100644 index bea41725..00000000 --- a/functions/development/bert_embeddings/0.0.1/static/example.html +++ /dev/null @@ -1,279 +0,0 @@ - - - - - - - -BERT Embeddings Serverless Function - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- - -
-
-
-
-
-
-

BERT Embeddings Serverless Function

-

This notebook presents deployment of pretrained BERT model that outputs embeddings for given textual sequences as a serverless function. Embeddings are meaningful, contextual representations of text in the form of ndarrays that are used frequently as input to various learning tasks in the field of NLP.

-
-
-
# nuclio: ignore
-import nuclio
-
-
-
-
-
-
-
%%nuclio cmd -c
-pip install torch==1.6.0
-pip install transformers==3.0.1
-
-
-
-
-
-

function code

-
-
-
from transformers import BertModel, BertTokenizer
-import torch
-from typing import Union, List
-import json
-import pickle
-
-def init_context(context):
-    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-    model = BertModel.from_pretrained('bert-base-uncased')
-    model.eval()
-    setattr(context.user_data, 'tokenizer', tokenizer)
-    setattr(context.user_data, 'model', model)
-
-def handler(context, event):
-    docs = json.loads(event.body)
-    docs = [doc.lower() for doc in docs]
-    docs = context.user_data.tokenizer.batch_encode_plus(docs, pad_to_max_length=True, return_tensors='pt')
-    with torch.no_grad():
-        embeddings = context.user_data.model(**docs)
-    embeddings = [embeddings[0].numpy(), embeddings[1].numpy()]
-    return pickle.dumps(embeddings)
-
-
-
-
-
-
-
# nuclio: end-code
-
-
-
-
-
-
-

local test

-
-
-
event = nuclio.Event(body=json.dumps(['John loves Mary']))
-init_context(context)
-outputs = pickle.loads(handler(context, event))
-
-
-
-
-

This is a good chance to view the outputs of this BERT model. It gives two different outputs. The first is a contextual embedding for each token in the input sequence and the second is a pooled embedding for the complete sequence.

-
-
-
print(f'embeddings per token shape: {outputs[0].shape}, pooled embeddings shape: {outputs[1].shape}')
-
-
-
-
-
embeddings per token shape: (1, 5, 768), pooled embeddings shape: (1, 768)
-
-
-
-
-

As seen both outputs share first dimension size of 1. This corresponds to the single sequence we passed as input, “John loves Mary”. The last dimension for both is of size 768 which is the embedding dimension for this default configuration of bert. Note that the first input has an intermediate dimension of size 5 that corresponds to the number of tokens in the input sequence after addtion of two special tokens marking beginning and end of a sequence by the tokenizer.

-
-
-

Deploy as serverless function

-
-
-
from mlrun import code_to_function
-fn = code_to_function("bert-embeddings", kind="nuclio",
-                      description="Get BERT based embeddings for given text",
-                      categories=["NLP", "BERT", "embeddings"],
-                      labels = {"author": "roye", "framework": "pytorch"},
-                      code_output='.')
-
-fn.export("function.yaml")
-
-
-
-
-
[mlrun] 2020-06-11 13:16:26,161 function spec saved to path: function.yaml
-
-
-
<mlrun.runtimes.function.RemoteRuntime at 0x7f1649c00128>
-
-
-
-
-
-
-
addr = fn.deploy(project='nlp-servers')
-
-
-
-
-
[mlrun] 2020-06-11 13:16:30,576 deploy started
-[nuclio] 2020-06-11 13:16:38,751 (info) Build complete
-[nuclio] 2020-06-11 13:16:58,965 (info) Function deploy complete
-[nuclio] 2020-06-11 13:16:58,972 done updating nlp-servers-bert-embeddings, function address: 192.168.224.208:31596
-[mlrun] 2020-06-11 13:16:58,982 warning!, server (0.4.7) and client (0.4.8) ver dont match
-
-
-
-
-
-

Test the function via http request

-
-
-
import requests
-
-
-event_data = ['the quick brown fox jumps over the lazy dog', 'Hello I am Jacob']
-resp = requests.post(addr, json=json.dumps(event_data))
-
-
-
-
-
-
-
output_embeddings = pickle.loads(resp.content)
-
-
-
-
-
-
-
print(f'embeddings per token shape: {output_embeddings[0].shape}, pooled embeddings shape: {output_embeddings[1].shape}')
-
-
-
-
-
embeddings per token shape: (2, 11, 768), pooled embeddings shape: (2, 768)
-
-
-
-
-

Now we can see that the size of the first dimension of the outputs is two since we passed in two sequences. Also the intermediate dimension of the first output is the maximal number of tokens across all input sequences. Sequences with less tokens are padded with zero values.

-
-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/bert_embeddings/0.0.1/static/function.html b/functions/development/bert_embeddings/0.0.1/static/function.html deleted file mode 100644 index ec7b4e48..00000000 --- a/functions/development/bert_embeddings/0.0.1/static/function.html +++ /dev/null @@ -1,66 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: remote
-metadata:
-  name: bert-embeddings
-  tag: ''
-  hash: 77dd0921ea6d23ed5c0e944037e291e2cd6727fd
-  project: default
-  labels:
-    framework: pytorch
-  categories:
-  - machine-learning
-  - data-preparation
-spec:
-  command: ''
-  args: []
-  image: mlrun/ml-models
-  description: Get BERT based embeddings for given text
-  min_replicas: 1
-  max_replicas: 4
-  env: []
-  base_spec:
-    apiVersion: nuclio.io/v1
-    kind: Function
-    metadata:
-      name: bert-embeddings
-      labels: {}
-      annotations:
-        nuclio.io/generated_by: function generated from /home/kali/functions/bert_embeddings/bert_embeddings.py
-    spec:
-      runtime: python:3.6
-      handler: bert_embeddings:handler
-      env: []
-      volumes: []
-      build:
-        commands: []
-        noBaseImagesPull: true
-        functionSourceCode: aW1wb3J0IGpzb24KaW1wb3J0IHBpY2tsZQoKaW1wb3J0IHRvcmNoCmZyb20gdHJhbnNmb3JtZXJzIGltcG9ydCBCZXJ0TW9kZWwsIEJlcnRUb2tlbml6ZXIKCgpkZWYgaW5pdF9jb250ZXh0KGNvbnRleHQpOgogICAgdG9rZW5pemVyID0gQmVydFRva2VuaXplci5mcm9tX3ByZXRyYWluZWQoImJlcnQtYmFzZS11bmNhc2VkIikKICAgIG1vZGVsID0gQmVydE1vZGVsLmZyb21fcHJldHJhaW5lZCgiYmVydC1iYXNlLXVuY2FzZWQiKQogICAgbW9kZWwuZXZhbCgpCgogICAgc2V0YXR0cihjb250ZXh0LnVzZXJfZGF0YSwgInRva2VuaXplciIsIHRva2VuaXplcikKICAgIHNldGF0dHIoY29udGV4dC51c2VyX2RhdGEsICJtb2RlbCIsIG1vZGVsKQoKCmRlZiBoYW5kbGVyKGNvbnRleHQsIGV2ZW50KToKICAgIGRvY3MgPSBqc29uLmxvYWRzKGV2ZW50LmJvZHkpCiAgICBkb2NzID0gW2RvYy5sb3dlcigpIGZvciBkb2MgaW4gZG9jc10KICAgIGRvY3MgPSBjb250ZXh0LnVzZXJfZGF0YS50b2tlbml6ZXIuYmF0Y2hfZW5jb2RlX3BsdXMoCiAgICAgICAgZG9jcywgcGFkX3RvX21heF9sZW5ndGg9VHJ1ZSwgcmV0dXJuX3RlbnNvcnM9InB0IgogICAgKQoKICAgIHdpdGggdG9yY2gubm9fZ3JhZCgpOgogICAgICAgIGVtYmVkZGluZ3MgPSBjb250ZXh0LnVzZXJfZGF0YS5tb2RlbCgqKmRvY3MpCiAgICBlbWJlZGRpbmdzID0gW2VtYmVkZGluZ3NbMF0ubnVtcHkoKSwgZW1iZWRkaW5nc1sxXS5udW1weSgpXQogICAgcmV0dXJuIHBpY2tsZS5kdW1wcyhlbWJlZGRpbmdzKQo=
-  source: ''
-  build:
-    commands:
-    - python -m pip install torch==1.6.0
-    code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/bert_embeddings/bert_embeddings.py
-  default_handler: handler
-  affinity: null
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/bert_embeddings/0.0.1/static/item.html b/functions/development/bert_embeddings/0.0.1/static/item.html deleted file mode 100644 index 6d823267..00000000 --- a/functions/development/bert_embeddings/0.0.1/static/item.html +++ /dev/null @@ -1,47 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- machine-learning
-- data-preparation
-description: Get BERT based embeddings for given text
-doc: ''
-example: bert_embeddings.ipynb
-generationDate: 2021-05-19:22-04
-icon: ''
-labels:
-  framework: pytorch
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 0.5.4
-name: bert-embeddings
-platformVersion: 2.10.0
-spec:
-  filename: bert_embeddings.py
-  handler: handler
-  image: mlrun/ml-models
-  kind: nuclio
-  requirements:
-  - torch==1.6.0
-url: ''
-version: 0.0.1
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/bert_embeddings/0.0.1/static/source.html b/functions/development/bert_embeddings/0.0.1/static/source.html deleted file mode 100644 index 3c3de0a9..00000000 --- a/functions/development/bert_embeddings/0.0.1/static/source.html +++ /dev/null @@ -1,49 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-import json
-import pickle
-
-import torch
-from transformers import BertModel, BertTokenizer
-
-
-def init_context(context):
-    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
-    model = BertModel.from_pretrained("bert-base-uncased")
-    model.eval()
-
-    setattr(context.user_data, "tokenizer", tokenizer)
-    setattr(context.user_data, "model", model)
-
-
-def handler(context, event):
-    docs = json.loads(event.body)
-    docs = [doc.lower() for doc in docs]
-    docs = context.user_data.tokenizer.batch_encode_plus(
-        docs, pad_to_max_length=True, return_tensors="pt"
-    )
-
-    with torch.no_grad():
-        embeddings = context.user_data.model(**docs)
-    embeddings = [embeddings[0].numpy(), embeddings[1].numpy()]
-    return pickle.dumps(embeddings)
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/bert_embeddings/0.8.0/src/bert_embeddings.ipynb b/functions/development/bert_embeddings/0.8.0/src/bert_embeddings.ipynb deleted file mode 100644 index 663ed084..00000000 --- a/functions/development/bert_embeddings/0.8.0/src/bert_embeddings.ipynb +++ /dev/null @@ -1,520 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## BERT Embeddings Serverless Function\n", - "This notebook presents deployment of pretrained BERT model that outputs embeddings for given textual sequences as a serverless function. Embeddings are meaningful, contextual representations of text in the form of ndarrays that are used frequently as input to various learning tasks in the field of NLP." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Both server & client are aligned (0.6.5).\n" - ] - } - ], - "source": [ - "!/User/align_mlrun.sh" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Embeddings without bert" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "[One-Hot Encoding](https://en.wikipedia.org/wiki/One-hot) is a general method that can vectorize any categorical features. It is simple and fast to create and update the vectorization.
\n", - "in case of text embeddings, each row is a sentence and each column is a word/char/[n-gram](https://en.wikipedia.org/wiki/N-gram)." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "# some sentences to do examine\n", - "sentences = ['the quick brown fox jumps over the lazy dog',\n", - " 'Hello I am Jacob',\n", - " 'Daniel visited Tel-Aviv last month']" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "lets see the difference between bert embeddings and one-hot encoding" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'lazy', 'dog', 'Hello', 'I', 'am', 'Jacob', 'Daniel', 'visited', 'Tel-Aviv', 'last', 'month']\n" - ] - } - ], - "source": [ - "# constructing a list of all the words (will be our columns) - make sure no duplicate words are set\n", - "tokens = []\n", - "for sentence in sentences:\n", - " for word in sentence.split():\n", - " tokens.append(word) if word not in tokens else \"\"\n", - "print(tokens)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "# constructing the one hot vector\n", - "import pandas as pd\n", - "import numpy as np\n", - "\n", - "one_hot = pd.DataFrame(columns = range(len(tokens)))\n", - "# filling our empty dataframe with each sentence encoding\n", - "for sentence in sentences:\n", - " vector = np.zeros(len(tokens))\n", - " for word in sentence.split():\n", - " vector[tokens.index(word)]=1\n", - " one_hot = one_hot.append(pd.Series(vector),ignore_index=True)\n", - "one_hot.columns = tokens" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
thequickbrownfoxjumpsoverlazydogHelloIamJacobDanielvisitedTel-Avivlastmonth
01.01.01.01.01.01.01.01.00.00.00.00.00.00.00.00.00.0
10.00.00.00.00.00.00.00.01.01.01.01.00.00.00.00.00.0
20.00.00.00.00.00.00.00.00.00.00.00.01.01.01.01.01.0
\n", - "
" - ], - "text/plain": [ - " the quick brown fox jumps over lazy dog Hello I am Jacob \\\n", - "0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 0.0 0.0 0.0 0.0 \n", - "1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 \n", - "2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", - "\n", - " Daniel visited Tel-Aviv last month \n", - "0 0.0 0.0 0.0 0.0 0.0 \n", - "1 0.0 0.0 0.0 0.0 0.0 \n", - "2 1.0 1.0 1.0 1.0 1.0 " - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "one_hot" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The table above represents the one-hot encoding of our sentences, each row is a sentence and each column is a word.\n", - "this representation is very slim and will be a very weak learning dataset." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Introducing Bert embeddings" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import import_function, auto_mount" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "# importing the function from the hub\n", - "fn = import_function(\"hub://bert_embeddings\").apply(auto_mount())" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2021-08-11 14:49:02,307 [info] Starting remote function deploy\n", - "2021-08-11 14:49:02 (info) Deploying function\n", - "2021-08-11 14:49:02 (info) Building\n", - "2021-08-11 14:49:03 (info) Staging files and preparing base images\n", - "2021-08-11 14:49:03 (info) Building processor image\n", - "2021-08-11 14:49:10 (info) Build complete\n", - "2021-08-11 14:49:42 (info) Function deploy complete\n", - "> 2021-08-11 14:49:43,742 [info] function deployed, address=default-tenant.app.dev39.lab.iguazeng.com:32500\n" - ] - } - ], - "source": [ - "# deploying the function\n", - "addr = fn.deploy(project = \"function-marketplace\")" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "import requests\n", - "import json\n", - "# sending a request to the function endpoint to get the sentences' embeddings\n", - "resp = requests.post(addr, json=json.dumps(sentences))" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "import pickle\n", - "output_embeddings = pickle.loads(resp.content)" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "embeddings per token shape: (3, 11, 768), pooled embeddings shape: (3, 768)\n" - ] - } - ], - "source": [ - "print(f'embeddings per token shape: {output_embeddings[0].shape}, pooled embeddings shape: {output_embeddings[1].shape}')" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
0123456789...758759760761762763764765766767
0-0.733322-0.2235400.3424620.383463-0.1647960.0405220.8028450.1528420.331639-0.999779...0.2065640.2314150.1964340.7979080.4351760.7493700.2460980.427603-0.5773840.842063
1-0.953005-0.535132-0.7438210.8939340.646276-0.2793880.9435130.275504-0.555108-0.999992...0.582385-0.0046130.9760790.931517-0.3914420.5303840.675933-0.682720-0.7463390.957809
2-0.843678-0.453405-0.8260110.6508050.494036-0.1541170.8216420.349507-0.650629-0.999978...0.618286-0.3367000.9362620.857577-0.7874890.2461370.676243-0.612531-0.7087860.840879
\n", - "

3 rows × 768 columns

\n", - "
" - ], - "text/plain": [ - " 0 1 2 3 4 5 6 \\\n", - "0 -0.733322 -0.223540 0.342462 0.383463 -0.164796 0.040522 0.802845 \n", - "1 -0.953005 -0.535132 -0.743821 0.893934 0.646276 -0.279388 0.943513 \n", - "2 -0.843678 -0.453405 -0.826011 0.650805 0.494036 -0.154117 0.821642 \n", - "\n", - " 7 8 9 ... 758 759 760 761 \\\n", - "0 0.152842 0.331639 -0.999779 ... 0.206564 0.231415 0.196434 0.797908 \n", - "1 0.275504 -0.555108 -0.999992 ... 0.582385 -0.004613 0.976079 0.931517 \n", - "2 0.349507 -0.650629 -0.999978 ... 0.618286 -0.336700 0.936262 0.857577 \n", - "\n", - " 762 763 764 765 766 767 \n", - "0 0.435176 0.749370 0.246098 0.427603 -0.577384 0.842063 \n", - "1 -0.391442 0.530384 0.675933 -0.682720 -0.746339 0.957809 \n", - "2 -0.787489 0.246137 0.676243 -0.612531 -0.708786 0.840879 \n", - "\n", - "[3 rows x 768 columns]" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "pd.DataFrame(output_embeddings[1])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "we can see that the size of the first dimension of the outputs is three since we passed in three sequences. Also the intermediate dimension of the first output is the maximal number of tokens across all input sequences. Sequences with less tokens are padded with zero values.
\n", - "Note that the first input has an intermediate dimension of size 11 that corresponds to the number of max tokens in the input sequence after addition of two special tokens marking beginning and end of a sequence by the tokenizer.
\n", - "The last dimension for both is of size 768 which is the embedding dimension for this default configuration of bert.
\n", - "Now you tell me, which encoding are you gonna use in your project ??" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/functions/development/bert_embeddings/0.8.0/src/bert_embeddings.py b/functions/development/bert_embeddings/0.8.0/src/bert_embeddings.py deleted file mode 100644 index 92610a06..00000000 --- a/functions/development/bert_embeddings/0.8.0/src/bert_embeddings.py +++ /dev/null @@ -1,27 +0,0 @@ -import json -import pickle - -import torch -from transformers import BertModel, BertTokenizer - - -def init_context(context): - tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") - model = BertModel.from_pretrained("bert-base-uncased") - model.eval() - - setattr(context.user_data, "tokenizer", tokenizer) - setattr(context.user_data, "model", model) - - -def handler(context, event): - docs = json.loads(event.body) - docs = [doc.lower() for doc in docs] - docs = context.user_data.tokenizer.batch_encode_plus( - docs, pad_to_max_length=True, return_tensors="pt" - ) - - with torch.no_grad(): - embeddings = context.user_data.model(**docs) - embeddings = [embeddings[0].numpy(), embeddings[1].numpy()] - return pickle.dumps(embeddings) diff --git a/functions/development/bert_embeddings/0.8.0/src/function.yaml b/functions/development/bert_embeddings/0.8.0/src/function.yaml deleted file mode 100644 index 5d5dd14f..00000000 --- a/functions/development/bert_embeddings/0.8.0/src/function.yaml +++ /dev/null @@ -1,44 +0,0 @@ -kind: remote -metadata: - name: bert-embeddings - tag: '' - hash: 77dd0921ea6d23ed5c0e944037e291e2cd6727fd - project: default - labels: - framework: pytorch - categories: - - machine-learning - - data-preparation -spec: - command: '' - args: [] - image: mlrun/ml-models - description: Get BERT based embeddings for given text - min_replicas: 1 - max_replicas: 4 - env: [] - base_spec: - apiVersion: nuclio.io/v1 - kind: Function - metadata: - name: bert-embeddings - labels: {} - annotations: - nuclio.io/generated_by: function generated from /home/kali/functions/bert_embeddings/bert_embeddings.py - spec: - runtime: python:3.6 - handler: bert_embeddings:handler - env: [] - volumes: [] - build: - commands: [] - noBaseImagesPull: true - functionSourceCode: aW1wb3J0IGpzb24KaW1wb3J0IHBpY2tsZQoKaW1wb3J0IHRvcmNoCmZyb20gdHJhbnNmb3JtZXJzIGltcG9ydCBCZXJ0TW9kZWwsIEJlcnRUb2tlbml6ZXIKCgpkZWYgaW5pdF9jb250ZXh0KGNvbnRleHQpOgogICAgdG9rZW5pemVyID0gQmVydFRva2VuaXplci5mcm9tX3ByZXRyYWluZWQoImJlcnQtYmFzZS11bmNhc2VkIikKICAgIG1vZGVsID0gQmVydE1vZGVsLmZyb21fcHJldHJhaW5lZCgiYmVydC1iYXNlLXVuY2FzZWQiKQogICAgbW9kZWwuZXZhbCgpCgogICAgc2V0YXR0cihjb250ZXh0LnVzZXJfZGF0YSwgInRva2VuaXplciIsIHRva2VuaXplcikKICAgIHNldGF0dHIoY29udGV4dC51c2VyX2RhdGEsICJtb2RlbCIsIG1vZGVsKQoKCmRlZiBoYW5kbGVyKGNvbnRleHQsIGV2ZW50KToKICAgIGRvY3MgPSBqc29uLmxvYWRzKGV2ZW50LmJvZHkpCiAgICBkb2NzID0gW2RvYy5sb3dlcigpIGZvciBkb2MgaW4gZG9jc10KICAgIGRvY3MgPSBjb250ZXh0LnVzZXJfZGF0YS50b2tlbml6ZXIuYmF0Y2hfZW5jb2RlX3BsdXMoCiAgICAgICAgZG9jcywgcGFkX3RvX21heF9sZW5ndGg9VHJ1ZSwgcmV0dXJuX3RlbnNvcnM9InB0IgogICAgKQoKICAgIHdpdGggdG9yY2gubm9fZ3JhZCgpOgogICAgICAgIGVtYmVkZGluZ3MgPSBjb250ZXh0LnVzZXJfZGF0YS5tb2RlbCgqKmRvY3MpCiAgICBlbWJlZGRpbmdzID0gW2VtYmVkZGluZ3NbMF0ubnVtcHkoKSwgZW1iZWRkaW5nc1sxXS5udW1weSgpXQogICAgcmV0dXJuIHBpY2tsZS5kdW1wcyhlbWJlZGRpbmdzKQo= - source: '' - build: - commands: - - python -m pip install torch==1.6.0 - code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/bert_embeddings/bert_embeddings.py - default_handler: handler - affinity: null -verbose: false diff --git a/functions/development/bert_embeddings/0.8.0/src/item.yaml b/functions/development/bert_embeddings/0.8.0/src/item.yaml deleted file mode 100644 index 3bec2745..00000000 --- a/functions/development/bert_embeddings/0.8.0/src/item.yaml +++ /dev/null @@ -1,25 +0,0 @@ -apiVersion: v1 -categories: -- machine-learning -- data-preparation -description: Get BERT based embeddings for given text -doc: '' -example: bert_embeddings.ipynb -generationDate: 2021-05-19:22-04 -icon: '' -labels: - framework: pytorch -maintainers: [] -marketplaceType: '' -mlrunVersion: 0.8.0 -name: bert-embeddings -platformVersion: 3.2.0 -spec: - filename: bert_embeddings.py - handler: handler - image: mlrun/ml-models - kind: nuclio - requirements: - - torch==1.6.0 -url: '' -version: 0.8.0 diff --git a/functions/development/bert_embeddings/0.8.0/src/requirements.txt b/functions/development/bert_embeddings/0.8.0/src/requirements.txt deleted file mode 100644 index d7ee31ee..00000000 --- a/functions/development/bert_embeddings/0.8.0/src/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -mlrun -transformers \ No newline at end of file diff --git a/functions/development/bert_embeddings/0.8.0/src/test_bert_embeddings.py b/functions/development/bert_embeddings/0.8.0/src/test_bert_embeddings.py deleted file mode 100644 index 320dff20..00000000 --- a/functions/development/bert_embeddings/0.8.0/src/test_bert_embeddings.py +++ /dev/null @@ -1,18 +0,0 @@ -from bert_embeddings import init_context,handler -import nuclio -import json -import pickle -import numpy as np - -ARCHIVE = "https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz" -ARTIFACTS_PATH = 'artifacts' - - -def test_bert_embeddings(): - event = nuclio.Event(body=json.dumps(['John loves Mary'])) - ctx = nuclio.Context() - init_context(ctx) - outputs = pickle.loads(handler(ctx, event)) - assert (True if abs(np.mean(outputs[0]) - -0.011996539) <= 0.0001 else False) is True - assert (True if abs(np.mean(outputs[0]) - -0.011996539) > 0 else False) is True - diff --git a/functions/development/bert_embeddings/0.8.0/static/documentation.html b/functions/development/bert_embeddings/0.8.0/static/documentation.html deleted file mode 100644 index ae4d0424..00000000 --- a/functions/development/bert_embeddings/0.8.0/static/documentation.html +++ /dev/null @@ -1,133 +0,0 @@ - - - - - - - -bert_embeddings package - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- - -
-
-
-
-
-
-

bert_embeddings package

-
-

Submodules

-
-
-

bert_embeddings.bert_embeddings module

-
-
-bert_embeddings.bert_embeddings.handler(context, event)[source]
-
-
-
-bert_embeddings.bert_embeddings.init_context(context)[source]
-
-
-
-

Module contents

-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/bert_embeddings/0.8.0/static/example.html b/functions/development/bert_embeddings/0.8.0/static/example.html deleted file mode 100644 index 506a6040..00000000 --- a/functions/development/bert_embeddings/0.8.0/static/example.html +++ /dev/null @@ -1,489 +0,0 @@ - - - - - - - -BERT Embeddings Serverless Function - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- - -
-
-
-
-
-
-

BERT Embeddings Serverless Function

-

This notebook presents deployment of pretrained BERT model that outputs embeddings for given textual sequences as a serverless function. Embeddings are meaningful, contextual representations of text in the form of ndarrays that are used frequently as input to various learning tasks in the field of NLP.

-
-
-
!/User/align_mlrun.sh
-
-
-
-
-
Both server & client are aligned (0.6.5).
-
-
-
-
-
-
-

Embeddings without bert

-

One-Hot Encoding is a general method that can vectorize any categorical features. It is simple and fast to create and update the vectorization.
-in case of text embeddings, each row is a sentence and each column is a word/char/n-gram.

-
-
-
# some sentences to do examine
-sentences = ['the quick brown fox jumps over the lazy dog',
-              'Hello I am Jacob',
-              'Daniel visited Tel-Aviv last month']
-
-
-
-
-

lets see the difference between bert embeddings and one-hot encoding

-
-
-
# constructing a list of all the words (will be our columns) - make sure no duplicate words are set
-tokens = []
-for sentence in sentences:
-    for word in sentence.split():
-        tokens.append(word) if word not in tokens else ""
-print(tokens)
-
-
-
-
-
['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'lazy', 'dog', 'Hello', 'I', 'am', 'Jacob', 'Daniel', 'visited', 'Tel-Aviv', 'last', 'month']
-
-
-
-
-
-
-
# constructing the one hot vector
-import pandas as pd
-import numpy as np
-
-one_hot = pd.DataFrame(columns = range(len(tokens)))
-# filling our empty dataframe with each sentence encoding
-for sentence in sentences:
-    vector = np.zeros(len(tokens))
-    for word in sentence.split():
-        vector[tokens.index(word)]=1
-    one_hot = one_hot.append(pd.Series(vector),ignore_index=True)
-one_hot.columns = tokens
-
-
-
-
-
-
-
one_hot
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
thequickbrownfoxjumpsoverlazydogHelloIamJacobDanielvisitedTel-Avivlastmonth
01.01.01.01.01.01.01.01.00.00.00.00.00.00.00.00.00.0
10.00.00.00.00.00.00.00.01.01.01.01.00.00.00.00.00.0
20.00.00.00.00.00.00.00.00.00.00.00.01.01.01.01.01.0
-
-
-

The table above represents the one-hot encoding of our sentences, each row is a sentence and each column is a word. -this representation is very slim and will be a very weak learning dataset.

-
-
-

Introducing Bert embeddings

-
-
-
from mlrun import import_function, auto_mount
-
-
-
-
-
-
-
# importing the function from the hub
-fn = import_function("hub://bert_embeddings").apply(auto_mount())
-
-
-
-
-
-
-
# deploying the function
-addr = fn.deploy(project = "function-marketplace")
-
-
-
-
-
> 2021-08-11 14:49:02,307 [info] Starting remote function deploy
-2021-08-11 14:49:02  (info) Deploying function
-2021-08-11 14:49:02  (info) Building
-2021-08-11 14:49:03  (info) Staging files and preparing base images
-2021-08-11 14:49:03  (info) Building processor image
-2021-08-11 14:49:10  (info) Build complete
-2021-08-11 14:49:42  (info) Function deploy complete
-> 2021-08-11 14:49:43,742 [info] function deployed, address=default-tenant.app.dev39.lab.iguazeng.com:32500
-
-
-
-
-
-
-
import requests
-import json
-# sending a request to the function endpoint to get the sentences' embeddings
-resp = requests.post(addr, json=json.dumps(sentences))
-
-
-
-
-
-
-
import pickle
-output_embeddings = pickle.loads(resp.content)
-
-
-
-
-
-
-
print(f'embeddings per token shape: {output_embeddings[0].shape}, pooled embeddings shape: {output_embeddings[1].shape}')
-
-
-
-
-
embeddings per token shape: (3, 11, 768), pooled embeddings shape: (3, 768)
-
-
-
-
-
-
-
pd.DataFrame(output_embeddings[1])
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
0123456789...758759760761762763764765766767
0-0.733322-0.2235400.3424620.383463-0.1647960.0405220.8028450.1528420.331639-0.999779...0.2065640.2314150.1964340.7979080.4351760.7493700.2460980.427603-0.5773840.842063
1-0.953005-0.535132-0.7438210.8939340.646276-0.2793880.9435130.275504-0.555108-0.999992...0.582385-0.0046130.9760790.931517-0.3914420.5303840.675933-0.682720-0.7463390.957809
2-0.843678-0.453405-0.8260110.6508050.494036-0.1541170.8216420.349507-0.650629-0.999978...0.618286-0.3367000.9362620.857577-0.7874890.2461370.676243-0.612531-0.7087860.840879
-

3 rows × 768 columns

-
-
-

we can see that the size of the first dimension of the outputs is three since we passed in three sequences. Also the intermediate dimension of the first output is the maximal number of tokens across all input sequences. Sequences with less tokens are padded with zero values.
-Note that the first input has an intermediate dimension of size 11 that corresponds to the number of max tokens in the input sequence after addition of two special tokens marking beginning and end of a sequence by the tokenizer.
-The last dimension for both is of size 768 which is the embedding dimension for this default configuration of bert.
-Now you tell me, which encoding are you gonna use in your project ??

-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/bert_embeddings/0.8.0/static/function.html b/functions/development/bert_embeddings/0.8.0/static/function.html deleted file mode 100644 index ec7b4e48..00000000 --- a/functions/development/bert_embeddings/0.8.0/static/function.html +++ /dev/null @@ -1,66 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: remote
-metadata:
-  name: bert-embeddings
-  tag: ''
-  hash: 77dd0921ea6d23ed5c0e944037e291e2cd6727fd
-  project: default
-  labels:
-    framework: pytorch
-  categories:
-  - machine-learning
-  - data-preparation
-spec:
-  command: ''
-  args: []
-  image: mlrun/ml-models
-  description: Get BERT based embeddings for given text
-  min_replicas: 1
-  max_replicas: 4
-  env: []
-  base_spec:
-    apiVersion: nuclio.io/v1
-    kind: Function
-    metadata:
-      name: bert-embeddings
-      labels: {}
-      annotations:
-        nuclio.io/generated_by: function generated from /home/kali/functions/bert_embeddings/bert_embeddings.py
-    spec:
-      runtime: python:3.6
-      handler: bert_embeddings:handler
-      env: []
-      volumes: []
-      build:
-        commands: []
-        noBaseImagesPull: true
-        functionSourceCode: aW1wb3J0IGpzb24KaW1wb3J0IHBpY2tsZQoKaW1wb3J0IHRvcmNoCmZyb20gdHJhbnNmb3JtZXJzIGltcG9ydCBCZXJ0TW9kZWwsIEJlcnRUb2tlbml6ZXIKCgpkZWYgaW5pdF9jb250ZXh0KGNvbnRleHQpOgogICAgdG9rZW5pemVyID0gQmVydFRva2VuaXplci5mcm9tX3ByZXRyYWluZWQoImJlcnQtYmFzZS11bmNhc2VkIikKICAgIG1vZGVsID0gQmVydE1vZGVsLmZyb21fcHJldHJhaW5lZCgiYmVydC1iYXNlLXVuY2FzZWQiKQogICAgbW9kZWwuZXZhbCgpCgogICAgc2V0YXR0cihjb250ZXh0LnVzZXJfZGF0YSwgInRva2VuaXplciIsIHRva2VuaXplcikKICAgIHNldGF0dHIoY29udGV4dC51c2VyX2RhdGEsICJtb2RlbCIsIG1vZGVsKQoKCmRlZiBoYW5kbGVyKGNvbnRleHQsIGV2ZW50KToKICAgIGRvY3MgPSBqc29uLmxvYWRzKGV2ZW50LmJvZHkpCiAgICBkb2NzID0gW2RvYy5sb3dlcigpIGZvciBkb2MgaW4gZG9jc10KICAgIGRvY3MgPSBjb250ZXh0LnVzZXJfZGF0YS50b2tlbml6ZXIuYmF0Y2hfZW5jb2RlX3BsdXMoCiAgICAgICAgZG9jcywgcGFkX3RvX21heF9sZW5ndGg9VHJ1ZSwgcmV0dXJuX3RlbnNvcnM9InB0IgogICAgKQoKICAgIHdpdGggdG9yY2gubm9fZ3JhZCgpOgogICAgICAgIGVtYmVkZGluZ3MgPSBjb250ZXh0LnVzZXJfZGF0YS5tb2RlbCgqKmRvY3MpCiAgICBlbWJlZGRpbmdzID0gW2VtYmVkZGluZ3NbMF0ubnVtcHkoKSwgZW1iZWRkaW5nc1sxXS5udW1weSgpXQogICAgcmV0dXJuIHBpY2tsZS5kdW1wcyhlbWJlZGRpbmdzKQo=
-  source: ''
-  build:
-    commands:
-    - python -m pip install torch==1.6.0
-    code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/bert_embeddings/bert_embeddings.py
-  default_handler: handler
-  affinity: null
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/bert_embeddings/0.8.0/static/item.html b/functions/development/bert_embeddings/0.8.0/static/item.html deleted file mode 100644 index 716a9cfb..00000000 --- a/functions/development/bert_embeddings/0.8.0/static/item.html +++ /dev/null @@ -1,47 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- machine-learning
-- data-preparation
-description: Get BERT based embeddings for given text
-doc: ''
-example: bert_embeddings.ipynb
-generationDate: 2021-05-19:22-04
-icon: ''
-labels:
-  framework: pytorch
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 0.8.0
-name: bert-embeddings
-platformVersion: 3.2.0
-spec:
-  filename: bert_embeddings.py
-  handler: handler
-  image: mlrun/ml-models
-  kind: nuclio
-  requirements:
-  - torch==1.6.0
-url: ''
-version: 0.8.0
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/bert_embeddings/0.8.0/static/source.html b/functions/development/bert_embeddings/0.8.0/static/source.html deleted file mode 100644 index 3c3de0a9..00000000 --- a/functions/development/bert_embeddings/0.8.0/static/source.html +++ /dev/null @@ -1,49 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-import json
-import pickle
-
-import torch
-from transformers import BertModel, BertTokenizer
-
-
-def init_context(context):
-    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
-    model = BertModel.from_pretrained("bert-base-uncased")
-    model.eval()
-
-    setattr(context.user_data, "tokenizer", tokenizer)
-    setattr(context.user_data, "model", model)
-
-
-def handler(context, event):
-    docs = json.loads(event.body)
-    docs = [doc.lower() for doc in docs]
-    docs = context.user_data.tokenizer.batch_encode_plus(
-        docs, pad_to_max_length=True, return_tensors="pt"
-    )
-
-    with torch.no_grad():
-        embeddings = context.user_data.model(**docs)
-    embeddings = [embeddings[0].numpy(), embeddings[1].numpy()]
-    return pickle.dumps(embeddings)
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/bert_embeddings/0.9.0/src/bert_embeddings.ipynb b/functions/development/bert_embeddings/0.9.0/src/bert_embeddings.ipynb deleted file mode 100644 index 663ed084..00000000 --- a/functions/development/bert_embeddings/0.9.0/src/bert_embeddings.ipynb +++ /dev/null @@ -1,520 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## BERT Embeddings Serverless Function\n", - "This notebook presents deployment of pretrained BERT model that outputs embeddings for given textual sequences as a serverless function. Embeddings are meaningful, contextual representations of text in the form of ndarrays that are used frequently as input to various learning tasks in the field of NLP." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Both server & client are aligned (0.6.5).\n" - ] - } - ], - "source": [ - "!/User/align_mlrun.sh" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Embeddings without bert" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "[One-Hot Encoding](https://en.wikipedia.org/wiki/One-hot) is a general method that can vectorize any categorical features. It is simple and fast to create and update the vectorization.
\n", - "in case of text embeddings, each row is a sentence and each column is a word/char/[n-gram](https://en.wikipedia.org/wiki/N-gram)." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "# some sentences to do examine\n", - "sentences = ['the quick brown fox jumps over the lazy dog',\n", - " 'Hello I am Jacob',\n", - " 'Daniel visited Tel-Aviv last month']" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "lets see the difference between bert embeddings and one-hot encoding" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'lazy', 'dog', 'Hello', 'I', 'am', 'Jacob', 'Daniel', 'visited', 'Tel-Aviv', 'last', 'month']\n" - ] - } - ], - "source": [ - "# constructing a list of all the words (will be our columns) - make sure no duplicate words are set\n", - "tokens = []\n", - "for sentence in sentences:\n", - " for word in sentence.split():\n", - " tokens.append(word) if word not in tokens else \"\"\n", - "print(tokens)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "# constructing the one hot vector\n", - "import pandas as pd\n", - "import numpy as np\n", - "\n", - "one_hot = pd.DataFrame(columns = range(len(tokens)))\n", - "# filling our empty dataframe with each sentence encoding\n", - "for sentence in sentences:\n", - " vector = np.zeros(len(tokens))\n", - " for word in sentence.split():\n", - " vector[tokens.index(word)]=1\n", - " one_hot = one_hot.append(pd.Series(vector),ignore_index=True)\n", - "one_hot.columns = tokens" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
thequickbrownfoxjumpsoverlazydogHelloIamJacobDanielvisitedTel-Avivlastmonth
01.01.01.01.01.01.01.01.00.00.00.00.00.00.00.00.00.0
10.00.00.00.00.00.00.00.01.01.01.01.00.00.00.00.00.0
20.00.00.00.00.00.00.00.00.00.00.00.01.01.01.01.01.0
\n", - "
" - ], - "text/plain": [ - " the quick brown fox jumps over lazy dog Hello I am Jacob \\\n", - "0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 0.0 0.0 0.0 0.0 \n", - "1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 \n", - "2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", - "\n", - " Daniel visited Tel-Aviv last month \n", - "0 0.0 0.0 0.0 0.0 0.0 \n", - "1 0.0 0.0 0.0 0.0 0.0 \n", - "2 1.0 1.0 1.0 1.0 1.0 " - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "one_hot" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The table above represents the one-hot encoding of our sentences, each row is a sentence and each column is a word.\n", - "this representation is very slim and will be a very weak learning dataset." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Introducing Bert embeddings" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import import_function, auto_mount" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "# importing the function from the hub\n", - "fn = import_function(\"hub://bert_embeddings\").apply(auto_mount())" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2021-08-11 14:49:02,307 [info] Starting remote function deploy\n", - "2021-08-11 14:49:02 (info) Deploying function\n", - "2021-08-11 14:49:02 (info) Building\n", - "2021-08-11 14:49:03 (info) Staging files and preparing base images\n", - "2021-08-11 14:49:03 (info) Building processor image\n", - "2021-08-11 14:49:10 (info) Build complete\n", - "2021-08-11 14:49:42 (info) Function deploy complete\n", - "> 2021-08-11 14:49:43,742 [info] function deployed, address=default-tenant.app.dev39.lab.iguazeng.com:32500\n" - ] - } - ], - "source": [ - "# deploying the function\n", - "addr = fn.deploy(project = \"function-marketplace\")" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "import requests\n", - "import json\n", - "# sending a request to the function endpoint to get the sentences' embeddings\n", - "resp = requests.post(addr, json=json.dumps(sentences))" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "import pickle\n", - "output_embeddings = pickle.loads(resp.content)" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "embeddings per token shape: (3, 11, 768), pooled embeddings shape: (3, 768)\n" - ] - } - ], - "source": [ - "print(f'embeddings per token shape: {output_embeddings[0].shape}, pooled embeddings shape: {output_embeddings[1].shape}')" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
0123456789...758759760761762763764765766767
0-0.733322-0.2235400.3424620.383463-0.1647960.0405220.8028450.1528420.331639-0.999779...0.2065640.2314150.1964340.7979080.4351760.7493700.2460980.427603-0.5773840.842063
1-0.953005-0.535132-0.7438210.8939340.646276-0.2793880.9435130.275504-0.555108-0.999992...0.582385-0.0046130.9760790.931517-0.3914420.5303840.675933-0.682720-0.7463390.957809
2-0.843678-0.453405-0.8260110.6508050.494036-0.1541170.8216420.349507-0.650629-0.999978...0.618286-0.3367000.9362620.857577-0.7874890.2461370.676243-0.612531-0.7087860.840879
\n", - "

3 rows × 768 columns

\n", - "
" - ], - "text/plain": [ - " 0 1 2 3 4 5 6 \\\n", - "0 -0.733322 -0.223540 0.342462 0.383463 -0.164796 0.040522 0.802845 \n", - "1 -0.953005 -0.535132 -0.743821 0.893934 0.646276 -0.279388 0.943513 \n", - "2 -0.843678 -0.453405 -0.826011 0.650805 0.494036 -0.154117 0.821642 \n", - "\n", - " 7 8 9 ... 758 759 760 761 \\\n", - "0 0.152842 0.331639 -0.999779 ... 0.206564 0.231415 0.196434 0.797908 \n", - "1 0.275504 -0.555108 -0.999992 ... 0.582385 -0.004613 0.976079 0.931517 \n", - "2 0.349507 -0.650629 -0.999978 ... 0.618286 -0.336700 0.936262 0.857577 \n", - "\n", - " 762 763 764 765 766 767 \n", - "0 0.435176 0.749370 0.246098 0.427603 -0.577384 0.842063 \n", - "1 -0.391442 0.530384 0.675933 -0.682720 -0.746339 0.957809 \n", - "2 -0.787489 0.246137 0.676243 -0.612531 -0.708786 0.840879 \n", - "\n", - "[3 rows x 768 columns]" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "pd.DataFrame(output_embeddings[1])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "we can see that the size of the first dimension of the outputs is three since we passed in three sequences. Also the intermediate dimension of the first output is the maximal number of tokens across all input sequences. Sequences with less tokens are padded with zero values.
\n", - "Note that the first input has an intermediate dimension of size 11 that corresponds to the number of max tokens in the input sequence after addition of two special tokens marking beginning and end of a sequence by the tokenizer.
\n", - "The last dimension for both is of size 768 which is the embedding dimension for this default configuration of bert.
\n", - "Now you tell me, which encoding are you gonna use in your project ??" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/functions/development/bert_embeddings/0.9.0/src/bert_embeddings.py b/functions/development/bert_embeddings/0.9.0/src/bert_embeddings.py deleted file mode 100644 index 92610a06..00000000 --- a/functions/development/bert_embeddings/0.9.0/src/bert_embeddings.py +++ /dev/null @@ -1,27 +0,0 @@ -import json -import pickle - -import torch -from transformers import BertModel, BertTokenizer - - -def init_context(context): - tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") - model = BertModel.from_pretrained("bert-base-uncased") - model.eval() - - setattr(context.user_data, "tokenizer", tokenizer) - setattr(context.user_data, "model", model) - - -def handler(context, event): - docs = json.loads(event.body) - docs = [doc.lower() for doc in docs] - docs = context.user_data.tokenizer.batch_encode_plus( - docs, pad_to_max_length=True, return_tensors="pt" - ) - - with torch.no_grad(): - embeddings = context.user_data.model(**docs) - embeddings = [embeddings[0].numpy(), embeddings[1].numpy()] - return pickle.dumps(embeddings) diff --git a/functions/development/bert_embeddings/0.9.0/src/function.yaml b/functions/development/bert_embeddings/0.9.0/src/function.yaml deleted file mode 100644 index da59cd56..00000000 --- a/functions/development/bert_embeddings/0.9.0/src/function.yaml +++ /dev/null @@ -1,44 +0,0 @@ -kind: remote -metadata: - name: bert-embeddings - tag: '' - hash: 77dd0921ea6d23ed5c0e944037e291e2cd6727fd - project: '' - labels: - framework: pytorch - categories: - - machine-learning - - data-preparation -spec: - command: '' - args: [] - image: mlrun/ml-models - description: Get BERT based embeddings for given text - min_replicas: 1 - max_replicas: 4 - env: [] - base_spec: - apiVersion: nuclio.io/v1 - kind: Function - metadata: - name: bert-embeddings - labels: {} - annotations: - nuclio.io/generated_by: function generated from /home/kali/functions/bert_embeddings/bert_embeddings.py - spec: - runtime: python:3.6 - handler: bert_embeddings:handler - env: [] - volumes: [] - build: - commands: [] - noBaseImagesPull: true - functionSourceCode: aW1wb3J0IGpzb24KaW1wb3J0IHBpY2tsZQoKaW1wb3J0IHRvcmNoCmZyb20gdHJhbnNmb3JtZXJzIGltcG9ydCBCZXJ0TW9kZWwsIEJlcnRUb2tlbml6ZXIKCgpkZWYgaW5pdF9jb250ZXh0KGNvbnRleHQpOgogICAgdG9rZW5pemVyID0gQmVydFRva2VuaXplci5mcm9tX3ByZXRyYWluZWQoImJlcnQtYmFzZS11bmNhc2VkIikKICAgIG1vZGVsID0gQmVydE1vZGVsLmZyb21fcHJldHJhaW5lZCgiYmVydC1iYXNlLXVuY2FzZWQiKQogICAgbW9kZWwuZXZhbCgpCgogICAgc2V0YXR0cihjb250ZXh0LnVzZXJfZGF0YSwgInRva2VuaXplciIsIHRva2VuaXplcikKICAgIHNldGF0dHIoY29udGV4dC51c2VyX2RhdGEsICJtb2RlbCIsIG1vZGVsKQoKCmRlZiBoYW5kbGVyKGNvbnRleHQsIGV2ZW50KToKICAgIGRvY3MgPSBqc29uLmxvYWRzKGV2ZW50LmJvZHkpCiAgICBkb2NzID0gW2RvYy5sb3dlcigpIGZvciBkb2MgaW4gZG9jc10KICAgIGRvY3MgPSBjb250ZXh0LnVzZXJfZGF0YS50b2tlbml6ZXIuYmF0Y2hfZW5jb2RlX3BsdXMoCiAgICAgICAgZG9jcywgcGFkX3RvX21heF9sZW5ndGg9VHJ1ZSwgcmV0dXJuX3RlbnNvcnM9InB0IgogICAgKQoKICAgIHdpdGggdG9yY2gubm9fZ3JhZCgpOgogICAgICAgIGVtYmVkZGluZ3MgPSBjb250ZXh0LnVzZXJfZGF0YS5tb2RlbCgqKmRvY3MpCiAgICBlbWJlZGRpbmdzID0gW2VtYmVkZGluZ3NbMF0ubnVtcHkoKSwgZW1iZWRkaW5nc1sxXS5udW1weSgpXQogICAgcmV0dXJuIHBpY2tsZS5kdW1wcyhlbWJlZGRpbmdzKQo= - source: '' - build: - commands: - - python -m pip install torch==1.6.0 - code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/bert_embeddings/bert_embeddings.py - default_handler: handler - affinity: null -verbose: false diff --git a/functions/development/bert_embeddings/0.9.0/src/item.yaml b/functions/development/bert_embeddings/0.9.0/src/item.yaml deleted file mode 100644 index 0f74ed52..00000000 --- a/functions/development/bert_embeddings/0.9.0/src/item.yaml +++ /dev/null @@ -1,25 +0,0 @@ -apiVersion: v1 -categories: -- machine-learning -- data-preparation -description: Get BERT based embeddings for given text -doc: '' -example: bert_embeddings.ipynb -generationDate: 2021-11-18:12-28 -icon: '' -labels: - framework: pytorch -maintainers: [] -marketplaceType: '' -mlrunVersion: 0.8.0 -name: bert-embeddings -platformVersion: 3.2.0 -spec: - filename: bert_embeddings.py - handler: handler - image: mlrun/ml-models - kind: nuclio - requirements: - - torch==1.6.0 -url: '' -version: 0.9.0 diff --git a/functions/development/bert_embeddings/0.9.0/src/requirements.txt b/functions/development/bert_embeddings/0.9.0/src/requirements.txt deleted file mode 100644 index d7ee31ee..00000000 --- a/functions/development/bert_embeddings/0.9.0/src/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -mlrun -transformers \ No newline at end of file diff --git a/functions/development/bert_embeddings/0.9.0/src/test_bert_embeddings.py b/functions/development/bert_embeddings/0.9.0/src/test_bert_embeddings.py deleted file mode 100644 index 320dff20..00000000 --- a/functions/development/bert_embeddings/0.9.0/src/test_bert_embeddings.py +++ /dev/null @@ -1,18 +0,0 @@ -from bert_embeddings import init_context,handler -import nuclio -import json -import pickle -import numpy as np - -ARCHIVE = "https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz" -ARTIFACTS_PATH = 'artifacts' - - -def test_bert_embeddings(): - event = nuclio.Event(body=json.dumps(['John loves Mary'])) - ctx = nuclio.Context() - init_context(ctx) - outputs = pickle.loads(handler(ctx, event)) - assert (True if abs(np.mean(outputs[0]) - -0.011996539) <= 0.0001 else False) is True - assert (True if abs(np.mean(outputs[0]) - -0.011996539) > 0 else False) is True - diff --git a/functions/development/bert_embeddings/0.9.0/static/documentation.html b/functions/development/bert_embeddings/0.9.0/static/documentation.html deleted file mode 100644 index ae4d0424..00000000 --- a/functions/development/bert_embeddings/0.9.0/static/documentation.html +++ /dev/null @@ -1,133 +0,0 @@ - - - - - - - -bert_embeddings package - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- - -
-
-
-
-
-
-

bert_embeddings package

-
-

Submodules

-
-
-

bert_embeddings.bert_embeddings module

-
-
-bert_embeddings.bert_embeddings.handler(context, event)[source]
-
-
-
-bert_embeddings.bert_embeddings.init_context(context)[source]
-
-
-
-

Module contents

-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/bert_embeddings/0.9.0/static/example.html b/functions/development/bert_embeddings/0.9.0/static/example.html deleted file mode 100644 index 506a6040..00000000 --- a/functions/development/bert_embeddings/0.9.0/static/example.html +++ /dev/null @@ -1,489 +0,0 @@ - - - - - - - -BERT Embeddings Serverless Function - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- - -
-
-
-
-
-
-

BERT Embeddings Serverless Function

-

This notebook presents deployment of pretrained BERT model that outputs embeddings for given textual sequences as a serverless function. Embeddings are meaningful, contextual representations of text in the form of ndarrays that are used frequently as input to various learning tasks in the field of NLP.

-
-
-
!/User/align_mlrun.sh
-
-
-
-
-
Both server & client are aligned (0.6.5).
-
-
-
-
-
-
-

Embeddings without bert

-

One-Hot Encoding is a general method that can vectorize any categorical features. It is simple and fast to create and update the vectorization.
-in case of text embeddings, each row is a sentence and each column is a word/char/n-gram.

-
-
-
# some sentences to do examine
-sentences = ['the quick brown fox jumps over the lazy dog',
-              'Hello I am Jacob',
-              'Daniel visited Tel-Aviv last month']
-
-
-
-
-

lets see the difference between bert embeddings and one-hot encoding

-
-
-
# constructing a list of all the words (will be our columns) - make sure no duplicate words are set
-tokens = []
-for sentence in sentences:
-    for word in sentence.split():
-        tokens.append(word) if word not in tokens else ""
-print(tokens)
-
-
-
-
-
['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'lazy', 'dog', 'Hello', 'I', 'am', 'Jacob', 'Daniel', 'visited', 'Tel-Aviv', 'last', 'month']
-
-
-
-
-
-
-
# constructing the one hot vector
-import pandas as pd
-import numpy as np
-
-one_hot = pd.DataFrame(columns = range(len(tokens)))
-# filling our empty dataframe with each sentence encoding
-for sentence in sentences:
-    vector = np.zeros(len(tokens))
-    for word in sentence.split():
-        vector[tokens.index(word)]=1
-    one_hot = one_hot.append(pd.Series(vector),ignore_index=True)
-one_hot.columns = tokens
-
-
-
-
-
-
-
one_hot
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
thequickbrownfoxjumpsoverlazydogHelloIamJacobDanielvisitedTel-Avivlastmonth
01.01.01.01.01.01.01.01.00.00.00.00.00.00.00.00.00.0
10.00.00.00.00.00.00.00.01.01.01.01.00.00.00.00.00.0
20.00.00.00.00.00.00.00.00.00.00.00.01.01.01.01.01.0
-
-
-

The table above represents the one-hot encoding of our sentences, each row is a sentence and each column is a word. -this representation is very slim and will be a very weak learning dataset.

-
-
-

Introducing Bert embeddings

-
-
-
from mlrun import import_function, auto_mount
-
-
-
-
-
-
-
# importing the function from the hub
-fn = import_function("hub://bert_embeddings").apply(auto_mount())
-
-
-
-
-
-
-
# deploying the function
-addr = fn.deploy(project = "function-marketplace")
-
-
-
-
-
> 2021-08-11 14:49:02,307 [info] Starting remote function deploy
-2021-08-11 14:49:02  (info) Deploying function
-2021-08-11 14:49:02  (info) Building
-2021-08-11 14:49:03  (info) Staging files and preparing base images
-2021-08-11 14:49:03  (info) Building processor image
-2021-08-11 14:49:10  (info) Build complete
-2021-08-11 14:49:42  (info) Function deploy complete
-> 2021-08-11 14:49:43,742 [info] function deployed, address=default-tenant.app.dev39.lab.iguazeng.com:32500
-
-
-
-
-
-
-
import requests
-import json
-# sending a request to the function endpoint to get the sentences' embeddings
-resp = requests.post(addr, json=json.dumps(sentences))
-
-
-
-
-
-
-
import pickle
-output_embeddings = pickle.loads(resp.content)
-
-
-
-
-
-
-
print(f'embeddings per token shape: {output_embeddings[0].shape}, pooled embeddings shape: {output_embeddings[1].shape}')
-
-
-
-
-
embeddings per token shape: (3, 11, 768), pooled embeddings shape: (3, 768)
-
-
-
-
-
-
-
pd.DataFrame(output_embeddings[1])
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
0123456789...758759760761762763764765766767
0-0.733322-0.2235400.3424620.383463-0.1647960.0405220.8028450.1528420.331639-0.999779...0.2065640.2314150.1964340.7979080.4351760.7493700.2460980.427603-0.5773840.842063
1-0.953005-0.535132-0.7438210.8939340.646276-0.2793880.9435130.275504-0.555108-0.999992...0.582385-0.0046130.9760790.931517-0.3914420.5303840.675933-0.682720-0.7463390.957809
2-0.843678-0.453405-0.8260110.6508050.494036-0.1541170.8216420.349507-0.650629-0.999978...0.618286-0.3367000.9362620.857577-0.7874890.2461370.676243-0.612531-0.7087860.840879
-

3 rows × 768 columns

-
-
-

we can see that the size of the first dimension of the outputs is three since we passed in three sequences. Also the intermediate dimension of the first output is the maximal number of tokens across all input sequences. Sequences with less tokens are padded with zero values.
-Note that the first input has an intermediate dimension of size 11 that corresponds to the number of max tokens in the input sequence after addition of two special tokens marking beginning and end of a sequence by the tokenizer.
-The last dimension for both is of size 768 which is the embedding dimension for this default configuration of bert.
-Now you tell me, which encoding are you gonna use in your project ??

-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/bert_embeddings/0.9.0/static/function.html b/functions/development/bert_embeddings/0.9.0/static/function.html deleted file mode 100644 index 32d2fe58..00000000 --- a/functions/development/bert_embeddings/0.9.0/static/function.html +++ /dev/null @@ -1,66 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: remote
-metadata:
-  name: bert-embeddings
-  tag: ''
-  hash: 77dd0921ea6d23ed5c0e944037e291e2cd6727fd
-  project: ''
-  labels:
-    framework: pytorch
-  categories:
-  - machine-learning
-  - data-preparation
-spec:
-  command: ''
-  args: []
-  image: mlrun/ml-models
-  description: Get BERT based embeddings for given text
-  min_replicas: 1
-  max_replicas: 4
-  env: []
-  base_spec:
-    apiVersion: nuclio.io/v1
-    kind: Function
-    metadata:
-      name: bert-embeddings
-      labels: {}
-      annotations:
-        nuclio.io/generated_by: function generated from /home/kali/functions/bert_embeddings/bert_embeddings.py
-    spec:
-      runtime: python:3.6
-      handler: bert_embeddings:handler
-      env: []
-      volumes: []
-      build:
-        commands: []
-        noBaseImagesPull: true
-        functionSourceCode: aW1wb3J0IGpzb24KaW1wb3J0IHBpY2tsZQoKaW1wb3J0IHRvcmNoCmZyb20gdHJhbnNmb3JtZXJzIGltcG9ydCBCZXJ0TW9kZWwsIEJlcnRUb2tlbml6ZXIKCgpkZWYgaW5pdF9jb250ZXh0KGNvbnRleHQpOgogICAgdG9rZW5pemVyID0gQmVydFRva2VuaXplci5mcm9tX3ByZXRyYWluZWQoImJlcnQtYmFzZS11bmNhc2VkIikKICAgIG1vZGVsID0gQmVydE1vZGVsLmZyb21fcHJldHJhaW5lZCgiYmVydC1iYXNlLXVuY2FzZWQiKQogICAgbW9kZWwuZXZhbCgpCgogICAgc2V0YXR0cihjb250ZXh0LnVzZXJfZGF0YSwgInRva2VuaXplciIsIHRva2VuaXplcikKICAgIHNldGF0dHIoY29udGV4dC51c2VyX2RhdGEsICJtb2RlbCIsIG1vZGVsKQoKCmRlZiBoYW5kbGVyKGNvbnRleHQsIGV2ZW50KToKICAgIGRvY3MgPSBqc29uLmxvYWRzKGV2ZW50LmJvZHkpCiAgICBkb2NzID0gW2RvYy5sb3dlcigpIGZvciBkb2MgaW4gZG9jc10KICAgIGRvY3MgPSBjb250ZXh0LnVzZXJfZGF0YS50b2tlbml6ZXIuYmF0Y2hfZW5jb2RlX3BsdXMoCiAgICAgICAgZG9jcywgcGFkX3RvX21heF9sZW5ndGg9VHJ1ZSwgcmV0dXJuX3RlbnNvcnM9InB0IgogICAgKQoKICAgIHdpdGggdG9yY2gubm9fZ3JhZCgpOgogICAgICAgIGVtYmVkZGluZ3MgPSBjb250ZXh0LnVzZXJfZGF0YS5tb2RlbCgqKmRvY3MpCiAgICBlbWJlZGRpbmdzID0gW2VtYmVkZGluZ3NbMF0ubnVtcHkoKSwgZW1iZWRkaW5nc1sxXS5udW1weSgpXQogICAgcmV0dXJuIHBpY2tsZS5kdW1wcyhlbWJlZGRpbmdzKQo=
-  source: ''
-  build:
-    commands:
-    - python -m pip install torch==1.6.0
-    code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/bert_embeddings/bert_embeddings.py
-  default_handler: handler
-  affinity: null
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/bert_embeddings/0.9.0/static/item.html b/functions/development/bert_embeddings/0.9.0/static/item.html deleted file mode 100644 index b5416a9a..00000000 --- a/functions/development/bert_embeddings/0.9.0/static/item.html +++ /dev/null @@ -1,47 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- machine-learning
-- data-preparation
-description: Get BERT based embeddings for given text
-doc: ''
-example: bert_embeddings.ipynb
-generationDate: 2021-11-18:12-28
-icon: ''
-labels:
-  framework: pytorch
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 0.8.0
-name: bert-embeddings
-platformVersion: 3.2.0
-spec:
-  filename: bert_embeddings.py
-  handler: handler
-  image: mlrun/ml-models
-  kind: nuclio
-  requirements:
-  - torch==1.6.0
-url: ''
-version: 0.9.0
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/bert_embeddings/0.9.0/static/source.html b/functions/development/bert_embeddings/0.9.0/static/source.html deleted file mode 100644 index 3c3de0a9..00000000 --- a/functions/development/bert_embeddings/0.9.0/static/source.html +++ /dev/null @@ -1,49 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-import json
-import pickle
-
-import torch
-from transformers import BertModel, BertTokenizer
-
-
-def init_context(context):
-    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
-    model = BertModel.from_pretrained("bert-base-uncased")
-    model.eval()
-
-    setattr(context.user_data, "tokenizer", tokenizer)
-    setattr(context.user_data, "model", model)
-
-
-def handler(context, event):
-    docs = json.loads(event.body)
-    docs = [doc.lower() for doc in docs]
-    docs = context.user_data.tokenizer.batch_encode_plus(
-        docs, pad_to_max_length=True, return_tensors="pt"
-    )
-
-    with torch.no_grad():
-        embeddings = context.user_data.model(**docs)
-    embeddings = [embeddings[0].numpy(), embeddings[1].numpy()]
-    return pickle.dumps(embeddings)
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/bert_embeddings/1.1.0/src/bert_embeddings.ipynb b/functions/development/bert_embeddings/1.1.0/src/bert_embeddings.ipynb deleted file mode 100644 index 663ed084..00000000 --- a/functions/development/bert_embeddings/1.1.0/src/bert_embeddings.ipynb +++ /dev/null @@ -1,520 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## BERT Embeddings Serverless Function\n", - "This notebook presents deployment of pretrained BERT model that outputs embeddings for given textual sequences as a serverless function. Embeddings are meaningful, contextual representations of text in the form of ndarrays that are used frequently as input to various learning tasks in the field of NLP." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Both server & client are aligned (0.6.5).\n" - ] - } - ], - "source": [ - "!/User/align_mlrun.sh" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Embeddings without bert" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "[One-Hot Encoding](https://en.wikipedia.org/wiki/One-hot) is a general method that can vectorize any categorical features. It is simple and fast to create and update the vectorization.
\n", - "in case of text embeddings, each row is a sentence and each column is a word/char/[n-gram](https://en.wikipedia.org/wiki/N-gram)." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "# some sentences to do examine\n", - "sentences = ['the quick brown fox jumps over the lazy dog',\n", - " 'Hello I am Jacob',\n", - " 'Daniel visited Tel-Aviv last month']" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "lets see the difference between bert embeddings and one-hot encoding" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'lazy', 'dog', 'Hello', 'I', 'am', 'Jacob', 'Daniel', 'visited', 'Tel-Aviv', 'last', 'month']\n" - ] - } - ], - "source": [ - "# constructing a list of all the words (will be our columns) - make sure no duplicate words are set\n", - "tokens = []\n", - "for sentence in sentences:\n", - " for word in sentence.split():\n", - " tokens.append(word) if word not in tokens else \"\"\n", - "print(tokens)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "# constructing the one hot vector\n", - "import pandas as pd\n", - "import numpy as np\n", - "\n", - "one_hot = pd.DataFrame(columns = range(len(tokens)))\n", - "# filling our empty dataframe with each sentence encoding\n", - "for sentence in sentences:\n", - " vector = np.zeros(len(tokens))\n", - " for word in sentence.split():\n", - " vector[tokens.index(word)]=1\n", - " one_hot = one_hot.append(pd.Series(vector),ignore_index=True)\n", - "one_hot.columns = tokens" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
thequickbrownfoxjumpsoverlazydogHelloIamJacobDanielvisitedTel-Avivlastmonth
01.01.01.01.01.01.01.01.00.00.00.00.00.00.00.00.00.0
10.00.00.00.00.00.00.00.01.01.01.01.00.00.00.00.00.0
20.00.00.00.00.00.00.00.00.00.00.00.01.01.01.01.01.0
\n", - "
" - ], - "text/plain": [ - " the quick brown fox jumps over lazy dog Hello I am Jacob \\\n", - "0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 0.0 0.0 0.0 0.0 \n", - "1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 \n", - "2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", - "\n", - " Daniel visited Tel-Aviv last month \n", - "0 0.0 0.0 0.0 0.0 0.0 \n", - "1 0.0 0.0 0.0 0.0 0.0 \n", - "2 1.0 1.0 1.0 1.0 1.0 " - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "one_hot" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The table above represents the one-hot encoding of our sentences, each row is a sentence and each column is a word.\n", - "this representation is very slim and will be a very weak learning dataset." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Introducing Bert embeddings" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import import_function, auto_mount" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "# importing the function from the hub\n", - "fn = import_function(\"hub://bert_embeddings\").apply(auto_mount())" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2021-08-11 14:49:02,307 [info] Starting remote function deploy\n", - "2021-08-11 14:49:02 (info) Deploying function\n", - "2021-08-11 14:49:02 (info) Building\n", - "2021-08-11 14:49:03 (info) Staging files and preparing base images\n", - "2021-08-11 14:49:03 (info) Building processor image\n", - "2021-08-11 14:49:10 (info) Build complete\n", - "2021-08-11 14:49:42 (info) Function deploy complete\n", - "> 2021-08-11 14:49:43,742 [info] function deployed, address=default-tenant.app.dev39.lab.iguazeng.com:32500\n" - ] - } - ], - "source": [ - "# deploying the function\n", - "addr = fn.deploy(project = \"function-marketplace\")" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "import requests\n", - "import json\n", - "# sending a request to the function endpoint to get the sentences' embeddings\n", - "resp = requests.post(addr, json=json.dumps(sentences))" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "import pickle\n", - "output_embeddings = pickle.loads(resp.content)" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "embeddings per token shape: (3, 11, 768), pooled embeddings shape: (3, 768)\n" - ] - } - ], - "source": [ - "print(f'embeddings per token shape: {output_embeddings[0].shape}, pooled embeddings shape: {output_embeddings[1].shape}')" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
0123456789...758759760761762763764765766767
0-0.733322-0.2235400.3424620.383463-0.1647960.0405220.8028450.1528420.331639-0.999779...0.2065640.2314150.1964340.7979080.4351760.7493700.2460980.427603-0.5773840.842063
1-0.953005-0.535132-0.7438210.8939340.646276-0.2793880.9435130.275504-0.555108-0.999992...0.582385-0.0046130.9760790.931517-0.3914420.5303840.675933-0.682720-0.7463390.957809
2-0.843678-0.453405-0.8260110.6508050.494036-0.1541170.8216420.349507-0.650629-0.999978...0.618286-0.3367000.9362620.857577-0.7874890.2461370.676243-0.612531-0.7087860.840879
\n", - "

3 rows × 768 columns

\n", - "
" - ], - "text/plain": [ - " 0 1 2 3 4 5 6 \\\n", - "0 -0.733322 -0.223540 0.342462 0.383463 -0.164796 0.040522 0.802845 \n", - "1 -0.953005 -0.535132 -0.743821 0.893934 0.646276 -0.279388 0.943513 \n", - "2 -0.843678 -0.453405 -0.826011 0.650805 0.494036 -0.154117 0.821642 \n", - "\n", - " 7 8 9 ... 758 759 760 761 \\\n", - "0 0.152842 0.331639 -0.999779 ... 0.206564 0.231415 0.196434 0.797908 \n", - "1 0.275504 -0.555108 -0.999992 ... 0.582385 -0.004613 0.976079 0.931517 \n", - "2 0.349507 -0.650629 -0.999978 ... 0.618286 -0.336700 0.936262 0.857577 \n", - "\n", - " 762 763 764 765 766 767 \n", - "0 0.435176 0.749370 0.246098 0.427603 -0.577384 0.842063 \n", - "1 -0.391442 0.530384 0.675933 -0.682720 -0.746339 0.957809 \n", - "2 -0.787489 0.246137 0.676243 -0.612531 -0.708786 0.840879 \n", - "\n", - "[3 rows x 768 columns]" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "pd.DataFrame(output_embeddings[1])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "we can see that the size of the first dimension of the outputs is three since we passed in three sequences. Also the intermediate dimension of the first output is the maximal number of tokens across all input sequences. Sequences with less tokens are padded with zero values.
\n", - "Note that the first input has an intermediate dimension of size 11 that corresponds to the number of max tokens in the input sequence after addition of two special tokens marking beginning and end of a sequence by the tokenizer.
\n", - "The last dimension for both is of size 768 which is the embedding dimension for this default configuration of bert.
\n", - "Now you tell me, which encoding are you gonna use in your project ??" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/functions/development/bert_embeddings/1.1.0/src/bert_embeddings.py b/functions/development/bert_embeddings/1.1.0/src/bert_embeddings.py deleted file mode 100644 index 109081b1..00000000 --- a/functions/development/bert_embeddings/1.1.0/src/bert_embeddings.py +++ /dev/null @@ -1,41 +0,0 @@ -# Copyright 2019 Iguazio -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -import json -import pickle - -import torch -from transformers import BertModel, BertTokenizer - - -def init_context(context): - tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") - model = BertModel.from_pretrained("bert-base-uncased") - model.eval() - - setattr(context.user_data, "tokenizer", tokenizer) - setattr(context.user_data, "model", model) - - -def handler(context, event): - docs = json.loads(event.body) - docs = [doc.lower() for doc in docs] - docs = context.user_data.tokenizer.batch_encode_plus( - docs, pad_to_max_length=True, return_tensors="pt" - ) - - with torch.no_grad(): - embeddings = context.user_data.model(**docs) - embeddings = [embeddings[0].numpy(), embeddings[1].numpy()] - return pickle.dumps(embeddings) diff --git a/functions/development/bert_embeddings/1.1.0/src/function.yaml b/functions/development/bert_embeddings/1.1.0/src/function.yaml deleted file mode 100644 index da59cd56..00000000 --- a/functions/development/bert_embeddings/1.1.0/src/function.yaml +++ /dev/null @@ -1,44 +0,0 @@ -kind: remote -metadata: - name: bert-embeddings - tag: '' - hash: 77dd0921ea6d23ed5c0e944037e291e2cd6727fd - project: '' - labels: - framework: pytorch - categories: - - machine-learning - - data-preparation -spec: - command: '' - args: [] - image: mlrun/ml-models - description: Get BERT based embeddings for given text - min_replicas: 1 - max_replicas: 4 - env: [] - base_spec: - apiVersion: nuclio.io/v1 - kind: Function - metadata: - name: bert-embeddings - labels: {} - annotations: - nuclio.io/generated_by: function generated from /home/kali/functions/bert_embeddings/bert_embeddings.py - spec: - runtime: python:3.6 - handler: bert_embeddings:handler - env: [] - volumes: [] - build: - commands: [] - noBaseImagesPull: true - functionSourceCode: aW1wb3J0IGpzb24KaW1wb3J0IHBpY2tsZQoKaW1wb3J0IHRvcmNoCmZyb20gdHJhbnNmb3JtZXJzIGltcG9ydCBCZXJ0TW9kZWwsIEJlcnRUb2tlbml6ZXIKCgpkZWYgaW5pdF9jb250ZXh0KGNvbnRleHQpOgogICAgdG9rZW5pemVyID0gQmVydFRva2VuaXplci5mcm9tX3ByZXRyYWluZWQoImJlcnQtYmFzZS11bmNhc2VkIikKICAgIG1vZGVsID0gQmVydE1vZGVsLmZyb21fcHJldHJhaW5lZCgiYmVydC1iYXNlLXVuY2FzZWQiKQogICAgbW9kZWwuZXZhbCgpCgogICAgc2V0YXR0cihjb250ZXh0LnVzZXJfZGF0YSwgInRva2VuaXplciIsIHRva2VuaXplcikKICAgIHNldGF0dHIoY29udGV4dC51c2VyX2RhdGEsICJtb2RlbCIsIG1vZGVsKQoKCmRlZiBoYW5kbGVyKGNvbnRleHQsIGV2ZW50KToKICAgIGRvY3MgPSBqc29uLmxvYWRzKGV2ZW50LmJvZHkpCiAgICBkb2NzID0gW2RvYy5sb3dlcigpIGZvciBkb2MgaW4gZG9jc10KICAgIGRvY3MgPSBjb250ZXh0LnVzZXJfZGF0YS50b2tlbml6ZXIuYmF0Y2hfZW5jb2RlX3BsdXMoCiAgICAgICAgZG9jcywgcGFkX3RvX21heF9sZW5ndGg9VHJ1ZSwgcmV0dXJuX3RlbnNvcnM9InB0IgogICAgKQoKICAgIHdpdGggdG9yY2gubm9fZ3JhZCgpOgogICAgICAgIGVtYmVkZGluZ3MgPSBjb250ZXh0LnVzZXJfZGF0YS5tb2RlbCgqKmRvY3MpCiAgICBlbWJlZGRpbmdzID0gW2VtYmVkZGluZ3NbMF0ubnVtcHkoKSwgZW1iZWRkaW5nc1sxXS5udW1weSgpXQogICAgcmV0dXJuIHBpY2tsZS5kdW1wcyhlbWJlZGRpbmdzKQo= - source: '' - build: - commands: - - python -m pip install torch==1.6.0 - code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/bert_embeddings/bert_embeddings.py - default_handler: handler - affinity: null -verbose: false diff --git a/functions/development/bert_embeddings/1.1.0/src/item.yaml b/functions/development/bert_embeddings/1.1.0/src/item.yaml deleted file mode 100644 index f9b72d2c..00000000 --- a/functions/development/bert_embeddings/1.1.0/src/item.yaml +++ /dev/null @@ -1,26 +0,0 @@ -apiVersion: v1 -categories: -- machine-learning -- data-preparation -description: Get BERT based embeddings for given text -doc: '' -example: bert_embeddings.ipynb -generationDate: 2022-08-28:17-25 -hidden: false -icon: '' -labels: - framework: pytorch -maintainers: [] -marketplaceType: '' -mlrunVersion: 1.1.0 -name: bert-embeddings -platformVersion: 3.5.0 -spec: - filename: bert_embeddings.py - handler: handler - image: mlrun/ml-models - kind: nuclio - requirements: - - torch==1.6.0 -url: '' -version: 1.1.0 diff --git a/functions/development/bert_embeddings/1.1.0/src/requirements.txt b/functions/development/bert_embeddings/1.1.0/src/requirements.txt deleted file mode 100644 index d7ee31ee..00000000 --- a/functions/development/bert_embeddings/1.1.0/src/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -mlrun -transformers \ No newline at end of file diff --git a/functions/development/bert_embeddings/1.1.0/src/test_bert_embeddings.py b/functions/development/bert_embeddings/1.1.0/src/test_bert_embeddings.py deleted file mode 100644 index 7ad9101c..00000000 --- a/functions/development/bert_embeddings/1.1.0/src/test_bert_embeddings.py +++ /dev/null @@ -1,32 +0,0 @@ -# Copyright 2019 Iguazio -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -from bert_embeddings import init_context,handler -import nuclio -import json -import pickle -import numpy as np - -ARCHIVE = "https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz" -ARTIFACTS_PATH = 'artifacts' - - -def test_bert_embeddings(): - event = nuclio.Event(body=json.dumps(['John loves Mary'])) - ctx = nuclio.Context() - init_context(ctx) - outputs = pickle.loads(handler(ctx, event)) - assert (True if abs(np.mean(outputs[0]) - -0.011996539) <= 0.0001 else False) is True - assert (True if abs(np.mean(outputs[0]) - -0.011996539) > 0 else False) is True - diff --git a/functions/development/bert_embeddings/1.1.0/static/documentation.html b/functions/development/bert_embeddings/1.1.0/static/documentation.html deleted file mode 100644 index 9432d662..00000000 --- a/functions/development/bert_embeddings/1.1.0/static/documentation.html +++ /dev/null @@ -1,136 +0,0 @@ - - - - - - - -bert_embeddings package - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- - -
-
-
-
-
-
-

bert_embeddings package

-
-

Submodules

-
-
-

bert_embeddings.bert_embeddings module

-
-
-bert_embeddings.bert_embeddings.handler(context, event)[source]
-
-
-
-bert_embeddings.bert_embeddings.init_context(context)[source]
-
-
-
-

Module contents

-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/bert_embeddings/1.1.0/static/example.html b/functions/development/bert_embeddings/1.1.0/static/example.html deleted file mode 100644 index 10a74f0f..00000000 --- a/functions/development/bert_embeddings/1.1.0/static/example.html +++ /dev/null @@ -1,492 +0,0 @@ - - - - - - - -BERT Embeddings Serverless Function - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- - -
-
-
-
-
-
-

BERT Embeddings Serverless Function

-

This notebook presents deployment of pretrained BERT model that outputs embeddings for given textual sequences as a serverless function. Embeddings are meaningful, contextual representations of text in the form of ndarrays that are used frequently as input to various learning tasks in the field of NLP.

-
-
-
!/User/align_mlrun.sh
-
-
-
-
-
Both server & client are aligned (0.6.5).
-
-
-
-
-
-
-

Embeddings without bert

-

One-Hot Encoding is a general method that can vectorize any categorical features. It is simple and fast to create and update the vectorization.
-in case of text embeddings, each row is a sentence and each column is a word/char/n-gram.

-
-
-
# some sentences to do examine
-sentences = ['the quick brown fox jumps over the lazy dog',
-              'Hello I am Jacob',
-              'Daniel visited Tel-Aviv last month']
-
-
-
-
-

lets see the difference between bert embeddings and one-hot encoding

-
-
-
# constructing a list of all the words (will be our columns) - make sure no duplicate words are set
-tokens = []
-for sentence in sentences:
-    for word in sentence.split():
-        tokens.append(word) if word not in tokens else ""
-print(tokens)
-
-
-
-
-
['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'lazy', 'dog', 'Hello', 'I', 'am', 'Jacob', 'Daniel', 'visited', 'Tel-Aviv', 'last', 'month']
-
-
-
-
-
-
-
# constructing the one hot vector
-import pandas as pd
-import numpy as np
-
-one_hot = pd.DataFrame(columns = range(len(tokens)))
-# filling our empty dataframe with each sentence encoding
-for sentence in sentences:
-    vector = np.zeros(len(tokens))
-    for word in sentence.split():
-        vector[tokens.index(word)]=1
-    one_hot = one_hot.append(pd.Series(vector),ignore_index=True)
-one_hot.columns = tokens
-
-
-
-
-
-
-
one_hot
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
thequickbrownfoxjumpsoverlazydogHelloIamJacobDanielvisitedTel-Avivlastmonth
01.01.01.01.01.01.01.01.00.00.00.00.00.00.00.00.00.0
10.00.00.00.00.00.00.00.01.01.01.01.00.00.00.00.00.0
20.00.00.00.00.00.00.00.00.00.00.00.01.01.01.01.01.0
-
-
-

The table above represents the one-hot encoding of our sentences, each row is a sentence and each column is a word. -this representation is very slim and will be a very weak learning dataset.

-
-
-

Introducing Bert embeddings

-
-
-
from mlrun import import_function, auto_mount
-
-
-
-
-
-
-
# importing the function from the hub
-fn = import_function("hub://bert_embeddings").apply(auto_mount())
-
-
-
-
-
-
-
# deploying the function
-addr = fn.deploy(project = "function-marketplace")
-
-
-
-
-
> 2021-08-11 14:49:02,307 [info] Starting remote function deploy
-2021-08-11 14:49:02  (info) Deploying function
-2021-08-11 14:49:02  (info) Building
-2021-08-11 14:49:03  (info) Staging files and preparing base images
-2021-08-11 14:49:03  (info) Building processor image
-2021-08-11 14:49:10  (info) Build complete
-2021-08-11 14:49:42  (info) Function deploy complete
-> 2021-08-11 14:49:43,742 [info] function deployed, address=default-tenant.app.dev39.lab.iguazeng.com:32500
-
-
-
-
-
-
-
import requests
-import json
-# sending a request to the function endpoint to get the sentences' embeddings
-resp = requests.post(addr, json=json.dumps(sentences))
-
-
-
-
-
-
-
import pickle
-output_embeddings = pickle.loads(resp.content)
-
-
-
-
-
-
-
print(f'embeddings per token shape: {output_embeddings[0].shape}, pooled embeddings shape: {output_embeddings[1].shape}')
-
-
-
-
-
embeddings per token shape: (3, 11, 768), pooled embeddings shape: (3, 768)
-
-
-
-
-
-
-
pd.DataFrame(output_embeddings[1])
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
0123456789...758759760761762763764765766767
0-0.733322-0.2235400.3424620.383463-0.1647960.0405220.8028450.1528420.331639-0.999779...0.2065640.2314150.1964340.7979080.4351760.7493700.2460980.427603-0.5773840.842063
1-0.953005-0.535132-0.7438210.8939340.646276-0.2793880.9435130.275504-0.555108-0.999992...0.582385-0.0046130.9760790.931517-0.3914420.5303840.675933-0.682720-0.7463390.957809
2-0.843678-0.453405-0.8260110.6508050.494036-0.1541170.8216420.349507-0.650629-0.999978...0.618286-0.3367000.9362620.857577-0.7874890.2461370.676243-0.612531-0.7087860.840879
-

3 rows × 768 columns

-
-
-

we can see that the size of the first dimension of the outputs is three since we passed in three sequences. Also the intermediate dimension of the first output is the maximal number of tokens across all input sequences. Sequences with less tokens are padded with zero values.
-Note that the first input has an intermediate dimension of size 11 that corresponds to the number of max tokens in the input sequence after addition of two special tokens marking beginning and end of a sequence by the tokenizer.
-The last dimension for both is of size 768 which is the embedding dimension for this default configuration of bert.
-Now you tell me, which encoding are you gonna use in your project ??

-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/bert_embeddings/1.1.0/static/function.html b/functions/development/bert_embeddings/1.1.0/static/function.html deleted file mode 100644 index 32d2fe58..00000000 --- a/functions/development/bert_embeddings/1.1.0/static/function.html +++ /dev/null @@ -1,66 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: remote
-metadata:
-  name: bert-embeddings
-  tag: ''
-  hash: 77dd0921ea6d23ed5c0e944037e291e2cd6727fd
-  project: ''
-  labels:
-    framework: pytorch
-  categories:
-  - machine-learning
-  - data-preparation
-spec:
-  command: ''
-  args: []
-  image: mlrun/ml-models
-  description: Get BERT based embeddings for given text
-  min_replicas: 1
-  max_replicas: 4
-  env: []
-  base_spec:
-    apiVersion: nuclio.io/v1
-    kind: Function
-    metadata:
-      name: bert-embeddings
-      labels: {}
-      annotations:
-        nuclio.io/generated_by: function generated from /home/kali/functions/bert_embeddings/bert_embeddings.py
-    spec:
-      runtime: python:3.6
-      handler: bert_embeddings:handler
-      env: []
-      volumes: []
-      build:
-        commands: []
-        noBaseImagesPull: true
-        functionSourceCode: aW1wb3J0IGpzb24KaW1wb3J0IHBpY2tsZQoKaW1wb3J0IHRvcmNoCmZyb20gdHJhbnNmb3JtZXJzIGltcG9ydCBCZXJ0TW9kZWwsIEJlcnRUb2tlbml6ZXIKCgpkZWYgaW5pdF9jb250ZXh0KGNvbnRleHQpOgogICAgdG9rZW5pemVyID0gQmVydFRva2VuaXplci5mcm9tX3ByZXRyYWluZWQoImJlcnQtYmFzZS11bmNhc2VkIikKICAgIG1vZGVsID0gQmVydE1vZGVsLmZyb21fcHJldHJhaW5lZCgiYmVydC1iYXNlLXVuY2FzZWQiKQogICAgbW9kZWwuZXZhbCgpCgogICAgc2V0YXR0cihjb250ZXh0LnVzZXJfZGF0YSwgInRva2VuaXplciIsIHRva2VuaXplcikKICAgIHNldGF0dHIoY29udGV4dC51c2VyX2RhdGEsICJtb2RlbCIsIG1vZGVsKQoKCmRlZiBoYW5kbGVyKGNvbnRleHQsIGV2ZW50KToKICAgIGRvY3MgPSBqc29uLmxvYWRzKGV2ZW50LmJvZHkpCiAgICBkb2NzID0gW2RvYy5sb3dlcigpIGZvciBkb2MgaW4gZG9jc10KICAgIGRvY3MgPSBjb250ZXh0LnVzZXJfZGF0YS50b2tlbml6ZXIuYmF0Y2hfZW5jb2RlX3BsdXMoCiAgICAgICAgZG9jcywgcGFkX3RvX21heF9sZW5ndGg9VHJ1ZSwgcmV0dXJuX3RlbnNvcnM9InB0IgogICAgKQoKICAgIHdpdGggdG9yY2gubm9fZ3JhZCgpOgogICAgICAgIGVtYmVkZGluZ3MgPSBjb250ZXh0LnVzZXJfZGF0YS5tb2RlbCgqKmRvY3MpCiAgICBlbWJlZGRpbmdzID0gW2VtYmVkZGluZ3NbMF0ubnVtcHkoKSwgZW1iZWRkaW5nc1sxXS5udW1weSgpXQogICAgcmV0dXJuIHBpY2tsZS5kdW1wcyhlbWJlZGRpbmdzKQo=
-  source: ''
-  build:
-    commands:
-    - python -m pip install torch==1.6.0
-    code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/bert_embeddings/bert_embeddings.py
-  default_handler: handler
-  affinity: null
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/bert_embeddings/1.1.0/static/item.html b/functions/development/bert_embeddings/1.1.0/static/item.html deleted file mode 100644 index 78201dba..00000000 --- a/functions/development/bert_embeddings/1.1.0/static/item.html +++ /dev/null @@ -1,48 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- machine-learning
-- data-preparation
-description: Get BERT based embeddings for given text
-doc: ''
-example: bert_embeddings.ipynb
-generationDate: 2022-08-28:17-25
-hidden: false
-icon: ''
-labels:
-  framework: pytorch
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 1.1.0
-name: bert-embeddings
-platformVersion: 3.5.0
-spec:
-  filename: bert_embeddings.py
-  handler: handler
-  image: mlrun/ml-models
-  kind: nuclio
-  requirements:
-  - torch==1.6.0
-url: ''
-version: 1.1.0
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/bert_embeddings/1.1.0/static/source.html b/functions/development/bert_embeddings/1.1.0/static/source.html deleted file mode 100644 index 1df4accf..00000000 --- a/functions/development/bert_embeddings/1.1.0/static/source.html +++ /dev/null @@ -1,63 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-# Copyright 2019 Iguazio
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-import json
-import pickle
-
-import torch
-from transformers import BertModel, BertTokenizer
-
-
-def init_context(context):
-    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
-    model = BertModel.from_pretrained("bert-base-uncased")
-    model.eval()
-
-    setattr(context.user_data, "tokenizer", tokenizer)
-    setattr(context.user_data, "model", model)
-
-
-def handler(context, event):
-    docs = json.loads(event.body)
-    docs = [doc.lower() for doc in docs]
-    docs = context.user_data.tokenizer.batch_encode_plus(
-        docs, pad_to_max_length=True, return_tensors="pt"
-    )
-
-    with torch.no_grad():
-        embeddings = context.user_data.model(**docs)
-    embeddings = [embeddings[0].numpy(), embeddings[1].numpy()]
-    return pickle.dumps(embeddings)
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/bert_embeddings/1.1.1/src/bert_embeddings.ipynb b/functions/development/bert_embeddings/1.1.1/src/bert_embeddings.ipynb deleted file mode 100644 index cb6d5584..00000000 --- a/functions/development/bert_embeddings/1.1.1/src/bert_embeddings.ipynb +++ /dev/null @@ -1,503 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## BERT Embeddings Serverless Function\n", - "This notebook presents deployment of pretrained BERT model that outputs embeddings for given textual sequences as a serverless function. Embeddings are meaningful, contextual representations of text in the form of ndarrays that are used frequently as input to various learning tasks in the field of NLP." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Embeddings without bert" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "[One-Hot Encoding](https://en.wikipedia.org/wiki/One-hot) is a general method that can vectorize any categorical features. It is simple and fast to create and update the vectorization.
\n", - "in case of text embeddings, each row is a sentence and each column is a word/char/[n-gram](https://en.wikipedia.org/wiki/N-gram)." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "# some sentences to do examine\n", - "sentences = ['the quick brown fox jumps over the lazy dog',\n", - " 'Hello I am Jacob',\n", - " 'Daniel visited Tel-Aviv last month']" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "lets see the difference between bert embeddings and one-hot encoding" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'lazy', 'dog', 'Hello', 'I', 'am', 'Jacob', 'Daniel', 'visited', 'Tel-Aviv', 'last', 'month']\n" - ] - } - ], - "source": [ - "# constructing a list of all the words (will be our columns) - make sure no duplicate words are set\n", - "tokens = []\n", - "for sentence in sentences:\n", - " for word in sentence.split():\n", - " tokens.append(word) if word not in tokens else \"\"\n", - "print(tokens)" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "# constructing the one hot vector\n", - "import pandas as pd\n", - "import numpy as np\n", - "\n", - "one_hot = pd.DataFrame(columns = range(len(tokens)))\n", - "# filling our empty dataframe with each sentence encoding\n", - "for sentence in sentences:\n", - " vector = np.zeros(len(tokens))\n", - " for word in sentence.split():\n", - " vector[tokens.index(word)]=1\n", - " one_hot = one_hot.append(pd.Series(vector),ignore_index=True)\n", - "one_hot.columns = tokens" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
thequickbrownfoxjumpsoverlazydogHelloIamJacobDanielvisitedTel-Avivlastmonth
01.01.01.01.01.01.01.01.00.00.00.00.00.00.00.00.00.0
10.00.00.00.00.00.00.00.01.01.01.01.00.00.00.00.00.0
20.00.00.00.00.00.00.00.00.00.00.00.01.01.01.01.01.0
\n", - "
" - ], - "text/plain": [ - " the quick brown fox jumps over lazy dog Hello I am Jacob \\\n", - "0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 0.0 0.0 0.0 0.0 \n", - "1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 \n", - "2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", - "\n", - " Daniel visited Tel-Aviv last month \n", - "0 0.0 0.0 0.0 0.0 0.0 \n", - "1 0.0 0.0 0.0 0.0 0.0 \n", - "2 1.0 1.0 1.0 1.0 1.0 " - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "one_hot" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The table above represents the one-hot encoding of our sentences, each row is a sentence and each column is a word.\n", - "this representation is very slim and will be a very weak learning dataset." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Introducing Bert embeddings" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import import_function, auto_mount" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "# importing the function from the hub\n", - "fn = import_function(\"hub://bert_embeddings\").apply(auto_mount())" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2023-02-02 09:29:59,002 [info] Starting remote function deploy\n", - "2023-02-02 09:29:59 (info) Deploying function\n", - "2023-02-02 09:29:59 (info) Building\n", - "2023-02-02 09:29:59 (info) Staging files and preparing base images\n", - "2023-02-02 09:29:59 (info) Building processor image\n", - "2023-02-02 09:32:09 (info) Build complete\n", - "2023-02-02 09:32:35 (info) Function deploy complete\n", - "> 2023-02-02 09:32:36,059 [info] successfully deployed function: {'internal_invocation_urls': ['nuclio-default-bert-embeddings.default-tenant.svc.cluster.local:8080'], 'external_invocation_urls': ['default-bert-embeddings-default.default-tenant.app.cto-office.iguazio-cd1.com/']}\n" - ] - } - ], - "source": [ - "# deploying the function\n", - "addr = fn.deploy()" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "import requests\n", - "import json\n", - "# sending a request to the function endpoint to get the sentences' embeddings\n", - "resp = requests.post(addr, json=json.dumps(sentences))" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "import pickle\n", - "output_embeddings = pickle.loads(resp.content)" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "embeddings per token shape: (3, 11, 768), pooled embeddings shape: (3, 768)\n" - ] - } - ], - "source": [ - "print(f'embeddings per token shape: {output_embeddings[0].shape}, pooled embeddings shape: {output_embeddings[1].shape}')" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
0123456789...758759760761762763764765766767
0-0.733322-0.2235400.3424620.383463-0.1647960.0405220.8028450.1528420.331639-0.999779...0.2065640.2314150.1964330.7979080.4351750.7493700.2460980.427603-0.5773840.842063
1-0.953005-0.535132-0.7438220.8939340.646276-0.2793880.9435130.275504-0.555109-0.999992...0.582386-0.0046140.9760790.931517-0.3914420.5303840.675933-0.682721-0.7463390.957809
2-0.843678-0.453405-0.8260110.6508050.494036-0.1541170.8216420.349507-0.650629-0.999978...0.618286-0.3367000.9362620.857577-0.7874890.2461370.676243-0.612532-0.7087860.840879
\n", - "

3 rows × 768 columns

\n", - "
" - ], - "text/plain": [ - " 0 1 2 3 4 5 6 \\\n", - "0 -0.733322 -0.223540 0.342462 0.383463 -0.164796 0.040522 0.802845 \n", - "1 -0.953005 -0.535132 -0.743822 0.893934 0.646276 -0.279388 0.943513 \n", - "2 -0.843678 -0.453405 -0.826011 0.650805 0.494036 -0.154117 0.821642 \n", - "\n", - " 7 8 9 ... 758 759 760 761 \\\n", - "0 0.152842 0.331639 -0.999779 ... 0.206564 0.231415 0.196433 0.797908 \n", - "1 0.275504 -0.555109 -0.999992 ... 0.582386 -0.004614 0.976079 0.931517 \n", - "2 0.349507 -0.650629 -0.999978 ... 0.618286 -0.336700 0.936262 0.857577 \n", - "\n", - " 762 763 764 765 766 767 \n", - "0 0.435175 0.749370 0.246098 0.427603 -0.577384 0.842063 \n", - "1 -0.391442 0.530384 0.675933 -0.682721 -0.746339 0.957809 \n", - "2 -0.787489 0.246137 0.676243 -0.612532 -0.708786 0.840879 \n", - "\n", - "[3 rows x 768 columns]" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "pd.DataFrame(output_embeddings[1])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "we can see that the size of the first dimension of the outputs is three since we passed in three sequences. Also the intermediate dimension of the first output is the maximal number of tokens across all input sequences. Sequences with less tokens are padded with zero values.
\n", - "Note that the first input has an intermediate dimension of size 11 that corresponds to the number of max tokens in the input sequence after addition of two special tokens marking beginning and end of a sequence by the tokenizer.
\n", - "The last dimension for both is of size 768 which is the embedding dimension for this default configuration of bert.
\n", - "Now you tell me, which encoding are you gonna use in your project ??" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/functions/development/bert_embeddings/1.1.1/src/bert_embeddings.py b/functions/development/bert_embeddings/1.1.1/src/bert_embeddings.py deleted file mode 100644 index 109081b1..00000000 --- a/functions/development/bert_embeddings/1.1.1/src/bert_embeddings.py +++ /dev/null @@ -1,41 +0,0 @@ -# Copyright 2019 Iguazio -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -import json -import pickle - -import torch -from transformers import BertModel, BertTokenizer - - -def init_context(context): - tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") - model = BertModel.from_pretrained("bert-base-uncased") - model.eval() - - setattr(context.user_data, "tokenizer", tokenizer) - setattr(context.user_data, "model", model) - - -def handler(context, event): - docs = json.loads(event.body) - docs = [doc.lower() for doc in docs] - docs = context.user_data.tokenizer.batch_encode_plus( - docs, pad_to_max_length=True, return_tensors="pt" - ) - - with torch.no_grad(): - embeddings = context.user_data.model(**docs) - embeddings = [embeddings[0].numpy(), embeddings[1].numpy()] - return pickle.dumps(embeddings) diff --git a/functions/development/bert_embeddings/1.1.1/src/function.yaml b/functions/development/bert_embeddings/1.1.1/src/function.yaml deleted file mode 100644 index da59cd56..00000000 --- a/functions/development/bert_embeddings/1.1.1/src/function.yaml +++ /dev/null @@ -1,44 +0,0 @@ -kind: remote -metadata: - name: bert-embeddings - tag: '' - hash: 77dd0921ea6d23ed5c0e944037e291e2cd6727fd - project: '' - labels: - framework: pytorch - categories: - - machine-learning - - data-preparation -spec: - command: '' - args: [] - image: mlrun/ml-models - description: Get BERT based embeddings for given text - min_replicas: 1 - max_replicas: 4 - env: [] - base_spec: - apiVersion: nuclio.io/v1 - kind: Function - metadata: - name: bert-embeddings - labels: {} - annotations: - nuclio.io/generated_by: function generated from /home/kali/functions/bert_embeddings/bert_embeddings.py - spec: - runtime: python:3.6 - handler: bert_embeddings:handler - env: [] - volumes: [] - build: - commands: [] - noBaseImagesPull: true - functionSourceCode: aW1wb3J0IGpzb24KaW1wb3J0IHBpY2tsZQoKaW1wb3J0IHRvcmNoCmZyb20gdHJhbnNmb3JtZXJzIGltcG9ydCBCZXJ0TW9kZWwsIEJlcnRUb2tlbml6ZXIKCgpkZWYgaW5pdF9jb250ZXh0KGNvbnRleHQpOgogICAgdG9rZW5pemVyID0gQmVydFRva2VuaXplci5mcm9tX3ByZXRyYWluZWQoImJlcnQtYmFzZS11bmNhc2VkIikKICAgIG1vZGVsID0gQmVydE1vZGVsLmZyb21fcHJldHJhaW5lZCgiYmVydC1iYXNlLXVuY2FzZWQiKQogICAgbW9kZWwuZXZhbCgpCgogICAgc2V0YXR0cihjb250ZXh0LnVzZXJfZGF0YSwgInRva2VuaXplciIsIHRva2VuaXplcikKICAgIHNldGF0dHIoY29udGV4dC51c2VyX2RhdGEsICJtb2RlbCIsIG1vZGVsKQoKCmRlZiBoYW5kbGVyKGNvbnRleHQsIGV2ZW50KToKICAgIGRvY3MgPSBqc29uLmxvYWRzKGV2ZW50LmJvZHkpCiAgICBkb2NzID0gW2RvYy5sb3dlcigpIGZvciBkb2MgaW4gZG9jc10KICAgIGRvY3MgPSBjb250ZXh0LnVzZXJfZGF0YS50b2tlbml6ZXIuYmF0Y2hfZW5jb2RlX3BsdXMoCiAgICAgICAgZG9jcywgcGFkX3RvX21heF9sZW5ndGg9VHJ1ZSwgcmV0dXJuX3RlbnNvcnM9InB0IgogICAgKQoKICAgIHdpdGggdG9yY2gubm9fZ3JhZCgpOgogICAgICAgIGVtYmVkZGluZ3MgPSBjb250ZXh0LnVzZXJfZGF0YS5tb2RlbCgqKmRvY3MpCiAgICBlbWJlZGRpbmdzID0gW2VtYmVkZGluZ3NbMF0ubnVtcHkoKSwgZW1iZWRkaW5nc1sxXS5udW1weSgpXQogICAgcmV0dXJuIHBpY2tsZS5kdW1wcyhlbWJlZGRpbmdzKQo= - source: '' - build: - commands: - - python -m pip install torch==1.6.0 - code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/bert_embeddings/bert_embeddings.py - default_handler: handler - affinity: null -verbose: false diff --git a/functions/development/bert_embeddings/1.1.1/src/item.yaml b/functions/development/bert_embeddings/1.1.1/src/item.yaml deleted file mode 100644 index 206d8a78..00000000 --- a/functions/development/bert_embeddings/1.1.1/src/item.yaml +++ /dev/null @@ -1,26 +0,0 @@ -apiVersion: v1 -categories: -- machine-learning -- data-preparation -description: Get BERT based embeddings for given text -doc: '' -example: bert_embeddings.ipynb -generationDate: 2022-08-28:17-25 -hidden: false -icon: '' -labels: - framework: pytorch -maintainers: [] -marketplaceType: '' -mlrunVersion: 1.1.0 -name: bert-embeddings -platformVersion: 3.5.0 -spec: - filename: bert_embeddings.py - handler: handler - image: mlrun/ml-models - kind: nuclio - requirements: - - torch==1.6.0 -url: '' -version: 1.1.1 diff --git a/functions/development/bert_embeddings/1.1.1/src/requirements.txt b/functions/development/bert_embeddings/1.1.1/src/requirements.txt deleted file mode 100644 index d7ee31ee..00000000 --- a/functions/development/bert_embeddings/1.1.1/src/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -mlrun -transformers \ No newline at end of file diff --git a/functions/development/bert_embeddings/1.1.1/src/test_bert_embeddings.py b/functions/development/bert_embeddings/1.1.1/src/test_bert_embeddings.py deleted file mode 100644 index 7ad9101c..00000000 --- a/functions/development/bert_embeddings/1.1.1/src/test_bert_embeddings.py +++ /dev/null @@ -1,32 +0,0 @@ -# Copyright 2019 Iguazio -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -from bert_embeddings import init_context,handler -import nuclio -import json -import pickle -import numpy as np - -ARCHIVE = "https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz" -ARTIFACTS_PATH = 'artifacts' - - -def test_bert_embeddings(): - event = nuclio.Event(body=json.dumps(['John loves Mary'])) - ctx = nuclio.Context() - init_context(ctx) - outputs = pickle.loads(handler(ctx, event)) - assert (True if abs(np.mean(outputs[0]) - -0.011996539) <= 0.0001 else False) is True - assert (True if abs(np.mean(outputs[0]) - -0.011996539) > 0 else False) is True - diff --git a/functions/development/bert_embeddings/1.1.1/static/bert_embeddings.html b/functions/development/bert_embeddings/1.1.1/static/bert_embeddings.html deleted file mode 100644 index 863a0261..00000000 --- a/functions/development/bert_embeddings/1.1.1/static/bert_embeddings.html +++ /dev/null @@ -1,181 +0,0 @@ - - - - - - - -bert_embeddings.bert_embeddings - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - -
-
- -
-
-
-
-
- -
-

- -
-
-
-
-
-
-
-

Source code for bert_embeddings.bert_embeddings

-# Copyright 2019 Iguazio
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-import json
-import pickle
-
-import torch
-from transformers import BertModel, BertTokenizer
-
-
-
[docs]def init_context(context): - tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") - model = BertModel.from_pretrained("bert-base-uncased") - model.eval() - - setattr(context.user_data, "tokenizer", tokenizer) - setattr(context.user_data, "model", model)
- - -
[docs]def handler(context, event): - docs = json.loads(event.body) - docs = [doc.lower() for doc in docs] - docs = context.user_data.tokenizer.batch_encode_plus( - docs, pad_to_max_length=True, return_tensors="pt" - ) - - with torch.no_grad(): - embeddings = context.user_data.model(**docs) - embeddings = [embeddings[0].numpy(), embeddings[1].numpy()] - return pickle.dumps(embeddings)
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/bert_embeddings/1.1.1/static/documentation.html b/functions/development/bert_embeddings/1.1.1/static/documentation.html deleted file mode 100644 index b99c805d..00000000 --- a/functions/development/bert_embeddings/1.1.1/static/documentation.html +++ /dev/null @@ -1,230 +0,0 @@ - - - - - - - -bert_embeddings package - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - - - -
-
- - -
-
-
- -
-

bert_embeddings package

- -
- -
-
-
-
-
-

bert_embeddings package#

-
-

Submodules#

-
-
-

bert_embeddings.bert_embeddings module#

-
-
-bert_embeddings.bert_embeddings.handler(context, event)[source]#
-
-
-
-bert_embeddings.bert_embeddings.init_context(context)[source]#
-
-
-
-

Module contents#

-
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/bert_embeddings/1.1.1/static/example.html b/functions/development/bert_embeddings/1.1.1/static/example.html deleted file mode 100644 index 59984155..00000000 --- a/functions/development/bert_embeddings/1.1.1/static/example.html +++ /dev/null @@ -1,584 +0,0 @@ - - - - - - - -BERT Embeddings Serverless Function - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - - - -
-
- - -
-
-
- -
-

BERT Embeddings Serverless Function

- -
- -
-
-
-
-
-

BERT Embeddings Serverless Function#

-

This notebook presents deployment of pretrained BERT model that outputs embeddings for given textual sequences as a serverless function. Embeddings are meaningful, contextual representations of text in the form of ndarrays that are used frequently as input to various learning tasks in the field of NLP.

-
-
-

Embeddings without bert#

-

One-Hot Encoding is a general method that can vectorize any categorical features. It is simple and fast to create and update the vectorization.
-in case of text embeddings, each row is a sentence and each column is a word/char/n-gram.

-
-
-
# some sentences to do examine
-sentences = ['the quick brown fox jumps over the lazy dog',
-              'Hello I am Jacob',
-              'Daniel visited Tel-Aviv last month']
-
-
-
-
-

lets see the difference between bert embeddings and one-hot encoding

-
-
-
# constructing a list of all the words (will be our columns) - make sure no duplicate words are set
-tokens = []
-for sentence in sentences:
-    for word in sentence.split():
-        tokens.append(word) if word not in tokens else ""
-print(tokens)
-
-
-
-
-
['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'lazy', 'dog', 'Hello', 'I', 'am', 'Jacob', 'Daniel', 'visited', 'Tel-Aviv', 'last', 'month']
-
-
-
-
-
-
-
# constructing the one hot vector
-import pandas as pd
-import numpy as np
-
-one_hot = pd.DataFrame(columns = range(len(tokens)))
-# filling our empty dataframe with each sentence encoding
-for sentence in sentences:
-    vector = np.zeros(len(tokens))
-    for word in sentence.split():
-        vector[tokens.index(word)]=1
-    one_hot = one_hot.append(pd.Series(vector),ignore_index=True)
-one_hot.columns = tokens
-
-
-
-
-
-
-
one_hot
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
thequickbrownfoxjumpsoverlazydogHelloIamJacobDanielvisitedTel-Avivlastmonth
01.01.01.01.01.01.01.01.00.00.00.00.00.00.00.00.00.0
10.00.00.00.00.00.00.00.01.01.01.01.00.00.00.00.00.0
20.00.00.00.00.00.00.00.00.00.00.00.01.01.01.01.01.0
-
-
-

The table above represents the one-hot encoding of our sentences, each row is a sentence and each column is a word. -this representation is very slim and will be a very weak learning dataset.

-
-
-

Introducing Bert embeddings#

-
-
-
from mlrun import import_function, auto_mount
-
-
-
-
-
-
-
# importing the function from the hub
-fn = import_function("hub://bert_embeddings").apply(auto_mount())
-
-
-
-
-
-
-
# deploying the function
-addr = fn.deploy()
-
-
-
-
-
> 2023-02-02 09:29:59,002 [info] Starting remote function deploy
-2023-02-02 09:29:59  (info) Deploying function
-2023-02-02 09:29:59  (info) Building
-2023-02-02 09:29:59  (info) Staging files and preparing base images
-2023-02-02 09:29:59  (info) Building processor image
-2023-02-02 09:32:09  (info) Build complete
-2023-02-02 09:32:35  (info) Function deploy complete
-> 2023-02-02 09:32:36,059 [info] successfully deployed function: {'internal_invocation_urls': ['nuclio-default-bert-embeddings.default-tenant.svc.cluster.local:8080'], 'external_invocation_urls': ['default-bert-embeddings-default.default-tenant.app.cto-office.iguazio-cd1.com/']}
-
-
-
-
-
-
-
import requests
-import json
-# sending a request to the function endpoint to get the sentences' embeddings
-resp = requests.post(addr, json=json.dumps(sentences))
-
-
-
-
-
-
-
import pickle
-output_embeddings = pickle.loads(resp.content)
-
-
-
-
-
-
-
print(f'embeddings per token shape: {output_embeddings[0].shape}, pooled embeddings shape: {output_embeddings[1].shape}')
-
-
-
-
-
embeddings per token shape: (3, 11, 768), pooled embeddings shape: (3, 768)
-
-
-
-
-
-
-
pd.DataFrame(output_embeddings[1])
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
0123456789...758759760761762763764765766767
0-0.733322-0.2235400.3424620.383463-0.1647960.0405220.8028450.1528420.331639-0.999779...0.2065640.2314150.1964330.7979080.4351750.7493700.2460980.427603-0.5773840.842063
1-0.953005-0.535132-0.7438220.8939340.646276-0.2793880.9435130.275504-0.555109-0.999992...0.582386-0.0046140.9760790.931517-0.3914420.5303840.675933-0.682721-0.7463390.957809
2-0.843678-0.453405-0.8260110.6508050.494036-0.1541170.8216420.349507-0.650629-0.999978...0.618286-0.3367000.9362620.857577-0.7874890.2461370.676243-0.612532-0.7087860.840879
-

3 rows × 768 columns

-
-
-

we can see that the size of the first dimension of the outputs is three since we passed in three sequences. Also the intermediate dimension of the first output is the maximal number of tokens across all input sequences. Sequences with less tokens are padded with zero values.
-Note that the first input has an intermediate dimension of size 11 that corresponds to the number of max tokens in the input sequence after addition of two special tokens marking beginning and end of a sequence by the tokenizer.
-The last dimension for both is of size 768 which is the embedding dimension for this default configuration of bert.
-Now you tell me, which encoding are you gonna use in your project ??

-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/bert_embeddings/1.1.1/static/function.html b/functions/development/bert_embeddings/1.1.1/static/function.html deleted file mode 100644 index 32d2fe58..00000000 --- a/functions/development/bert_embeddings/1.1.1/static/function.html +++ /dev/null @@ -1,66 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: remote
-metadata:
-  name: bert-embeddings
-  tag: ''
-  hash: 77dd0921ea6d23ed5c0e944037e291e2cd6727fd
-  project: ''
-  labels:
-    framework: pytorch
-  categories:
-  - machine-learning
-  - data-preparation
-spec:
-  command: ''
-  args: []
-  image: mlrun/ml-models
-  description: Get BERT based embeddings for given text
-  min_replicas: 1
-  max_replicas: 4
-  env: []
-  base_spec:
-    apiVersion: nuclio.io/v1
-    kind: Function
-    metadata:
-      name: bert-embeddings
-      labels: {}
-      annotations:
-        nuclio.io/generated_by: function generated from /home/kali/functions/bert_embeddings/bert_embeddings.py
-    spec:
-      runtime: python:3.6
-      handler: bert_embeddings:handler
-      env: []
-      volumes: []
-      build:
-        commands: []
-        noBaseImagesPull: true
-        functionSourceCode: aW1wb3J0IGpzb24KaW1wb3J0IHBpY2tsZQoKaW1wb3J0IHRvcmNoCmZyb20gdHJhbnNmb3JtZXJzIGltcG9ydCBCZXJ0TW9kZWwsIEJlcnRUb2tlbml6ZXIKCgpkZWYgaW5pdF9jb250ZXh0KGNvbnRleHQpOgogICAgdG9rZW5pemVyID0gQmVydFRva2VuaXplci5mcm9tX3ByZXRyYWluZWQoImJlcnQtYmFzZS11bmNhc2VkIikKICAgIG1vZGVsID0gQmVydE1vZGVsLmZyb21fcHJldHJhaW5lZCgiYmVydC1iYXNlLXVuY2FzZWQiKQogICAgbW9kZWwuZXZhbCgpCgogICAgc2V0YXR0cihjb250ZXh0LnVzZXJfZGF0YSwgInRva2VuaXplciIsIHRva2VuaXplcikKICAgIHNldGF0dHIoY29udGV4dC51c2VyX2RhdGEsICJtb2RlbCIsIG1vZGVsKQoKCmRlZiBoYW5kbGVyKGNvbnRleHQsIGV2ZW50KToKICAgIGRvY3MgPSBqc29uLmxvYWRzKGV2ZW50LmJvZHkpCiAgICBkb2NzID0gW2RvYy5sb3dlcigpIGZvciBkb2MgaW4gZG9jc10KICAgIGRvY3MgPSBjb250ZXh0LnVzZXJfZGF0YS50b2tlbml6ZXIuYmF0Y2hfZW5jb2RlX3BsdXMoCiAgICAgICAgZG9jcywgcGFkX3RvX21heF9sZW5ndGg9VHJ1ZSwgcmV0dXJuX3RlbnNvcnM9InB0IgogICAgKQoKICAgIHdpdGggdG9yY2gubm9fZ3JhZCgpOgogICAgICAgIGVtYmVkZGluZ3MgPSBjb250ZXh0LnVzZXJfZGF0YS5tb2RlbCgqKmRvY3MpCiAgICBlbWJlZGRpbmdzID0gW2VtYmVkZGluZ3NbMF0ubnVtcHkoKSwgZW1iZWRkaW5nc1sxXS5udW1weSgpXQogICAgcmV0dXJuIHBpY2tsZS5kdW1wcyhlbWJlZGRpbmdzKQo=
-  source: ''
-  build:
-    commands:
-    - python -m pip install torch==1.6.0
-    code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/bert_embeddings/bert_embeddings.py
-  default_handler: handler
-  affinity: null
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/bert_embeddings/1.1.1/static/item.html b/functions/development/bert_embeddings/1.1.1/static/item.html deleted file mode 100644 index fc040937..00000000 --- a/functions/development/bert_embeddings/1.1.1/static/item.html +++ /dev/null @@ -1,48 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- machine-learning
-- data-preparation
-description: Get BERT based embeddings for given text
-doc: ''
-example: bert_embeddings.ipynb
-generationDate: 2022-08-28:17-25
-hidden: false
-icon: ''
-labels:
-  framework: pytorch
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 1.1.0
-name: bert-embeddings
-platformVersion: 3.5.0
-spec:
-  filename: bert_embeddings.py
-  handler: handler
-  image: mlrun/ml-models
-  kind: nuclio
-  requirements:
-  - torch==1.6.0
-url: ''
-version: 1.1.1
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/bert_embeddings/1.1.1/static/source.html b/functions/development/bert_embeddings/1.1.1/static/source.html deleted file mode 100644 index 1df4accf..00000000 --- a/functions/development/bert_embeddings/1.1.1/static/source.html +++ /dev/null @@ -1,63 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-# Copyright 2019 Iguazio
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-import json
-import pickle
-
-import torch
-from transformers import BertModel, BertTokenizer
-
-
-def init_context(context):
-    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
-    model = BertModel.from_pretrained("bert-base-uncased")
-    model.eval()
-
-    setattr(context.user_data, "tokenizer", tokenizer)
-    setattr(context.user_data, "model", model)
-
-
-def handler(context, event):
-    docs = json.loads(event.body)
-    docs = [doc.lower() for doc in docs]
-    docs = context.user_data.tokenizer.batch_encode_plus(
-        docs, pad_to_max_length=True, return_tensors="pt"
-    )
-
-    with torch.no_grad():
-        embeddings = context.user_data.model(**docs)
-    embeddings = [embeddings[0].numpy(), embeddings[1].numpy()]
-    return pickle.dumps(embeddings)
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/bert_embeddings/1.2.0/src/bert_embeddings.ipynb b/functions/development/bert_embeddings/1.2.0/src/bert_embeddings.ipynb deleted file mode 100644 index cb6d5584..00000000 --- a/functions/development/bert_embeddings/1.2.0/src/bert_embeddings.ipynb +++ /dev/null @@ -1,503 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## BERT Embeddings Serverless Function\n", - "This notebook presents deployment of pretrained BERT model that outputs embeddings for given textual sequences as a serverless function. Embeddings are meaningful, contextual representations of text in the form of ndarrays that are used frequently as input to various learning tasks in the field of NLP." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Embeddings without bert" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "[One-Hot Encoding](https://en.wikipedia.org/wiki/One-hot) is a general method that can vectorize any categorical features. It is simple and fast to create and update the vectorization.
\n", - "in case of text embeddings, each row is a sentence and each column is a word/char/[n-gram](https://en.wikipedia.org/wiki/N-gram)." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "# some sentences to do examine\n", - "sentences = ['the quick brown fox jumps over the lazy dog',\n", - " 'Hello I am Jacob',\n", - " 'Daniel visited Tel-Aviv last month']" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "lets see the difference between bert embeddings and one-hot encoding" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'lazy', 'dog', 'Hello', 'I', 'am', 'Jacob', 'Daniel', 'visited', 'Tel-Aviv', 'last', 'month']\n" - ] - } - ], - "source": [ - "# constructing a list of all the words (will be our columns) - make sure no duplicate words are set\n", - "tokens = []\n", - "for sentence in sentences:\n", - " for word in sentence.split():\n", - " tokens.append(word) if word not in tokens else \"\"\n", - "print(tokens)" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "# constructing the one hot vector\n", - "import pandas as pd\n", - "import numpy as np\n", - "\n", - "one_hot = pd.DataFrame(columns = range(len(tokens)))\n", - "# filling our empty dataframe with each sentence encoding\n", - "for sentence in sentences:\n", - " vector = np.zeros(len(tokens))\n", - " for word in sentence.split():\n", - " vector[tokens.index(word)]=1\n", - " one_hot = one_hot.append(pd.Series(vector),ignore_index=True)\n", - "one_hot.columns = tokens" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
thequickbrownfoxjumpsoverlazydogHelloIamJacobDanielvisitedTel-Avivlastmonth
01.01.01.01.01.01.01.01.00.00.00.00.00.00.00.00.00.0
10.00.00.00.00.00.00.00.01.01.01.01.00.00.00.00.00.0
20.00.00.00.00.00.00.00.00.00.00.00.01.01.01.01.01.0
\n", - "
" - ], - "text/plain": [ - " the quick brown fox jumps over lazy dog Hello I am Jacob \\\n", - "0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 0.0 0.0 0.0 0.0 \n", - "1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 \n", - "2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", - "\n", - " Daniel visited Tel-Aviv last month \n", - "0 0.0 0.0 0.0 0.0 0.0 \n", - "1 0.0 0.0 0.0 0.0 0.0 \n", - "2 1.0 1.0 1.0 1.0 1.0 " - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "one_hot" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The table above represents the one-hot encoding of our sentences, each row is a sentence and each column is a word.\n", - "this representation is very slim and will be a very weak learning dataset." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Introducing Bert embeddings" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import import_function, auto_mount" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "# importing the function from the hub\n", - "fn = import_function(\"hub://bert_embeddings\").apply(auto_mount())" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2023-02-02 09:29:59,002 [info] Starting remote function deploy\n", - "2023-02-02 09:29:59 (info) Deploying function\n", - "2023-02-02 09:29:59 (info) Building\n", - "2023-02-02 09:29:59 (info) Staging files and preparing base images\n", - "2023-02-02 09:29:59 (info) Building processor image\n", - "2023-02-02 09:32:09 (info) Build complete\n", - "2023-02-02 09:32:35 (info) Function deploy complete\n", - "> 2023-02-02 09:32:36,059 [info] successfully deployed function: {'internal_invocation_urls': ['nuclio-default-bert-embeddings.default-tenant.svc.cluster.local:8080'], 'external_invocation_urls': ['default-bert-embeddings-default.default-tenant.app.cto-office.iguazio-cd1.com/']}\n" - ] - } - ], - "source": [ - "# deploying the function\n", - "addr = fn.deploy()" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "import requests\n", - "import json\n", - "# sending a request to the function endpoint to get the sentences' embeddings\n", - "resp = requests.post(addr, json=json.dumps(sentences))" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "import pickle\n", - "output_embeddings = pickle.loads(resp.content)" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "embeddings per token shape: (3, 11, 768), pooled embeddings shape: (3, 768)\n" - ] - } - ], - "source": [ - "print(f'embeddings per token shape: {output_embeddings[0].shape}, pooled embeddings shape: {output_embeddings[1].shape}')" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
0123456789...758759760761762763764765766767
0-0.733322-0.2235400.3424620.383463-0.1647960.0405220.8028450.1528420.331639-0.999779...0.2065640.2314150.1964330.7979080.4351750.7493700.2460980.427603-0.5773840.842063
1-0.953005-0.535132-0.7438220.8939340.646276-0.2793880.9435130.275504-0.555109-0.999992...0.582386-0.0046140.9760790.931517-0.3914420.5303840.675933-0.682721-0.7463390.957809
2-0.843678-0.453405-0.8260110.6508050.494036-0.1541170.8216420.349507-0.650629-0.999978...0.618286-0.3367000.9362620.857577-0.7874890.2461370.676243-0.612532-0.7087860.840879
\n", - "

3 rows × 768 columns

\n", - "
" - ], - "text/plain": [ - " 0 1 2 3 4 5 6 \\\n", - "0 -0.733322 -0.223540 0.342462 0.383463 -0.164796 0.040522 0.802845 \n", - "1 -0.953005 -0.535132 -0.743822 0.893934 0.646276 -0.279388 0.943513 \n", - "2 -0.843678 -0.453405 -0.826011 0.650805 0.494036 -0.154117 0.821642 \n", - "\n", - " 7 8 9 ... 758 759 760 761 \\\n", - "0 0.152842 0.331639 -0.999779 ... 0.206564 0.231415 0.196433 0.797908 \n", - "1 0.275504 -0.555109 -0.999992 ... 0.582386 -0.004614 0.976079 0.931517 \n", - "2 0.349507 -0.650629 -0.999978 ... 0.618286 -0.336700 0.936262 0.857577 \n", - "\n", - " 762 763 764 765 766 767 \n", - "0 0.435175 0.749370 0.246098 0.427603 -0.577384 0.842063 \n", - "1 -0.391442 0.530384 0.675933 -0.682721 -0.746339 0.957809 \n", - "2 -0.787489 0.246137 0.676243 -0.612532 -0.708786 0.840879 \n", - "\n", - "[3 rows x 768 columns]" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "pd.DataFrame(output_embeddings[1])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "we can see that the size of the first dimension of the outputs is three since we passed in three sequences. Also the intermediate dimension of the first output is the maximal number of tokens across all input sequences. Sequences with less tokens are padded with zero values.
\n", - "Note that the first input has an intermediate dimension of size 11 that corresponds to the number of max tokens in the input sequence after addition of two special tokens marking beginning and end of a sequence by the tokenizer.
\n", - "The last dimension for both is of size 768 which is the embedding dimension for this default configuration of bert.
\n", - "Now you tell me, which encoding are you gonna use in your project ??" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/functions/development/bert_embeddings/1.2.0/src/bert_embeddings.py b/functions/development/bert_embeddings/1.2.0/src/bert_embeddings.py deleted file mode 100644 index 109081b1..00000000 --- a/functions/development/bert_embeddings/1.2.0/src/bert_embeddings.py +++ /dev/null @@ -1,41 +0,0 @@ -# Copyright 2019 Iguazio -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -import json -import pickle - -import torch -from transformers import BertModel, BertTokenizer - - -def init_context(context): - tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") - model = BertModel.from_pretrained("bert-base-uncased") - model.eval() - - setattr(context.user_data, "tokenizer", tokenizer) - setattr(context.user_data, "model", model) - - -def handler(context, event): - docs = json.loads(event.body) - docs = [doc.lower() for doc in docs] - docs = context.user_data.tokenizer.batch_encode_plus( - docs, pad_to_max_length=True, return_tensors="pt" - ) - - with torch.no_grad(): - embeddings = context.user_data.model(**docs) - embeddings = [embeddings[0].numpy(), embeddings[1].numpy()] - return pickle.dumps(embeddings) diff --git a/functions/development/bert_embeddings/1.2.0/src/function.yaml b/functions/development/bert_embeddings/1.2.0/src/function.yaml deleted file mode 100644 index 4a3fcf54..00000000 --- a/functions/development/bert_embeddings/1.2.0/src/function.yaml +++ /dev/null @@ -1,38 +0,0 @@ -kind: remote -metadata: - name: bert-embeddings - tag: '' - hash: 57a2ce8e0da1f6e813a8649e9ea6fcbb69a1ce5f - project: '' - labels: - framework: pytorch - categories: - - machine-learning - - data-preparation -spec: - command: '' - args: [] - image: mlrun/mlrun - build: - functionSourceCode: IyBDb3B5cmlnaHQgMjAxOSBJZ3VhemlvCiMKIyBMaWNlbnNlZCB1bmRlciB0aGUgQXBhY2hlIExpY2Vuc2UsIFZlcnNpb24gMi4wICh0aGUgIkxpY2Vuc2UiKTsKIyB5b3UgbWF5IG5vdCB1c2UgdGhpcyBmaWxlIGV4Y2VwdCBpbiBjb21wbGlhbmNlIHdpdGggdGhlIExpY2Vuc2UuCiMgWW91IG1heSBvYnRhaW4gYSBjb3B5IG9mIHRoZSBMaWNlbnNlIGF0CiMKIyAgICAgaHR0cDovL3d3dy5hcGFjaGUub3JnL2xpY2Vuc2VzL0xJQ0VOU0UtMi4wCiMKIyBVbmxlc3MgcmVxdWlyZWQgYnkgYXBwbGljYWJsZSBsYXcgb3IgYWdyZWVkIHRvIGluIHdyaXRpbmcsIHNvZnR3YXJlCiMgZGlzdHJpYnV0ZWQgdW5kZXIgdGhlIExpY2Vuc2UgaXMgZGlzdHJpYnV0ZWQgb24gYW4gIkFTIElTIiBCQVNJUywKIyBXSVRIT1VUIFdBUlJBTlRJRVMgT1IgQ09ORElUSU9OUyBPRiBBTlkgS0lORCwgZWl0aGVyIGV4cHJlc3Mgb3IgaW1wbGllZC4KIyBTZWUgdGhlIExpY2Vuc2UgZm9yIHRoZSBzcGVjaWZpYyBsYW5ndWFnZSBnb3Zlcm5pbmcgcGVybWlzc2lvbnMgYW5kCiMgbGltaXRhdGlvbnMgdW5kZXIgdGhlIExpY2Vuc2UuCiMKaW1wb3J0IGpzb24KaW1wb3J0IHBpY2tsZQoKaW1wb3J0IHRvcmNoCmZyb20gdHJhbnNmb3JtZXJzIGltcG9ydCBCZXJ0TW9kZWwsIEJlcnRUb2tlbml6ZXIKCgpkZWYgaW5pdF9jb250ZXh0KGNvbnRleHQpOgogICAgdG9rZW5pemVyID0gQmVydFRva2VuaXplci5mcm9tX3ByZXRyYWluZWQoImJlcnQtYmFzZS11bmNhc2VkIikKICAgIG1vZGVsID0gQmVydE1vZGVsLmZyb21fcHJldHJhaW5lZCgiYmVydC1iYXNlLXVuY2FzZWQiKQogICAgbW9kZWwuZXZhbCgpCgogICAgc2V0YXR0cihjb250ZXh0LnVzZXJfZGF0YSwgInRva2VuaXplciIsIHRva2VuaXplcikKICAgIHNldGF0dHIoY29udGV4dC51c2VyX2RhdGEsICJtb2RlbCIsIG1vZGVsKQoKCmRlZiBoYW5kbGVyKGNvbnRleHQsIGV2ZW50KToKICAgIGRvY3MgPSBqc29uLmxvYWRzKGV2ZW50LmJvZHkpCiAgICBkb2NzID0gW2RvYy5sb3dlcigpIGZvciBkb2MgaW4gZG9jc10KICAgIGRvY3MgPSBjb250ZXh0LnVzZXJfZGF0YS50b2tlbml6ZXIuYmF0Y2hfZW5jb2RlX3BsdXMoCiAgICAgICAgZG9jcywgcGFkX3RvX21heF9sZW5ndGg9VHJ1ZSwgcmV0dXJuX3RlbnNvcnM9InB0IgogICAgKQoKICAgIHdpdGggdG9yY2gubm9fZ3JhZCgpOgogICAgICAgIGVtYmVkZGluZ3MgPSBjb250ZXh0LnVzZXJfZGF0YS5tb2RlbCgqKmRvY3MpCiAgICBlbWJlZGRpbmdzID0gW2VtYmVkZGluZ3NbMF0ubnVtcHkoKSwgZW1iZWRkaW5nc1sxXS5udW1weSgpXQogICAgcmV0dXJuIHBpY2tsZS5kdW1wcyhlbWJlZGRpbmdzKQo= - commands: [] - code_origin: http://github.com/aviaIguazio/functions.git#a1c9940e4c2420c88063768b4038e29b1f4e37a6:/Users/Avi_Asulin/PycharmProjects/mlrun/functions/bert_embeddings/bert_embeddings.py - origin_filename: /Users/Avi_Asulin/PycharmProjects/mlrun/functions/bert_embeddings/bert_embeddings.py - requirements: - - torch - description: Get BERT based embeddings for given text - default_handler: '' - disable_auto_mount: false - clone_target_dir: '' - env: [] - priority_class_name: '' - preemption_mode: prevent - min_replicas: 1 - max_replicas: 4 - source: '' - function_handler: bert_embeddings:handler - base_image_pull: false - affinity: null - tolerations: null - security_context: {} -verbose: false diff --git a/functions/development/bert_embeddings/1.2.0/src/item.yaml b/functions/development/bert_embeddings/1.2.0/src/item.yaml deleted file mode 100644 index f0eaed1c..00000000 --- a/functions/development/bert_embeddings/1.2.0/src/item.yaml +++ /dev/null @@ -1,26 +0,0 @@ -apiVersion: v1 -categories: -- machine-learning -- data-preparation -description: Get BERT based embeddings for given text -doc: '' -example: bert_embeddings.ipynb -generationDate: 2022-08-28:17-25 -hidden: false -icon: '' -labels: - framework: pytorch -maintainers: [] -marketplaceType: '' -mlrunVersion: 1.4.1 -name: bert-embeddings -platformVersion: 3.5.3 -spec: - filename: bert_embeddings.py - handler: handler - image: mlrun/mlrun - kind: nuclio - requirements: - - torch -url: '' -version: 1.2.0 diff --git a/functions/development/bert_embeddings/1.2.0/src/requirements.txt b/functions/development/bert_embeddings/1.2.0/src/requirements.txt deleted file mode 100644 index 747b7aa9..00000000 --- a/functions/development/bert_embeddings/1.2.0/src/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -transformers \ No newline at end of file diff --git a/functions/development/bert_embeddings/1.2.0/src/test_bert_embeddings.py b/functions/development/bert_embeddings/1.2.0/src/test_bert_embeddings.py deleted file mode 100644 index 7ad9101c..00000000 --- a/functions/development/bert_embeddings/1.2.0/src/test_bert_embeddings.py +++ /dev/null @@ -1,32 +0,0 @@ -# Copyright 2019 Iguazio -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -from bert_embeddings import init_context,handler -import nuclio -import json -import pickle -import numpy as np - -ARCHIVE = "https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz" -ARTIFACTS_PATH = 'artifacts' - - -def test_bert_embeddings(): - event = nuclio.Event(body=json.dumps(['John loves Mary'])) - ctx = nuclio.Context() - init_context(ctx) - outputs = pickle.loads(handler(ctx, event)) - assert (True if abs(np.mean(outputs[0]) - -0.011996539) <= 0.0001 else False) is True - assert (True if abs(np.mean(outputs[0]) - -0.011996539) > 0 else False) is True - diff --git a/functions/development/bert_embeddings/1.2.0/static/bert_embeddings.html b/functions/development/bert_embeddings/1.2.0/static/bert_embeddings.html deleted file mode 100644 index 863a0261..00000000 --- a/functions/development/bert_embeddings/1.2.0/static/bert_embeddings.html +++ /dev/null @@ -1,181 +0,0 @@ - - - - - - - -bert_embeddings.bert_embeddings - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - -
-
- -
-
-
-
-
- -
-

- -
-
-
-
-
-
-
-

Source code for bert_embeddings.bert_embeddings

-# Copyright 2019 Iguazio
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-import json
-import pickle
-
-import torch
-from transformers import BertModel, BertTokenizer
-
-
-
[docs]def init_context(context): - tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") - model = BertModel.from_pretrained("bert-base-uncased") - model.eval() - - setattr(context.user_data, "tokenizer", tokenizer) - setattr(context.user_data, "model", model)
- - -
[docs]def handler(context, event): - docs = json.loads(event.body) - docs = [doc.lower() for doc in docs] - docs = context.user_data.tokenizer.batch_encode_plus( - docs, pad_to_max_length=True, return_tensors="pt" - ) - - with torch.no_grad(): - embeddings = context.user_data.model(**docs) - embeddings = [embeddings[0].numpy(), embeddings[1].numpy()] - return pickle.dumps(embeddings)
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/bert_embeddings/1.2.0/static/documentation.html b/functions/development/bert_embeddings/1.2.0/static/documentation.html deleted file mode 100644 index b99c805d..00000000 --- a/functions/development/bert_embeddings/1.2.0/static/documentation.html +++ /dev/null @@ -1,230 +0,0 @@ - - - - - - - -bert_embeddings package - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - - - -
-
- - -
-
-
- -
-

bert_embeddings package

- -
- -
-
-
-
-
-

bert_embeddings package#

-
-

Submodules#

-
-
-

bert_embeddings.bert_embeddings module#

-
-
-bert_embeddings.bert_embeddings.handler(context, event)[source]#
-
-
-
-bert_embeddings.bert_embeddings.init_context(context)[source]#
-
-
-
-

Module contents#

-
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/bert_embeddings/1.2.0/static/example.html b/functions/development/bert_embeddings/1.2.0/static/example.html deleted file mode 100644 index 59984155..00000000 --- a/functions/development/bert_embeddings/1.2.0/static/example.html +++ /dev/null @@ -1,584 +0,0 @@ - - - - - - - -BERT Embeddings Serverless Function - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - - - -
-
- - -
-
-
- -
-

BERT Embeddings Serverless Function

- -
- -
-
-
-
-
-

BERT Embeddings Serverless Function#

-

This notebook presents deployment of pretrained BERT model that outputs embeddings for given textual sequences as a serverless function. Embeddings are meaningful, contextual representations of text in the form of ndarrays that are used frequently as input to various learning tasks in the field of NLP.

-
-
-

Embeddings without bert#

-

One-Hot Encoding is a general method that can vectorize any categorical features. It is simple and fast to create and update the vectorization.
-in case of text embeddings, each row is a sentence and each column is a word/char/n-gram.

-
-
-
# some sentences to do examine
-sentences = ['the quick brown fox jumps over the lazy dog',
-              'Hello I am Jacob',
-              'Daniel visited Tel-Aviv last month']
-
-
-
-
-

lets see the difference between bert embeddings and one-hot encoding

-
-
-
# constructing a list of all the words (will be our columns) - make sure no duplicate words are set
-tokens = []
-for sentence in sentences:
-    for word in sentence.split():
-        tokens.append(word) if word not in tokens else ""
-print(tokens)
-
-
-
-
-
['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'lazy', 'dog', 'Hello', 'I', 'am', 'Jacob', 'Daniel', 'visited', 'Tel-Aviv', 'last', 'month']
-
-
-
-
-
-
-
# constructing the one hot vector
-import pandas as pd
-import numpy as np
-
-one_hot = pd.DataFrame(columns = range(len(tokens)))
-# filling our empty dataframe with each sentence encoding
-for sentence in sentences:
-    vector = np.zeros(len(tokens))
-    for word in sentence.split():
-        vector[tokens.index(word)]=1
-    one_hot = one_hot.append(pd.Series(vector),ignore_index=True)
-one_hot.columns = tokens
-
-
-
-
-
-
-
one_hot
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
thequickbrownfoxjumpsoverlazydogHelloIamJacobDanielvisitedTel-Avivlastmonth
01.01.01.01.01.01.01.01.00.00.00.00.00.00.00.00.00.0
10.00.00.00.00.00.00.00.01.01.01.01.00.00.00.00.00.0
20.00.00.00.00.00.00.00.00.00.00.00.01.01.01.01.01.0
-
-
-

The table above represents the one-hot encoding of our sentences, each row is a sentence and each column is a word. -this representation is very slim and will be a very weak learning dataset.

-
-
-

Introducing Bert embeddings#

-
-
-
from mlrun import import_function, auto_mount
-
-
-
-
-
-
-
# importing the function from the hub
-fn = import_function("hub://bert_embeddings").apply(auto_mount())
-
-
-
-
-
-
-
# deploying the function
-addr = fn.deploy()
-
-
-
-
-
> 2023-02-02 09:29:59,002 [info] Starting remote function deploy
-2023-02-02 09:29:59  (info) Deploying function
-2023-02-02 09:29:59  (info) Building
-2023-02-02 09:29:59  (info) Staging files and preparing base images
-2023-02-02 09:29:59  (info) Building processor image
-2023-02-02 09:32:09  (info) Build complete
-2023-02-02 09:32:35  (info) Function deploy complete
-> 2023-02-02 09:32:36,059 [info] successfully deployed function: {'internal_invocation_urls': ['nuclio-default-bert-embeddings.default-tenant.svc.cluster.local:8080'], 'external_invocation_urls': ['default-bert-embeddings-default.default-tenant.app.cto-office.iguazio-cd1.com/']}
-
-
-
-
-
-
-
import requests
-import json
-# sending a request to the function endpoint to get the sentences' embeddings
-resp = requests.post(addr, json=json.dumps(sentences))
-
-
-
-
-
-
-
import pickle
-output_embeddings = pickle.loads(resp.content)
-
-
-
-
-
-
-
print(f'embeddings per token shape: {output_embeddings[0].shape}, pooled embeddings shape: {output_embeddings[1].shape}')
-
-
-
-
-
embeddings per token shape: (3, 11, 768), pooled embeddings shape: (3, 768)
-
-
-
-
-
-
-
pd.DataFrame(output_embeddings[1])
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
0123456789...758759760761762763764765766767
0-0.733322-0.2235400.3424620.383463-0.1647960.0405220.8028450.1528420.331639-0.999779...0.2065640.2314150.1964330.7979080.4351750.7493700.2460980.427603-0.5773840.842063
1-0.953005-0.535132-0.7438220.8939340.646276-0.2793880.9435130.275504-0.555109-0.999992...0.582386-0.0046140.9760790.931517-0.3914420.5303840.675933-0.682721-0.7463390.957809
2-0.843678-0.453405-0.8260110.6508050.494036-0.1541170.8216420.349507-0.650629-0.999978...0.618286-0.3367000.9362620.857577-0.7874890.2461370.676243-0.612532-0.7087860.840879
-

3 rows × 768 columns

-
-
-

we can see that the size of the first dimension of the outputs is three since we passed in three sequences. Also the intermediate dimension of the first output is the maximal number of tokens across all input sequences. Sequences with less tokens are padded with zero values.
-Note that the first input has an intermediate dimension of size 11 that corresponds to the number of max tokens in the input sequence after addition of two special tokens marking beginning and end of a sequence by the tokenizer.
-The last dimension for both is of size 768 which is the embedding dimension for this default configuration of bert.
-Now you tell me, which encoding are you gonna use in your project ??

-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/bert_embeddings/1.2.0/static/function.html b/functions/development/bert_embeddings/1.2.0/static/function.html deleted file mode 100644 index 985f9e26..00000000 --- a/functions/development/bert_embeddings/1.2.0/static/function.html +++ /dev/null @@ -1,60 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: remote
-metadata:
-  name: bert-embeddings
-  tag: ''
-  hash: 57a2ce8e0da1f6e813a8649e9ea6fcbb69a1ce5f
-  project: ''
-  labels:
-    framework: pytorch
-  categories:
-  - machine-learning
-  - data-preparation
-spec:
-  command: ''
-  args: []
-  image: mlrun/mlrun
-  build:
-    functionSourceCode: IyBDb3B5cmlnaHQgMjAxOSBJZ3VhemlvCiMKIyBMaWNlbnNlZCB1bmRlciB0aGUgQXBhY2hlIExpY2Vuc2UsIFZlcnNpb24gMi4wICh0aGUgIkxpY2Vuc2UiKTsKIyB5b3UgbWF5IG5vdCB1c2UgdGhpcyBmaWxlIGV4Y2VwdCBpbiBjb21wbGlhbmNlIHdpdGggdGhlIExpY2Vuc2UuCiMgWW91IG1heSBvYnRhaW4gYSBjb3B5IG9mIHRoZSBMaWNlbnNlIGF0CiMKIyAgICAgaHR0cDovL3d3dy5hcGFjaGUub3JnL2xpY2Vuc2VzL0xJQ0VOU0UtMi4wCiMKIyBVbmxlc3MgcmVxdWlyZWQgYnkgYXBwbGljYWJsZSBsYXcgb3IgYWdyZWVkIHRvIGluIHdyaXRpbmcsIHNvZnR3YXJlCiMgZGlzdHJpYnV0ZWQgdW5kZXIgdGhlIExpY2Vuc2UgaXMgZGlzdHJpYnV0ZWQgb24gYW4gIkFTIElTIiBCQVNJUywKIyBXSVRIT1VUIFdBUlJBTlRJRVMgT1IgQ09ORElUSU9OUyBPRiBBTlkgS0lORCwgZWl0aGVyIGV4cHJlc3Mgb3IgaW1wbGllZC4KIyBTZWUgdGhlIExpY2Vuc2UgZm9yIHRoZSBzcGVjaWZpYyBsYW5ndWFnZSBnb3Zlcm5pbmcgcGVybWlzc2lvbnMgYW5kCiMgbGltaXRhdGlvbnMgdW5kZXIgdGhlIExpY2Vuc2UuCiMKaW1wb3J0IGpzb24KaW1wb3J0IHBpY2tsZQoKaW1wb3J0IHRvcmNoCmZyb20gdHJhbnNmb3JtZXJzIGltcG9ydCBCZXJ0TW9kZWwsIEJlcnRUb2tlbml6ZXIKCgpkZWYgaW5pdF9jb250ZXh0KGNvbnRleHQpOgogICAgdG9rZW5pemVyID0gQmVydFRva2VuaXplci5mcm9tX3ByZXRyYWluZWQoImJlcnQtYmFzZS11bmNhc2VkIikKICAgIG1vZGVsID0gQmVydE1vZGVsLmZyb21fcHJldHJhaW5lZCgiYmVydC1iYXNlLXVuY2FzZWQiKQogICAgbW9kZWwuZXZhbCgpCgogICAgc2V0YXR0cihjb250ZXh0LnVzZXJfZGF0YSwgInRva2VuaXplciIsIHRva2VuaXplcikKICAgIHNldGF0dHIoY29udGV4dC51c2VyX2RhdGEsICJtb2RlbCIsIG1vZGVsKQoKCmRlZiBoYW5kbGVyKGNvbnRleHQsIGV2ZW50KToKICAgIGRvY3MgPSBqc29uLmxvYWRzKGV2ZW50LmJvZHkpCiAgICBkb2NzID0gW2RvYy5sb3dlcigpIGZvciBkb2MgaW4gZG9jc10KICAgIGRvY3MgPSBjb250ZXh0LnVzZXJfZGF0YS50b2tlbml6ZXIuYmF0Y2hfZW5jb2RlX3BsdXMoCiAgICAgICAgZG9jcywgcGFkX3RvX21heF9sZW5ndGg9VHJ1ZSwgcmV0dXJuX3RlbnNvcnM9InB0IgogICAgKQoKICAgIHdpdGggdG9yY2gubm9fZ3JhZCgpOgogICAgICAgIGVtYmVkZGluZ3MgPSBjb250ZXh0LnVzZXJfZGF0YS5tb2RlbCgqKmRvY3MpCiAgICBlbWJlZGRpbmdzID0gW2VtYmVkZGluZ3NbMF0ubnVtcHkoKSwgZW1iZWRkaW5nc1sxXS5udW1weSgpXQogICAgcmV0dXJuIHBpY2tsZS5kdW1wcyhlbWJlZGRpbmdzKQo=
-    commands: []
-    code_origin: http://github.com/aviaIguazio/functions.git#a1c9940e4c2420c88063768b4038e29b1f4e37a6:/Users/Avi_Asulin/PycharmProjects/mlrun/functions/bert_embeddings/bert_embeddings.py
-    origin_filename: /Users/Avi_Asulin/PycharmProjects/mlrun/functions/bert_embeddings/bert_embeddings.py
-    requirements:
-    - torch
-  description: Get BERT based embeddings for given text
-  default_handler: ''
-  disable_auto_mount: false
-  clone_target_dir: ''
-  env: []
-  priority_class_name: ''
-  preemption_mode: prevent
-  min_replicas: 1
-  max_replicas: 4
-  source: ''
-  function_handler: bert_embeddings:handler
-  base_image_pull: false
-  affinity: null
-  tolerations: null
-  security_context: {}
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/bert_embeddings/1.2.0/static/item.html b/functions/development/bert_embeddings/1.2.0/static/item.html deleted file mode 100644 index 612e78b6..00000000 --- a/functions/development/bert_embeddings/1.2.0/static/item.html +++ /dev/null @@ -1,48 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- machine-learning
-- data-preparation
-description: Get BERT based embeddings for given text
-doc: ''
-example: bert_embeddings.ipynb
-generationDate: 2022-08-28:17-25
-hidden: false
-icon: ''
-labels:
-  framework: pytorch
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 1.4.1
-name: bert-embeddings
-platformVersion: 3.5.3
-spec:
-  filename: bert_embeddings.py
-  handler: handler
-  image: mlrun/mlrun
-  kind: nuclio
-  requirements:
-  - torch
-url: ''
-version: 1.2.0
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/bert_embeddings/1.2.0/static/source.html b/functions/development/bert_embeddings/1.2.0/static/source.html deleted file mode 100644 index 1df4accf..00000000 --- a/functions/development/bert_embeddings/1.2.0/static/source.html +++ /dev/null @@ -1,63 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-# Copyright 2019 Iguazio
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-import json
-import pickle
-
-import torch
-from transformers import BertModel, BertTokenizer
-
-
-def init_context(context):
-    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
-    model = BertModel.from_pretrained("bert-base-uncased")
-    model.eval()
-
-    setattr(context.user_data, "tokenizer", tokenizer)
-    setattr(context.user_data, "model", model)
-
-
-def handler(context, event):
-    docs = json.loads(event.body)
-    docs = [doc.lower() for doc in docs]
-    docs = context.user_data.tokenizer.batch_encode_plus(
-        docs, pad_to_max_length=True, return_tensors="pt"
-    )
-
-    with torch.no_grad():
-        embeddings = context.user_data.model(**docs)
-    embeddings = [embeddings[0].numpy(), embeddings[1].numpy()]
-    return pickle.dumps(embeddings)
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/bert_embeddings/latest/src/bert_embeddings.ipynb b/functions/development/bert_embeddings/latest/src/bert_embeddings.ipynb deleted file mode 100644 index cb6d5584..00000000 --- a/functions/development/bert_embeddings/latest/src/bert_embeddings.ipynb +++ /dev/null @@ -1,503 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## BERT Embeddings Serverless Function\n", - "This notebook presents deployment of pretrained BERT model that outputs embeddings for given textual sequences as a serverless function. Embeddings are meaningful, contextual representations of text in the form of ndarrays that are used frequently as input to various learning tasks in the field of NLP." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Embeddings without bert" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "[One-Hot Encoding](https://en.wikipedia.org/wiki/One-hot) is a general method that can vectorize any categorical features. It is simple and fast to create and update the vectorization.
\n", - "in case of text embeddings, each row is a sentence and each column is a word/char/[n-gram](https://en.wikipedia.org/wiki/N-gram)." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "# some sentences to do examine\n", - "sentences = ['the quick brown fox jumps over the lazy dog',\n", - " 'Hello I am Jacob',\n", - " 'Daniel visited Tel-Aviv last month']" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "lets see the difference between bert embeddings and one-hot encoding" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'lazy', 'dog', 'Hello', 'I', 'am', 'Jacob', 'Daniel', 'visited', 'Tel-Aviv', 'last', 'month']\n" - ] - } - ], - "source": [ - "# constructing a list of all the words (will be our columns) - make sure no duplicate words are set\n", - "tokens = []\n", - "for sentence in sentences:\n", - " for word in sentence.split():\n", - " tokens.append(word) if word not in tokens else \"\"\n", - "print(tokens)" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "# constructing the one hot vector\n", - "import pandas as pd\n", - "import numpy as np\n", - "\n", - "one_hot = pd.DataFrame(columns = range(len(tokens)))\n", - "# filling our empty dataframe with each sentence encoding\n", - "for sentence in sentences:\n", - " vector = np.zeros(len(tokens))\n", - " for word in sentence.split():\n", - " vector[tokens.index(word)]=1\n", - " one_hot = one_hot.append(pd.Series(vector),ignore_index=True)\n", - "one_hot.columns = tokens" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
thequickbrownfoxjumpsoverlazydogHelloIamJacobDanielvisitedTel-Avivlastmonth
01.01.01.01.01.01.01.01.00.00.00.00.00.00.00.00.00.0
10.00.00.00.00.00.00.00.01.01.01.01.00.00.00.00.00.0
20.00.00.00.00.00.00.00.00.00.00.00.01.01.01.01.01.0
\n", - "
" - ], - "text/plain": [ - " the quick brown fox jumps over lazy dog Hello I am Jacob \\\n", - "0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 0.0 0.0 0.0 0.0 \n", - "1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 \n", - "2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", - "\n", - " Daniel visited Tel-Aviv last month \n", - "0 0.0 0.0 0.0 0.0 0.0 \n", - "1 0.0 0.0 0.0 0.0 0.0 \n", - "2 1.0 1.0 1.0 1.0 1.0 " - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "one_hot" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The table above represents the one-hot encoding of our sentences, each row is a sentence and each column is a word.\n", - "this representation is very slim and will be a very weak learning dataset." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Introducing Bert embeddings" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import import_function, auto_mount" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "# importing the function from the hub\n", - "fn = import_function(\"hub://bert_embeddings\").apply(auto_mount())" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2023-02-02 09:29:59,002 [info] Starting remote function deploy\n", - "2023-02-02 09:29:59 (info) Deploying function\n", - "2023-02-02 09:29:59 (info) Building\n", - "2023-02-02 09:29:59 (info) Staging files and preparing base images\n", - "2023-02-02 09:29:59 (info) Building processor image\n", - "2023-02-02 09:32:09 (info) Build complete\n", - "2023-02-02 09:32:35 (info) Function deploy complete\n", - "> 2023-02-02 09:32:36,059 [info] successfully deployed function: {'internal_invocation_urls': ['nuclio-default-bert-embeddings.default-tenant.svc.cluster.local:8080'], 'external_invocation_urls': ['default-bert-embeddings-default.default-tenant.app.cto-office.iguazio-cd1.com/']}\n" - ] - } - ], - "source": [ - "# deploying the function\n", - "addr = fn.deploy()" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "import requests\n", - "import json\n", - "# sending a request to the function endpoint to get the sentences' embeddings\n", - "resp = requests.post(addr, json=json.dumps(sentences))" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "import pickle\n", - "output_embeddings = pickle.loads(resp.content)" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "embeddings per token shape: (3, 11, 768), pooled embeddings shape: (3, 768)\n" - ] - } - ], - "source": [ - "print(f'embeddings per token shape: {output_embeddings[0].shape}, pooled embeddings shape: {output_embeddings[1].shape}')" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
0123456789...758759760761762763764765766767
0-0.733322-0.2235400.3424620.383463-0.1647960.0405220.8028450.1528420.331639-0.999779...0.2065640.2314150.1964330.7979080.4351750.7493700.2460980.427603-0.5773840.842063
1-0.953005-0.535132-0.7438220.8939340.646276-0.2793880.9435130.275504-0.555109-0.999992...0.582386-0.0046140.9760790.931517-0.3914420.5303840.675933-0.682721-0.7463390.957809
2-0.843678-0.453405-0.8260110.6508050.494036-0.1541170.8216420.349507-0.650629-0.999978...0.618286-0.3367000.9362620.857577-0.7874890.2461370.676243-0.612532-0.7087860.840879
\n", - "

3 rows × 768 columns

\n", - "
" - ], - "text/plain": [ - " 0 1 2 3 4 5 6 \\\n", - "0 -0.733322 -0.223540 0.342462 0.383463 -0.164796 0.040522 0.802845 \n", - "1 -0.953005 -0.535132 -0.743822 0.893934 0.646276 -0.279388 0.943513 \n", - "2 -0.843678 -0.453405 -0.826011 0.650805 0.494036 -0.154117 0.821642 \n", - "\n", - " 7 8 9 ... 758 759 760 761 \\\n", - "0 0.152842 0.331639 -0.999779 ... 0.206564 0.231415 0.196433 0.797908 \n", - "1 0.275504 -0.555109 -0.999992 ... 0.582386 -0.004614 0.976079 0.931517 \n", - "2 0.349507 -0.650629 -0.999978 ... 0.618286 -0.336700 0.936262 0.857577 \n", - "\n", - " 762 763 764 765 766 767 \n", - "0 0.435175 0.749370 0.246098 0.427603 -0.577384 0.842063 \n", - "1 -0.391442 0.530384 0.675933 -0.682721 -0.746339 0.957809 \n", - "2 -0.787489 0.246137 0.676243 -0.612532 -0.708786 0.840879 \n", - "\n", - "[3 rows x 768 columns]" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "pd.DataFrame(output_embeddings[1])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "we can see that the size of the first dimension of the outputs is three since we passed in three sequences. Also the intermediate dimension of the first output is the maximal number of tokens across all input sequences. Sequences with less tokens are padded with zero values.
\n", - "Note that the first input has an intermediate dimension of size 11 that corresponds to the number of max tokens in the input sequence after addition of two special tokens marking beginning and end of a sequence by the tokenizer.
\n", - "The last dimension for both is of size 768 which is the embedding dimension for this default configuration of bert.
\n", - "Now you tell me, which encoding are you gonna use in your project ??" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/functions/development/bert_embeddings/latest/src/bert_embeddings.py b/functions/development/bert_embeddings/latest/src/bert_embeddings.py deleted file mode 100644 index 109081b1..00000000 --- a/functions/development/bert_embeddings/latest/src/bert_embeddings.py +++ /dev/null @@ -1,41 +0,0 @@ -# Copyright 2019 Iguazio -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -import json -import pickle - -import torch -from transformers import BertModel, BertTokenizer - - -def init_context(context): - tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") - model = BertModel.from_pretrained("bert-base-uncased") - model.eval() - - setattr(context.user_data, "tokenizer", tokenizer) - setattr(context.user_data, "model", model) - - -def handler(context, event): - docs = json.loads(event.body) - docs = [doc.lower() for doc in docs] - docs = context.user_data.tokenizer.batch_encode_plus( - docs, pad_to_max_length=True, return_tensors="pt" - ) - - with torch.no_grad(): - embeddings = context.user_data.model(**docs) - embeddings = [embeddings[0].numpy(), embeddings[1].numpy()] - return pickle.dumps(embeddings) diff --git a/functions/development/bert_embeddings/latest/src/function.yaml b/functions/development/bert_embeddings/latest/src/function.yaml deleted file mode 100644 index 4a3fcf54..00000000 --- a/functions/development/bert_embeddings/latest/src/function.yaml +++ /dev/null @@ -1,38 +0,0 @@ -kind: remote -metadata: - name: bert-embeddings - tag: '' - hash: 57a2ce8e0da1f6e813a8649e9ea6fcbb69a1ce5f - project: '' - labels: - framework: pytorch - categories: - - machine-learning - - data-preparation -spec: - command: '' - args: [] - image: mlrun/mlrun - build: - functionSourceCode: IyBDb3B5cmlnaHQgMjAxOSBJZ3VhemlvCiMKIyBMaWNlbnNlZCB1bmRlciB0aGUgQXBhY2hlIExpY2Vuc2UsIFZlcnNpb24gMi4wICh0aGUgIkxpY2Vuc2UiKTsKIyB5b3UgbWF5IG5vdCB1c2UgdGhpcyBmaWxlIGV4Y2VwdCBpbiBjb21wbGlhbmNlIHdpdGggdGhlIExpY2Vuc2UuCiMgWW91IG1heSBvYnRhaW4gYSBjb3B5IG9mIHRoZSBMaWNlbnNlIGF0CiMKIyAgICAgaHR0cDovL3d3dy5hcGFjaGUub3JnL2xpY2Vuc2VzL0xJQ0VOU0UtMi4wCiMKIyBVbmxlc3MgcmVxdWlyZWQgYnkgYXBwbGljYWJsZSBsYXcgb3IgYWdyZWVkIHRvIGluIHdyaXRpbmcsIHNvZnR3YXJlCiMgZGlzdHJpYnV0ZWQgdW5kZXIgdGhlIExpY2Vuc2UgaXMgZGlzdHJpYnV0ZWQgb24gYW4gIkFTIElTIiBCQVNJUywKIyBXSVRIT1VUIFdBUlJBTlRJRVMgT1IgQ09ORElUSU9OUyBPRiBBTlkgS0lORCwgZWl0aGVyIGV4cHJlc3Mgb3IgaW1wbGllZC4KIyBTZWUgdGhlIExpY2Vuc2UgZm9yIHRoZSBzcGVjaWZpYyBsYW5ndWFnZSBnb3Zlcm5pbmcgcGVybWlzc2lvbnMgYW5kCiMgbGltaXRhdGlvbnMgdW5kZXIgdGhlIExpY2Vuc2UuCiMKaW1wb3J0IGpzb24KaW1wb3J0IHBpY2tsZQoKaW1wb3J0IHRvcmNoCmZyb20gdHJhbnNmb3JtZXJzIGltcG9ydCBCZXJ0TW9kZWwsIEJlcnRUb2tlbml6ZXIKCgpkZWYgaW5pdF9jb250ZXh0KGNvbnRleHQpOgogICAgdG9rZW5pemVyID0gQmVydFRva2VuaXplci5mcm9tX3ByZXRyYWluZWQoImJlcnQtYmFzZS11bmNhc2VkIikKICAgIG1vZGVsID0gQmVydE1vZGVsLmZyb21fcHJldHJhaW5lZCgiYmVydC1iYXNlLXVuY2FzZWQiKQogICAgbW9kZWwuZXZhbCgpCgogICAgc2V0YXR0cihjb250ZXh0LnVzZXJfZGF0YSwgInRva2VuaXplciIsIHRva2VuaXplcikKICAgIHNldGF0dHIoY29udGV4dC51c2VyX2RhdGEsICJtb2RlbCIsIG1vZGVsKQoKCmRlZiBoYW5kbGVyKGNvbnRleHQsIGV2ZW50KToKICAgIGRvY3MgPSBqc29uLmxvYWRzKGV2ZW50LmJvZHkpCiAgICBkb2NzID0gW2RvYy5sb3dlcigpIGZvciBkb2MgaW4gZG9jc10KICAgIGRvY3MgPSBjb250ZXh0LnVzZXJfZGF0YS50b2tlbml6ZXIuYmF0Y2hfZW5jb2RlX3BsdXMoCiAgICAgICAgZG9jcywgcGFkX3RvX21heF9sZW5ndGg9VHJ1ZSwgcmV0dXJuX3RlbnNvcnM9InB0IgogICAgKQoKICAgIHdpdGggdG9yY2gubm9fZ3JhZCgpOgogICAgICAgIGVtYmVkZGluZ3MgPSBjb250ZXh0LnVzZXJfZGF0YS5tb2RlbCgqKmRvY3MpCiAgICBlbWJlZGRpbmdzID0gW2VtYmVkZGluZ3NbMF0ubnVtcHkoKSwgZW1iZWRkaW5nc1sxXS5udW1weSgpXQogICAgcmV0dXJuIHBpY2tsZS5kdW1wcyhlbWJlZGRpbmdzKQo= - commands: [] - code_origin: http://github.com/aviaIguazio/functions.git#a1c9940e4c2420c88063768b4038e29b1f4e37a6:/Users/Avi_Asulin/PycharmProjects/mlrun/functions/bert_embeddings/bert_embeddings.py - origin_filename: /Users/Avi_Asulin/PycharmProjects/mlrun/functions/bert_embeddings/bert_embeddings.py - requirements: - - torch - description: Get BERT based embeddings for given text - default_handler: '' - disable_auto_mount: false - clone_target_dir: '' - env: [] - priority_class_name: '' - preemption_mode: prevent - min_replicas: 1 - max_replicas: 4 - source: '' - function_handler: bert_embeddings:handler - base_image_pull: false - affinity: null - tolerations: null - security_context: {} -verbose: false diff --git a/functions/development/bert_embeddings/latest/src/item.yaml b/functions/development/bert_embeddings/latest/src/item.yaml deleted file mode 100644 index f0eaed1c..00000000 --- a/functions/development/bert_embeddings/latest/src/item.yaml +++ /dev/null @@ -1,26 +0,0 @@ -apiVersion: v1 -categories: -- machine-learning -- data-preparation -description: Get BERT based embeddings for given text -doc: '' -example: bert_embeddings.ipynb -generationDate: 2022-08-28:17-25 -hidden: false -icon: '' -labels: - framework: pytorch -maintainers: [] -marketplaceType: '' -mlrunVersion: 1.4.1 -name: bert-embeddings -platformVersion: 3.5.3 -spec: - filename: bert_embeddings.py - handler: handler - image: mlrun/mlrun - kind: nuclio - requirements: - - torch -url: '' -version: 1.2.0 diff --git a/functions/development/bert_embeddings/latest/src/requirements.txt b/functions/development/bert_embeddings/latest/src/requirements.txt deleted file mode 100644 index 747b7aa9..00000000 --- a/functions/development/bert_embeddings/latest/src/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -transformers \ No newline at end of file diff --git a/functions/development/bert_embeddings/latest/src/test_bert_embeddings.py b/functions/development/bert_embeddings/latest/src/test_bert_embeddings.py deleted file mode 100644 index 7ad9101c..00000000 --- a/functions/development/bert_embeddings/latest/src/test_bert_embeddings.py +++ /dev/null @@ -1,32 +0,0 @@ -# Copyright 2019 Iguazio -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -from bert_embeddings import init_context,handler -import nuclio -import json -import pickle -import numpy as np - -ARCHIVE = "https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz" -ARTIFACTS_PATH = 'artifacts' - - -def test_bert_embeddings(): - event = nuclio.Event(body=json.dumps(['John loves Mary'])) - ctx = nuclio.Context() - init_context(ctx) - outputs = pickle.loads(handler(ctx, event)) - assert (True if abs(np.mean(outputs[0]) - -0.011996539) <= 0.0001 else False) is True - assert (True if abs(np.mean(outputs[0]) - -0.011996539) > 0 else False) is True - diff --git a/functions/development/bert_embeddings/latest/static/bert_embeddings.html b/functions/development/bert_embeddings/latest/static/bert_embeddings.html deleted file mode 100644 index 863a0261..00000000 --- a/functions/development/bert_embeddings/latest/static/bert_embeddings.html +++ /dev/null @@ -1,181 +0,0 @@ - - - - - - - -bert_embeddings.bert_embeddings - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - -
-
- -
-
-
-
-
- -
-

- -
-
-
-
-
-
-
-

Source code for bert_embeddings.bert_embeddings

-# Copyright 2019 Iguazio
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-import json
-import pickle
-
-import torch
-from transformers import BertModel, BertTokenizer
-
-
-
[docs]def init_context(context): - tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") - model = BertModel.from_pretrained("bert-base-uncased") - model.eval() - - setattr(context.user_data, "tokenizer", tokenizer) - setattr(context.user_data, "model", model)
- - -
[docs]def handler(context, event): - docs = json.loads(event.body) - docs = [doc.lower() for doc in docs] - docs = context.user_data.tokenizer.batch_encode_plus( - docs, pad_to_max_length=True, return_tensors="pt" - ) - - with torch.no_grad(): - embeddings = context.user_data.model(**docs) - embeddings = [embeddings[0].numpy(), embeddings[1].numpy()] - return pickle.dumps(embeddings)
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/bert_embeddings/latest/static/documentation.html b/functions/development/bert_embeddings/latest/static/documentation.html deleted file mode 100644 index b99c805d..00000000 --- a/functions/development/bert_embeddings/latest/static/documentation.html +++ /dev/null @@ -1,230 +0,0 @@ - - - - - - - -bert_embeddings package - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - - - -
-
- - -
-
-
- -
-

bert_embeddings package

- -
- -
-
-
-
-
-

bert_embeddings package#

-
-

Submodules#

-
-
-

bert_embeddings.bert_embeddings module#

-
-
-bert_embeddings.bert_embeddings.handler(context, event)[source]#
-
-
-
-bert_embeddings.bert_embeddings.init_context(context)[source]#
-
-
-
-

Module contents#

-
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/bert_embeddings/latest/static/example.html b/functions/development/bert_embeddings/latest/static/example.html deleted file mode 100644 index 59984155..00000000 --- a/functions/development/bert_embeddings/latest/static/example.html +++ /dev/null @@ -1,584 +0,0 @@ - - - - - - - -BERT Embeddings Serverless Function - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - - - -
-
- - -
-
-
- -
-

BERT Embeddings Serverless Function

- -
- -
-
-
-
-
-

BERT Embeddings Serverless Function#

-

This notebook presents deployment of pretrained BERT model that outputs embeddings for given textual sequences as a serverless function. Embeddings are meaningful, contextual representations of text in the form of ndarrays that are used frequently as input to various learning tasks in the field of NLP.

-
-
-

Embeddings without bert#

-

One-Hot Encoding is a general method that can vectorize any categorical features. It is simple and fast to create and update the vectorization.
-in case of text embeddings, each row is a sentence and each column is a word/char/n-gram.

-
-
-
# some sentences to do examine
-sentences = ['the quick brown fox jumps over the lazy dog',
-              'Hello I am Jacob',
-              'Daniel visited Tel-Aviv last month']
-
-
-
-
-

lets see the difference between bert embeddings and one-hot encoding

-
-
-
# constructing a list of all the words (will be our columns) - make sure no duplicate words are set
-tokens = []
-for sentence in sentences:
-    for word in sentence.split():
-        tokens.append(word) if word not in tokens else ""
-print(tokens)
-
-
-
-
-
['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'lazy', 'dog', 'Hello', 'I', 'am', 'Jacob', 'Daniel', 'visited', 'Tel-Aviv', 'last', 'month']
-
-
-
-
-
-
-
# constructing the one hot vector
-import pandas as pd
-import numpy as np
-
-one_hot = pd.DataFrame(columns = range(len(tokens)))
-# filling our empty dataframe with each sentence encoding
-for sentence in sentences:
-    vector = np.zeros(len(tokens))
-    for word in sentence.split():
-        vector[tokens.index(word)]=1
-    one_hot = one_hot.append(pd.Series(vector),ignore_index=True)
-one_hot.columns = tokens
-
-
-
-
-
-
-
one_hot
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
thequickbrownfoxjumpsoverlazydogHelloIamJacobDanielvisitedTel-Avivlastmonth
01.01.01.01.01.01.01.01.00.00.00.00.00.00.00.00.00.0
10.00.00.00.00.00.00.00.01.01.01.01.00.00.00.00.00.0
20.00.00.00.00.00.00.00.00.00.00.00.01.01.01.01.01.0
-
-
-

The table above represents the one-hot encoding of our sentences, each row is a sentence and each column is a word. -this representation is very slim and will be a very weak learning dataset.

-
-
-

Introducing Bert embeddings#

-
-
-
from mlrun import import_function, auto_mount
-
-
-
-
-
-
-
# importing the function from the hub
-fn = import_function("hub://bert_embeddings").apply(auto_mount())
-
-
-
-
-
-
-
# deploying the function
-addr = fn.deploy()
-
-
-
-
-
> 2023-02-02 09:29:59,002 [info] Starting remote function deploy
-2023-02-02 09:29:59  (info) Deploying function
-2023-02-02 09:29:59  (info) Building
-2023-02-02 09:29:59  (info) Staging files and preparing base images
-2023-02-02 09:29:59  (info) Building processor image
-2023-02-02 09:32:09  (info) Build complete
-2023-02-02 09:32:35  (info) Function deploy complete
-> 2023-02-02 09:32:36,059 [info] successfully deployed function: {'internal_invocation_urls': ['nuclio-default-bert-embeddings.default-tenant.svc.cluster.local:8080'], 'external_invocation_urls': ['default-bert-embeddings-default.default-tenant.app.cto-office.iguazio-cd1.com/']}
-
-
-
-
-
-
-
import requests
-import json
-# sending a request to the function endpoint to get the sentences' embeddings
-resp = requests.post(addr, json=json.dumps(sentences))
-
-
-
-
-
-
-
import pickle
-output_embeddings = pickle.loads(resp.content)
-
-
-
-
-
-
-
print(f'embeddings per token shape: {output_embeddings[0].shape}, pooled embeddings shape: {output_embeddings[1].shape}')
-
-
-
-
-
embeddings per token shape: (3, 11, 768), pooled embeddings shape: (3, 768)
-
-
-
-
-
-
-
pd.DataFrame(output_embeddings[1])
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
0123456789...758759760761762763764765766767
0-0.733322-0.2235400.3424620.383463-0.1647960.0405220.8028450.1528420.331639-0.999779...0.2065640.2314150.1964330.7979080.4351750.7493700.2460980.427603-0.5773840.842063
1-0.953005-0.535132-0.7438220.8939340.646276-0.2793880.9435130.275504-0.555109-0.999992...0.582386-0.0046140.9760790.931517-0.3914420.5303840.675933-0.682721-0.7463390.957809
2-0.843678-0.453405-0.8260110.6508050.494036-0.1541170.8216420.349507-0.650629-0.999978...0.618286-0.3367000.9362620.857577-0.7874890.2461370.676243-0.612532-0.7087860.840879
-

3 rows × 768 columns

-
-
-

we can see that the size of the first dimension of the outputs is three since we passed in three sequences. Also the intermediate dimension of the first output is the maximal number of tokens across all input sequences. Sequences with less tokens are padded with zero values.
-Note that the first input has an intermediate dimension of size 11 that corresponds to the number of max tokens in the input sequence after addition of two special tokens marking beginning and end of a sequence by the tokenizer.
-The last dimension for both is of size 768 which is the embedding dimension for this default configuration of bert.
-Now you tell me, which encoding are you gonna use in your project ??

-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/bert_embeddings/latest/static/function.html b/functions/development/bert_embeddings/latest/static/function.html deleted file mode 100644 index 985f9e26..00000000 --- a/functions/development/bert_embeddings/latest/static/function.html +++ /dev/null @@ -1,60 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: remote
-metadata:
-  name: bert-embeddings
-  tag: ''
-  hash: 57a2ce8e0da1f6e813a8649e9ea6fcbb69a1ce5f
-  project: ''
-  labels:
-    framework: pytorch
-  categories:
-  - machine-learning
-  - data-preparation
-spec:
-  command: ''
-  args: []
-  image: mlrun/mlrun
-  build:
-    functionSourceCode: IyBDb3B5cmlnaHQgMjAxOSBJZ3VhemlvCiMKIyBMaWNlbnNlZCB1bmRlciB0aGUgQXBhY2hlIExpY2Vuc2UsIFZlcnNpb24gMi4wICh0aGUgIkxpY2Vuc2UiKTsKIyB5b3UgbWF5IG5vdCB1c2UgdGhpcyBmaWxlIGV4Y2VwdCBpbiBjb21wbGlhbmNlIHdpdGggdGhlIExpY2Vuc2UuCiMgWW91IG1heSBvYnRhaW4gYSBjb3B5IG9mIHRoZSBMaWNlbnNlIGF0CiMKIyAgICAgaHR0cDovL3d3dy5hcGFjaGUub3JnL2xpY2Vuc2VzL0xJQ0VOU0UtMi4wCiMKIyBVbmxlc3MgcmVxdWlyZWQgYnkgYXBwbGljYWJsZSBsYXcgb3IgYWdyZWVkIHRvIGluIHdyaXRpbmcsIHNvZnR3YXJlCiMgZGlzdHJpYnV0ZWQgdW5kZXIgdGhlIExpY2Vuc2UgaXMgZGlzdHJpYnV0ZWQgb24gYW4gIkFTIElTIiBCQVNJUywKIyBXSVRIT1VUIFdBUlJBTlRJRVMgT1IgQ09ORElUSU9OUyBPRiBBTlkgS0lORCwgZWl0aGVyIGV4cHJlc3Mgb3IgaW1wbGllZC4KIyBTZWUgdGhlIExpY2Vuc2UgZm9yIHRoZSBzcGVjaWZpYyBsYW5ndWFnZSBnb3Zlcm5pbmcgcGVybWlzc2lvbnMgYW5kCiMgbGltaXRhdGlvbnMgdW5kZXIgdGhlIExpY2Vuc2UuCiMKaW1wb3J0IGpzb24KaW1wb3J0IHBpY2tsZQoKaW1wb3J0IHRvcmNoCmZyb20gdHJhbnNmb3JtZXJzIGltcG9ydCBCZXJ0TW9kZWwsIEJlcnRUb2tlbml6ZXIKCgpkZWYgaW5pdF9jb250ZXh0KGNvbnRleHQpOgogICAgdG9rZW5pemVyID0gQmVydFRva2VuaXplci5mcm9tX3ByZXRyYWluZWQoImJlcnQtYmFzZS11bmNhc2VkIikKICAgIG1vZGVsID0gQmVydE1vZGVsLmZyb21fcHJldHJhaW5lZCgiYmVydC1iYXNlLXVuY2FzZWQiKQogICAgbW9kZWwuZXZhbCgpCgogICAgc2V0YXR0cihjb250ZXh0LnVzZXJfZGF0YSwgInRva2VuaXplciIsIHRva2VuaXplcikKICAgIHNldGF0dHIoY29udGV4dC51c2VyX2RhdGEsICJtb2RlbCIsIG1vZGVsKQoKCmRlZiBoYW5kbGVyKGNvbnRleHQsIGV2ZW50KToKICAgIGRvY3MgPSBqc29uLmxvYWRzKGV2ZW50LmJvZHkpCiAgICBkb2NzID0gW2RvYy5sb3dlcigpIGZvciBkb2MgaW4gZG9jc10KICAgIGRvY3MgPSBjb250ZXh0LnVzZXJfZGF0YS50b2tlbml6ZXIuYmF0Y2hfZW5jb2RlX3BsdXMoCiAgICAgICAgZG9jcywgcGFkX3RvX21heF9sZW5ndGg9VHJ1ZSwgcmV0dXJuX3RlbnNvcnM9InB0IgogICAgKQoKICAgIHdpdGggdG9yY2gubm9fZ3JhZCgpOgogICAgICAgIGVtYmVkZGluZ3MgPSBjb250ZXh0LnVzZXJfZGF0YS5tb2RlbCgqKmRvY3MpCiAgICBlbWJlZGRpbmdzID0gW2VtYmVkZGluZ3NbMF0ubnVtcHkoKSwgZW1iZWRkaW5nc1sxXS5udW1weSgpXQogICAgcmV0dXJuIHBpY2tsZS5kdW1wcyhlbWJlZGRpbmdzKQo=
-    commands: []
-    code_origin: http://github.com/aviaIguazio/functions.git#a1c9940e4c2420c88063768b4038e29b1f4e37a6:/Users/Avi_Asulin/PycharmProjects/mlrun/functions/bert_embeddings/bert_embeddings.py
-    origin_filename: /Users/Avi_Asulin/PycharmProjects/mlrun/functions/bert_embeddings/bert_embeddings.py
-    requirements:
-    - torch
-  description: Get BERT based embeddings for given text
-  default_handler: ''
-  disable_auto_mount: false
-  clone_target_dir: ''
-  env: []
-  priority_class_name: ''
-  preemption_mode: prevent
-  min_replicas: 1
-  max_replicas: 4
-  source: ''
-  function_handler: bert_embeddings:handler
-  base_image_pull: false
-  affinity: null
-  tolerations: null
-  security_context: {}
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/bert_embeddings/latest/static/item.html b/functions/development/bert_embeddings/latest/static/item.html deleted file mode 100644 index 612e78b6..00000000 --- a/functions/development/bert_embeddings/latest/static/item.html +++ /dev/null @@ -1,48 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- machine-learning
-- data-preparation
-description: Get BERT based embeddings for given text
-doc: ''
-example: bert_embeddings.ipynb
-generationDate: 2022-08-28:17-25
-hidden: false
-icon: ''
-labels:
-  framework: pytorch
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 1.4.1
-name: bert-embeddings
-platformVersion: 3.5.3
-spec:
-  filename: bert_embeddings.py
-  handler: handler
-  image: mlrun/mlrun
-  kind: nuclio
-  requirements:
-  - torch
-url: ''
-version: 1.2.0
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/bert_embeddings/latest/static/source.html b/functions/development/bert_embeddings/latest/static/source.html deleted file mode 100644 index 1df4accf..00000000 --- a/functions/development/bert_embeddings/latest/static/source.html +++ /dev/null @@ -1,63 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-# Copyright 2019 Iguazio
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-import json
-import pickle
-
-import torch
-from transformers import BertModel, BertTokenizer
-
-
-def init_context(context):
-    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
-    model = BertModel.from_pretrained("bert-base-uncased")
-    model.eval()
-
-    setattr(context.user_data, "tokenizer", tokenizer)
-    setattr(context.user_data, "model", model)
-
-
-def handler(context, event):
-    docs = json.loads(event.body)
-    docs = [doc.lower() for doc in docs]
-    docs = context.user_data.tokenizer.batch_encode_plus(
-        docs, pad_to_max_length=True, return_tensors="pt"
-    )
-
-    with torch.no_grad():
-        embeddings = context.user_data.model(**docs)
-    embeddings = [embeddings[0].numpy(), embeddings[1].numpy()]
-    return pickle.dumps(embeddings)
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/catalog.json b/functions/development/catalog.json index e97e8906..9a0bbdab 100644 --- a/functions/development/catalog.json +++ b/functions/development/catalog.json @@ -1 +1 @@ -{"ingest": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "Feature Store ingest function that runs the transformation graph on the source of the featureset.", "doc": "", "example": "ingest.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "ingest", "platformVersion": "3.5.0", "spec": {"filename": "ingest.py", "handler": "ingest", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/ingest.ipynb", "source": "src/ingest.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "Feature Store ingest function that runs the transformation graph on the source of the featureset.", "doc": "", "example": "ingest.ipynb", "generationDate": "2021-11-13:00-15", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "ingest", "platformVersion": "", "spec": {"filename": "ingest.py", "handler": "ingest", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/ingest.ipynb", "source": "src/ingest.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "Feature Store ingest function that runs the transformation graph on the source of the featureset.", "doc": "", "example": "ingest.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "ingest", "platformVersion": "3.5.0", "spec": {"filename": "ingest.py", "handler": "ingest", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/ingest.ipynb", "source": "src/ingest.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "Feature Store ingest function that runs the transformation graph on the source of the featureset.", "doc": "", "example": "ingest.ipynb", "generationDate": "2021-11-13:00-15", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "ingest", "platformVersion": "", "spec": {"filename": "ingest.py", "handler": "ingest", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/ingest.ipynb", "source": "src/ingest.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "load_dataset": {"latest": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.0", "name": "load-dataset", "platformVersion": "3.5.5", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0", "assets": {"example": "src/load_dataset.ipynb", "source": "src/load_dataset.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "load-dataset", "platformVersion": "3.2.0", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/load_dataset.ipynb", "source": "src/load_dataset.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.0", "name": "load-dataset", "platformVersion": "3.5.5", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0", "assets": {"example": "src/load_dataset.ipynb", "source": "src/load_dataset.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "load-dataset", "platformVersion": "", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/load_dataset.ipynb", "source": "src/load_dataset.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "load-dataset", "platformVersion": "3.5.0", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/load_dataset.ipynb", "source": "src/load_dataset.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "load-dataset", "platformVersion": "3.2.0", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/load_dataset.ipynb", "source": "src/load_dataset.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "concept_drift": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "Deploy a streaming Concept Drift detector on a labeled stream", "doc": "", "example": "concept_drift.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "concept-drift", "platformVersion": "3.5.0", "spec": {"filename": "concept_drift.py", "handler": "concept_drift_deployer", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-multiflow"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/concept_drift.ipynb", "source": "src/concept_drift.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "Deploy a streaming Concept Drift detector on a labeled stream", "doc": "", "example": "concept_drift.ipynb", "generationDate": "2021-05-19:22-04", "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "concept-drift", "platformVersion": "3.2.0", "spec": {"filename": "concept_drift.py", "handler": "concept_drift_deployer", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-multiflow"]}, "url": "", "version": "0.8.0", "assets": {"example": "src/concept_drift.ipynb", "source": "src/concept_drift.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.1": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "Deploy a streaming Concept Drift detector on a labeled stream", "doc": "", "example": "concept_drift.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "concept-drift", "platformVersion": "3.2.0", "spec": {"filename": "concept_drift.py", "handler": "concept_drift_deployer", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-multiflow"]}, "url": "", "version": "0.9.1", "assets": {"example": "src/concept_drift.ipynb", "source": "src/concept_drift.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "Deploy a streaming Concept Drift detector on a labeled stream", "doc": "", "example": "concept_drift.ipynb", "generationDate": "2021-05-19:22-04", "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "concept-drift", "platformVersion": "", "spec": {"filename": "concept_drift.py", "handler": "concept_drift_deployer", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/concept_drift.ipynb", "source": "src/concept_drift.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.2": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "Deploy a streaming Concept Drift detector on a labeled stream", "doc": "", "example": "concept_drift.ipynb", "generationDate": "2021-05-19:22-04", "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "concept-drift", "platformVersion": "", "spec": {"filename": "concept_drift.py", "handler": "concept_drift_deployer", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-multiflow"]}, "url": "", "version": "0.0.2", "assets": {"example": "src/concept_drift.ipynb", "source": "src/concept_drift.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "Deploy a streaming Concept Drift detector on a labeled stream", "doc": "", "example": "concept_drift.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "concept-drift", "platformVersion": "3.5.0", "spec": {"filename": "concept_drift.py", "handler": "concept_drift_deployer", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-multiflow"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/concept_drift.ipynb", "source": "src/concept_drift.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "Deploy a streaming Concept Drift detector on a labeled stream", "doc": "", "example": "concept_drift.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "concept-drift", "platformVersion": "3.2.0", "spec": {"filename": "concept_drift.py", "handler": "concept_drift_deployer", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-multiflow"]}, "url": "", "version": "0.9.0", "assets": {"example": "src/concept_drift.ipynb", "source": "src/concept_drift.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "tf2_serving": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "tf2-serving", "platformVersion": "3.5.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/tf2_serving.ipynb", "source": "src/tf2_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf2-serving", "platformVersion": "3.2.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.8.0", "assets": {"example": "src/tf2_serving.ipynb", "source": "src/tf2_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf2-serving", "platformVersion": "3.2.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.9.1", "assets": {"example": "src/tf2_serving.ipynb", "source": "src/tf2_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "tf2-serving", "platformVersion": "", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.0.1", "assets": {"example": "src/tf2_serving.ipynb", "source": "src/tf2_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "tf2-serving", "platformVersion": "3.5.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/tf2_serving.ipynb", "source": "src/tf2_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf2-serving", "platformVersion": "3.2.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.9.0", "assets": {"example": "src/tf2_serving.ipynb", "source": "src/tf2_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "feature_selection": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.4.0", "assets": {"example": "src/feature_selection.ipynb", "source": "src/feature_selection.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "feature-selection", "platformVersion": "3.2.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection/feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/feature_selection.ipynb", "source": "src/feature_selection.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.3.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0", "assets": {"example": "src/feature_selection.ipynb", "source": "src/feature_selection.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0", "assets": {"example": "src/feature_selection.ipynb", "source": "src/feature_selection.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.9.0", "name": "feature-selection", "platformVersion": "3.2.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.1", "assets": {"example": "src/feature_selection.ipynb", "source": "src/feature_selection.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.4.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.4.0", "assets": {"example": "src/feature_selection.ipynb", "source": "src/feature_selection.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.5.4", "name": "feature-selection", "platformVersion": "2.10.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection/feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/feature_selection.ipynb", "source": "src/feature_selection.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/feature_selection.ipynb", "source": "src/feature_selection.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.9.0", "name": "feature-selection", "platformVersion": "3.2.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection/feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/feature_selection.ipynb", "source": "src/feature_selection.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.1", "assets": {"example": "src/feature_selection.ipynb", "source": "src/feature_selection.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "github_utils": {"latest": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "github-utils", "platformVersion": "3.5.0", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/github_utils.ipynb", "source": "src/github_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "github-utils", "platformVersion": "3.2.0", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/github_utils.ipynb", "source": "src/github_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "github-utils", "platformVersion": "", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/github_utils.ipynb", "source": "src/github_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "github-utils", "platformVersion": "3.5.0", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/github_utils.ipynb", "source": "src/github_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "github-utils", "platformVersion": "3.2.0", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/github_utils.ipynb", "source": "src/github_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "tf2_serving_v2": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server v2", "doc": "", "example": "tf2_serving_v2.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "tf2-serving-v2", "platformVersion": "3.5.0", "spec": {"filename": "tf2_serving_v2.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/tf2_serving_v2.ipynb", "source": "src/tf2_serving_v2.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server v2", "doc": "", "example": "tf2_serving_v2.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf2-serving-v2", "platformVersion": "3.2.0", "spec": {"filename": "tf2_serving_v2.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.8.0", "assets": {"example": "src/tf2_serving_v2.ipynb", "source": "src/tf2_serving_v2.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server v2", "doc": "", "example": "tf2_serving_v2.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf2-serving-v2", "platformVersion": "3.2.0", "spec": {"filename": "tf2_serving_v2.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.9.1", "assets": {"example": "src/tf2_serving_v2.ipynb", "source": "src/tf2_serving_v2.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server v2", "doc": "", "example": "tf2_serving_v2.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "tf2-serving-v2", "platformVersion": "", "spec": {"filename": "tf2_serving_v2.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.0.1", "assets": {"example": "src/tf2_serving_v2.ipynb", "source": "src/tf2_serving_v2.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server v2", "doc": "", "example": "tf2_serving_v2.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "tf2-serving-v2", "platformVersion": "3.5.0", "spec": {"filename": "tf2_serving_v2.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/tf2_serving_v2.ipynb", "source": "src/tf2_serving_v2.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server v2", "doc": "", "example": "tf2_serving_v2.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf2-serving-v2", "platformVersion": "3.2.0", "spec": {"filename": "tf2_serving_v2.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.9.0", "assets": {"example": "src/tf2_serving_v2.ipynb", "source": "src/tf2_serving_v2.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "auto_trainer": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.7.0", "assets": {"example": "src/auto_trainer.ipynb", "source": "src/auto_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.3.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.3.0", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0", "assets": {"example": "src/auto_trainer.ipynb", "source": "src/auto_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.1": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-04-26:10-43", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.0.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "", "kind": "job", "requirements": []}, "url": "", "version": "1.0.1", "assets": {"example": "src/auto_trainer.ipynb", "source": "src/auto_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.6": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-04-26:10-43", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.0.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.6", "assets": {"example": "src/auto_trainer.ipynb", "source": "src/auto_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.10.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-02-06:10-18", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.10.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "", "kind": "job", "requirements": []}, "url": "", "version": "0.10.0", "assets": {"example": "src/auto_trainer.ipynb", "source": "src/auto_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.10.1": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-02-06:10-18", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.10.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "", "kind": "job", "requirements": []}, "url": "", "version": "0.10.1", "assets": {"example": "src/auto_trainer.ipynb", "source": "src/auto_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.4": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-04-26:10-43", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.0.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.4", "assets": {"example": "src/auto_trainer.ipynb", "source": "src/auto_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.2": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-04-26:10-43", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.0.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.2", "assets": {"example": "src/auto_trainer.ipynb", "source": "src/auto_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.4.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.4.0", "assets": {"example": "src/auto_trainer.ipynb", "source": "src/auto_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.5.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.5.0", "assets": {"example": "src/auto_trainer.ipynb", "source": "src/auto_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/auto_trainer.ipynb", "source": "src/auto_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.7.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.7.0", "assets": {"example": "src/auto_trainer.ipynb", "source": "src/auto_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.10.2": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-02-06:10-18", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.10.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "", "kind": "job", "requirements": []}, "url": "", "version": "0.10.2", "assets": {"example": "src/auto_trainer.ipynb", "source": "src/auto_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-02-06:10-18", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.10.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/auto_trainer.ipynb", "source": "src/auto_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.6.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.6.0", "assets": {"example": "src/auto_trainer.ipynb", "source": "src/auto_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.10.3": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-04-26:10-43", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.10.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "", "kind": "job", "requirements": []}, "url": "", "version": "0.10.3", "assets": {"example": "src/auto_trainer.ipynb", "source": "src/auto_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "churn_server": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "churn classification and predictor", "doc": "", "example": "churn_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Iguazio", "framework": "churn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "churn-server", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "ChurnModel"}, "env": {"ENABLE_EXPLAINER": "False"}, "filename": "churn_server.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["xgboost==1.3.1", "lifelines==0.22.8"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/churn_server.ipynb", "source": "src/churn_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "churn classification and predictor", "doc": "", "example": "churn_server.ipynb", "generationDate": "2021-05-19:22-04", "icon": "", "labels": {"author": "Iguazio", "framework": "churn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "churn-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ChurnModel"}, "env": {"ENABLE_EXPLAINER": "False"}, "filename": "churn_server.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["xgboost==1.3.1", "lifelines==0.22.8"]}, "url": "", "version": "0.8.0", "assets": {"example": "src/churn_server.ipynb", "source": "src/churn_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "churn classification and predictor", "doc": "", "example": "churn_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Iguazio", "framework": "churn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "churn-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ChurnModel"}, "env": {"ENABLE_EXPLAINER": "False"}, "filename": "churn_server.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["xgboost==1.3.1", "lifelines==0.22.8"]}, "url": "", "version": "1.0.0", "assets": {"example": "src/churn_server.ipynb", "source": "src/churn_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "churn classification and predictor", "doc": "", "example": "churn_server.ipynb", "generationDate": "2021-05-19:22-04", "icon": "", "labels": {"author": "Iguazio", "framework": "churn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "churn-server", "platformVersion": "", "spec": {"filename": "churn_server.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": [], "env": {"ENABLE_EXPLAINER": "False"}, "customFields": {"default_class": "ChurnModel"}}, "url": "", "version": "0.0.1", "assets": {"example": "src/churn_server.ipynb", "source": "src/churn_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "churn classification and predictor", "doc": "", "example": "churn_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Iguazio", "framework": "churn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "churn-server", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "ChurnModel"}, "env": {"ENABLE_EXPLAINER": "False"}, "filename": "churn_server.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["xgboost==1.3.1", "lifelines==0.22.8"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/churn_server.ipynb", "source": "src/churn_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "churn classification and predictor", "doc": "", "example": "churn_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Iguazio", "framework": "churn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "churn-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ChurnModel"}, "env": {"ENABLE_EXPLAINER": "False"}, "filename": "churn_server.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["xgboost==1.3.1", "lifelines==0.22.8"]}, "url": "", "version": "0.9.0", "assets": {"example": "src/churn_server.ipynb", "source": "src/churn_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "model_server": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-server", "platformVersion": "3.5.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/model_server.ipynb", "source": "src/model_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server", "platformVersion": "3.2.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/model_server.ipynb", "source": "src/model_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server", "platformVersion": "3.2.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "1.0.0", "assets": {"example": "src/model_server.ipynb", "source": "src/model_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "model-server", "platformVersion": "", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/model_server.ipynb", "source": "src/model_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-server", "platformVersion": "3.5.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/model_server.ipynb", "source": "src/model_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server", "platformVersion": "3.2.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/model_server.ipynb", "source": "src/model_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "sql_to_file": {"latest": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "SQL To File - Ingest data using SQL query", "doc": "", "example": "sql_to_file.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "adih"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "sql-to-file", "platformVersion": "3.5.0", "spec": {"filename": "sql_to_file.py", "handler": "sql_to_file", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/sql_to_file.ipynb", "source": "src/sql_to_file.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "SQL To File - Ingest data using SQL query", "doc": "", "example": "sql_to_file.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "adih"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "sql-to-file", "platformVersion": "3.2.0", "spec": {"filename": "sql_to_file.py", "handler": "sql_to_file", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/sql_to_file.ipynb", "source": "src/sql_to_file.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "SQL To File - Ingest data using SQL query", "doc": "", "example": "sql_to_file.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "adih"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "sql-to-file", "platformVersion": "3.2.0", "spec": {"filename": "sql_to_file.py", "handler": "sql_to_file", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.1", "assets": {"example": "src/sql_to_file.ipynb", "source": "src/sql_to_file.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "SQL To File - Ingest data using SQL query", "doc": "", "example": "sql_to_file.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "adih"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "sql-to-file", "platformVersion": "", "spec": {"filename": "sql_to_file.py", "handler": "sql_to_file", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/sql_to_file.ipynb", "source": "src/sql_to_file.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "SQL To File - Ingest data using SQL query", "doc": "", "example": "sql_to_file.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "adih"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "sql-to-file", "platformVersion": "3.5.0", "spec": {"filename": "sql_to_file.py", "handler": "sql_to_file", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/sql_to_file.ipynb", "source": "src/sql_to_file.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "SQL To File - Ingest data using SQL query", "doc": "", "example": "sql_to_file.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "adih"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "sql-to-file", "platformVersion": "3.2.0", "spec": {"filename": "sql_to_file.py", "handler": "sql_to_file", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/sql_to_file.ipynb", "source": "src/sql_to_file.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "concept_drift_streaming": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "monitoring"], "description": "Deploy a streaming Concept Drift detector on a labeled stream. the nuclio part of the concept_drift function", "doc": "", "example": "concept_drift_streaming.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "concept-drift-streaming", "platformVersion": "3.5.0", "spec": {"filename": "concept_drift_streaming.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["scikit-multiflow==0.4.1", "v3io_frames"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/concept_drift_streaming.ipynb", "source": "src/concept_drift_streaming.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["machine-learning", "monitoring"], "description": "Deploy a streaming Concept Drift detector on a labeled stream. the nuclio part of the concept_drift function", "doc": "", "example": "concept_drift_streaming.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "concept-drift-streaming", "platformVersion": "3.2.0", "spec": {"filename": "concept_drift_streaming.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["scikit-multiflow==0.4.1", "v3io_frames"]}, "url": "", "version": "0.8.0", "assets": {"example": "src/concept_drift_streaming.ipynb", "source": "src/concept_drift_streaming.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.1": {"apiVersion": "v1", "categories": ["machine-learning", "monitoring"], "description": "Deploy a streaming Concept Drift detector on a labeled stream. the nuclio part of the concept_drift function", "doc": "", "example": "concept_drift_streaming.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "concept-drift-streaming", "platformVersion": "3.2.0", "spec": {"filename": "concept_drift_streaming.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["scikit-multiflow==0.4.1", "v3io_frames"]}, "url": "", "version": "0.9.1", "assets": {"example": "src/concept_drift_streaming.ipynb", "source": "src/concept_drift_streaming.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["machine-learning", "monitoring"], "description": "Deploy a streaming Concept Drift detector on a labeled stream. the nuclio part of the concept_drift function", "doc": "", "example": "concept_drift_streaming.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "concept-drift-streaming", "platformVersion": "", "spec": {"filename": "concept_drift_streaming.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["scikit-multiflow==0.4.1", "v3io_frames"]}, "url": "", "version": "0.0.1", "assets": {"example": "src/concept_drift_streaming.ipynb", "source": "src/concept_drift_streaming.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.2": {"apiVersion": "v1", "categories": ["machine-learning", "monitoring"], "description": "Deploy a streaming Concept Drift detector on a labeled stream. the nuclio part of the concept_drift function", "doc": "", "example": "concept_drift_streaming.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "concept-drift-streaming", "platformVersion": "", "spec": {"filename": "concept_drift_streaming.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["scikit-multiflow==0.4.1", "v3io_frames"]}, "url": "", "version": "0.0.2", "assets": {"example": "src/concept_drift_streaming.ipynb", "source": "src/concept_drift_streaming.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "monitoring"], "description": "Deploy a streaming Concept Drift detector on a labeled stream. the nuclio part of the concept_drift function", "doc": "", "example": "concept_drift_streaming.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "concept-drift-streaming", "platformVersion": "3.5.0", "spec": {"filename": "concept_drift_streaming.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["scikit-multiflow==0.4.1", "v3io_frames"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/concept_drift_streaming.ipynb", "source": "src/concept_drift_streaming.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["machine-learning", "monitoring"], "description": "Deploy a streaming Concept Drift detector on a labeled stream. the nuclio part of the concept_drift function", "doc": "", "example": "concept_drift_streaming.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "concept-drift-streaming", "platformVersion": "3.2.0", "spec": {"filename": "concept_drift_streaming.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["scikit-multiflow==0.4.1", "v3io_frames"]}, "url": "", "version": "0.9.0", "assets": {"example": "src/concept_drift_streaming.ipynb", "source": "src/concept_drift_streaming.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "get_offline_features": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "retrieve offline feature vector results", "doc": "", "example": "get_offline_features.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "get_offline_features", "platformVersion": "3.5.0", "spec": {"filename": "get_offline_features.py", "handler": "get_offline_features", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0", "assets": {"example": "src/get_offline_features.ipynb", "source": "src/get_offline_features.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.1": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "retrieve offline feature vector results", "doc": "", "example": "get_offline_features.ipynb", "generationDate": "2022-05-25:10-58", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.0.1", "name": "get_offline_features", "platformVersion": "", "spec": {"filename": "get_offline_features.py", "handler": "get_offline_features", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.1", "assets": {"example": "src/get_offline_features.ipynb", "source": "src/get_offline_features.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "retrieve offline feature vector results", "doc": "", "example": "get_offline_features.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "get_offline_features", "platformVersion": "3.5.0", "spec": {"filename": "get_offline_features.py", "handler": "get_offline_features", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0", "assets": {"example": "src/get_offline_features.ipynb", "source": "src/get_offline_features.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.1": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "retrieve offline feature vector results", "doc": "", "example": "get_offline_features.ipynb", "generationDate": "2022-05-25:10-58", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.0.1", "name": "get_offline_features", "platformVersion": "", "spec": {"filename": "get_offline_features.py", "handler": "get_offline_features", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.1", "assets": {"example": "src/get_offline_features.ipynb", "source": "src/get_offline_features.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.0": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "retrieve offline feature vector results", "doc": "", "example": "get_offline_features.ipynb", "generationDate": "2022-05-25:10-58", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.0.1", "name": "get_offline_features", "platformVersion": "", "spec": {"filename": "get_offline_features.py", "handler": "get_offline_features", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.0", "assets": {"example": "src/get_offline_features.ipynb", "source": "src/get_offline_features.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "retrieve offline feature vector results", "doc": "", "example": "get_offline_features.ipynb", "generationDate": "2022-01-17:17-56", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.9.1", "name": "get_offline_features", "platformVersion": "", "spec": {"filename": "get_offline_features.py", "handler": "get_offline_features", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/get_offline_features.ipynb", "source": "src/get_offline_features.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "retrieve offline feature vector results", "doc": "", "example": "get_offline_features.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "get_offline_features", "platformVersion": "3.5.0", "spec": {"filename": "get_offline_features.py", "handler": "get_offline_features", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/get_offline_features.ipynb", "source": "src/get_offline_features.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation", "data-analysis", "feature-store"], "description": "retrieve offline feature vector results", "doc": "", "example": "get_offline_features.ipynb", "generationDate": "2022-01-17:17-56", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.9.1", "name": "get_offline_features", "platformVersion": "", "spec": {"filename": "get_offline_features.py", "handler": "get_offline_features", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/get_offline_features.ipynb", "source": "src/get_offline_features.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "feature_perms": {"latest": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "estimate feature importances using permutations", "doc": "", "example": "feature_perms.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-perms", "platformVersion": "3.5.0", "spec": {"filename": "feature_perms.py", "handler": "permutation_importance", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "test_valid": false, "assets": {"example": "src/feature_perms.ipynb", "source": "src/feature_perms.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "estimate feature importances using permutations", "doc": "", "example": "feature_perms.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "feature-perms", "platformVersion": "3.2.0", "spec": {"filename": "feature_perms.py", "handler": "permutation_importance", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/feature_perms.ipynb", "source": "src/feature_perms.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "estimate feature importances using permutations", "doc": "", "example": "feature_perms.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "feature-perms", "platformVersion": "3.2.0", "spec": {"filename": "feature_perms.py", "handler": "permutation_importance", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.0.0", "assets": {"example": "src/feature_perms.ipynb", "source": "src/feature_perms.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "estimate feature importances using permutations", "doc": "", "example": "feature_perms.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "feature-perms", "platformVersion": "", "spec": {"filename": "feature_perms.py", "handler": "permutation_importance", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/feature_perms.ipynb", "source": "src/feature_perms.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "estimate feature importances using permutations", "doc": "", "example": "feature_perms.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-perms", "platformVersion": "3.5.0", "spec": {"filename": "feature_perms.py", "handler": "permutation_importance", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "test_valid": false, "assets": {"example": "src/feature_perms.ipynb", "source": "src/feature_perms.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "estimate feature importances using permutations", "doc": "", "example": "feature_perms.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "feature-perms", "platformVersion": "3.2.0", "spec": {"filename": "feature_perms.py", "handler": "permutation_importance", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/feature_perms.ipynb", "source": "src/feature_perms.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "onnx_utils": {"latest": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "onnx_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "with_mlrun": false}}, "filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/mlrun", "kind": "job", "requirements": ["onnx~=1.13.0", "onnxruntime~=1.14.0", "onnxoptimizer~=0.3.0", "onnxmltools~=1.11.0", "tf2onnx~=1.13.0"]}, "url": "", "version": "1.2.0", "assets": {"example": "src/onnx_utils.ipynb", "source": "src/onnx_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2021-10-25:00-15", "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "onnx_utils", "platformVersion": "3.2.0", "spec": {"filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/onnx_utils.ipynb", "source": "src/onnx_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.2.0": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "onnx_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "with_mlrun": false}}, "filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/mlrun", "kind": "job", "requirements": ["onnx~=1.13.0", "onnxruntime~=1.14.0", "onnxoptimizer~=0.3.0", "onnxmltools~=1.11.0", "tf2onnx~=1.13.0"]}, "url": "", "version": "1.2.0", "assets": {"example": "src/onnx_utils.ipynb", "source": "src/onnx_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.10.1": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.10.0", "name": "onnx_utils", "platformVersion": "3.2.0", "spec": {"filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.10.1", "assets": {"example": "src/onnx_utils.ipynb", "source": "src/onnx_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2021-10-25:00-15", "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "onnx_utils", "platformVersion": "", "spec": {"filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/onnx_utils.ipynb", "source": "src/onnx_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "onnx_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "with_mlrun": false}}, "filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/ml-models", "kind": "job", "requirements": ["onnx~=1.10.1", "onnxruntime~=1.8.1", "onnxoptimizer~=0.2.0", "onnxmltools~=1.9.0", "tf2onnx~=1.9.0"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/onnx_utils.ipynb", "source": "src/onnx_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.10.2": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.10.0", "name": "onnx_utils", "platformVersion": "3.2.0", "spec": {"filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.10.2", "assets": {"example": "src/onnx_utils.ipynb", "source": "src/onnx_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "onnx_utils", "platformVersion": "3.2.0", "spec": {"filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/onnx_utils.ipynb", "source": "src/onnx_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.1": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "onnx_utils", "platformVersion": "3.2.0", "spec": {"filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.1", "assets": {"example": "src/onnx_utils.ipynb", "source": "src/onnx_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.1": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "onnx_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "with_mlrun": false}}, "filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/ml-models", "kind": "job", "requirements": ["onnx~=1.13.0", "onnxruntime~=1.14.0", "onnxoptimizer~=0.3.0", "onnxmltools~=1.11.0", "tf2onnx~=1.13.0"]}, "url": "", "version": "1.1.1", "assets": {"example": "src/onnx_utils.ipynb", "source": "src/onnx_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "azureml_utils": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "azureml_utils", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "commands": ["apt-get update && apt-get install -y --no-install-recommends git", "apt install -y liblttng-ust0"], "with_mlrun": true}}, "filename": "azureml_utils.py", "handler": "train", "image": "python:3.9-bullseye", "kind": "job", "requirements": ["azureml-core==1.54.0.post1", "azureml-train-automl-client==1.54.0.post1", "plotly~=5.4"]}, "url": "", "version": "1.3.0", "test_valid": true, "assets": {"example": "src/azureml_utils.ipynb", "source": "src/azureml_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.3.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "azureml_utils", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "commands": ["apt-get update && apt-get install -y --no-install-recommends git", "apt install -y liblttng-ust0"], "with_mlrun": true}}, "filename": "azureml_utils.py", "handler": "train", "image": "python:3.9-bullseye", "kind": "job", "requirements": ["azureml-core==1.54.0.post1", "azureml-train-automl-client==1.54.0.post1", "plotly~=5.4"]}, "url": "", "version": "1.3.0", "test_valid": true, "assets": {"example": "src/azureml_utils.ipynb", "source": "src/azureml_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.2.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "azureml_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "commands": ["python -m pip install pip==22.1.2", "apt-get update && apt-get install -y --no-install-recommends git"], "with_mlrun": true}}, "filename": "azureml_utils.py", "handler": "train", "image": "python:3.7.9-slim", "kind": "job", "requirements": ["azureml-core==1.40.0", "azureml-train-automl-client==1.40.0", "plotly~=5.4"]}, "url": "", "version": "1.2.0", "test_valid": false, "assets": {"example": "src/azureml_utils.ipynb", "source": "src/azureml_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.4": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2021-11-13:00-15", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "azureml_utils", "platformVersion": "", "spec": {"filename": "azureml_utils.py", "handler": "train", "extra_spec": {"build": {"commands": ["python -m pip install pip==21.2.4", "apt-get update && apt-get install -y --no-install-recommends git"], "with_mlrun": true, "auto_build": true}, "allow_empty_resources": true}, "image": "python:3.7.9-slim", "kind": "job", "requirements": ["azureml-core==1.33.0", "azureml-train-automl-client==1.33.0", "plotly~=5.4"]}, "url": "", "version": "0.9.4", "assets": {"example": "src/azureml_utils.ipynb", "source": "src/azureml_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2021-11-13:00-15", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "azureml_utils", "platformVersion": "", "spec": {"filename": "azureml_utils.py", "handler": "train", "commands": null, "image": "", "kind": "job", "requirements": ["azureml-core==1.33.0", "azureml-train-automl-client==1.33.0"]}, "url": "", "version": "0.0.1", "assets": {"example": "src/azureml_utils.ipynb", "source": "src/azureml_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.5": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2021-04-20:15-18", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "azureml_utils", "platformVersion": "", "spec": {"filename": "azureml_utils.py", "handler": "train", "extra_spec": {"build": {"commands": ["python -m pip install pip==21.2.4", "apt-get update && apt-get install -y --no-install-recommends git"], "with_mlrun": true, "auto_build": true}, "allow_empty_resources": true}, "image": "python:3.7.9-slim", "kind": "job", "requirements": ["azureml-core==1.40.0", "azureml-train-automl-client==1.40.0", "plotly~=5.4"]}, "url": "", "version": "0.9.5", "assets": {"example": "src/azureml_utils.ipynb", "source": "src/azureml_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "azureml_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "commands": ["python -m pip install pip==22.1.2", "apt-get update && apt-get install -y --no-install-recommends git"], "with_mlrun": true}}, "filename": "azureml_utils.py", "handler": "train", "image": "python:3.7.9-slim", "kind": "job", "requirements": ["azureml-core==1.40.0", "azureml-train-automl-client==1.40.0", "plotly~=5.4"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/azureml_utils.ipynb", "source": "src/azureml_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.3": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2021-11-13:00-15", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "azureml_utils", "platformVersion": "", "spec": {"filename": "azureml_utils.py", "handler": "train", "extra_spec": {"build": {"commands": ["python -m pip install pip==21.2.4", "apt-get update && apt-get install -y --no-install-recommends git"], "with_mlrun": true, "auto_build": true}, "allow_empty_resources": true}, "image": "python:3.7.9-slim", "kind": "job", "requirements": ["azureml-core==1.33.0", "azureml-train-automl-client==1.33.0", "plotly~=5.4"]}, "url": "", "version": "0.9.3", "assets": {"example": "src/azureml_utils.ipynb", "source": "src/azureml_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2021-11-13:00-15", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "azureml_utils", "platformVersion": "", "spec": {"filename": "azureml_utils.py", "handler": "train", "commands": null, "image": "", "kind": "job", "requirements": ["azureml-core==1.33.0", "azureml-train-automl-client==1.33.0"]}, "url": "", "version": "0.9.0", "assets": {"example": "src/azureml_utils.ipynb", "source": "src/azureml_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "model_server_tester": {"latest": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-server-tester", "platformVersion": "3.5.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/model_server_tester.ipynb", "source": "src/model_server_tester.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server-tester", "platformVersion": "3.2.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/model_server_tester.ipynb", "source": "src/model_server_tester.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.0": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server-tester", "platformVersion": "3.2.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.0", "assets": {"example": "src/model_server_tester.ipynb", "source": "src/model_server_tester.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "model-server-tester", "platformVersion": "", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/model_server_tester.ipynb", "source": "src/model_server_tester.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-server-tester", "platformVersion": "3.5.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/model_server_tester.ipynb", "source": "src/model_server_tester.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server-tester", "platformVersion": "3.2.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/model_server_tester.ipynb", "source": "src/model_server_tester.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "gen_class_data": {"latest": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "gen_class_data", "platformVersion": "3.5.3", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0", "assets": {"example": "src/gen_class_data.ipynb", "source": "src/gen_class_data.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "gen_class_data", "platformVersion": "3.2.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/gen_class_data.ipynb", "source": "src/gen_class_data.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "gen_class_data", "platformVersion": "3.5.3", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0", "assets": {"example": "src/gen_class_data.ipynb", "source": "src/gen_class_data.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.10.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "gen_class_data", "platformVersion": "3.2.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.10.0", "assets": {"example": "src/gen_class_data.ipynb", "source": "src/gen_class_data.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.6.2", "name": "gen_class_data", "platformVersion": "3.0.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/gen_class_data.ipynb", "source": "src/gen_class_data.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "gen_class_data", "platformVersion": "3.5.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/gen_class_data.ipynb", "source": "src/gen_class_data.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "gen_class_data", "platformVersion": "3.2.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/gen_class_data.ipynb", "source": "src/gen_class_data.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "coxph_test": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "model-testing"], "description": "Test cox proportional hazards model", "doc": "", "example": "coxph_test.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Iguazio", "framework": "survival"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "coxph-test", "platformVersion": "3.5.0", "spec": {"filename": "coxph_test.py", "handler": "cox_test", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/coxph_test.ipynb", "source": "src/coxph_test.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-testing"], "description": "Test cox proportional hazards model", "doc": "", "example": "coxph_test.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "Iguazio", "framework": "survival"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "coxph-test", "platformVersion": "3.2.0", "spec": {"filename": "coxph_test.py", "handler": "cox_test", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/coxph_test.ipynb", "source": "src/coxph_test.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-testing"], "description": "Test cox proportional hazards model", "doc": "", "example": "coxph_test.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Iguazio", "framework": "survival"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "coxph-test", "platformVersion": "3.2.0", "spec": {"filename": "coxph_test.py", "handler": "cox_test", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.0.0", "assets": {"example": "src/coxph_test.ipynb", "source": "src/coxph_test.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["machine-learning", "model-testing"], "description": "Test cox proportional hazards model", "doc": "", "example": "coxph_test.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "Iguazio", "framework": "survival"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "coxph-test", "platformVersion": "", "spec": {"filename": "coxph_test.py", "handler": "cox_test", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/coxph_test.ipynb", "source": "src/coxph_test.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-testing"], "description": "Test cox proportional hazards model", "doc": "", "example": "coxph_test.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Iguazio", "framework": "survival"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "coxph-test", "platformVersion": "3.5.0", "spec": {"filename": "coxph_test.py", "handler": "cox_test", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/coxph_test.ipynb", "source": "src/coxph_test.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-testing"], "description": "Test cox proportional hazards model", "doc": "", "example": "coxph_test.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Iguazio", "framework": "survival"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "coxph-test", "platformVersion": "3.2.0", "spec": {"filename": "coxph_test.py", "handler": "cox_test", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/coxph_test.ipynb", "source": "src/coxph_test.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "slack_notify": {"latest": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Slack notification", "doc": "", "example": "slack_notify.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "mdl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "slack-notify", "platformVersion": "3.5.0", "spec": {"filename": "slack_notify.py", "handler": "slack_notify", "image": "python:3.6-jessie", "kind": "job", "requirements": ["requests"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/slack_notify.ipynb", "source": "src/slack_notify.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Slack notification", "doc": "", "example": "slack_notify.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "mdl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "slack-notify", "platformVersion": "3.2.0", "spec": {"filename": "slack_notify.py", "handler": "slack_notify", "image": "python:3.6-jessie", "kind": "job", "requirements": ["requests"]}, "url": "", "version": "0.8.0", "assets": {"example": "src/slack_notify.ipynb", "source": "src/slack_notify.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Slack notification", "doc": "", "example": "slack_notify.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "mdl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "slack-notify", "platformVersion": "", "spec": {"filename": "slack_notify.py", "handler": "slack_notify", "image": "python:3.6-jessie", "kind": "job", "requirements": ["requests"]}, "url": "", "version": "0.0.1", "assets": {"example": "src/slack_notify.ipynb", "source": "src/slack_notify.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Slack notification", "doc": "", "example": "slack_notify.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "mdl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "slack-notify", "platformVersion": "3.5.0", "spec": {"filename": "slack_notify.py", "handler": "slack_notify", "image": "python:3.6-jessie", "kind": "job", "requirements": ["requests"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/slack_notify.ipynb", "source": "src/slack_notify.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Slack notification", "doc": "", "example": "slack_notify.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "mdl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "slack-notify", "platformVersion": "3.2.0", "spec": {"filename": "slack_notify.py", "handler": "slack_notify", "image": "python:3.6-jessie", "kind": "job", "requirements": ["requests"]}, "url": "", "version": "0.9.0", "assets": {"example": "src/slack_notify.ipynb", "source": "src/slack_notify.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "describe_spark": {"latest": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "describe-spark", "platformVersion": "3.5.0", "spec": {"filename": "describe_spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/describe_spark.ipynb", "source": "src/describe_spark.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe-spark", "platformVersion": "3.2.0", "spec": {"filename": "describe-spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/describe_spark.ipynb", "source": "src/describe-spark.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe-spark", "platformVersion": "3.2.0", "spec": {"filename": "describe_spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "0.9.1", "assets": {"example": "src/describe_spark.ipynb", "source": "src/describe_spark.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "describe-spark", "platformVersion": "", "spec": {"filename": "describe-spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/describe_spark.ipynb", "source": "src/describe-spark.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "describe-spark", "platformVersion": "3.5.0", "spec": {"filename": "describe_spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/describe_spark.ipynb", "source": "src/describe_spark.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe-spark", "platformVersion": "3.2.0", "spec": {"filename": "describe-spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/describe_spark.ipynb", "source": "src/describe-spark.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "v2_model_server": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "v2-model-server", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/v2_model_server.ipynb", "source": "src/v2_model_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/v2_model_server.ipynb", "source": "src/v2_model_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "1.0.0", "assets": {"example": "src/v2_model_server.ipynb", "source": "src/v2_model_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "v2-model-server", "platformVersion": "", "spec": {"filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": [], "customFields": {"default_class": "ClassifierModel"}}, "url": "", "version": "0.0.1", "assets": {"example": "src/v2_model_server.ipynb", "source": "src/v2_model_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "v2-model-server", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/v2_model_server.ipynb", "source": "src/v2_model_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/v2_model_server.ipynb", "source": "src/v2_model_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "virtual_drift": {"latest": {"apiVersion": "v1", "categories": ["data-analysis", "machine-learning"], "description": "Compute drift magnitude between Time-Samples T and U", "doc": "", "example": "virtual_drift.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "virtual-drift", "platformVersion": "3.5.0", "spec": {"filename": "virtual_drift.py", "handler": "drift_magnitude", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-learn", "scipy", "v3io_frames"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/virtual_drift.ipynb", "source": "src/virtual_drift.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["data-analysis", "machine-learning"], "description": "Compute drift magnitude between Time-Samples T and U", "doc": "", "example": "virtual_drift.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "virtual-drift", "platformVersion": "3.2.0", "spec": {"filename": "virtual_drift.py", "handler": "drift_magnitude", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-learn", "scipy", "v3io_frames"]}, "url": "", "version": "0.8.0", "assets": {"example": "src/virtual_drift.ipynb", "source": "src/virtual_drift.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["data-analysis", "machine-learning"], "description": "Compute drift magnitude between Time-Samples T and U", "doc": "", "example": "virtual_drift.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "virtual-drift", "platformVersion": "", "spec": {"filename": "virtual_drift.py", "handler": "drift_magnitude", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-learn", "scipy", "v3io_frames"]}, "url": "", "version": "0.0.1", "assets": {"example": "src/virtual_drift.ipynb", "source": "src/virtual_drift.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["data-analysis", "machine-learning"], "description": "Compute drift magnitude between Time-Samples T and U", "doc": "", "example": "virtual_drift.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "virtual-drift", "platformVersion": "3.5.0", "spec": {"filename": "virtual_drift.py", "handler": "drift_magnitude", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-learn", "scipy", "v3io_frames"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/virtual_drift.ipynb", "source": "src/virtual_drift.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["data-analysis", "machine-learning"], "description": "Compute drift magnitude between Time-Samples T and U", "doc": "", "example": "virtual_drift.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "virtual-drift", "platformVersion": "3.2.0", "spec": {"filename": "virtual_drift.py", "handler": "drift_magnitude", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-learn", "scipy", "v3io_frames"]}, "url": "", "version": "0.9.0", "assets": {"example": "src/virtual_drift.ipynb", "source": "src/virtual_drift.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "send_email": {"latest": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "send-email", "platformVersion": "3.5.3", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0", "assets": {"example": "src/send_email.ipynb", "source": "src/send_email.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "send-email", "platformVersion": "3.2.0", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/send_email.ipynb", "source": "src/send_email.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.2.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "send-email", "platformVersion": "3.5.3", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0", "assets": {"example": "src/send_email.ipynb", "source": "src/send_email.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "send-email", "platformVersion": "", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/send_email.ipynb", "source": "src/send_email.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "send-email", "platformVersion": "3.5.0", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/send_email.ipynb", "source": "src/send_email.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "send-email", "platformVersion": "3.2.0", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/send_email.ipynb", "source": "src/send_email.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "arc_to_parquet": {"latest": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avi"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "arc-to-parquet", "platformVersion": "3.5.4", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.4.1", "assets": {"example": "src/arc_to_parquet.ipynb", "source": "src/arc_to_parquet.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2021-05-19:22-04", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "arc-to-parquet", "platformVersion": "3.2.0", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/ml-base", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/arc_to_parquet.ipynb", "source": "src/arc_to_parquet.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.2.0": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "arc-to-parquet", "platformVersion": "3.5.0", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/ml-base", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0", "assets": {"example": "src/arc_to_parquet.ipynb", "source": "src/arc_to_parquet.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.4.1": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avi"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "arc-to-parquet", "platformVersion": "3.5.4", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.4.1", "assets": {"example": "src/arc_to_parquet.ipynb", "source": "src/arc_to_parquet.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2021-05-19:22-04", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.5.4", "name": "arc-to-parquet", "platformVersion": "2.10.0", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/ml-base", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/arc_to_parquet.ipynb", "source": "src/arc_to_parquet.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "arc-to-parquet", "platformVersion": "3.5.0", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/ml-base", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/arc_to_parquet.ipynb", "source": "src/arc_to_parquet.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "arc-to-parquet", "platformVersion": "3.2.0", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/ml-base", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/arc_to_parquet.ipynb", "source": "src/arc_to_parquet.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "open_archive": {"latest": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Open a file/object archive into a target directory", "doc": "", "example": "open_archive.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "open-archive", "platformVersion": "3.5.0", "spec": {"filename": "open_archive.py", "handler": "open_archive", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/open_archive.ipynb", "source": "src/open_archive.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Open a file/object archive into a target directory", "doc": "", "example": "open_archive.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "open-archive", "platformVersion": "3.2.0", "spec": {"filename": "open_archive.py", "handler": "open_archive", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/open_archive.ipynb", "source": "src/open_archive.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Open a file/object archive into a target directory", "doc": "", "example": "open_archive.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "open-archive", "platformVersion": "", "spec": {"filename": "open_archive.py", "handler": "open_archive", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/open_archive.ipynb", "source": "src/open_archive.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Open a file/object archive into a target directory", "doc": "", "example": "open_archive.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "open-archive", "platformVersion": "3.5.0", "spec": {"filename": "open_archive.py", "handler": "open_archive", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/open_archive.ipynb", "source": "src/open_archive.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Open a file/object archive into a target directory", "doc": "", "example": "open_archive.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "open-archive", "platformVersion": "3.2.0", "spec": {"filename": "open_archive.py", "handler": "open_archive", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/open_archive.ipynb", "source": "src/open_archive.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "model_monitoring_batch": {"latest": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_batch.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-monitoring-batch", "platformVersion": "3.5.0", "spec": {"filename": "model_monitoring_batch.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/model_monitoring_batch.ipynb", "source": "src/model_monitoring_batch.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_batch.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-monitoring-batch", "platformVersion": "3.2.0", "spec": {"filename": "model_monitoring_batch.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/model_monitoring_batch.ipynb", "source": "src/model_monitoring_batch.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.1": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_batch.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-monitoring-batch", "platformVersion": "3.2.0", "spec": {"filename": "model_monitoring_batch.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.1", "assets": {"example": "src/model_monitoring_batch.ipynb", "source": "src/model_monitoring_batch.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_batch.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "model-monitoring-batch", "platformVersion": "", "spec": {"filename": "model_monitoring_batch.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/model_monitoring_batch.ipynb", "source": "src/model_monitoring_batch.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_batch.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-monitoring-batch", "platformVersion": "3.5.0", "spec": {"filename": "model_monitoring_batch.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/model_monitoring_batch.ipynb", "source": "src/model_monitoring_batch.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_batch.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-monitoring-batch", "platformVersion": "3.2.0", "spec": {"filename": "model_monitoring_batch.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/model_monitoring_batch.ipynb", "source": "src/model_monitoring_batch.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "tf1_serving": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf1 image classification server", "doc": "", "example": "tf1_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "tf1-serving", "platformVersion": "3.5.0", "spec": {"env": {"ENABLE_EXPLAINER": false, "MODEL_CLASS": "TFModel"}, "filename": "tf1_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/tf1_serving.ipynb", "source": "src/tf1_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf1 image classification server", "doc": "", "example": "tf1_serving.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf1-serving", "platformVersion": "3.2.0", "spec": {"env": {"ENABLE_EXPLAINER": false, "MODEL_CLASS": "TFModel"}, "filename": "tf1_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/tf1_serving.ipynb", "source": "src/tf1_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf1 image classification server", "doc": "", "example": "tf1_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf1-serving", "platformVersion": "3.2.0", "spec": {"env": {"ENABLE_EXPLAINER": false, "MODEL_CLASS": "TFModel"}, "filename": "tf1_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "0.9.1", "assets": {"example": "src/tf1_serving.ipynb", "source": "src/tf1_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf1 image classification server", "doc": "", "example": "tf1_serving.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "tf1-serving", "platformVersion": "", "spec": {"filename": "tf1_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": [], "env": {"MODEL_CLASS": "TFModel", "ENABLE_EXPLAINER": false}}, "url": "", "version": "0.0.1", "assets": {"example": "src/tf1_serving.ipynb", "source": "src/tf1_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf1 image classification server", "doc": "", "example": "tf1_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "tf1-serving", "platformVersion": "3.5.0", "spec": {"env": {"ENABLE_EXPLAINER": false, "MODEL_CLASS": "TFModel"}, "filename": "tf1_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/tf1_serving.ipynb", "source": "src/tf1_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf1 image classification server", "doc": "", "example": "tf1_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf1-serving", "platformVersion": "3.2.0", "spec": {"env": {"ENABLE_EXPLAINER": false, "MODEL_CLASS": "TFModel"}, "filename": "tf1_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/tf1_serving.ipynb", "source": "src/tf1_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "stream_to_parquet": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Saves a stream to Parquet and can lunch drift detection task on it", "doc": "", "example": "stream_to_parquet.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "stream-to-parquet", "platformVersion": "3.5.0", "spec": {"customFields": {"max_replicas": 1, "min_replicas": 1}, "filename": "stream_to_parquet.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/stream_to_parquet.ipynb", "source": "src/stream_to_parquet.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Saves a stream to Parquet and can lunch drift detection task on it", "doc": "", "example": "stream_to_parquet.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "stream-to-parquet", "platformVersion": "3.2.0", "spec": {"customFields": {"max_replicas": 1, "min_replicas": 1}, "filename": "stream_to_parquet.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/stream_to_parquet.ipynb", "source": "src/stream_to_parquet.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Saves a stream to Parquet and can lunch drift detection task on it", "doc": "", "example": "stream_to_parquet.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "stream-to-parquet", "platformVersion": "", "spec": {"filename": "stream_to_parquet.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": [], "customFields": {"min_replicas": 1, "max_replicas": 1}}, "url": "", "version": "0.0.1", "assets": {"example": "src/stream_to_parquet.ipynb", "source": "src/stream_to_parquet.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Saves a stream to Parquet and can lunch drift detection task on it", "doc": "", "example": "stream_to_parquet.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "stream-to-parquet", "platformVersion": "3.5.0", "spec": {"customFields": {"max_replicas": 1, "min_replicas": 1}, "filename": "stream_to_parquet.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/stream_to_parquet.ipynb", "source": "src/stream_to_parquet.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Saves a stream to Parquet and can lunch drift detection task on it", "doc": "", "example": "stream_to_parquet.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "stream-to-parquet", "platformVersion": "3.2.0", "spec": {"customFields": {"max_replicas": 1, "min_replicas": 1}, "filename": "stream_to_parquet.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/stream_to_parquet.ipynb", "source": "src/stream_to_parquet.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "rnn_serving": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an rnn based stock analysis model server.", "doc": "", "example": "rnn_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "rnn-serving", "platformVersion": "3.5.0", "spec": {"filename": "rnn_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": null}, "url": "", "version": "1.1.0", "assets": {"example": "src/rnn_serving.ipynb", "source": "src/rnn_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an rnn based stock analysis model server.", "doc": "", "example": "rnn_serving.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "rnn-serving", "platformVersion": "3.2.0", "spec": {"filename": "rnn_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": null}, "url": "", "version": "0.8.0", "assets": {"example": "src/rnn_serving.ipynb", "source": "src/rnn_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an rnn based stock analysis model server.", "doc": "", "example": "rnn_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "rnn-serving", "platformVersion": "3.2.0", "spec": {"filename": "rnn_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": null}, "url": "", "version": "1.0.0", "assets": {"example": "src/rnn_serving.ipynb", "source": "src/rnn_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an rnn based stock analysis model server.", "doc": "", "example": "rnn_serving.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "rnn-serving", "platformVersion": "", "spec": {"filename": "rnn_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["keras"]}, "url": "", "version": "0.0.1", "assets": {"example": "src/rnn_serving.ipynb", "source": "src/rnn_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an rnn based stock analysis model server.", "doc": "", "example": "rnn_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "rnn-serving", "platformVersion": "3.5.0", "spec": {"filename": "rnn_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": null}, "url": "", "version": "1.1.0", "assets": {"example": "src/rnn_serving.ipynb", "source": "src/rnn_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an rnn based stock analysis model server.", "doc": "", "example": "rnn_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "rnn-serving", "platformVersion": "3.2.0", "spec": {"filename": "rnn_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": null}, "url": "", "version": "0.9.0", "assets": {"example": "src/rnn_serving.ipynb", "source": "src/rnn_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "v2_model_tester": {"latest": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "v2-model-tester", "platformVersion": "3.5.0", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/v2_model_tester.ipynb", "source": "src/v2_model_tester.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-tester", "platformVersion": "3.2.0", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/v2_model_tester.ipynb", "source": "src/v2_model_tester.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "v2-model-tester", "platformVersion": "", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/v2_model_tester.ipynb", "source": "src/v2_model_tester.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "v2-model-tester", "platformVersion": "3.5.0", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/v2_model_tester.ipynb", "source": "src/v2_model_tester.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-tester", "platformVersion": "3.2.0", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/v2_model_tester.ipynb", "source": "src/v2_model_tester.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "xgb_serving": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an XGBoost model server.", "doc": "", "example": "xgb_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "xgb_serving", "platformVersion": "3.5.3", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "xgb_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "1.1.2", "assets": {"example": "src/xgb_serving.ipynb", "source": "src/xgb_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an XGBoost model server.", "doc": "", "example": "xgb_serving.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "xgb_serving", "platformVersion": "3.2.0", "spec": {"filename": "xgb_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "remote", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/xgb_serving.ipynb", "source": "src/xgb_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.2": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an XGBoost model server.", "doc": "", "example": "xgb_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "xgb_serving", "platformVersion": "3.5.3", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "xgb_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "1.1.2", "assets": {"example": "src/xgb_serving.ipynb", "source": "src/xgb_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an XGBoost model server.", "doc": "", "example": "xgb_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "xgb_serving", "platformVersion": "3.2.0", "spec": {"filename": "xgb_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "remote", "requirements": []}, "url": "", "version": "1.0.0", "assets": {"example": "src/xgb_serving.ipynb", "source": "src/xgb_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an XGBoost model server.", "doc": "", "example": "xgb_serving.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.6.2", "name": "xgb_serving", "platformVersion": "3.0.0", "spec": {"filename": "xgb_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "remote", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/xgb_serving.ipynb", "source": "src/xgb_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an XGBoost model server.", "doc": "", "example": "xgb_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "xgb_serving", "platformVersion": "3.5.0", "spec": {"filename": "xgb_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "remote", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/xgb_serving.ipynb", "source": "src/xgb_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "deploy an XGBoost model server.", "doc": "", "example": "xgb_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "xgb_serving", "platformVersion": "3.2.0", "spec": {"filename": "xgb_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "remote", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/xgb_serving.ipynb", "source": "src/xgb_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "bert_embeddings": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Get BERT based embeddings for given text", "doc": "", "example": "bert_embeddings.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"framework": "pytorch"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "bert-embeddings", "platformVersion": "3.5.3", "spec": {"filename": "bert_embeddings.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio", "requirements": ["torch"]}, "url": "", "version": "1.2.0", "assets": {"example": "src/bert_embeddings.ipynb", "source": "src/bert_embeddings.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Get BERT based embeddings for given text", "doc": "", "example": "bert_embeddings.ipynb", "generationDate": "2021-05-19:22-04", "icon": "", "labels": {"framework": "pytorch"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "bert-embeddings", "platformVersion": "3.2.0", "spec": {"filename": "bert_embeddings.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["torch==1.6.0"]}, "url": "", "version": "0.8.0", "assets": {"example": "src/bert_embeddings.ipynb", "source": "src/bert_embeddings.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.2.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Get BERT based embeddings for given text", "doc": "", "example": "bert_embeddings.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"framework": "pytorch"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "bert-embeddings", "platformVersion": "3.5.3", "spec": {"filename": "bert_embeddings.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio", "requirements": ["torch"]}, "url": "", "version": "1.2.0", "assets": {"example": "src/bert_embeddings.ipynb", "source": "src/bert_embeddings.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Get BERT based embeddings for given text", "doc": "", "example": "bert_embeddings.ipynb", "generationDate": "2021-05-19:22-04", "icon": "", "labels": {"framework": "pytorch"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.5.4", "name": "bert-embeddings", "platformVersion": "2.10.0", "spec": {"filename": "bert_embeddings.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["torch==1.6.0"]}, "url": "", "version": "0.0.1", "assets": {"example": "src/bert_embeddings.ipynb", "source": "src/bert_embeddings.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Get BERT based embeddings for given text", "doc": "", "example": "bert_embeddings.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"framework": "pytorch"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "bert-embeddings", "platformVersion": "3.5.0", "spec": {"filename": "bert_embeddings.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["torch==1.6.0"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/bert_embeddings.ipynb", "source": "src/bert_embeddings.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Get BERT based embeddings for given text", "doc": "", "example": "bert_embeddings.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"framework": "pytorch"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "bert-embeddings", "platformVersion": "3.2.0", "spec": {"filename": "bert_embeddings.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["torch==1.6.0"]}, "url": "", "version": "0.9.0", "assets": {"example": "src/bert_embeddings.ipynb", "source": "src/bert_embeddings.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.1": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "Get BERT based embeddings for given text", "doc": "", "example": "bert_embeddings.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"framework": "pytorch"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "bert-embeddings", "platformVersion": "3.5.0", "spec": {"filename": "bert_embeddings.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "nuclio", "requirements": ["torch==1.6.0"]}, "url": "", "version": "1.1.1", "assets": {"example": "src/bert_embeddings.ipynb", "source": "src/bert_embeddings.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "aggregate": {"latest": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "aggregate", "platformVersion": "3.5.4", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0", "assets": {"example": "src/aggregate.ipynb", "source": "src/aggregate.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2021-05-19:22-31", "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "aggregate", "platformVersion": "3.2.0", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/aggregate.ipynb", "source": "src/aggregate.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.3.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "aggregate", "platformVersion": "3.5.4", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0", "assets": {"example": "src/aggregate.ipynb", "source": "src/aggregate.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "aggregate", "platformVersion": "3.5.2", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0", "assets": {"example": "src/aggregate.ipynb", "source": "src/aggregate.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2021-05-19:22-31", "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "aggregate", "platformVersion": "3.2.0", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.0", "assets": {"example": "src/aggregate.ipynb", "source": "src/aggregate.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2021-05-19:22-31", "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.6.2", "name": "aggregate", "platformVersion": "3.0.0", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/aggregate.ipynb", "source": "src/aggregate.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.2": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2021-05-19:22-31", "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.7.1", "name": "aggregate", "platformVersion": "3.2.0", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.2", "assets": {"example": "src/aggregate.ipynb", "source": "src/aggregate.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "aggregate", "platformVersion": "3.5.2", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/aggregate.ipynb", "source": "src/aggregate.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "aggregate", "platformVersion": "3.2.0", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/aggregate.ipynb", "source": "src/aggregate.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "aggregate", "platformVersion": "3.5.2", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.1", "assets": {"example": "src/aggregate.ipynb", "source": "src/aggregate.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "describe": {"latest": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "describe", "platformVersion": "3.5.3", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0", "assets": {"example": "src/describe.ipynb", "source": "src/describe.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "Iguazio"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe", "platformVersion": "3.2.0", "spec": {"filename": "describe.py", "handler": "summarize", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/describe.ipynb", "source": "src/describe.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.2.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "describe", "platformVersion": "3.5.3", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0", "assets": {"example": "src/describe.ipynb", "source": "src/describe.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-04-07:14-20", "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe", "platformVersion": "3.2.0", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.1", "assets": {"example": "src/describe.ipynb", "source": "src/describe.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "Iguazio"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.5.4", "name": "describe", "platformVersion": "2.10.0", "spec": {"filename": "describe.py", "handler": "summarize", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/describe.ipynb", "source": "src/describe.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "describe", "platformVersion": "3.5.0", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/describe.ipynb", "source": "src/describe.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Iguazio"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe", "platformVersion": "3.2.0", "spec": {"filename": "describe.py", "handler": "summarize", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/describe.ipynb", "source": "src/describe.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.2": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-04-26:10-20", "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe", "platformVersion": "3.2.0", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.2", "assets": {"example": "src/describe.ipynb", "source": "src/describe.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "pandas_profiling_report": {"latest": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "Create Pandas Profiling Report from Dataset", "doc": "", "example": "pandas_profiling_report.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "nicks"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "pandas-profiling-report", "platformVersion": "3.5.0", "spec": {"filename": "pandas_profiling_report.py", "handler": "pandas_profiling_report", "image": "mlrun/mlrun", "kind": "job", "requirements": ["pandas_profiling"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/pandas_profiling_report.ipynb", "source": "src/pandas_profiling_report.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "Create Pandas Profiling Report from Dataset", "doc": "", "example": "pandas_profiling_report.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "nicks"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "pandas-profiling-report", "platformVersion": "3.2.0", "spec": {"filename": "pandas_profiling_report.py", "handler": "pandas_profiling_report", "image": "mlrun/mlrun", "kind": "job", "requirements": ["pandas_profiling"]}, "url": "", "version": "0.8.0", "assets": {"example": "src/pandas_profiling_report.ipynb", "source": "src/pandas_profiling_report.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "Create Pandas Profiling Report from Dataset", "doc": "", "example": "pandas_profiling_report.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "nicks"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "pandas-profiling-report", "platformVersion": "", "spec": {"filename": "pandas_profiling_report.py", "handler": "pandas_profiling_report", "image": "mlrun/mlrun", "kind": "job", "requirements": ["pandas_profiling"]}, "url": "", "version": "0.0.1", "assets": {"example": "src/pandas_profiling_report.ipynb", "source": "src/pandas_profiling_report.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "Create Pandas Profiling Report from Dataset", "doc": "", "example": "pandas_profiling_report.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "nicks"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "pandas-profiling-report", "platformVersion": "3.5.0", "spec": {"filename": "pandas_profiling_report.py", "handler": "pandas_profiling_report", "image": "mlrun/mlrun", "kind": "job", "requirements": ["pandas_profiling"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/pandas_profiling_report.ipynb", "source": "src/pandas_profiling_report.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "Create Pandas Profiling Report from Dataset", "doc": "", "example": "pandas_profiling_report.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "nicks"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "pandas-profiling-report", "platformVersion": "3.2.0", "spec": {"filename": "pandas_profiling_report.py", "handler": "pandas_profiling_report", "image": "mlrun/mlrun", "kind": "job", "requirements": ["pandas_profiling"]}, "url": "", "version": "0.9.0", "assets": {"example": "src/pandas_profiling_report.ipynb", "source": "src/pandas_profiling_report.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "model_monitoring_stream": {"latest": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_stream.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-monitoring-stream", "platformVersion": "3.5.0", "spec": {"filename": "model_monitoring_stream.py", "handler": "handler", "image": "livsmichael/mlrun-api:automation", "kind": "nuclio", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/model_monitoring_stream.ipynb", "source": "src/model_monitoring_stream.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_stream.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-monitoring-stream", "platformVersion": "3.2.0", "spec": {"filename": "model_monitoring_stream.py", "handler": "handler", "image": "livsmichael/mlrun-api:automation", "kind": "nuclio", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/model_monitoring_stream.ipynb", "source": "src/model_monitoring_stream.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.1": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_stream.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-monitoring-stream", "platformVersion": "3.2.0", "spec": {"filename": "model_monitoring_stream.py", "handler": "handler", "image": "livsmichael/mlrun-api:automation", "kind": "nuclio", "requirements": []}, "url": "", "version": "0.9.1", "assets": {"example": "src/model_monitoring_stream.ipynb", "source": "src/model_monitoring_stream.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_stream.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "model-monitoring-stream", "platformVersion": "", "spec": {"filename": "model_monitoring_stream.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/model_monitoring_stream.ipynb", "source": "src/model_monitoring_stream.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_stream.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-monitoring-stream", "platformVersion": "3.5.0", "spec": {"filename": "model_monitoring_stream.py", "handler": "handler", "image": "livsmichael/mlrun-api:automation", "kind": "nuclio", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/model_monitoring_stream.ipynb", "source": "src/model_monitoring_stream.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_stream.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-monitoring-stream", "platformVersion": "3.2.0", "spec": {"filename": "model_monitoring_stream.py", "handler": "handler", "image": "livsmichael/mlrun-api:automation", "kind": "nuclio", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/model_monitoring_stream.ipynb", "source": "src/model_monitoring_stream.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "snowflake_dask": {"latest": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Snowflake Dask - Ingest snowflake data in parallel with Dask cluster", "doc": "", "example": "snowflake-dask-mlrun.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "xingsheng", "framework": "dask"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "snowflake_dask", "platformVersion": "3.5.0", "spec": {"filename": "snowflake_dask.py", "handler": "load_results", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/snowflake-dask-mlrun.ipynb", "source": "src/snowflake_dask.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Snowflake Dask - Ingest snowflake data in parallel with Dask cluster", "doc": "", "example": "snowflake-dask-mlrun.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "xingsheng", "framework": "dask"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "snowflake_dask", "platformVersion": "3.5.0", "spec": {"filename": "snowflake_dask.py", "handler": "load_results", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/snowflake-dask-mlrun.ipynb", "source": "src/snowflake_dask.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Snowflake Dask - Ingest snowflake data in parallel with Dask cluster", "doc": "", "example": "snowflake-dask-mlrun.ipynb", "generationDate": "2022-03-20:12-28", "icon": "", "labels": {"author": "xingsheng", "framework": "dask"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.9.1", "name": "snowflake_dask", "platformVersion": "3.2.0", "spec": {"filename": "snowflake_dask.py", "handler": "load_results", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/snowflake-dask-mlrun.ipynb", "source": "src/snowflake_dask.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "azureml_serving": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "AzureML serving function", "doc": "", "example": "azureml_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "azureml_serving", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "mlrun.frameworks.sklearn.PickleModelServer"}, "filename": "azureml_serving.py", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["azureml-automl-runtime~=1.38.1"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/azureml_serving.ipynb", "source": "src/azureml_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "AzureML serving function", "doc": "", "example": "azureml_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "azureml_serving", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "mlrun.frameworks.sklearn.PickleModelServer"}, "filename": "azureml_serving.py", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["azureml-automl-runtime~=1.38.1"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/azureml_serving.ipynb", "source": "src/azureml_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "batch_inference": {"latest": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.7.0", "assets": {"example": "src/batch_inference.ipynb", "source": "src/batch_inference.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.2.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.2.0", "assets": {"example": "src/batch_inference.ipynb", "source": "src/batch_inference.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.5.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.5.0", "assets": {"example": "src/batch_inference.ipynb", "source": "src/batch_inference.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference ( also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-learn", "plotly"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/batch_inference.ipynb", "source": "src/batch_inference.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.7.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.7.0", "assets": {"example": "src/batch_inference.ipynb", "source": "src/batch_inference.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.6.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.6.0", "assets": {"example": "src/batch_inference.ipynb", "source": "src/batch_inference.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.1": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.1.1", "assets": {"example": "src/batch_inference.ipynb", "source": "src/batch_inference.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "hugging_face_serving": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "Generic Hugging Face model server.", "doc": "", "example": "hugging_face_serving.ipynb", "generationDate": "2022-09-05:17-00", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "hugging_face_serving", "platformVersion": "", "spec": {"customFields": {"default_class": "HuggingFaceModelServer"}, "filename": "hugging_face_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["transformers==4.21.3", "tensorflow==2.9.2"]}, "url": "", "version": "1.0.0", "assets": {"example": "src/hugging_face_serving.ipynb", "source": "src/hugging_face_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "Generic Hugging Face model server.", "doc": "", "example": "hugging_face_serving.ipynb", "generationDate": "2022-09-05:17-00", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "hugging_face_serving", "platformVersion": "", "spec": {"customFields": {"default_class": "HuggingFaceModelServer"}, "filename": "hugging_face_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["transformers==4.21.3", "tensorflow==2.9.2"]}, "url": "", "version": "1.0.0", "assets": {"example": "src/hugging_face_serving.ipynb", "source": "src/hugging_face_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "hugging_face_classifier_trainer": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train and optimize functions for HuggingFace framework", "doc": "", "example": "hugging_face_classifier_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.1", "name": "hugging_face_classifier_trainer", "platformVersion": "3.5.5", "spec": {"filename": "hugging_face_classifier_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": ["onnx~=1.14.1", "onnxruntime~=1.16.1", "optimum~=1.6.4", "transformers~=4.26.1", "datasets~=2.10.1", "scikit-learn~=1.0.2"]}, "url": "", "version": "0.2.0", "assets": {"example": "src/hugging_face_classifier_trainer.ipynb", "source": "src/hugging_face_classifier_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train and optimize functions for HuggingFace framework", "doc": "", "example": "hugging_face_classifier_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "hugging_face_classifier_trainer", "platformVersion": "3.5.0", "spec": {"filename": "hugging_face_classifier_trainer.py", "handler": "train", "image": "mlrun/ml-models", "kind": "job", "requirements": ["onnx~=1.10.1", "onnxruntime~=1.8.1", "optimum~=1.6.4", "transformers~=4.26.1", "datasets~=2.10.1", "scikit-learn~=1.0.2"]}, "url": "", "version": "0.1.0", "assets": {"example": "src/hugging_face_classifier_trainer.ipynb", "source": "src/hugging_face_classifier_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.2.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train and optimize functions for HuggingFace framework", "doc": "", "example": "hugging_face_classifier_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.1", "name": "hugging_face_classifier_trainer", "platformVersion": "3.5.5", "spec": {"filename": "hugging_face_classifier_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": ["onnx~=1.14.1", "onnxruntime~=1.16.1", "optimum~=1.6.4", "transformers~=4.26.1", "datasets~=2.10.1", "scikit-learn~=1.0.2"]}, "url": "", "version": "0.2.0", "assets": {"example": "src/hugging_face_classifier_trainer.ipynb", "source": "src/hugging_face_classifier_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train and optimize functions for HuggingFace framework", "doc": "", "example": "hugging_face_classifier_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "hugging_face_classifier_trainer", "platformVersion": "3.5.0", "spec": {"filename": "hugging_face_classifier_trainer.py", "handler": "train", "image": "mlrun/ml-models", "kind": "job", "requirements": ["onnx~=1.10.1", "onnxruntime~=1.8.1", "optimum~=1.6.4", "transformers~=4.26.1", "datasets~=2.10.1", "scikit-learn~=1.0.2"]}, "url": "", "version": "0.0.1", "assets": {"example": "src/hugging_face_classifier_trainer.ipynb", "source": "src/hugging_face_classifier_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "validate_great_expectations": {"latest": {"apiVersion": "v1", "categories": ["data-validation", "data-analysis"], "description": "Validate a dataset using Great Expectations", "doc": "", "example": "validate_great_expectations.ipynb", "generationDate": "2022-04-26:12-28", "hidden": false, "icon": "", "labels": {"author": "nicks", "framework": "great-expectations"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "validate-great-expectations", "platformVersion": "3.5.2", "spec": {"filename": "validate_great_expectations.py", "handler": "validate_expectations", "image": "mlrun/mlrun", "kind": "job", "requirements": ["great-expectations==0.15.41"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/validate_great_expectations.ipynb", "source": "src/validate_great_expectations.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["data-validation", "data-analysis"], "description": "Validate a dataset using Great Expectations", "doc": "", "example": "validate_great_expectations.ipynb", "generationDate": "2022-04-26:12-28", "hidden": false, "icon": "", "labels": {"author": "nicks", "framework": "great-expectations"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "validate-great-expectations", "platformVersion": "3.5.2", "spec": {"filename": "validate_great_expectations.py", "handler": "validate_expectations", "image": "mlrun/mlrun", "kind": "job", "requirements": ["great-expectations==0.15.41"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/validate_great_expectations.ipynb", "source": "src/validate_great_expectations.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "transcribe": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Transcribe audio files into text files", "doc": "", "example": "transcribe.ipynb", "generationDate": "2023-07-13:11-20", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "transcribe", "platformVersion": "3.5.3", "spec": {"filename": "transcribe.py", "handler": "transcribe", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "tqdm", "torchaudio", "torch", "accelerate"]}, "url": "", "version": "1.0.0", "assets": {"example": "src/transcribe.ipynb", "source": "src/transcribe.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Transcribe audio files into text files", "doc": "", "example": "transcribe.ipynb", "generationDate": "2023-07-13:11-20", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "transcribe", "platformVersion": "3.5.3", "spec": {"filename": "transcribe.py", "handler": "transcribe", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "tqdm", "torchaudio", "torch", "accelerate"]}, "url": "", "version": "1.0.0", "assets": {"example": "src/transcribe.ipynb", "source": "src/transcribe.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Transcribe audio files into text files", "doc": "", "example": "transcribe.ipynb", "generationDate": "2023-07-13:11-20", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "transcribe", "platformVersion": "3.5.3", "spec": {"filename": "transcribe.py", "handler": "transcribe", "image": "mlrun/mlrun", "kind": "job", "requirements": ["openai-whisper", "tqdm"]}, "url": "", "version": "0.0.1", "test_valid": false, "assets": {"example": "src/transcribe.ipynb", "source": "src/transcribe.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.2": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Transcribe audio files into text files", "doc": "", "example": "transcribe.ipynb", "generationDate": "2023-07-13:11-20", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "transcribe", "platformVersion": "3.5.3", "spec": {"filename": "transcribe.py", "handler": "transcribe", "image": "mlrun/mlrun", "kind": "job", "requirements": ["openai-whisper", "tqdm"]}, "url": "", "version": "0.0.2", "test_valid": true, "assets": {"example": "src/transcribe.ipynb", "source": "src/transcribe.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "question_answering": {"latest": {"apiVersion": "v1", "categories": ["machine-learning"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "torch", "tqdm"]}, "url": "", "version": "0.3.1", "assets": {"example": "src/question_answering.ipynb", "source": "src/question_answering.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.2.0": {"apiVersion": "v1", "categories": ["machine-learning"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": "transformers torch tqdm"}, "url": "", "version": "0.2.0", "assets": {"example": "src/question_answering.ipynb", "source": "src/question_answering.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.3.0": {"apiVersion": "v1", "categories": ["machine-learning"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": "transformers torch tqdm"}, "url": "", "version": "0.3.0", "assets": {"example": "src/question_answering.ipynb", "source": "src/question_answering.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.3.1": {"apiVersion": "v1", "categories": ["machine-learning"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "torch", "tqdm"]}, "url": "", "version": "0.3.1", "assets": {"example": "src/question_answering.ipynb", "source": "src/question_answering.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "huggingface_auto_trainer": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "fine-tune llm model with ease", "doc": "", "example": "huggingface_auto_trainer.ipynb", "generationDate": "2023-08-21:17-25", "hidden": false, "icon": "", "labels": {"author": "Zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "huggingface-auto-trainer", "platformVersion": "3.5.0", "spec": {"filename": "huggingface_auto_trainer.py", "handler": "finetune_llm", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.0", "assets": {"example": "src/huggingface_auto_trainer.ipynb", "source": "src/huggingface_auto_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "fine-tune llm model with ease", "doc": "", "example": "huggingface_auto_trainer.ipynb", "generationDate": "2023-08-21:17-25", "hidden": false, "icon": "", "labels": {"author": "Zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "huggingface-auto-trainer", "platformVersion": "3.5.0", "spec": {"filename": "huggingface_auto_trainer.py", "handler": "finetune_llm", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.0", "assets": {"example": "src/huggingface_auto_trainer.ipynb", "source": "src/huggingface_auto_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "pii_recognizer": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "This function is used to recognize PII in a directory of text files", "doc": "", "example": "pii_recognizer.ipynb", "generationDate": "2023-08-15:10-24", "hidden": false, "icon": "", "labels": {"author": "pgw"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "pii-recognizer", "platformVersion": "3.5.3", "spec": {"filename": "pii_recognizer.py", "handler": "recognize_pii", "image": "mlrun/mlrun", "kind": "job", "requirements": ["nltk", "pandas", "presidio-anonymizer", "presidio-analyzer", "torch", "flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653", "st-annotated-text", "https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl"]}, "url": "", "version": "0.2.0", "test_valid": false, "assets": {"example": "src/pii_recognizer.ipynb", "source": "src/pii_recognizer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "This function is used to recognize PII in a directory of text files", "doc": "", "example": "pii_recognizer.ipynb", "generationDate": "2023-08-15:10-24", "hidden": false, "icon": "", "labels": {"author": "pgw"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "pii-recognizer", "platformVersion": "3.5.3", "spec": {"filename": "pii_recognizer.py", "handler": "recognize_pii", "image": "mlrun/mlrun", "kind": "job", "requirements": ["nltk", "pandas", "presidio-anonymizer", "presidio-analyzer", "torch", "flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653", "st-annotated-text", "https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl"]}, "url": "", "version": "0.1.0", "test_valid": false, "assets": {"example": "src/pii_recognizer.ipynb", "source": "src/pii_recognizer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.2.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "This function is used to recognize PII in a directory of text files", "doc": "", "example": "pii_recognizer.ipynb", "generationDate": "2023-08-15:10-24", "hidden": false, "icon": "", "labels": {"author": "pgw"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "pii-recognizer", "platformVersion": "3.5.3", "spec": {"filename": "pii_recognizer.py", "handler": "recognize_pii", "image": "mlrun/mlrun", "kind": "job", "requirements": ["nltk", "pandas", "presidio-anonymizer", "presidio-analyzer", "torch", "flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653", "st-annotated-text", "https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl"]}, "url": "", "version": "0.2.0", "test_valid": false, "assets": {"example": "src/pii_recognizer.ipynb", "source": "src/pii_recognizer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "batch_inference_v2": {"latest": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.5.0", "assets": {"example": "src/batch_inference_v2.ipynb", "source": "src/batch_inference_v2.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.9.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0-rc16", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.9.0", "assets": {"example": "src/batch_inference_v2.ipynb", "source": "src/batch_inference_v2.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "2.0.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.0.0", "assets": {"example": "src/batch_inference_v2.ipynb", "source": "src/batch_inference_v2.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "2.5.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.5.0", "assets": {"example": "src/batch_inference_v2.ipynb", "source": "src/batch_inference_v2.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "2.1.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.1.0", "assets": {"example": "src/batch_inference_v2.ipynb", "source": "src/batch_inference_v2.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "2.2.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.2.0", "assets": {"example": "src/batch_inference_v2.ipynb", "source": "src/batch_inference_v2.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.5.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0-rc9", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.5.0", "assets": {"example": "src/batch_inference_v2.ipynb", "source": "src/batch_inference_v2.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.7.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0-rc13", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.7.0", "assets": {"example": "src/batch_inference_v2.ipynb", "source": "src/batch_inference_v2.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.6.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0-rc9", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.6.0", "assets": {"example": "src/batch_inference_v2.ipynb", "source": "src/batch_inference_v2.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.8.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0-rc13", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.8.0", "assets": {"example": "src/batch_inference_v2.ipynb", "source": "src/batch_inference_v2.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "2.3.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.3.0", "assets": {"example": "src/batch_inference_v2.ipynb", "source": "src/batch_inference_v2.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "translate": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning", "deep-learning", "NLP"], "description": "Translate text files from one language to another", "doc": "", "example": "translate.ipynb", "generationDate": "2023-12-05:17-20", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "translate", "platformVersion": "3.5.3", "spec": {"filename": "translate.py", "handler": "translate", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "sentencepiece", "torch", "tqdm"]}, "url": "", "version": "0.0.2", "test_valid": true, "assets": {"example": "src/translate.ipynb", "source": "src/translate.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Translate text files from one language to another", "doc": "", "example": "translate.ipynb", "generationDate": "2023-12-05:17-20", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "translate", "platformVersion": "3.5.3", "spec": {"filename": "translate.py", "handler": "translate", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "sentencepiece", "torch", "tqdm"]}, "url": "", "version": "0.0.1", "test_valid": true, "assets": {"example": "src/translate.ipynb", "source": "src/translate.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.2": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning", "deep-learning", "NLP"], "description": "Translate text files from one language to another", "doc": "", "example": "translate.ipynb", "generationDate": "2023-12-05:17-20", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "translate", "platformVersion": "3.5.3", "spec": {"filename": "translate.py", "handler": "translate", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "sentencepiece", "torch", "tqdm"]}, "url": "", "version": "0.0.2", "test_valid": true, "assets": {"example": "src/translate.ipynb", "source": "src/translate.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "structured_data_generator": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "GenAI"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.1", "name": "structured_data_generator", "platformVersion": "3.5.5", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.4.0", "assets": {"example": "src/structured_data_generator.ipynb", "source": "src/structured_data_generator.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.3.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "structured_data_generator", "platformVersion": "3.5.0", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.3.0", "assets": {"example": "src/structured_data_generator.ipynb", "source": "src/structured_data_generator.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "structured_data_generator", "platformVersion": "3.5.0", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.0.0", "assets": {"example": "src/structured_data_generator.ipynb", "source": "src/structured_data_generator.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.4.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "GenAI"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.1", "name": "structured_data_generator", "platformVersion": "3.5.5", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.4.0", "assets": {"example": "src/structured_data_generator.ipynb", "source": "src/structured_data_generator.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.3.1": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "structured_data_generator", "platformVersion": "3.5.0", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.3.1", "assets": {"example": "src/structured_data_generator.ipynb", "source": "src/structured_data_generator.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "structured_data_generator", "platformVersion": "3.5.0", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/structured_data_generator.ipynb", "source": "src/structured_data_generator.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "text_to_audio_generator": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Generate audio file from text using different speakers", "doc": "", "example": "text_to_audio_generator.ipynb", "generationDate": "2023-12-03:15-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "text_to_audio_generator", "platformVersion": "3.5.3", "spec": {"filename": "text_to_audio_generator.py", "handler": "generate_multi_speakers_audio", "image": "mlrun/mlrun", "kind": "job", "requirements": ["bark", "torchaudio"]}, "url": "", "version": "1.1.0", "test_valid": true, "assets": {"example": "src/text_to_audio_generator.ipynb", "source": "src/text_to_audio_generator.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Generate audio file from text using different speakers", "doc": "", "example": "text_to_audio_generator.ipynb", "generationDate": "2023-12-03:15-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "text_to_audio_generator", "platformVersion": "3.5.3", "spec": {"filename": "text_to_audio_generator.py", "handler": "generate_multi_speakers_audio", "image": "mlrun/mlrun", "kind": "job", "requirements": ["bark", "torchaudio"]}, "url": "", "version": "1.0.0", "test_valid": true, "assets": {"example": "src/text_to_audio_generator.ipynb", "source": "src/text_to_audio_generator.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Generate audio file from text using different speakers", "doc": "", "example": "text_to_audio_generator.ipynb", "generationDate": "2023-12-03:15-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "text_to_audio_generator", "platformVersion": "3.5.3", "spec": {"filename": "text_to_audio_generator.py", "handler": "generate_multi_speakers_audio", "image": "mlrun/mlrun", "kind": "job", "requirements": ["bark", "torchaudio"]}, "url": "", "version": "1.1.0", "test_valid": true, "assets": {"example": "src/text_to_audio_generator.ipynb", "source": "src/text_to_audio_generator.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "silero_vad": {"latest": {"apiVersion": "v1", "categories": ["deep-learning", "PyTorch", "Audio"], "description": "Silero VAD (Voice Activity Detection) functions.", "doc": "", "example": "silero_vad.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "silero_vad", "platformVersion": "3.5.3", "spec": {"filename": "silero_vad.py", "handler": "detect_voice", "image": "mlrun/mlrun", "kind": "job", "requirements": ["torch", "torchaudio", "tqdm", "onnxruntime"]}, "url": "", "version": "1.2.0", "assets": {"example": "src/silero_vad.ipynb", "source": "src/silero_vad.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.2.0": {"apiVersion": "v1", "categories": ["deep-learning", "PyTorch", "Audio"], "description": "Silero VAD (Voice Activity Detection) functions.", "doc": "", "example": "silero_vad.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "silero_vad", "platformVersion": "3.5.3", "spec": {"filename": "silero_vad.py", "handler": "detect_voice", "image": "mlrun/mlrun", "kind": "job", "requirements": ["torch", "torchaudio", "tqdm", "onnxruntime"]}, "url": "", "version": "1.2.0", "assets": {"example": "src/silero_vad.ipynb", "source": "src/silero_vad.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["deep-learning", "pytorch", "audio"], "description": "Silero VAD (Voice Activity Detection) functions.", "doc": "", "example": "silero_vad.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "silero_vad", "platformVersion": "3.5.3", "spec": {"filename": "silero_vad.py", "handler": "detect_voice", "image": "mlrun/mlrun", "kind": "job", "requirements": ["torch", "torchaudio", "tqdm", "onnxruntime"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/silero_vad.ipynb", "source": "src/silero_vad.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "pyannote_audio": {"latest": {"apiVersion": "v1", "categories": ["deep-learning", "Huggingface", "Audio"], "description": "pyannote's speech diarization of audio files", "doc": "", "example": "pyannote_audio.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "pyannote-audio", "platformVersion": "3.5.3", "spec": {"filename": "pyannote_audio.py", "handler": "diarize", "image": "mlrun/mlrun-gpu", "kind": "job", "requirements": ["pyannote.audio", "pyannote.core", "torchaudio", "tqdm"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/pyannote_audio.ipynb", "source": "src/pyannote_audio.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.0": {"apiVersion": "v1", "categories": ["deep-learning", "huggingface", "audio"], "description": "pyannote's speech diarization of audio files", "doc": "", "example": "pyannote_audio.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "pyannote-audio", "platformVersion": "3.5.3", "spec": {"filename": "pyannote_audio.py", "handler": "diarize", "image": "mlrun/mlrun-gpu", "kind": "job", "requirements": ["pyannote.audio", "pyannote.core", "torchaudio", "tqdm"]}, "url": "", "version": "1.0.0", "assets": {"example": "src/pyannote_audio.ipynb", "source": "src/pyannote_audio.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["deep-learning", "Huggingface", "Audio"], "description": "pyannote's speech diarization of audio files", "doc": "", "example": "pyannote_audio.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "pyannote-audio", "platformVersion": "3.5.3", "spec": {"filename": "pyannote_audio.py", "handler": "diarize", "image": "mlrun/mlrun-gpu", "kind": "job", "requirements": ["pyannote.audio", "pyannote.core", "torchaudio", "tqdm"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/pyannote_audio.ipynb", "source": "src/pyannote_audio.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "noise_reduction": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Reduce noise from audio files", "doc": "", "example": "noise_reduction.ipynb", "generationDate": "2024-03-04:17-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "mlrunVersion": "1.5.2", "name": "noise-reduction", "platformVersion": "3.5.3", "spec": {"filename": "noise_reduction.py", "handler": "reduce_noise", "image": "mlrun/mlrun", "kind": "job", "requirements": ["librosa", "noisereduce", "deepfilternet", "torchaudio>=2.1.2"]}, "url": "", "version": "1.0.0", "assets": {"example": "src/noise_reduction.ipynb", "source": "src/noise_reduction.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Reduce noise from audio files", "doc": "", "example": "noise_reduction.ipynb", "generationDate": "2024-03-04:17-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "mlrunVersion": "1.5.2", "name": "noise-reduction", "platformVersion": "3.5.3", "spec": {"filename": "noise_reduction.py", "handler": "reduce_noise", "image": "mlrun/mlrun", "kind": "job", "requirements": ["librosa", "noisereduce", "deepfilternet", "torchaudio>=2.1.2"]}, "url": "", "version": "1.0.0", "assets": {"example": "src/noise_reduction.ipynb", "source": "src/noise_reduction.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}} \ No newline at end of file +{"load_dataset": {"latest": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.0", "name": "load-dataset", "platformVersion": "3.5.5", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0", "assets": {"example": "src/load_dataset.ipynb", "source": "src/load_dataset.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "load-dataset", "platformVersion": "", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/load_dataset.ipynb", "source": "src/load_dataset.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.0", "name": "load-dataset", "platformVersion": "3.5.5", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0", "assets": {"example": "src/load_dataset.ipynb", "source": "src/load_dataset.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "load-dataset", "platformVersion": "3.5.0", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/load_dataset.ipynb", "source": "src/load_dataset.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "load-dataset", "platformVersion": "3.2.0", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/load_dataset.ipynb", "source": "src/load_dataset.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "load-dataset", "platformVersion": "3.2.0", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/load_dataset.ipynb", "source": "src/load_dataset.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "tf2_serving": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "tf2-serving", "platformVersion": "3.5.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/tf2_serving.ipynb", "source": "src/tf2_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "tf2-serving", "platformVersion": "", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.0.1", "assets": {"example": "src/tf2_serving.ipynb", "source": "src/tf2_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "tf2-serving", "platformVersion": "3.5.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/tf2_serving.ipynb", "source": "src/tf2_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf2-serving", "platformVersion": "3.2.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.8.0", "assets": {"example": "src/tf2_serving.ipynb", "source": "src/tf2_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf2-serving", "platformVersion": "3.2.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.9.1", "assets": {"example": "src/tf2_serving.ipynb", "source": "src/tf2_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf2-serving", "platformVersion": "3.2.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.9.0", "assets": {"example": "src/tf2_serving.ipynb", "source": "src/tf2_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "feature_selection": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.4.0", "assets": {"example": "src/feature_selection.ipynb", "source": "src/feature_selection.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.1", "assets": {"example": "src/feature_selection.ipynb", "source": "src/feature_selection.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.5.4", "name": "feature-selection", "platformVersion": "2.10.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection/feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/feature_selection.ipynb", "source": "src/feature_selection.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0", "assets": {"example": "src/feature_selection.ipynb", "source": "src/feature_selection.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.3.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0", "assets": {"example": "src/feature_selection.ipynb", "source": "src/feature_selection.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/feature_selection.ipynb", "source": "src/feature_selection.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "feature-selection", "platformVersion": "3.2.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection/feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/feature_selection.ipynb", "source": "src/feature_selection.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.4.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.4.0", "assets": {"example": "src/feature_selection.ipynb", "source": "src/feature_selection.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.9.0", "name": "feature-selection", "platformVersion": "3.2.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.1", "assets": {"example": "src/feature_selection.ipynb", "source": "src/feature_selection.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.9.0", "name": "feature-selection", "platformVersion": "3.2.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection/feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/feature_selection.ipynb", "source": "src/feature_selection.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "github_utils": {"latest": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "github-utils", "platformVersion": "3.5.0", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/github_utils.ipynb", "source": "src/github_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "github-utils", "platformVersion": "", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/github_utils.ipynb", "source": "src/github_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "github-utils", "platformVersion": "3.5.0", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/github_utils.ipynb", "source": "src/github_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "github-utils", "platformVersion": "3.2.0", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/github_utils.ipynb", "source": "src/github_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "github-utils", "platformVersion": "3.2.0", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/github_utils.ipynb", "source": "src/github_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "auto_trainer": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.7.0", "assets": {"example": "src/auto_trainer.ipynb", "source": "src/auto_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.4": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-04-26:10-43", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.0.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.4", "assets": {"example": "src/auto_trainer.ipynb", "source": "src/auto_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.10.1": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-02-06:10-18", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.10.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "", "kind": "job", "requirements": []}, "url": "", "version": "0.10.1", "assets": {"example": "src/auto_trainer.ipynb", "source": "src/auto_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.2": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-04-26:10-43", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.0.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.2", "assets": {"example": "src/auto_trainer.ipynb", "source": "src/auto_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.7.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.7.0", "assets": {"example": "src/auto_trainer.ipynb", "source": "src/auto_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.6.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.6.0", "assets": {"example": "src/auto_trainer.ipynb", "source": "src/auto_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.3.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.3.0", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0", "assets": {"example": "src/auto_trainer.ipynb", "source": "src/auto_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.5.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.5.0", "assets": {"example": "src/auto_trainer.ipynb", "source": "src/auto_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/auto_trainer.ipynb", "source": "src/auto_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.10.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-02-06:10-18", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.10.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "", "kind": "job", "requirements": []}, "url": "", "version": "0.10.0", "assets": {"example": "src/auto_trainer.ipynb", "source": "src/auto_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.10.2": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-02-06:10-18", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.10.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "", "kind": "job", "requirements": []}, "url": "", "version": "0.10.2", "assets": {"example": "src/auto_trainer.ipynb", "source": "src/auto_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.4.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.4.0", "assets": {"example": "src/auto_trainer.ipynb", "source": "src/auto_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.6": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-04-26:10-43", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.0.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.6", "assets": {"example": "src/auto_trainer.ipynb", "source": "src/auto_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.10.3": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-04-26:10-43", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.10.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "", "kind": "job", "requirements": []}, "url": "", "version": "0.10.3", "assets": {"example": "src/auto_trainer.ipynb", "source": "src/auto_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.1": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-04-26:10-43", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.0.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "", "kind": "job", "requirements": []}, "url": "", "version": "1.0.1", "assets": {"example": "src/auto_trainer.ipynb", "source": "src/auto_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-02-06:10-18", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.10.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/auto_trainer.ipynb", "source": "src/auto_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "churn_server": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "churn classification and predictor", "doc": "", "example": "churn_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Iguazio", "framework": "churn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "churn-server", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "ChurnModel"}, "env": {"ENABLE_EXPLAINER": "False"}, "filename": "churn_server.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["xgboost==1.3.1", "lifelines==0.22.8"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/churn_server.ipynb", "source": "src/churn_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "churn classification and predictor", "doc": "", "example": "churn_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Iguazio", "framework": "churn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "churn-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ChurnModel"}, "env": {"ENABLE_EXPLAINER": "False"}, "filename": "churn_server.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["xgboost==1.3.1", "lifelines==0.22.8"]}, "url": "", "version": "1.0.0", "assets": {"example": "src/churn_server.ipynb", "source": "src/churn_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "churn classification and predictor", "doc": "", "example": "churn_server.ipynb", "generationDate": "2021-05-19:22-04", "icon": "", "labels": {"author": "Iguazio", "framework": "churn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "churn-server", "platformVersion": "", "spec": {"filename": "churn_server.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": [], "env": {"ENABLE_EXPLAINER": "False"}, "customFields": {"default_class": "ChurnModel"}}, "url": "", "version": "0.0.1", "assets": {"example": "src/churn_server.ipynb", "source": "src/churn_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "churn classification and predictor", "doc": "", "example": "churn_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Iguazio", "framework": "churn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "churn-server", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "ChurnModel"}, "env": {"ENABLE_EXPLAINER": "False"}, "filename": "churn_server.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["xgboost==1.3.1", "lifelines==0.22.8"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/churn_server.ipynb", "source": "src/churn_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "churn classification and predictor", "doc": "", "example": "churn_server.ipynb", "generationDate": "2021-05-19:22-04", "icon": "", "labels": {"author": "Iguazio", "framework": "churn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "churn-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ChurnModel"}, "env": {"ENABLE_EXPLAINER": "False"}, "filename": "churn_server.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["xgboost==1.3.1", "lifelines==0.22.8"]}, "url": "", "version": "0.8.0", "assets": {"example": "src/churn_server.ipynb", "source": "src/churn_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "churn classification and predictor", "doc": "", "example": "churn_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Iguazio", "framework": "churn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "churn-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ChurnModel"}, "env": {"ENABLE_EXPLAINER": "False"}, "filename": "churn_server.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["xgboost==1.3.1", "lifelines==0.22.8"]}, "url": "", "version": "0.9.0", "assets": {"example": "src/churn_server.ipynb", "source": "src/churn_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "model_server": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-server", "platformVersion": "3.5.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/model_server.ipynb", "source": "src/model_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server", "platformVersion": "3.2.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "1.0.0", "assets": {"example": "src/model_server.ipynb", "source": "src/model_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "model-server", "platformVersion": "", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/model_server.ipynb", "source": "src/model_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-server", "platformVersion": "3.5.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/model_server.ipynb", "source": "src/model_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server", "platformVersion": "3.2.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/model_server.ipynb", "source": "src/model_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server", "platformVersion": "3.2.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/model_server.ipynb", "source": "src/model_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "onnx_utils": {"latest": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "onnx_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "with_mlrun": false}}, "filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/mlrun", "kind": "job", "requirements": ["onnx~=1.13.0", "onnxruntime~=1.14.0", "onnxoptimizer~=0.3.0", "onnxmltools~=1.11.0", "tf2onnx~=1.13.0"]}, "url": "", "version": "1.2.0", "assets": {"example": "src/onnx_utils.ipynb", "source": "src/onnx_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.1": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "onnx_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "with_mlrun": false}}, "filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/ml-models", "kind": "job", "requirements": ["onnx~=1.13.0", "onnxruntime~=1.14.0", "onnxoptimizer~=0.3.0", "onnxmltools~=1.11.0", "tf2onnx~=1.13.0"]}, "url": "", "version": "1.1.1", "assets": {"example": "src/onnx_utils.ipynb", "source": "src/onnx_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2021-10-25:00-15", "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "onnx_utils", "platformVersion": "", "spec": {"filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/onnx_utils.ipynb", "source": "src/onnx_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.2.0": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "onnx_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "with_mlrun": false}}, "filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/mlrun", "kind": "job", "requirements": ["onnx~=1.13.0", "onnxruntime~=1.14.0", "onnxoptimizer~=0.3.0", "onnxmltools~=1.11.0", "tf2onnx~=1.13.0"]}, "url": "", "version": "1.2.0", "assets": {"example": "src/onnx_utils.ipynb", "source": "src/onnx_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.10.1": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.10.0", "name": "onnx_utils", "platformVersion": "3.2.0", "spec": {"filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.10.1", "assets": {"example": "src/onnx_utils.ipynb", "source": "src/onnx_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "onnx_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "with_mlrun": false}}, "filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/ml-models", "kind": "job", "requirements": ["onnx~=1.10.1", "onnxruntime~=1.8.1", "onnxoptimizer~=0.2.0", "onnxmltools~=1.9.0", "tf2onnx~=1.9.0"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/onnx_utils.ipynb", "source": "src/onnx_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2021-10-25:00-15", "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "onnx_utils", "platformVersion": "3.2.0", "spec": {"filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/onnx_utils.ipynb", "source": "src/onnx_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.10.2": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.10.0", "name": "onnx_utils", "platformVersion": "3.2.0", "spec": {"filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.10.2", "assets": {"example": "src/onnx_utils.ipynb", "source": "src/onnx_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.1": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "onnx_utils", "platformVersion": "3.2.0", "spec": {"filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.1", "assets": {"example": "src/onnx_utils.ipynb", "source": "src/onnx_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "onnx_utils", "platformVersion": "3.2.0", "spec": {"filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/onnx_utils.ipynb", "source": "src/onnx_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "azureml_utils": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "azureml_utils", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "commands": ["apt-get update && apt-get install -y --no-install-recommends git", "apt install -y liblttng-ust0"], "with_mlrun": true}}, "filename": "azureml_utils.py", "handler": "train", "image": "python:3.9-bullseye", "kind": "job", "requirements": ["azureml-core==1.54.0.post1", "azureml-train-automl-client==1.54.0.post1", "plotly~=5.4"]}, "url": "", "version": "1.3.0", "test_valid": true, "assets": {"example": "src/azureml_utils.ipynb", "source": "src/azureml_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2021-11-13:00-15", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "azureml_utils", "platformVersion": "", "spec": {"filename": "azureml_utils.py", "handler": "train", "commands": null, "image": "", "kind": "job", "requirements": ["azureml-core==1.33.0", "azureml-train-automl-client==1.33.0"]}, "url": "", "version": "0.0.1", "assets": {"example": "src/azureml_utils.ipynb", "source": "src/azureml_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.3": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2021-11-13:00-15", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "azureml_utils", "platformVersion": "", "spec": {"filename": "azureml_utils.py", "handler": "train", "extra_spec": {"build": {"commands": ["python -m pip install pip==21.2.4", "apt-get update && apt-get install -y --no-install-recommends git"], "with_mlrun": true, "auto_build": true}, "allow_empty_resources": true}, "image": "python:3.7.9-slim", "kind": "job", "requirements": ["azureml-core==1.33.0", "azureml-train-automl-client==1.33.0", "plotly~=5.4"]}, "url": "", "version": "0.9.3", "assets": {"example": "src/azureml_utils.ipynb", "source": "src/azureml_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.2.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "azureml_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "commands": ["python -m pip install pip==22.1.2", "apt-get update && apt-get install -y --no-install-recommends git"], "with_mlrun": true}}, "filename": "azureml_utils.py", "handler": "train", "image": "python:3.7.9-slim", "kind": "job", "requirements": ["azureml-core==1.40.0", "azureml-train-automl-client==1.40.0", "plotly~=5.4"]}, "url": "", "version": "1.2.0", "test_valid": false, "assets": {"example": "src/azureml_utils.ipynb", "source": "src/azureml_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.3.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "azureml_utils", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "commands": ["apt-get update && apt-get install -y --no-install-recommends git", "apt install -y liblttng-ust0"], "with_mlrun": true}}, "filename": "azureml_utils.py", "handler": "train", "image": "python:3.9-bullseye", "kind": "job", "requirements": ["azureml-core==1.54.0.post1", "azureml-train-automl-client==1.54.0.post1", "plotly~=5.4"]}, "url": "", "version": "1.3.0", "test_valid": true, "assets": {"example": "src/azureml_utils.ipynb", "source": "src/azureml_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "azureml_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "commands": ["python -m pip install pip==22.1.2", "apt-get update && apt-get install -y --no-install-recommends git"], "with_mlrun": true}}, "filename": "azureml_utils.py", "handler": "train", "image": "python:3.7.9-slim", "kind": "job", "requirements": ["azureml-core==1.40.0", "azureml-train-automl-client==1.40.0", "plotly~=5.4"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/azureml_utils.ipynb", "source": "src/azureml_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.5": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2021-04-20:15-18", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "azureml_utils", "platformVersion": "", "spec": {"filename": "azureml_utils.py", "handler": "train", "extra_spec": {"build": {"commands": ["python -m pip install pip==21.2.4", "apt-get update && apt-get install -y --no-install-recommends git"], "with_mlrun": true, "auto_build": true}, "allow_empty_resources": true}, "image": "python:3.7.9-slim", "kind": "job", "requirements": ["azureml-core==1.40.0", "azureml-train-automl-client==1.40.0", "plotly~=5.4"]}, "url": "", "version": "0.9.5", "assets": {"example": "src/azureml_utils.ipynb", "source": "src/azureml_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.4": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2021-11-13:00-15", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "azureml_utils", "platformVersion": "", "spec": {"filename": "azureml_utils.py", "handler": "train", "extra_spec": {"build": {"commands": ["python -m pip install pip==21.2.4", "apt-get update && apt-get install -y --no-install-recommends git"], "with_mlrun": true, "auto_build": true}, "allow_empty_resources": true}, "image": "python:3.7.9-slim", "kind": "job", "requirements": ["azureml-core==1.33.0", "azureml-train-automl-client==1.33.0", "plotly~=5.4"]}, "url": "", "version": "0.9.4", "assets": {"example": "src/azureml_utils.ipynb", "source": "src/azureml_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2021-11-13:00-15", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "azureml_utils", "platformVersion": "", "spec": {"filename": "azureml_utils.py", "handler": "train", "commands": null, "image": "", "kind": "job", "requirements": ["azureml-core==1.33.0", "azureml-train-automl-client==1.33.0"]}, "url": "", "version": "0.9.0", "assets": {"example": "src/azureml_utils.ipynb", "source": "src/azureml_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "model_server_tester": {"latest": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-server-tester", "platformVersion": "3.5.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/model_server_tester.ipynb", "source": "src/model_server_tester.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.0": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server-tester", "platformVersion": "3.2.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.0", "assets": {"example": "src/model_server_tester.ipynb", "source": "src/model_server_tester.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "model-server-tester", "platformVersion": "", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/model_server_tester.ipynb", "source": "src/model_server_tester.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-server-tester", "platformVersion": "3.5.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/model_server_tester.ipynb", "source": "src/model_server_tester.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server-tester", "platformVersion": "3.2.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/model_server_tester.ipynb", "source": "src/model_server_tester.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server-tester", "platformVersion": "3.2.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/model_server_tester.ipynb", "source": "src/model_server_tester.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "gen_class_data": {"latest": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "gen_class_data", "platformVersion": "3.5.3", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0", "assets": {"example": "src/gen_class_data.ipynb", "source": "src/gen_class_data.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.6.2", "name": "gen_class_data", "platformVersion": "3.0.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/gen_class_data.ipynb", "source": "src/gen_class_data.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "gen_class_data", "platformVersion": "3.5.3", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0", "assets": {"example": "src/gen_class_data.ipynb", "source": "src/gen_class_data.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "gen_class_data", "platformVersion": "3.5.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/gen_class_data.ipynb", "source": "src/gen_class_data.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "gen_class_data", "platformVersion": "3.2.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/gen_class_data.ipynb", "source": "src/gen_class_data.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.10.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "gen_class_data", "platformVersion": "3.2.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.10.0", "assets": {"example": "src/gen_class_data.ipynb", "source": "src/gen_class_data.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "gen_class_data", "platformVersion": "3.2.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/gen_class_data.ipynb", "source": "src/gen_class_data.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "coxph_test": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "model-testing"], "description": "Test cox proportional hazards model", "doc": "", "example": "coxph_test.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Iguazio", "framework": "survival"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "coxph-test", "platformVersion": "3.5.0", "spec": {"filename": "coxph_test.py", "handler": "cox_test", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/coxph_test.ipynb", "source": "src/coxph_test.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-testing"], "description": "Test cox proportional hazards model", "doc": "", "example": "coxph_test.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Iguazio", "framework": "survival"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "coxph-test", "platformVersion": "3.2.0", "spec": {"filename": "coxph_test.py", "handler": "cox_test", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.0.0", "assets": {"example": "src/coxph_test.ipynb", "source": "src/coxph_test.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["machine-learning", "model-testing"], "description": "Test cox proportional hazards model", "doc": "", "example": "coxph_test.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "Iguazio", "framework": "survival"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "coxph-test", "platformVersion": "", "spec": {"filename": "coxph_test.py", "handler": "cox_test", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/coxph_test.ipynb", "source": "src/coxph_test.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-testing"], "description": "Test cox proportional hazards model", "doc": "", "example": "coxph_test.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Iguazio", "framework": "survival"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "coxph-test", "platformVersion": "3.5.0", "spec": {"filename": "coxph_test.py", "handler": "cox_test", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/coxph_test.ipynb", "source": "src/coxph_test.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-testing"], "description": "Test cox proportional hazards model", "doc": "", "example": "coxph_test.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "Iguazio", "framework": "survival"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "coxph-test", "platformVersion": "3.2.0", "spec": {"filename": "coxph_test.py", "handler": "cox_test", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/coxph_test.ipynb", "source": "src/coxph_test.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-testing"], "description": "Test cox proportional hazards model", "doc": "", "example": "coxph_test.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Iguazio", "framework": "survival"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "coxph-test", "platformVersion": "3.2.0", "spec": {"filename": "coxph_test.py", "handler": "cox_test", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/coxph_test.ipynb", "source": "src/coxph_test.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "describe_spark": {"latest": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "describe-spark", "platformVersion": "3.5.0", "spec": {"filename": "describe_spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/describe_spark.ipynb", "source": "src/describe_spark.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "describe-spark", "platformVersion": "", "spec": {"filename": "describe-spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/describe_spark.ipynb", "source": "src/describe-spark.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "describe-spark", "platformVersion": "3.5.0", "spec": {"filename": "describe_spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/describe_spark.ipynb", "source": "src/describe_spark.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe-spark", "platformVersion": "3.2.0", "spec": {"filename": "describe-spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/describe_spark.ipynb", "source": "src/describe-spark.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe-spark", "platformVersion": "3.2.0", "spec": {"filename": "describe_spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "0.9.1", "assets": {"example": "src/describe_spark.ipynb", "source": "src/describe_spark.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe-spark", "platformVersion": "3.2.0", "spec": {"filename": "describe-spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/describe_spark.ipynb", "source": "src/describe-spark.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "v2_model_server": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "v2-model-server", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/v2_model_server.ipynb", "source": "src/v2_model_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "1.0.0", "assets": {"example": "src/v2_model_server.ipynb", "source": "src/v2_model_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "v2-model-server", "platformVersion": "", "spec": {"filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": [], "customFields": {"default_class": "ClassifierModel"}}, "url": "", "version": "0.0.1", "assets": {"example": "src/v2_model_server.ipynb", "source": "src/v2_model_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "v2-model-server", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/v2_model_server.ipynb", "source": "src/v2_model_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/v2_model_server.ipynb", "source": "src/v2_model_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/v2_model_server.ipynb", "source": "src/v2_model_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "send_email": {"latest": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "send-email", "platformVersion": "3.5.3", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0", "assets": {"example": "src/send_email.ipynb", "source": "src/send_email.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "send-email", "platformVersion": "", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/send_email.ipynb", "source": "src/send_email.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.2.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "send-email", "platformVersion": "3.5.3", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0", "assets": {"example": "src/send_email.ipynb", "source": "src/send_email.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "send-email", "platformVersion": "3.5.0", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/send_email.ipynb", "source": "src/send_email.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "send-email", "platformVersion": "3.2.0", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/send_email.ipynb", "source": "src/send_email.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "send-email", "platformVersion": "3.2.0", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/send_email.ipynb", "source": "src/send_email.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "arc_to_parquet": {"latest": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avi"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "arc-to-parquet", "platformVersion": "3.5.4", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.4.1", "assets": {"example": "src/arc_to_parquet.ipynb", "source": "src/arc_to_parquet.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2021-05-19:22-04", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.5.4", "name": "arc-to-parquet", "platformVersion": "2.10.0", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/ml-base", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/arc_to_parquet.ipynb", "source": "src/arc_to_parquet.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.2.0": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "arc-to-parquet", "platformVersion": "3.5.0", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/ml-base", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0", "assets": {"example": "src/arc_to_parquet.ipynb", "source": "src/arc_to_parquet.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "arc-to-parquet", "platformVersion": "3.5.0", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/ml-base", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/arc_to_parquet.ipynb", "source": "src/arc_to_parquet.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2021-05-19:22-04", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "arc-to-parquet", "platformVersion": "3.2.0", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/ml-base", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/arc_to_parquet.ipynb", "source": "src/arc_to_parquet.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.4.1": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avi"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "arc-to-parquet", "platformVersion": "3.5.4", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.4.1", "assets": {"example": "src/arc_to_parquet.ipynb", "source": "src/arc_to_parquet.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yjb"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "arc-to-parquet", "platformVersion": "3.2.0", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/ml-base", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/arc_to_parquet.ipynb", "source": "src/arc_to_parquet.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "open_archive": {"latest": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Open a file/object archive into a target directory", "doc": "", "example": "open_archive.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "open-archive", "platformVersion": "3.5.0", "spec": {"filename": "open_archive.py", "handler": "open_archive", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/open_archive.ipynb", "source": "src/open_archive.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Open a file/object archive into a target directory", "doc": "", "example": "open_archive.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "open-archive", "platformVersion": "", "spec": {"filename": "open_archive.py", "handler": "open_archive", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/open_archive.ipynb", "source": "src/open_archive.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Open a file/object archive into a target directory", "doc": "", "example": "open_archive.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "open-archive", "platformVersion": "3.5.0", "spec": {"filename": "open_archive.py", "handler": "open_archive", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/open_archive.ipynb", "source": "src/open_archive.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Open a file/object archive into a target directory", "doc": "", "example": "open_archive.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "open-archive", "platformVersion": "3.2.0", "spec": {"filename": "open_archive.py", "handler": "open_archive", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/open_archive.ipynb", "source": "src/open_archive.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Open a file/object archive into a target directory", "doc": "", "example": "open_archive.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "open-archive", "platformVersion": "3.2.0", "spec": {"filename": "open_archive.py", "handler": "open_archive", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/open_archive.ipynb", "source": "src/open_archive.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "model_monitoring_batch": {"latest": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_batch.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-monitoring-batch", "platformVersion": "3.5.0", "spec": {"filename": "model_monitoring_batch.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/model_monitoring_batch.ipynb", "source": "src/model_monitoring_batch.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_batch.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "model-monitoring-batch", "platformVersion": "", "spec": {"filename": "model_monitoring_batch.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/model_monitoring_batch.ipynb", "source": "src/model_monitoring_batch.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_batch.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-monitoring-batch", "platformVersion": "3.5.0", "spec": {"filename": "model_monitoring_batch.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/model_monitoring_batch.ipynb", "source": "src/model_monitoring_batch.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_batch.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-monitoring-batch", "platformVersion": "3.2.0", "spec": {"filename": "model_monitoring_batch.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/model_monitoring_batch.ipynb", "source": "src/model_monitoring_batch.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.1": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_batch.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-monitoring-batch", "platformVersion": "3.2.0", "spec": {"filename": "model_monitoring_batch.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.1", "assets": {"example": "src/model_monitoring_batch.ipynb", "source": "src/model_monitoring_batch.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["monitoring"], "description": "", "doc": "", "example": "model_monitoring_batch.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-monitoring-batch", "platformVersion": "3.2.0", "spec": {"filename": "model_monitoring_batch.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/model_monitoring_batch.ipynb", "source": "src/model_monitoring_batch.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "v2_model_tester": {"latest": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "v2-model-tester", "platformVersion": "3.5.0", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/v2_model_tester.ipynb", "source": "src/v2_model_tester.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "v2-model-tester", "platformVersion": "", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/v2_model_tester.ipynb", "source": "src/v2_model_tester.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "v2-model-tester", "platformVersion": "3.5.0", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/v2_model_tester.ipynb", "source": "src/v2_model_tester.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-tester", "platformVersion": "3.2.0", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/v2_model_tester.ipynb", "source": "src/v2_model_tester.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-tester", "platformVersion": "3.2.0", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/v2_model_tester.ipynb", "source": "src/v2_model_tester.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "aggregate": {"latest": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "aggregate", "platformVersion": "3.5.4", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0", "assets": {"example": "src/aggregate.ipynb", "source": "src/aggregate.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2021-05-19:22-31", "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "aggregate", "platformVersion": "3.2.0", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.0", "assets": {"example": "src/aggregate.ipynb", "source": "src/aggregate.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "aggregate", "platformVersion": "3.5.2", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.1", "assets": {"example": "src/aggregate.ipynb", "source": "src/aggregate.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2021-05-19:22-31", "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.6.2", "name": "aggregate", "platformVersion": "3.0.0", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/aggregate.ipynb", "source": "src/aggregate.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "aggregate", "platformVersion": "3.5.2", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0", "assets": {"example": "src/aggregate.ipynb", "source": "src/aggregate.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.3.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "aggregate", "platformVersion": "3.5.4", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0", "assets": {"example": "src/aggregate.ipynb", "source": "src/aggregate.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "aggregate", "platformVersion": "3.5.2", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/aggregate.ipynb", "source": "src/aggregate.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2021-05-19:22-31", "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "aggregate", "platformVersion": "3.2.0", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/aggregate.ipynb", "source": "src/aggregate.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.2": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2021-05-19:22-31", "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.7.1", "name": "aggregate", "platformVersion": "3.2.0", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.2", "assets": {"example": "src/aggregate.ipynb", "source": "src/aggregate.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "aggregate", "platformVersion": "3.2.0", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/aggregate.ipynb", "source": "src/aggregate.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "describe": {"latest": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "describe", "platformVersion": "3.5.3", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0", "assets": {"example": "src/describe.ipynb", "source": "src/describe.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "Iguazio"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.5.4", "name": "describe", "platformVersion": "2.10.0", "spec": {"filename": "describe.py", "handler": "summarize", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/describe.ipynb", "source": "src/describe.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.2.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "describe", "platformVersion": "3.5.3", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0", "assets": {"example": "src/describe.ipynb", "source": "src/describe.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "describe", "platformVersion": "3.5.0", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/describe.ipynb", "source": "src/describe.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "Iguazio"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe", "platformVersion": "3.2.0", "spec": {"filename": "describe.py", "handler": "summarize", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/describe.ipynb", "source": "src/describe.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.2": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-04-26:10-20", "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe", "platformVersion": "3.2.0", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.2", "assets": {"example": "src/describe.ipynb", "source": "src/describe.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-04-07:14-20", "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe", "platformVersion": "3.2.0", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.1", "assets": {"example": "src/describe.ipynb", "source": "src/describe.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Iguazio"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe", "platformVersion": "3.2.0", "spec": {"filename": "describe.py", "handler": "summarize", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/describe.ipynb", "source": "src/describe.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "azureml_serving": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "AzureML serving function", "doc": "", "example": "azureml_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "azureml_serving", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "mlrun.frameworks.sklearn.PickleModelServer"}, "filename": "azureml_serving.py", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["azureml-automl-runtime~=1.38.1"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/azureml_serving.ipynb", "source": "src/azureml_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "AzureML serving function", "doc": "", "example": "azureml_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "azureml_serving", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "mlrun.frameworks.sklearn.PickleModelServer"}, "filename": "azureml_serving.py", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["azureml-automl-runtime~=1.38.1"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/azureml_serving.ipynb", "source": "src/azureml_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "batch_inference": {"latest": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.7.0", "assets": {"example": "src/batch_inference.ipynb", "source": "src/batch_inference.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.1": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.1.1", "assets": {"example": "src/batch_inference.ipynb", "source": "src/batch_inference.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.2.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.2.0", "assets": {"example": "src/batch_inference.ipynb", "source": "src/batch_inference.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.7.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.7.0", "assets": {"example": "src/batch_inference.ipynb", "source": "src/batch_inference.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.6.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.6.0", "assets": {"example": "src/batch_inference.ipynb", "source": "src/batch_inference.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.5.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.5.0", "assets": {"example": "src/batch_inference.ipynb", "source": "src/batch_inference.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference ( also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-learn", "plotly"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/batch_inference.ipynb", "source": "src/batch_inference.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "hugging_face_serving": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "Generic Hugging Face model server.", "doc": "", "example": "hugging_face_serving.ipynb", "generationDate": "2022-09-05:17-00", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "hugging_face_serving", "platformVersion": "", "spec": {"customFields": {"default_class": "HuggingFaceModelServer"}, "filename": "hugging_face_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["transformers==4.21.3", "tensorflow==2.9.2"]}, "url": "", "version": "1.0.0", "assets": {"example": "src/hugging_face_serving.ipynb", "source": "src/hugging_face_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "Generic Hugging Face model server.", "doc": "", "example": "hugging_face_serving.ipynb", "generationDate": "2022-09-05:17-00", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "hugging_face_serving", "platformVersion": "", "spec": {"customFields": {"default_class": "HuggingFaceModelServer"}, "filename": "hugging_face_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["transformers==4.21.3", "tensorflow==2.9.2"]}, "url": "", "version": "1.0.0", "assets": {"example": "src/hugging_face_serving.ipynb", "source": "src/hugging_face_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "validate_great_expectations": {"latest": {"apiVersion": "v1", "categories": ["data-validation", "data-analysis"], "description": "Validate a dataset using Great Expectations", "doc": "", "example": "validate_great_expectations.ipynb", "generationDate": "2022-04-26:12-28", "hidden": false, "icon": "", "labels": {"author": "nicks", "framework": "great-expectations"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "validate-great-expectations", "platformVersion": "3.5.2", "spec": {"filename": "validate_great_expectations.py", "handler": "validate_expectations", "image": "mlrun/mlrun", "kind": "job", "requirements": ["great-expectations==0.15.41"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/validate_great_expectations.ipynb", "source": "src/validate_great_expectations.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["data-validation", "data-analysis"], "description": "Validate a dataset using Great Expectations", "doc": "", "example": "validate_great_expectations.ipynb", "generationDate": "2022-04-26:12-28", "hidden": false, "icon": "", "labels": {"author": "nicks", "framework": "great-expectations"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "validate-great-expectations", "platformVersion": "3.5.2", "spec": {"filename": "validate_great_expectations.py", "handler": "validate_expectations", "image": "mlrun/mlrun", "kind": "job", "requirements": ["great-expectations==0.15.41"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/validate_great_expectations.ipynb", "source": "src/validate_great_expectations.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "transcribe": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Transcribe audio files into text files", "doc": "", "example": "transcribe.ipynb", "generationDate": "2023-07-13:11-20", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "transcribe", "platformVersion": "3.5.3", "spec": {"filename": "transcribe.py", "handler": "transcribe", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "tqdm", "torchaudio", "torch", "accelerate"]}, "url": "", "version": "1.0.0", "assets": {"example": "src/transcribe.ipynb", "source": "src/transcribe.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Transcribe audio files into text files", "doc": "", "example": "transcribe.ipynb", "generationDate": "2023-07-13:11-20", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "transcribe", "platformVersion": "3.5.3", "spec": {"filename": "transcribe.py", "handler": "transcribe", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "tqdm", "torchaudio", "torch", "accelerate"]}, "url": "", "version": "1.0.0", "assets": {"example": "src/transcribe.ipynb", "source": "src/transcribe.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Transcribe audio files into text files", "doc": "", "example": "transcribe.ipynb", "generationDate": "2023-07-13:11-20", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "transcribe", "platformVersion": "3.5.3", "spec": {"filename": "transcribe.py", "handler": "transcribe", "image": "mlrun/mlrun", "kind": "job", "requirements": ["openai-whisper", "tqdm"]}, "url": "", "version": "0.0.1", "test_valid": false, "assets": {"example": "src/transcribe.ipynb", "source": "src/transcribe.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.2": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Transcribe audio files into text files", "doc": "", "example": "transcribe.ipynb", "generationDate": "2023-07-13:11-20", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "transcribe", "platformVersion": "3.5.3", "spec": {"filename": "transcribe.py", "handler": "transcribe", "image": "mlrun/mlrun", "kind": "job", "requirements": ["openai-whisper", "tqdm"]}, "url": "", "version": "0.0.2", "test_valid": true, "assets": {"example": "src/transcribe.ipynb", "source": "src/transcribe.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "question_answering": {"latest": {"apiVersion": "v1", "categories": ["machine-learning"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "torch", "tqdm"]}, "url": "", "version": "0.3.1", "assets": {"example": "src/question_answering.ipynb", "source": "src/question_answering.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.3.1": {"apiVersion": "v1", "categories": ["machine-learning"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "torch", "tqdm"]}, "url": "", "version": "0.3.1", "assets": {"example": "src/question_answering.ipynb", "source": "src/question_answering.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.3.0": {"apiVersion": "v1", "categories": ["machine-learning"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": "transformers torch tqdm"}, "url": "", "version": "0.3.0", "assets": {"example": "src/question_answering.ipynb", "source": "src/question_answering.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.2.0": {"apiVersion": "v1", "categories": ["machine-learning"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": "transformers torch tqdm"}, "url": "", "version": "0.2.0", "assets": {"example": "src/question_answering.ipynb", "source": "src/question_answering.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "pii_recognizer": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "This function is used to recognize PII in a directory of text files", "doc": "", "example": "pii_recognizer.ipynb", "generationDate": "2023-08-15:10-24", "hidden": false, "icon": "", "labels": {"author": "pgw"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "pii-recognizer", "platformVersion": "3.5.3", "spec": {"filename": "pii_recognizer.py", "handler": "recognize_pii", "image": "mlrun/mlrun", "kind": "job", "requirements": ["nltk", "pandas", "presidio-anonymizer", "presidio-analyzer", "torch", "flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653", "st-annotated-text", "https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl"]}, "url": "", "version": "0.2.0", "test_valid": false, "assets": {"example": "src/pii_recognizer.ipynb", "source": "src/pii_recognizer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "This function is used to recognize PII in a directory of text files", "doc": "", "example": "pii_recognizer.ipynb", "generationDate": "2023-08-15:10-24", "hidden": false, "icon": "", "labels": {"author": "pgw"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "pii-recognizer", "platformVersion": "3.5.3", "spec": {"filename": "pii_recognizer.py", "handler": "recognize_pii", "image": "mlrun/mlrun", "kind": "job", "requirements": ["nltk", "pandas", "presidio-anonymizer", "presidio-analyzer", "torch", "flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653", "st-annotated-text", "https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl"]}, "url": "", "version": "0.1.0", "test_valid": false, "assets": {"example": "src/pii_recognizer.ipynb", "source": "src/pii_recognizer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.2.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "This function is used to recognize PII in a directory of text files", "doc": "", "example": "pii_recognizer.ipynb", "generationDate": "2023-08-15:10-24", "hidden": false, "icon": "", "labels": {"author": "pgw"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "pii-recognizer", "platformVersion": "3.5.3", "spec": {"filename": "pii_recognizer.py", "handler": "recognize_pii", "image": "mlrun/mlrun", "kind": "job", "requirements": ["nltk", "pandas", "presidio-anonymizer", "presidio-analyzer", "torch", "flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653", "st-annotated-text", "https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl"]}, "url": "", "version": "0.2.0", "test_valid": false, "assets": {"example": "src/pii_recognizer.ipynb", "source": "src/pii_recognizer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "batch_inference_v2": {"latest": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.5.0", "assets": {"example": "src/batch_inference_v2.ipynb", "source": "src/batch_inference_v2.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "2.3.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.3.0", "assets": {"example": "src/batch_inference_v2.ipynb", "source": "src/batch_inference_v2.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "2.2.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.2.0", "assets": {"example": "src/batch_inference_v2.ipynb", "source": "src/batch_inference_v2.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.8.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0-rc13", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.8.0", "assets": {"example": "src/batch_inference_v2.ipynb", "source": "src/batch_inference_v2.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.7.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0-rc13", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.7.0", "assets": {"example": "src/batch_inference_v2.ipynb", "source": "src/batch_inference_v2.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.6.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0-rc9", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.6.0", "assets": {"example": "src/batch_inference_v2.ipynb", "source": "src/batch_inference_v2.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.5.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0-rc9", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.5.0", "assets": {"example": "src/batch_inference_v2.ipynb", "source": "src/batch_inference_v2.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.9.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0-rc16", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.9.0", "assets": {"example": "src/batch_inference_v2.ipynb", "source": "src/batch_inference_v2.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "2.0.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.0.0", "assets": {"example": "src/batch_inference_v2.ipynb", "source": "src/batch_inference_v2.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "2.5.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.5.0", "assets": {"example": "src/batch_inference_v2.ipynb", "source": "src/batch_inference_v2.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "2.1.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.1.0", "assets": {"example": "src/batch_inference_v2.ipynb", "source": "src/batch_inference_v2.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "translate": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning", "deep-learning", "NLP"], "description": "Translate text files from one language to another", "doc": "", "example": "translate.ipynb", "generationDate": "2023-12-05:17-20", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "translate", "platformVersion": "3.5.3", "spec": {"filename": "translate.py", "handler": "translate", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "sentencepiece", "torch", "tqdm"]}, "url": "", "version": "0.0.2", "test_valid": true, "assets": {"example": "src/translate.ipynb", "source": "src/translate.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Translate text files from one language to another", "doc": "", "example": "translate.ipynb", "generationDate": "2023-12-05:17-20", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "translate", "platformVersion": "3.5.3", "spec": {"filename": "translate.py", "handler": "translate", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "sentencepiece", "torch", "tqdm"]}, "url": "", "version": "0.0.1", "test_valid": true, "assets": {"example": "src/translate.ipynb", "source": "src/translate.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.2": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning", "deep-learning", "NLP"], "description": "Translate text files from one language to another", "doc": "", "example": "translate.ipynb", "generationDate": "2023-12-05:17-20", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "translate", "platformVersion": "3.5.3", "spec": {"filename": "translate.py", "handler": "translate", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "sentencepiece", "torch", "tqdm"]}, "url": "", "version": "0.0.2", "test_valid": true, "assets": {"example": "src/translate.ipynb", "source": "src/translate.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "structured_data_generator": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "GenAI"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.1", "name": "structured_data_generator", "platformVersion": "3.5.5", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.4.0", "assets": {"example": "src/structured_data_generator.ipynb", "source": "src/structured_data_generator.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "structured_data_generator", "platformVersion": "3.5.0", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.0.0", "assets": {"example": "src/structured_data_generator.ipynb", "source": "src/structured_data_generator.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.3.1": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "structured_data_generator", "platformVersion": "3.5.0", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.3.1", "assets": {"example": "src/structured_data_generator.ipynb", "source": "src/structured_data_generator.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.3.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "structured_data_generator", "platformVersion": "3.5.0", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.3.0", "assets": {"example": "src/structured_data_generator.ipynb", "source": "src/structured_data_generator.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "structured_data_generator", "platformVersion": "3.5.0", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/structured_data_generator.ipynb", "source": "src/structured_data_generator.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.4.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "GenAI"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.1", "name": "structured_data_generator", "platformVersion": "3.5.5", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.4.0", "assets": {"example": "src/structured_data_generator.ipynb", "source": "src/structured_data_generator.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "text_to_audio_generator": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Generate audio file from text using different speakers", "doc": "", "example": "text_to_audio_generator.ipynb", "generationDate": "2023-12-03:15-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "text_to_audio_generator", "platformVersion": "3.5.3", "spec": {"filename": "text_to_audio_generator.py", "handler": "generate_multi_speakers_audio", "image": "mlrun/mlrun", "kind": "job", "requirements": ["bark", "torchaudio"]}, "url": "", "version": "1.1.0", "test_valid": true, "assets": {"example": "src/text_to_audio_generator.ipynb", "source": "src/text_to_audio_generator.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Generate audio file from text using different speakers", "doc": "", "example": "text_to_audio_generator.ipynb", "generationDate": "2023-12-03:15-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "text_to_audio_generator", "platformVersion": "3.5.3", "spec": {"filename": "text_to_audio_generator.py", "handler": "generate_multi_speakers_audio", "image": "mlrun/mlrun", "kind": "job", "requirements": ["bark", "torchaudio"]}, "url": "", "version": "1.0.0", "test_valid": true, "assets": {"example": "src/text_to_audio_generator.ipynb", "source": "src/text_to_audio_generator.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Generate audio file from text using different speakers", "doc": "", "example": "text_to_audio_generator.ipynb", "generationDate": "2023-12-03:15-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "text_to_audio_generator", "platformVersion": "3.5.3", "spec": {"filename": "text_to_audio_generator.py", "handler": "generate_multi_speakers_audio", "image": "mlrun/mlrun", "kind": "job", "requirements": ["bark", "torchaudio"]}, "url": "", "version": "1.1.0", "test_valid": true, "assets": {"example": "src/text_to_audio_generator.ipynb", "source": "src/text_to_audio_generator.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "silero_vad": {"latest": {"apiVersion": "v1", "categories": ["deep-learning", "PyTorch", "Audio"], "description": "Silero VAD (Voice Activity Detection) functions.", "doc": "", "example": "silero_vad.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "silero_vad", "platformVersion": "3.5.3", "spec": {"filename": "silero_vad.py", "handler": "detect_voice", "image": "mlrun/mlrun", "kind": "job", "requirements": ["torch", "torchaudio", "tqdm", "onnxruntime"]}, "url": "", "version": "1.2.0", "assets": {"example": "src/silero_vad.ipynb", "source": "src/silero_vad.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.2.0": {"apiVersion": "v1", "categories": ["deep-learning", "PyTorch", "Audio"], "description": "Silero VAD (Voice Activity Detection) functions.", "doc": "", "example": "silero_vad.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "silero_vad", "platformVersion": "3.5.3", "spec": {"filename": "silero_vad.py", "handler": "detect_voice", "image": "mlrun/mlrun", "kind": "job", "requirements": ["torch", "torchaudio", "tqdm", "onnxruntime"]}, "url": "", "version": "1.2.0", "assets": {"example": "src/silero_vad.ipynb", "source": "src/silero_vad.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["deep-learning", "pytorch", "audio"], "description": "Silero VAD (Voice Activity Detection) functions.", "doc": "", "example": "silero_vad.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "silero_vad", "platformVersion": "3.5.3", "spec": {"filename": "silero_vad.py", "handler": "detect_voice", "image": "mlrun/mlrun", "kind": "job", "requirements": ["torch", "torchaudio", "tqdm", "onnxruntime"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/silero_vad.ipynb", "source": "src/silero_vad.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "pyannote_audio": {"latest": {"apiVersion": "v1", "categories": ["deep-learning", "Huggingface", "Audio"], "description": "pyannote's speech diarization of audio files", "doc": "", "example": "pyannote_audio.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "pyannote-audio", "platformVersion": "3.5.3", "spec": {"filename": "pyannote_audio.py", "handler": "diarize", "image": "mlrun/mlrun-gpu", "kind": "job", "requirements": ["pyannote.audio", "pyannote.core", "torchaudio", "tqdm"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/pyannote_audio.ipynb", "source": "src/pyannote_audio.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.0": {"apiVersion": "v1", "categories": ["deep-learning", "huggingface", "audio"], "description": "pyannote's speech diarization of audio files", "doc": "", "example": "pyannote_audio.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "pyannote-audio", "platformVersion": "3.5.3", "spec": {"filename": "pyannote_audio.py", "handler": "diarize", "image": "mlrun/mlrun-gpu", "kind": "job", "requirements": ["pyannote.audio", "pyannote.core", "torchaudio", "tqdm"]}, "url": "", "version": "1.0.0", "assets": {"example": "src/pyannote_audio.ipynb", "source": "src/pyannote_audio.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["deep-learning", "Huggingface", "Audio"], "description": "pyannote's speech diarization of audio files", "doc": "", "example": "pyannote_audio.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "pyannote-audio", "platformVersion": "3.5.3", "spec": {"filename": "pyannote_audio.py", "handler": "diarize", "image": "mlrun/mlrun-gpu", "kind": "job", "requirements": ["pyannote.audio", "pyannote.core", "torchaudio", "tqdm"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/pyannote_audio.ipynb", "source": "src/pyannote_audio.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "noise_reduction": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Reduce noise from audio files", "doc": "", "example": "noise_reduction.ipynb", "generationDate": "2024-03-04:17-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "mlrunVersion": "1.5.2", "name": "noise-reduction", "platformVersion": "3.5.3", "spec": {"filename": "noise_reduction.py", "handler": "reduce_noise", "image": "mlrun/mlrun", "kind": "job", "requirements": ["librosa", "noisereduce", "deepfilternet", "torchaudio>=2.1.2"]}, "url": "", "version": "1.0.0", "assets": {"example": "src/noise_reduction.ipynb", "source": "src/noise_reduction.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Reduce noise from audio files", "doc": "", "example": "noise_reduction.ipynb", "generationDate": "2024-03-04:17-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "mlrunVersion": "1.5.2", "name": "noise-reduction", "platformVersion": "3.5.3", "spec": {"filename": "noise_reduction.py", "handler": "reduce_noise", "image": "mlrun/mlrun", "kind": "job", "requirements": ["librosa", "noisereduce", "deepfilternet", "torchaudio>=2.1.2"]}, "url": "", "version": "1.0.0", "assets": {"example": "src/noise_reduction.ipynb", "source": "src/noise_reduction.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}} \ No newline at end of file diff --git a/functions/development/concept_drift/0.0.1/src/README.md b/functions/development/concept_drift/0.0.1/src/README.md deleted file mode 100644 index 92e6d893..00000000 --- a/functions/development/concept_drift/0.0.1/src/README.md +++ /dev/null @@ -1,132 +0,0 @@ -# Concept Drift - -**Concept drift** is a change in the statistical properties of the **target variable** over time. - -When deploying our models to production, we must ensure our models perform as we expect them to - reaching the same level of performence we have seen on our test sets or at least performing in the same quality as when they were deployed. - -However, often this is not the case. there are many factors that can affect our model's performance like seasonality or any unkown root causes that will change the laws underlying our data and invalidate some assumptions made by the model. - -We offer this function to help combat Concept Drift with implementation of streaming DDM, EDDM and PH concept drift detectors. - -## How to integrate - -This function is made of two parts: - -1. Kubernetes job to instantiate the selected models with a provided base dataset (the test dataset could be used) -2. [Nuclio serverless function](../concept_drift_streaming/concept_drift_streaming.ipynb) listed on a _labeled stream_, which will be deployed from this function after the models initialization and run the models per event and provide necessary alerts. - -There are two steps to integrate sucessfully with your workflow: - -1. Provide a stream where each event containes the joined **label** and **prediction** for that specific event. -2. Add this function to the workflow with the following params: - -```markdown -:param context: MLRun context -:param base_dataset: Dataset containing label_col and prediction_col to initialize the detectors -:param input_stream: labeled stream to track. - Should contain label_col and prediction_col -:param output_stream: Output stream to push the detector's alerts -:param output_tsdb: Output TSDB table to allow analysis and display -:param tsdb_batch_size: Batch size of alerts to buffer before pushing to the TSDB -:param callbacks: Additional rest endpoints to send the alert data to -:param models: List of the detectors to deploy - Defaults to ['ddm', 'eddm', 'pagehinkley']. -:param models_dest: Location for saving the detectors - Defaults to 'models' (in relation to artifact_path). -:param pagehinkley_threshold: Drift level threshold for PH detector Defaults to 10. -:param ddm_warning_level: Warning level alert for DDM detector Defaults to 2. -:param ddm_out_control_level: Drift level alert for DDM detector Defaults to 3. -:param label_col: Label column to be used on base_dataset and input_stream - Defaults to 'label'. -:param prediction_col: Prediction column to be used on base_dataset and input_stream - Defaults to 'prediction'. -:param hub_url: hub_url in case the default is not used, concept_drift_streaming will be loaded - by this url - Defaults to mlconf.hub_url. -:param fn_tag: hub tag to use - Defaults to 'master' -``` - -## Algorithms - -We offer to deploy up to 3 concept drift streaming detectors - -### DDM - Drift Detection Method - -Models the **Number of errors** as a **binomial** variable. This enables us to confine the expected number of errors in a prediction stream window to within some standard deviation. - -- Good for **abrupt** drift changes - -
- -![$mu=np_t$](https://latex.codecogs.com/svg.latex?mu=np_t) - -![$\sigma=\sqrt{\frac{p_t(1-p_t)}{n}}$]() - -
- -**Alert** when: - -
- -![$p_t+\sigma_t\ge{p_{min}+3\sigma_{min}}$](https://latex.codecogs.com/svg.latex?p_t+\sigma_t\ge{p_{min}+3\sigma_{min}}) - -
- -### EDDM - Early Drift Detection Method - -Uses the distance between two consecutive errors. - -- works better for **gradual** drift changes. -- More sensitive then DDM for noise -- Requires Minimal number of errors to initialize the statistics. - -**Warning**: - -
- -![$\frac{p_t+2\sigma_t}{p_{max}+2\sigma_{max}}<0.95$](https://latex.codecogs.com/svg.latex?\frac{p_t+2\sigma_t}{p_{max}+2\sigma_{max}}<0.95) - -
- -**Alert**: - -
- -![$\frac{p_t+2\sigma_t}{p_{max}+2\sigma_{max}}<0.90$](https://latex.codecogs.com/svg.latex?\frac{p_t+2\sigma_t}{p_{max}+2\sigma_{max}}<0.90) - -
- -### PageHinkley Test: - -The PageHinkley test is a sequential analysis technique typically used for monitoring change detection. (The test was designed to detect change in avg. of a Gaussian signal). In this test we use: -x*1*, ..., x*n* - labeled dataset -δ - magnitude threshold -λ - detection threshold - -
- -![$\hat{x_T}=\frac{1}{T}\sum_{t=1}^{t}{x_t}$](https://latex.codecogs.com/svg.latex?\hat{x_T}=\frac{1}{T}\sum_{t=1}^{t}{x_t}) - -![$\sum_{t=1}^T{x_t-\hat{x_T}-\delta}$](https://latex.codecogs.com/svg.latex?U_T=\sum_{t=1}^T{x_t-\hat{x_T}-\delta}) - -![$m_T=min(U_t,t=1..T)$]() - -
- -**Alert**: - -
- -![$U_T-m_T>\lambda$](https://latex.codecogs.com/svg.latex?U_T-m_T>\lambda) - -
- -## Additional resources -[A Study on Change Detection Methods](https://pdfs.semanticscholar.org/bb6e/8a44c0efcd725aae1c0b1817561f6e278c2c.pdf), Raquel Sebasti˜ao1,2 and Jo˜ao Gama1,3, 1 LIAAD-INESC Porto L.A., University of Porto -Rua de Ceuta, 118 - 6, 4050-190 Porto, Portugal -2 Faculty of Science, University of Porto -3 Faculty of Economics, University of Porto -{raquel,jgama}@liaad.up.pt - -[MLOps Live #4 - How to Detect & Remediate Drift in Production with MLOps Automation](https://www.youtube.com/watch?v=66_Q7mJZOSc&t=1296s) diff --git a/functions/development/concept_drift/0.0.1/src/concept_drift.ipynb b/functions/development/concept_drift/0.0.1/src/concept_drift.ipynb deleted file mode 100644 index 0fbd3673..00000000 --- a/functions/development/concept_drift/0.0.1/src/concept_drift.ipynb +++ /dev/null @@ -1,383 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Concept Drift - Deployer\n", - "Deploy a streaming Concept Drift detector on a labeled stream. \n", - "\n", - "This function is the Deployment step for the Streaming Concept Drift Detector. It will initialize the selected drift detectors with the base_dataset's statistics and deploy the [concept_drift_streaming serverless Nuclio function](../concept_drift_streaming/concept_drift_streaming.ipynb) with them for streaming concept-drift detection on top of a labeled stream." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Environment setup" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import nuclio" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "from pprint import pprint" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "%%nuclio cmd -c\n", - "python -m pip install scikit-multiflow==0.4.1\n", - "python -m pip install v3io_frames\n", - "python -m pip install nuclio-jupyter" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "%nuclio: setting spec.build.baseImage to 'mlrun/ml-models'\n" - ] - } - ], - "source": [ - "# Define function spec\n", - "%nuclio config spec.build.baseImage = \"mlrun/ml-models\"" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: start-code" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "import skmultiflow.drift_detection # We will grab our PH, DDM, EDDM algorithms from here\n", - "import numpy as np\n", - "import pandas as pd\n", - "import os\n", - "from cloudpickle import dumps, load, dump\n", - "\n", - "from nuclio.triggers import V3IOStreamTrigger \n", - "from mlrun import DataItem, import_function, mlconf, MLClientCtx, mount_v3io\n", - "\n", - "# For testing\n", - "import random" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "def concept_drift_deployer(context: MLClientCtx, base_dataset:DataItem, \n", - " input_stream:str, output_stream:str, output_tsdb:str, tsdb_batch_size:int, callbacks:list, \n", - " models:list=['ddm', 'eddm', 'pagehinkley'], models_dest='models',\n", - " pagehinkley_threshold:float=10, ddm_warning_level:float=2, ddm_out_control_level:float=3,\n", - " label_col='label', prediction_col='prediction', hub_url:str=mlconf.hub_url, fn_tag:str='master'):\n", - " \"\"\"Deploy a streaming Concept Drift detector on a labeled stream\n", - " This function is the Deployment step for the Streaming Concept Drift Detector.\n", - " It will load the selected drift detectors and initialize them with the \n", - " base_dataset's statistics. Then it will deploy the concept_drift_streaming \n", - " function and pass the models to it for streaming concept-drift detection on top\n", - " of a labeled stream. \n", - "\n", - " :param context: MLRun context\n", - " :param base_dataset: Dataset containing label_col and prediction_col to initialize the detectors\n", - " :param input_stream: labeled stream to track.\n", - " Should contain label_col and prediction_col\n", - " :param output_stream: Output stream to push the detector's alerts\n", - " :param output_tsdb: Output TSDB table to allow analysis and display\n", - " :param tsdb_batch_size: Batch size of alerts to buffer before pushing to the TSDB\n", - " :param callbacks: Additional rest endpoints to send the alert data to\n", - " :param models: List of the detectors to deploy\n", - " Defaults to ['ddm', 'eddm', 'pagehinkley'].\n", - " :param models_dest: Location for saving the detectors\n", - " Defaults to 'models' (in relation to artifact_path).\n", - " :param pagehinkley_threshold: Drift level threshold for PH detector Defaults to 10.\n", - " :param ddm_warning_level: Warning level alert for DDM detector Defaults to 2.\n", - " :param ddm_out_control_level: Drift level alert for DDM detector Defaults to 3.\n", - " :param label_col: Label column to be used on base_dataset and input_stream\n", - " Defaults to 'label'.\n", - " :param prediction_col: Prediction column to be used on base_dataset and input_stream\n", - " Defaults to 'prediction'.\n", - " :param hub_url: hub_url in case the default is not used, concept_drift_streaming will be loaded\n", - " by this url\n", - " Defaults to mlconf.hub_url.\n", - " :param fn_tag: hub tag to use\n", - " Defaults to 'master'\n", - " \"\"\"\n", - "\n", - " # Set the streaming function\n", - " mlconf.dbpath = mlconf.dbpath or 'http://mlrun-api:8080'\n", - " mlconf.hub_url = hub_url\n", - " fn = import_function(url='hub://concept_drift_streaming')\n", - " \n", - " # Load test dataset\n", - " context.logger.info('Loading base dataset')\n", - " base_df = base_dataset.as_df()\n", - " error_stream = np.where(base_df[prediction_col].values==base_df[label_col].values, 0, 1)\n", - " \n", - " # Create models\n", - " context.logger.info('Creating models')\n", - " models = [model.strip() for model in os.getenv('models', 'pagehinkley, ddm, eddm').split(',')]\n", - " models = {'eddm': skmultiflow.drift_detection.EDDM(),\n", - " 'pagehinkley': skmultiflow.drift_detection.PageHinkley(min_instances=len(error_stream),\n", - " threshold=pagehinkley_threshold),\n", - " 'ddm': skmultiflow.drift_detection.DDM(min_num_instances=len(error_stream),\n", - " warning_level=ddm_warning_level,\n", - " out_control_level=ddm_out_control_level)}\n", - " \n", - " # Initialzie the models with the base dataset\n", - " context.logger.info('Streaming data to models')\n", - " for i in range(len(error_stream)):\n", - " for model_name, model in models.items():\n", - " model.add_element(error_stream[i])\n", - " \n", - " # Save warm models\n", - " context.logger.info('Logging ready models')\n", - " for name, model in models.items():\n", - " data = dumps(model)\n", - " model_file = f'{name}.pkl'\n", - " context.log_model(f'{name}_concept_drift', body=data, labels={'framework': 'skmultiflow', 'workflow': 'concept-drift'},\n", - " model_file=model_file, model_dir=models_dest, tag='latest')\n", - " fn.set_envs({f'{name}_model_path': os.path.join(context.artifact_path, models_dest, model_file)})\n", - " \n", - " # Deploy streaming concept drift function\n", - " # with the warm models\n", - " context.logger.info('Deploying Concept Drift Streaming function')\n", - " fn.set_envs({'label_col': label_col,\n", - " 'prediction_col': prediction_col, \n", - " 'drift_stream': output_stream,\n", - " 'tsdb_table': output_tsdb,\n", - " 'pagehinkley_threshold': pagehinkley_threshold,\n", - " 'ddm_warning_level': ddm_warning_level,\n", - " 'ddm_out_control': ddm_out_control_level}) \n", - " fn.add_trigger('labeled_stream', V3IOStreamTrigger(url=input_stream, name='labeled_stream'))\n", - " fn.apply(mount_v3io())\n", - " fn.deploy(project=context.project)" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: end-code" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Local test\n", - "A usecase based run example" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import run_local, NewTask" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "container = 'bigdata'\n", - "base_table = os.path.join('/', container, 'network-operations')\n", - "stream_consumer_group = 'cd'\n", - "artifacts_path = os.path.join(os.getcwd(), 'artifacts')\n", - "\n", - "task = NewTask(name='concept_drift_deployer',\n", - " project='network-operations',\n", - " handler=concept_drift_deployer,\n", - " params={'models': ['ddm', 'eddm', 'pagehinkley'],\n", - " 'label_col': 'is_error',\n", - " 'prediction_col': 'yscore',\n", - " 'output_tsdb': os.path.join(base_table, 'drift_tsdb'),\n", - " 'input_stream': f'http://{os.environ[\"V3IO_API\"]}{os.path.join(base_table, 'inference_stream')}@{stream_consumer_group}',\n", - " 'output_stream': os.path.join(base_table, 'drift_stream')},\n", - " inputs={'base_dataset': 'store://network-operations/test_test_set_preds'},\n", - " artifact_path=artifacts_path)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_local(task)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Save function yaml" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "from os import path\n", - "from mlrun import run_local, NewTask, mlconf, import_function, mount_v3io, code_to_function\n", - "mlconf.dbpath = mlconf.dbpath or 'http://mlrun-api:8080'" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2020-12-23 12:39:26,663 [info] function spec saved to path: function.yaml\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# create job function object from notebook code\n", - "fn = code_to_function(\"concept_drift\", \n", - " kind='job',\n", - " with_doc=True,\n", - " embed_code=True)\n", - "\n", - "# add metadata (for templates and reuse)\n", - "fn.spec.default_handler = \"concept_drift_deployer\"\n", - "fn.spec.description = \"Deploy a streaming Concept Drift detector on a labeled stream\"\n", - "fn.metadata.categories = [\"ml\", \"serve\"]\n", - "fn.metadata.labels = {\"author\": \"orz\", \"framework\": \"sklearn\"}\n", - "fn.export(\"function.yaml\")" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fn.apply(mount_v3io())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Stream testing" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "fn.deploy()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "fn.run(task)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python [conda env:root] *", - "language": "python", - "name": "conda-root-py" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/functions/development/concept_drift/0.0.1/src/concept_drift.py b/functions/development/concept_drift/0.0.1/src/concept_drift.py deleted file mode 100644 index 2b47ec2b..00000000 --- a/functions/development/concept_drift/0.0.1/src/concept_drift.py +++ /dev/null @@ -1,134 +0,0 @@ -# Generated by nuclio.export.NuclioExporter - -import skmultiflow.drift_detection # We will grab our PH, DDM, EDDM algorithms from here -import numpy as np -import pandas as pd -import os -from cloudpickle import dumps, load, dump - -from nuclio.triggers import V3IOStreamTrigger -from mlrun import DataItem, import_function, mlconf, MLClientCtx, mount_v3io - -import random - - -def concept_drift_deployer( - context: MLClientCtx, - base_dataset: DataItem, - input_stream: str, - output_stream: str, - output_tsdb: str, - tsdb_batch_size: int, - callbacks: list, - models: list = ["ddm", "eddm", "pagehinkley"], - models_dest="models", - pagehinkley_threshold: float = 10, - ddm_warning_level: float = 2, - ddm_out_control_level: float = 3, - label_col="label", - prediction_col="prediction", - hub_url: str = mlconf.hub_url, - fn_tag: str = "master", -): - """Deploy a streaming Concept Drift detector on a labeled stream - This function is the Deployment step for the Streaming Concept Drift Detector. - It will load the selected drift detectors and initialize them with the - base_dataset's statistics. Then it will deploy the concept_drift_streaming - function and pass the models to it for streaming concept-drift detection on top - of a labeled stream. - - :param context: MLRun context - :param base_dataset: Dataset containing label_col and prediction_col to initialize the detectors - :param input_stream: labeled stream to track. - Should contain label_col and prediction_col - :param output_stream: Output stream to push the detector's alerts - :param output_tsdb: Output TSDB table to allow analysis and display - :param tsdb_batch_size: Batch size of alerts to buffer before pushing to the TSDB - :param callbacks: Additional rest endpoints to send the alert data to - :param models: List of the detectors to deploy - Defaults to ['ddm', 'eddm', 'pagehinkley']. - :param models_dest: Location for saving the detectors - Defaults to 'models' (in relation to artifact_path). - :param pagehinkley_threshold: Drift level threshold for PH detector Defaults to 10. - :param ddm_warning_level: Warning level alert for DDM detector Defaults to 2. - :param ddm_out_control_level: Drift level alert for DDM detector Defaults to 3. - :param label_col: Label column to be used on base_dataset and input_stream - Defaults to 'label'. - :param prediction_col: Prediction column to be used on base_dataset and input_stream - Defaults to 'prediction'. - :param hub_url: hub_url in case the default is not used, concept_drift_streaming will be loaded - by this url - Defaults to mlconf.hub_url. - :param fn_tag: hub tag to use - Defaults to 'master' - """ - - mlconf.dbpath = mlconf.dbpath or "http://mlrun-api:8080" - mlconf.hub_url = hub_url - fn = import_function(url="hub://concept_drift_streaming") - - context.logger.info("Loading base dataset") - base_df = base_dataset.as_df() - error_stream = np.where( - base_df[prediction_col].values == base_df[label_col].values, 0, 1 - ) - - context.logger.info("Creating models") - models = [ - model.strip() - for model in os.getenv("models", "pagehinkley, ddm, eddm").split(",") - ] - models = { - "eddm": skmultiflow.drift_detection.EDDM(), - "pagehinkley": skmultiflow.drift_detection.PageHinkley( - min_instances=len(error_stream), threshold=pagehinkley_threshold - ), - "ddm": skmultiflow.drift_detection.DDM( - min_num_instances=len(error_stream), - warning_level=ddm_warning_level, - out_control_level=ddm_out_control_level, - ), - } - - context.logger.info("Streaming data to models") - for i in range(len(error_stream)): - for model_name, model in models.items(): - model.add_element(error_stream[i]) - - context.logger.info("Logging ready models") - for name, model in models.items(): - data = dumps(model) - model_file = f"{name}.pkl" - context.log_model( - f"{name}_concept_drift", - body=data, - labels={"framework": "skmultiflow", "workflow": "concept-drift"}, - model_file=model_file, - model_dir=models_dest, - tag="latest", - ) - fn.set_envs( - { - f"{name}_model_path": os.path.join( - context.artifact_path, models_dest, model_file - ) - } - ) - - context.logger.info("Deploying Concept Drift Streaming function") - fn.set_envs( - { - "label_col": label_col, - "prediction_col": prediction_col, - "drift_stream": output_stream, - "tsdb_table": output_tsdb, - "pagehinkley_threshold": pagehinkley_threshold, - "ddm_warning_level": ddm_warning_level, - "ddm_out_control": ddm_out_control_level, - } - ) - fn.add_trigger( - "labeled_stream", V3IOStreamTrigger(url=input_stream, name="labeled_stream") - ) - fn.apply(mount_v3io()) - fn.deploy(project=context.project) diff --git a/functions/development/concept_drift/0.0.1/src/function.yaml b/functions/development/concept_drift/0.0.1/src/function.yaml deleted file mode 100644 index ead217d7..00000000 --- a/functions/development/concept_drift/0.0.1/src/function.yaml +++ /dev/null @@ -1,106 +0,0 @@ -kind: job -metadata: - name: concept-drift - tag: '' - hash: 9b6fd888a47f7ce60c429a0b3fa11902d213edb0 - project: default - labels: - author: orz - framework: sklearn - categories: - - machine-learning - - model-serving -spec: - command: '' - args: [] - image: mlrun/ml-models - env: [] - default_handler: concept_drift_deployer - entry_points: - concept_drift_deployer: - name: concept_drift_deployer - doc: "Deploy a streaming Concept Drift detector on a labeled stream\n This\ - \ function is the Deployment step for the Streaming Concept Drift Detector.\n\ - \ It will load the selected drift detectors and initialize them with the\n\ - \ base_dataset's statistics. Then it will deploy the concept_drift_streaming\n\ - \ function and pass the models to it for streaming concept-drift detection\ - \ on top\n of a labeled stream." - parameters: - - name: context - type: MLClientCtx - doc: MLRun context - default: '' - - name: base_dataset - type: DataItem - doc: Dataset containing label_col and prediction_col to initialize the detectors - default: '' - - name: input_stream - type: str - doc: labeled stream to track. Should contain label_col and prediction_col - default: '' - - name: output_stream - type: str - doc: Output stream to push the detector's alerts - default: '' - - name: output_tsdb - type: str - doc: Output TSDB table to allow analysis and display - default: '' - - name: tsdb_batch_size - type: int - doc: Batch size of alerts to buffer before pushing to the TSDB - default: '' - - name: callbacks - type: list - doc: Additional rest endpoints to send the alert data to - default: '' - - name: models - type: list - doc: List of the detectors to deploy Defaults to ['ddm', 'eddm', 'pagehinkley']. - default: - - ddm - - eddm - - pagehinkley - - name: models_dest - doc: Location for saving the detectors Defaults to 'models' (in relation to - artifact_path). - default: models - - name: pagehinkley_threshold - type: float - doc: Drift level threshold for PH detector Defaults to 10. - default: 10 - - name: ddm_warning_level - type: float - doc: Warning level alert for DDM detector Defaults to 2. - default: 2 - - name: ddm_out_control_level - type: float - doc: Drift level alert for DDM detector Defaults to 3. - default: 3 - - name: label_col - doc: Label column to be used on base_dataset and input_stream Defaults to - 'label'. - default: label - - name: prediction_col - doc: Prediction column to be used on base_dataset and input_stream Defaults - to 'prediction'. - default: prediction - - name: hub_url - type: str - doc: hub_url in case the default is not used, concept_drift_streaming will - be loaded by this url Defaults to mlconf.hub_url. - default: <_ast.Name object at 0x7f4521c584e0> - - name: fn_tag - type: str - doc: hub tag to use Defaults to 'master' - default: master - outputs: - - default: '' - lineno: 15 - description: Deploy a streaming Concept Drift detector on a labeled stream - build: - functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHNrbXVsdGlmbG93LmRyaWZ0X2RldGVjdGlvbiAgIyBXZSB3aWxsIGdyYWIgb3VyIFBILCBERE0sIEVERE0gYWxnb3JpdGhtcyBmcm9tIGhlcmUKaW1wb3J0IG51bXB5IGFzIG5wCmltcG9ydCBwYW5kYXMgYXMgcGQKaW1wb3J0IG9zCmZyb20gY2xvdWRwaWNrbGUgaW1wb3J0IGR1bXBzLCBsb2FkLCBkdW1wCgpmcm9tIG51Y2xpby50cmlnZ2VycyBpbXBvcnQgVjNJT1N0cmVhbVRyaWdnZXIKZnJvbSBtbHJ1biBpbXBvcnQgRGF0YUl0ZW0sIGltcG9ydF9mdW5jdGlvbiwgbWxjb25mLCBNTENsaWVudEN0eCwgbW91bnRfdjNpbwoKaW1wb3J0IHJhbmRvbQoKCmRlZiBjb25jZXB0X2RyaWZ0X2RlcGxveWVyKAogICAgY29udGV4dDogTUxDbGllbnRDdHgsCiAgICBiYXNlX2RhdGFzZXQ6IERhdGFJdGVtLAogICAgaW5wdXRfc3RyZWFtOiBzdHIsCiAgICBvdXRwdXRfc3RyZWFtOiBzdHIsCiAgICBvdXRwdXRfdHNkYjogc3RyLAogICAgdHNkYl9iYXRjaF9zaXplOiBpbnQsCiAgICBjYWxsYmFja3M6IGxpc3QsCiAgICBtb2RlbHM6IGxpc3QgPSBbImRkbSIsICJlZGRtIiwgInBhZ2VoaW5rbGV5Il0sCiAgICBtb2RlbHNfZGVzdD0ibW9kZWxzIiwKICAgIHBhZ2VoaW5rbGV5X3RocmVzaG9sZDogZmxvYXQgPSAxMCwKICAgIGRkbV93YXJuaW5nX2xldmVsOiBmbG9hdCA9IDIsCiAgICBkZG1fb3V0X2NvbnRyb2xfbGV2ZWw6IGZsb2F0ID0gMywKICAgIGxhYmVsX2NvbD0ibGFiZWwiLAogICAgcHJlZGljdGlvbl9jb2w9InByZWRpY3Rpb24iLAogICAgaHViX3VybDogc3RyID0gbWxjb25mLmh1Yl91cmwsCiAgICBmbl90YWc6IHN0ciA9ICJtYXN0ZXIiLAopOgogICAgIiIiRGVwbG95IGEgc3RyZWFtaW5nIENvbmNlcHQgRHJpZnQgZGV0ZWN0b3Igb24gYSBsYWJlbGVkIHN0cmVhbQogICAgICAgVGhpcyBmdW5jdGlvbiBpcyB0aGUgRGVwbG95bWVudCBzdGVwIGZvciB0aGUgU3RyZWFtaW5nIENvbmNlcHQgRHJpZnQgRGV0ZWN0b3IuCiAgICAgICBJdCB3aWxsIGxvYWQgdGhlIHNlbGVjdGVkIGRyaWZ0IGRldGVjdG9ycyBhbmQgaW5pdGlhbGl6ZSB0aGVtIHdpdGggdGhlCiAgICAgICBiYXNlX2RhdGFzZXQncyBzdGF0aXN0aWNzLiAgVGhlbiBpdCB3aWxsIGRlcGxveSB0aGUgY29uY2VwdF9kcmlmdF9zdHJlYW1pbmcKICAgICAgIGZ1bmN0aW9uIGFuZCBwYXNzIHRoZSBtb2RlbHMgdG8gaXQgZm9yIHN0cmVhbWluZyBjb25jZXB0LWRyaWZ0IGRldGVjdGlvbiBvbiB0b3AKICAgICAgIG9mIGEgbGFiZWxlZCBzdHJlYW0uCgogICAgOnBhcmFtIGNvbnRleHQ6ICAgICAgICAgTUxSdW4gY29udGV4dAogICAgOnBhcmFtIGJhc2VfZGF0YXNldDogICAgRGF0YXNldCBjb250YWluaW5nIGxhYmVsX2NvbCBhbmQgcHJlZGljdGlvbl9jb2wgdG8gaW5pdGlhbGl6ZSB0aGUgZGV0ZWN0b3JzCiAgICA6cGFyYW0gaW5wdXRfc3RyZWFtOiAgICBsYWJlbGVkIHN0cmVhbSB0byB0cmFjay4KICAgICAgICAgICAgICAgICAgICAgICAgICAgIFNob3VsZCBjb250YWluIGxhYmVsX2NvbCBhbmQgcHJlZGljdGlvbl9jb2wKICAgIDpwYXJhbSBvdXRwdXRfc3RyZWFtOiAgIE91dHB1dCBzdHJlYW0gdG8gcHVzaCB0aGUgZGV0ZWN0b3IncyBhbGVydHMKICAgIDpwYXJhbSBvdXRwdXRfdHNkYjogICAgIE91dHB1dCBUU0RCIHRhYmxlIHRvIGFsbG93IGFuYWx5c2lzIGFuZCBkaXNwbGF5CiAgICA6cGFyYW0gdHNkYl9iYXRjaF9zaXplOiBCYXRjaCBzaXplIG9mIGFsZXJ0cyB0byBidWZmZXIgYmVmb3JlIHB1c2hpbmcgdG8gdGhlIFRTREIKICAgIDpwYXJhbSBjYWxsYmFja3M6ICAgICAgIEFkZGl0aW9uYWwgcmVzdCBlbmRwb2ludHMgdG8gc2VuZCB0aGUgYWxlcnQgZGF0YSB0bwogICAgOnBhcmFtIG1vZGVsczogICAgICAgICAgTGlzdCBvZiB0aGUgZGV0ZWN0b3JzIHRvIGRlcGxveQogICAgICAgICAgICAgICAgICAgICAgICAgICAgRGVmYXVsdHMgdG8gWydkZG0nLCAnZWRkbScsICdwYWdlaGlua2xleSddLgogICAgOnBhcmFtIG1vZGVsc19kZXN0OiAgICAgTG9jYXRpb24gZm9yIHNhdmluZyB0aGUgZGV0ZWN0b3JzCiAgICAgICAgICAgICAgICAgICAgICAgICAgICBEZWZhdWx0cyB0byAnbW9kZWxzJyAoaW4gcmVsYXRpb24gdG8gYXJ0aWZhY3RfcGF0aCkuCiAgICA6cGFyYW0gcGFnZWhpbmtsZXlfdGhyZXNob2xkOiAgRHJpZnQgbGV2ZWwgdGhyZXNob2xkIGZvciBQSCBkZXRlY3RvciBEZWZhdWx0cyB0byAxMC4KICAgIDpwYXJhbSBkZG1fd2FybmluZ19sZXZlbDogICAgICBXYXJuaW5nIGxldmVsIGFsZXJ0IGZvciBERE0gZGV0ZWN0b3IgRGVmYXVsdHMgdG8gMi4KICAgIDpwYXJhbSBkZG1fb3V0X2NvbnRyb2xfbGV2ZWw6ICBEcmlmdCBsZXZlbCBhbGVydCBmb3IgRERNIGRldGVjdG9yIERlZmF1bHRzIHRvIDMuCiAgICA6cGFyYW0gbGFiZWxfY29sOiAgICAgICBMYWJlbCBjb2x1bW4gdG8gYmUgdXNlZCBvbiBiYXNlX2RhdGFzZXQgYW5kIGlucHV0X3N0cmVhbQogICAgICAgICAgICAgICAgICAgICAgICAgICAgRGVmYXVsdHMgdG8gJ2xhYmVsJy4KICAgIDpwYXJhbSBwcmVkaWN0aW9uX2NvbDogIFByZWRpY3Rpb24gY29sdW1uIHRvIGJlIHVzZWQgb24gYmFzZV9kYXRhc2V0IGFuZCBpbnB1dF9zdHJlYW0KICAgICAgICAgICAgICAgICAgICAgICAgICAgIERlZmF1bHRzIHRvICdwcmVkaWN0aW9uJy4KICAgIDpwYXJhbSBodWJfdXJsOiAgICAgICAgIGh1Yl91cmwgaW4gY2FzZSB0aGUgZGVmYXVsdCBpcyBub3QgdXNlZCwgY29uY2VwdF9kcmlmdF9zdHJlYW1pbmcgd2lsbCBiZSBsb2FkZWQKICAgICAgICAgICAgICAgICAgICAgICAgICAgIGJ5IHRoaXMgdXJsCiAgICAgICAgICAgICAgICAgICAgICAgICAgICBEZWZhdWx0cyB0byBtbGNvbmYuaHViX3VybC4KICAgIDpwYXJhbSBmbl90YWc6ICAgICAgICAgIGh1YiB0YWcgdG8gdXNlCiAgICAgICAgICAgICAgICAgICAgICAgICAgICBEZWZhdWx0cyB0byAnbWFzdGVyJwogICAgIiIiCgogICAgbWxjb25mLmRicGF0aCA9IG1sY29uZi5kYnBhdGggb3IgImh0dHA6Ly9tbHJ1bi1hcGk6ODA4MCIKICAgIG1sY29uZi5odWJfdXJsID0gaHViX3VybAogICAgZm4gPSBpbXBvcnRfZnVuY3Rpb24odXJsPSJodWI6Ly9jb25jZXB0X2RyaWZ0X3N0cmVhbWluZyIpCgogICAgY29udGV4dC5sb2dnZXIuaW5mbygiTG9hZGluZyBiYXNlIGRhdGFzZXQiKQogICAgYmFzZV9kZiA9IGJhc2VfZGF0YXNldC5hc19kZigpCiAgICBlcnJvcl9zdHJlYW0gPSBucC53aGVyZSgKICAgICAgICBiYXNlX2RmW3ByZWRpY3Rpb25fY29sXS52YWx1ZXMgPT0gYmFzZV9kZltsYWJlbF9jb2xdLnZhbHVlcywgMCwgMQogICAgKQoKICAgIGNvbnRleHQubG9nZ2VyLmluZm8oIkNyZWF0aW5nIG1vZGVscyIpCiAgICBtb2RlbHMgPSBbCiAgICAgICAgbW9kZWwuc3RyaXAoKQogICAgICAgIGZvciBtb2RlbCBpbiBvcy5nZXRlbnYoIm1vZGVscyIsICJwYWdlaGlua2xleSwgZGRtLCBlZGRtIikuc3BsaXQoIiwiKQogICAgXQogICAgbW9kZWxzID0gewogICAgICAgICJlZGRtIjogc2ttdWx0aWZsb3cuZHJpZnRfZGV0ZWN0aW9uLkVERE0oKSwKICAgICAgICAicGFnZWhpbmtsZXkiOiBza211bHRpZmxvdy5kcmlmdF9kZXRlY3Rpb24uUGFnZUhpbmtsZXkoCiAgICAgICAgICAgIG1pbl9pbnN0YW5jZXM9bGVuKGVycm9yX3N0cmVhbSksIHRocmVzaG9sZD1wYWdlaGlua2xleV90aHJlc2hvbGQKICAgICAgICApLAogICAgICAgICJkZG0iOiBza211bHRpZmxvdy5kcmlmdF9kZXRlY3Rpb24uRERNKAogICAgICAgICAgICBtaW5fbnVtX2luc3RhbmNlcz1sZW4oZXJyb3Jfc3RyZWFtKSwKICAgICAgICAgICAgd2FybmluZ19sZXZlbD1kZG1fd2FybmluZ19sZXZlbCwKICAgICAgICAgICAgb3V0X2NvbnRyb2xfbGV2ZWw9ZGRtX291dF9jb250cm9sX2xldmVsLAogICAgICAgICksCiAgICB9CgogICAgY29udGV4dC5sb2dnZXIuaW5mbygiU3RyZWFtaW5nIGRhdGEgdG8gbW9kZWxzIikKICAgIGZvciBpIGluIHJhbmdlKGxlbihlcnJvcl9zdHJlYW0pKToKICAgICAgICBmb3IgbW9kZWxfbmFtZSwgbW9kZWwgaW4gbW9kZWxzLml0ZW1zKCk6CiAgICAgICAgICAgIG1vZGVsLmFkZF9lbGVtZW50KGVycm9yX3N0cmVhbVtpXSkKCiAgICBjb250ZXh0LmxvZ2dlci5pbmZvKCJMb2dnaW5nIHJlYWR5IG1vZGVscyIpCiAgICBmb3IgbmFtZSwgbW9kZWwgaW4gbW9kZWxzLml0ZW1zKCk6CiAgICAgICAgZGF0YSA9IGR1bXBzKG1vZGVsKQogICAgICAgIG1vZGVsX2ZpbGUgPSBmIntuYW1lfS5wa2wiCiAgICAgICAgY29udGV4dC5sb2dfbW9kZWwoCiAgICAgICAgICAgIGYie25hbWV9X2NvbmNlcHRfZHJpZnQiLAogICAgICAgICAgICBib2R5PWRhdGEsCiAgICAgICAgICAgIGxhYmVscz17ImZyYW1ld29yayI6ICJza211bHRpZmxvdyIsICJ3b3JrZmxvdyI6ICJjb25jZXB0LWRyaWZ0In0sCiAgICAgICAgICAgIG1vZGVsX2ZpbGU9bW9kZWxfZmlsZSwKICAgICAgICAgICAgbW9kZWxfZGlyPW1vZGVsc19kZXN0LAogICAgICAgICAgICB0YWc9ImxhdGVzdCIsCiAgICAgICAgKQogICAgICAgIGZuLnNldF9lbnZzKAogICAgICAgICAgICB7CiAgICAgICAgICAgICAgICBmIntuYW1lfV9tb2RlbF9wYXRoIjogb3MucGF0aC5qb2luKAogICAgICAgICAgICAgICAgICAgIGNvbnRleHQuYXJ0aWZhY3RfcGF0aCwgbW9kZWxzX2Rlc3QsIG1vZGVsX2ZpbGUKICAgICAgICAgICAgICAgICkKICAgICAgICAgICAgfQogICAgICAgICkKCiAgICBjb250ZXh0LmxvZ2dlci5pbmZvKCJEZXBsb3lpbmcgQ29uY2VwdCBEcmlmdCBTdHJlYW1pbmcgZnVuY3Rpb24iKQogICAgZm4uc2V0X2VudnMoCiAgICAgICAgewogICAgICAgICAgICAibGFiZWxfY29sIjogbGFiZWxfY29sLAogICAgICAgICAgICAicHJlZGljdGlvbl9jb2wiOiBwcmVkaWN0aW9uX2NvbCwKICAgICAgICAgICAgImRyaWZ0X3N0cmVhbSI6IG91dHB1dF9zdHJlYW0sCiAgICAgICAgICAgICJ0c2RiX3RhYmxlIjogb3V0cHV0X3RzZGIsCiAgICAgICAgICAgICJwYWdlaGlua2xleV90aHJlc2hvbGQiOiBwYWdlaGlua2xleV90aHJlc2hvbGQsCiAgICAgICAgICAgICJkZG1fd2FybmluZ19sZXZlbCI6IGRkbV93YXJuaW5nX2xldmVsLAogICAgICAgICAgICAiZGRtX291dF9jb250cm9sIjogZGRtX291dF9jb250cm9sX2xldmVsLAogICAgICAgIH0KICAgICkKICAgIGZuLmFkZF90cmlnZ2VyKAogICAgICAgICJsYWJlbGVkX3N0cmVhbSIsIFYzSU9TdHJlYW1UcmlnZ2VyKHVybD1pbnB1dF9zdHJlYW0sIG5hbWU9ImxhYmVsZWRfc3RyZWFtIikKICAgICkKICAgIGZuLmFwcGx5KG1vdW50X3YzaW8oKSkKICAgIGZuLmRlcGxveShwcm9qZWN0PWNvbnRleHQucHJvamVjdCkK - commands: [] - code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/concept_drift/concept_drift.py - affinity: null -verbose: false diff --git a/functions/development/concept_drift/0.0.1/src/item.yaml b/functions/development/concept_drift/0.0.1/src/item.yaml deleted file mode 100644 index 797f994d..00000000 --- a/functions/development/concept_drift/0.0.1/src/item.yaml +++ /dev/null @@ -1,25 +0,0 @@ -apiVersion: v1 -categories: -- machine-learning -- model-serving -description: Deploy a streaming Concept Drift detector on a labeled stream -doc: '' -example: concept_drift.ipynb -generationDate: 2021-05-19:22-04 -icon: '' -labels: - author: orz - framework: sklearn -maintainers: [] -marketplaceType: '' -mlrunVersion: '' -name: concept-drift -platformVersion: '' -spec: - filename: concept_drift.py - handler: concept_drift_deployer - image: mlrun/ml-models - kind: job - requirements: [] -url: '' -version: 0.0.1 diff --git a/functions/development/concept_drift/0.0.1/static/documentation.html b/functions/development/concept_drift/0.0.1/static/documentation.html deleted file mode 100644 index 37daa510..00000000 --- a/functions/development/concept_drift/0.0.1/static/documentation.html +++ /dev/null @@ -1,125 +0,0 @@ - - - - - - - -concept_drift package - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- - -
-
-
-
-
-
-

concept_drift package

-
-

Submodules

-
-
-

concept_drift.concept_drift module

-
-
-

Module contents

-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/concept_drift/0.0.1/static/example.html b/functions/development/concept_drift/0.0.1/static/example.html deleted file mode 100644 index 35e54ea2..00000000 --- a/functions/development/concept_drift/0.0.1/static/example.html +++ /dev/null @@ -1,393 +0,0 @@ - - - - - - - -Concept Drift - Deployer - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- - -
-
-
-
-
-
-

Concept Drift - Deployer

-

Deploy a streaming Concept Drift detector on a labeled stream.

-

This function is the Deployment step for the Streaming Concept Drift Detector. It will initialize the selected drift detectors with the base_dataset’s statistics and deploy the concept_drift_streaming serverless Nuclio function with them for streaming concept-drift detection on top of a labeled stream.

-
-

Environment setup

-
-
-
import nuclio
-
-
-
-
-
-
-
from pprint import pprint
-
-
-
-
-
-
-
%%nuclio cmd -c
-python -m pip install scikit-multiflow==0.4.1
-python -m pip install v3io_frames
-python -m pip install nuclio-jupyter
-
-
-
-
-
-
-
# Define function spec
-%nuclio config spec.build.baseImage = "mlrun/ml-models"
-
-
-
-
-
%nuclio: setting spec.build.baseImage to 'mlrun/ml-models'
-
-
-
-
-
-
-
# nuclio: start-code
-
-
-
-
-
-
-
import skmultiflow.drift_detection # We will grab our PH, DDM, EDDM algorithms from here
-import numpy as np
-import pandas as pd
-import os
-from cloudpickle import dumps, load, dump
-
-from nuclio.triggers import V3IOStreamTrigger 
-from mlrun import DataItem, import_function, mlconf, MLClientCtx, mount_v3io
-
-# For testing
-import random
-
-
-
-
-
-
-
def concept_drift_deployer(context: MLClientCtx, base_dataset:DataItem, 
-                           input_stream:str, output_stream:str, output_tsdb:str, tsdb_batch_size:int, callbacks:list, 
-                           models:list=['ddm', 'eddm', 'pagehinkley'], models_dest='models',
-                           pagehinkley_threshold:float=10, ddm_warning_level:float=2, ddm_out_control_level:float=3,
-                           label_col='label', prediction_col='prediction', hub_url:str=mlconf.hub_url, fn_tag:str='master'):
-    """Deploy a streaming Concept Drift detector on a labeled stream
-       This function is the Deployment step for the Streaming Concept Drift Detector.
-       It will load the selected drift detectors and initialize them with the 
-       base_dataset's statistics.  Then it will deploy the concept_drift_streaming 
-       function and pass the models to it for streaming concept-drift detection on top
-       of a labeled stream. 
-
-    :param context:         MLRun context
-    :param base_dataset:    Dataset containing label_col and prediction_col to initialize the detectors
-    :param input_stream:    labeled stream to track.
-                            Should contain label_col and prediction_col
-    :param output_stream:   Output stream to push the detector's alerts
-    :param output_tsdb:     Output TSDB table to allow analysis and display
-    :param tsdb_batch_size: Batch size of alerts to buffer before pushing to the TSDB
-    :param callbacks:       Additional rest endpoints to send the alert data to
-    :param models:          List of the detectors to deploy
-                            Defaults to ['ddm', 'eddm', 'pagehinkley'].
-    :param models_dest:     Location for saving the detectors
-                            Defaults to 'models' (in relation to artifact_path).
-    :param pagehinkley_threshold:  Drift level threshold for PH detector Defaults to 10.
-    :param ddm_warning_level:      Warning level alert for DDM detector Defaults to 2.
-    :param ddm_out_control_level:  Drift level alert for DDM detector Defaults to 3.
-    :param label_col:       Label column to be used on base_dataset and input_stream
-                            Defaults to 'label'.
-    :param prediction_col:  Prediction column to be used on base_dataset and input_stream
-                            Defaults to 'prediction'.
-    :param hub_url:         hub_url in case the default is not used, concept_drift_streaming will be loaded
-                            by this url
-                            Defaults to mlconf.hub_url.
-    :param fn_tag:          hub tag to use
-                            Defaults to 'master'
-    """
-
-    # Set the streaming function
-    mlconf.dbpath = mlconf.dbpath or 'http://mlrun-api:8080'
-    mlconf.hub_url = hub_url
-    fn = import_function(url='hub://concept_drift_streaming')
-    
-    # Load test dataset
-    context.logger.info('Loading base dataset')
-    base_df = base_dataset.as_df()
-    error_stream = np.where(base_df[prediction_col].values==base_df[label_col].values, 0, 1)
-    
-    # Create models
-    context.logger.info('Creating models')
-    models = [model.strip() for model in os.getenv('models', 'pagehinkley, ddm, eddm').split(',')]
-    models = {'eddm': skmultiflow.drift_detection.EDDM(),
-              'pagehinkley': skmultiflow.drift_detection.PageHinkley(min_instances=len(error_stream),
-                                                                     threshold=pagehinkley_threshold),
-              'ddm': skmultiflow.drift_detection.DDM(min_num_instances=len(error_stream),
-                                                     warning_level=ddm_warning_level,
-                                                     out_control_level=ddm_out_control_level)}
-    
-    # Initialzie the models with the base dataset
-    context.logger.info('Streaming data to models')
-    for i in range(len(error_stream)):
-        for model_name, model in models.items():
-            model.add_element(error_stream[i])
-            
-    # Save warm models
-    context.logger.info('Logging ready models')
-    for name, model in models.items():
-        data = dumps(model)
-        model_file = f'{name}.pkl'
-        context.log_model(f'{name}_concept_drift', body=data, labels={'framework': 'skmultiflow', 'workflow': 'concept-drift'},
-                          model_file=model_file, model_dir=models_dest, tag='latest')
-        fn.set_envs({f'{name}_model_path': os.path.join(context.artifact_path, models_dest, model_file)})
-            
-    # Deploy streaming concept drift function
-    # with the warm models
-    context.logger.info('Deploying Concept Drift Streaming function')
-    fn.set_envs({'label_col': label_col,
-                 'prediction_col': prediction_col, 
-                 'drift_stream': output_stream,
-                 'tsdb_table': output_tsdb,
-                 'pagehinkley_threshold': pagehinkley_threshold,
-                 'ddm_warning_level': ddm_warning_level,
-                 'ddm_out_control': ddm_out_control_level})    
-    fn.add_trigger('labeled_stream', V3IOStreamTrigger(url=input_stream, name='labeled_stream'))
-    fn.apply(mount_v3io())
-    fn.deploy(project=context.project)
-
-
-
-
-
-
-
# nuclio: end-code
-
-
-
-
-
-
-

Local test

-

A usecase based run example

-
-
-
from mlrun import run_local, NewTask
-
-
-
-
-
-
-
container = 'bigdata'
-base_table = os.path.join('/', container, 'network-operations')
-stream_consumer_group = 'cd'
-artifacts_path = os.path.join(os.getcwd(), 'artifacts')
-
-task = NewTask(name='concept_drift_deployer',
-        project='network-operations',
-        handler=concept_drift_deployer,
-        params={'models': ['ddm', 'eddm', 'pagehinkley'],
-                'label_col': 'is_error',
-                'prediction_col': 'yscore',
-                'output_tsdb': os.path.join(base_table, 'drift_tsdb'),
-                'input_stream': f'http://{os.environ["V3IO_API"]}{os.path.join(base_table, 'inference_stream')}@{stream_consumer_group}',
-                'output_stream': os.path.join(base_table, 'drift_stream')},
-        inputs={'base_dataset': 'store://network-operations/test_test_set_preds'},
-        artifact_path=artifacts_path)
-
-
-
-
-
-
-
run_local(task)
-
-
-
-
-
-
-

Save function yaml

-
-
-
from os import path
-from mlrun import run_local, NewTask, mlconf, import_function, mount_v3io, code_to_function
-mlconf.dbpath = mlconf.dbpath or 'http://mlrun-api:8080'
-
-
-
-
-
-
-
# create job function object from notebook code
-fn = code_to_function("concept_drift", 
-                      kind='job',
-                      with_doc=True,
-                      embed_code=True)
-
-# add metadata (for templates and reuse)
-fn.spec.default_handler = "concept_drift_deployer"
-fn.spec.description = "Deploy a streaming Concept Drift detector on a labeled stream"
-fn.metadata.categories = ["ml", "serve"]
-fn.metadata.labels = {"author": "orz", "framework": "sklearn"}
-fn.export("function.yaml")
-
-
-
-
-
> 2020-12-23 12:39:26,663 [info] function spec saved to path: function.yaml
-
-
-
<mlrun.runtimes.kubejob.KubejobRuntime at 0x7fb816dc7450>
-
-
-
-
-
-
-
fn.apply(mount_v3io())
-
-
-
-
-
<mlrun.runtimes.kubejob.KubejobRuntime at 0x7f40cd475160>
-
-
-
-
-
-
-

Stream testing

-
-
-
fn.deploy()
-
-
-
-
-
-
-
fn.run(task)
-
-
-
-
-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/concept_drift/0.0.1/static/function.html b/functions/development/concept_drift/0.0.1/static/function.html deleted file mode 100644 index e1edcfc1..00000000 --- a/functions/development/concept_drift/0.0.1/static/function.html +++ /dev/null @@ -1,128 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: job
-metadata:
-  name: concept-drift
-  tag: ''
-  hash: 9b6fd888a47f7ce60c429a0b3fa11902d213edb0
-  project: default
-  labels:
-    author: orz
-    framework: sklearn
-  categories:
-  - machine-learning
-  - model-serving
-spec:
-  command: ''
-  args: []
-  image: mlrun/ml-models
-  env: []
-  default_handler: concept_drift_deployer
-  entry_points:
-    concept_drift_deployer:
-      name: concept_drift_deployer
-      doc: "Deploy a streaming Concept Drift detector on a labeled stream\n   This\
-        \ function is the Deployment step for the Streaming Concept Drift Detector.\n\
-        \   It will load the selected drift detectors and initialize them with the\n\
-        \   base_dataset's statistics.  Then it will deploy the concept_drift_streaming\n\
-        \   function and pass the models to it for streaming concept-drift detection\
-        \ on top\n   of a labeled stream."
-      parameters:
-      - name: context
-        type: MLClientCtx
-        doc: MLRun context
-        default: ''
-      - name: base_dataset
-        type: DataItem
-        doc: Dataset containing label_col and prediction_col to initialize the detectors
-        default: ''
-      - name: input_stream
-        type: str
-        doc: labeled stream to track. Should contain label_col and prediction_col
-        default: ''
-      - name: output_stream
-        type: str
-        doc: Output stream to push the detector's alerts
-        default: ''
-      - name: output_tsdb
-        type: str
-        doc: Output TSDB table to allow analysis and display
-        default: ''
-      - name: tsdb_batch_size
-        type: int
-        doc: Batch size of alerts to buffer before pushing to the TSDB
-        default: ''
-      - name: callbacks
-        type: list
-        doc: Additional rest endpoints to send the alert data to
-        default: ''
-      - name: models
-        type: list
-        doc: List of the detectors to deploy Defaults to ['ddm', 'eddm', 'pagehinkley'].
-        default:
-        - ddm
-        - eddm
-        - pagehinkley
-      - name: models_dest
-        doc: Location for saving the detectors Defaults to 'models' (in relation to
-          artifact_path).
-        default: models
-      - name: pagehinkley_threshold
-        type: float
-        doc: Drift level threshold for PH detector Defaults to 10.
-        default: 10
-      - name: ddm_warning_level
-        type: float
-        doc: Warning level alert for DDM detector Defaults to 2.
-        default: 2
-      - name: ddm_out_control_level
-        type: float
-        doc: Drift level alert for DDM detector Defaults to 3.
-        default: 3
-      - name: label_col
-        doc: Label column to be used on base_dataset and input_stream Defaults to
-          'label'.
-        default: label
-      - name: prediction_col
-        doc: Prediction column to be used on base_dataset and input_stream Defaults
-          to 'prediction'.
-        default: prediction
-      - name: hub_url
-        type: str
-        doc: hub_url in case the default is not used, concept_drift_streaming will
-          be loaded by this url Defaults to mlconf.hub_url.
-        default: <_ast.Name object at 0x7f4521c584e0>
-      - name: fn_tag
-        type: str
-        doc: hub tag to use Defaults to 'master'
-        default: master
-      outputs:
-      - default: ''
-      lineno: 15
-  description: Deploy a streaming Concept Drift detector on a labeled stream
-  build:
-    functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHNrbXVsdGlmbG93LmRyaWZ0X2RldGVjdGlvbiAgIyBXZSB3aWxsIGdyYWIgb3VyIFBILCBERE0sIEVERE0gYWxnb3JpdGhtcyBmcm9tIGhlcmUKaW1wb3J0IG51bXB5IGFzIG5wCmltcG9ydCBwYW5kYXMgYXMgcGQKaW1wb3J0IG9zCmZyb20gY2xvdWRwaWNrbGUgaW1wb3J0IGR1bXBzLCBsb2FkLCBkdW1wCgpmcm9tIG51Y2xpby50cmlnZ2VycyBpbXBvcnQgVjNJT1N0cmVhbVRyaWdnZXIKZnJvbSBtbHJ1biBpbXBvcnQgRGF0YUl0ZW0sIGltcG9ydF9mdW5jdGlvbiwgbWxjb25mLCBNTENsaWVudEN0eCwgbW91bnRfdjNpbwoKaW1wb3J0IHJhbmRvbQoKCmRlZiBjb25jZXB0X2RyaWZ0X2RlcGxveWVyKAogICAgY29udGV4dDogTUxDbGllbnRDdHgsCiAgICBiYXNlX2RhdGFzZXQ6IERhdGFJdGVtLAogICAgaW5wdXRfc3RyZWFtOiBzdHIsCiAgICBvdXRwdXRfc3RyZWFtOiBzdHIsCiAgICBvdXRwdXRfdHNkYjogc3RyLAogICAgdHNkYl9iYXRjaF9zaXplOiBpbnQsCiAgICBjYWxsYmFja3M6IGxpc3QsCiAgICBtb2RlbHM6IGxpc3QgPSBbImRkbSIsICJlZGRtIiwgInBhZ2VoaW5rbGV5Il0sCiAgICBtb2RlbHNfZGVzdD0ibW9kZWxzIiwKICAgIHBhZ2VoaW5rbGV5X3RocmVzaG9sZDogZmxvYXQgPSAxMCwKICAgIGRkbV93YXJuaW5nX2xldmVsOiBmbG9hdCA9IDIsCiAgICBkZG1fb3V0X2NvbnRyb2xfbGV2ZWw6IGZsb2F0ID0gMywKICAgIGxhYmVsX2NvbD0ibGFiZWwiLAogICAgcHJlZGljdGlvbl9jb2w9InByZWRpY3Rpb24iLAogICAgaHViX3VybDogc3RyID0gbWxjb25mLmh1Yl91cmwsCiAgICBmbl90YWc6IHN0ciA9ICJtYXN0ZXIiLAopOgogICAgIiIiRGVwbG95IGEgc3RyZWFtaW5nIENvbmNlcHQgRHJpZnQgZGV0ZWN0b3Igb24gYSBsYWJlbGVkIHN0cmVhbQogICAgICAgVGhpcyBmdW5jdGlvbiBpcyB0aGUgRGVwbG95bWVudCBzdGVwIGZvciB0aGUgU3RyZWFtaW5nIENvbmNlcHQgRHJpZnQgRGV0ZWN0b3IuCiAgICAgICBJdCB3aWxsIGxvYWQgdGhlIHNlbGVjdGVkIGRyaWZ0IGRldGVjdG9ycyBhbmQgaW5pdGlhbGl6ZSB0aGVtIHdpdGggdGhlCiAgICAgICBiYXNlX2RhdGFzZXQncyBzdGF0aXN0aWNzLiAgVGhlbiBpdCB3aWxsIGRlcGxveSB0aGUgY29uY2VwdF9kcmlmdF9zdHJlYW1pbmcKICAgICAgIGZ1bmN0aW9uIGFuZCBwYXNzIHRoZSBtb2RlbHMgdG8gaXQgZm9yIHN0cmVhbWluZyBjb25jZXB0LWRyaWZ0IGRldGVjdGlvbiBvbiB0b3AKICAgICAgIG9mIGEgbGFiZWxlZCBzdHJlYW0uCgogICAgOnBhcmFtIGNvbnRleHQ6ICAgICAgICAgTUxSdW4gY29udGV4dAogICAgOnBhcmFtIGJhc2VfZGF0YXNldDogICAgRGF0YXNldCBjb250YWluaW5nIGxhYmVsX2NvbCBhbmQgcHJlZGljdGlvbl9jb2wgdG8gaW5pdGlhbGl6ZSB0aGUgZGV0ZWN0b3JzCiAgICA6cGFyYW0gaW5wdXRfc3RyZWFtOiAgICBsYWJlbGVkIHN0cmVhbSB0byB0cmFjay4KICAgICAgICAgICAgICAgICAgICAgICAgICAgIFNob3VsZCBjb250YWluIGxhYmVsX2NvbCBhbmQgcHJlZGljdGlvbl9jb2wKICAgIDpwYXJhbSBvdXRwdXRfc3RyZWFtOiAgIE91dHB1dCBzdHJlYW0gdG8gcHVzaCB0aGUgZGV0ZWN0b3IncyBhbGVydHMKICAgIDpwYXJhbSBvdXRwdXRfdHNkYjogICAgIE91dHB1dCBUU0RCIHRhYmxlIHRvIGFsbG93IGFuYWx5c2lzIGFuZCBkaXNwbGF5CiAgICA6cGFyYW0gdHNkYl9iYXRjaF9zaXplOiBCYXRjaCBzaXplIG9mIGFsZXJ0cyB0byBidWZmZXIgYmVmb3JlIHB1c2hpbmcgdG8gdGhlIFRTREIKICAgIDpwYXJhbSBjYWxsYmFja3M6ICAgICAgIEFkZGl0aW9uYWwgcmVzdCBlbmRwb2ludHMgdG8gc2VuZCB0aGUgYWxlcnQgZGF0YSB0bwogICAgOnBhcmFtIG1vZGVsczogICAgICAgICAgTGlzdCBvZiB0aGUgZGV0ZWN0b3JzIHRvIGRlcGxveQogICAgICAgICAgICAgICAgICAgICAgICAgICAgRGVmYXVsdHMgdG8gWydkZG0nLCAnZWRkbScsICdwYWdlaGlua2xleSddLgogICAgOnBhcmFtIG1vZGVsc19kZXN0OiAgICAgTG9jYXRpb24gZm9yIHNhdmluZyB0aGUgZGV0ZWN0b3JzCiAgICAgICAgICAgICAgICAgICAgICAgICAgICBEZWZhdWx0cyB0byAnbW9kZWxzJyAoaW4gcmVsYXRpb24gdG8gYXJ0aWZhY3RfcGF0aCkuCiAgICA6cGFyYW0gcGFnZWhpbmtsZXlfdGhyZXNob2xkOiAgRHJpZnQgbGV2ZWwgdGhyZXNob2xkIGZvciBQSCBkZXRlY3RvciBEZWZhdWx0cyB0byAxMC4KICAgIDpwYXJhbSBkZG1fd2FybmluZ19sZXZlbDogICAgICBXYXJuaW5nIGxldmVsIGFsZXJ0IGZvciBERE0gZGV0ZWN0b3IgRGVmYXVsdHMgdG8gMi4KICAgIDpwYXJhbSBkZG1fb3V0X2NvbnRyb2xfbGV2ZWw6ICBEcmlmdCBsZXZlbCBhbGVydCBmb3IgRERNIGRldGVjdG9yIERlZmF1bHRzIHRvIDMuCiAgICA6cGFyYW0gbGFiZWxfY29sOiAgICAgICBMYWJlbCBjb2x1bW4gdG8gYmUgdXNlZCBvbiBiYXNlX2RhdGFzZXQgYW5kIGlucHV0X3N0cmVhbQogICAgICAgICAgICAgICAgICAgICAgICAgICAgRGVmYXVsdHMgdG8gJ2xhYmVsJy4KICAgIDpwYXJhbSBwcmVkaWN0aW9uX2NvbDogIFByZWRpY3Rpb24gY29sdW1uIHRvIGJlIHVzZWQgb24gYmFzZV9kYXRhc2V0IGFuZCBpbnB1dF9zdHJlYW0KICAgICAgICAgICAgICAgICAgICAgICAgICAgIERlZmF1bHRzIHRvICdwcmVkaWN0aW9uJy4KICAgIDpwYXJhbSBodWJfdXJsOiAgICAgICAgIGh1Yl91cmwgaW4gY2FzZSB0aGUgZGVmYXVsdCBpcyBub3QgdXNlZCwgY29uY2VwdF9kcmlmdF9zdHJlYW1pbmcgd2lsbCBiZSBsb2FkZWQKICAgICAgICAgICAgICAgICAgICAgICAgICAgIGJ5IHRoaXMgdXJsCiAgICAgICAgICAgICAgICAgICAgICAgICAgICBEZWZhdWx0cyB0byBtbGNvbmYuaHViX3VybC4KICAgIDpwYXJhbSBmbl90YWc6ICAgICAgICAgIGh1YiB0YWcgdG8gdXNlCiAgICAgICAgICAgICAgICAgICAgICAgICAgICBEZWZhdWx0cyB0byAnbWFzdGVyJwogICAgIiIiCgogICAgbWxjb25mLmRicGF0aCA9IG1sY29uZi5kYnBhdGggb3IgImh0dHA6Ly9tbHJ1bi1hcGk6ODA4MCIKICAgIG1sY29uZi5odWJfdXJsID0gaHViX3VybAogICAgZm4gPSBpbXBvcnRfZnVuY3Rpb24odXJsPSJodWI6Ly9jb25jZXB0X2RyaWZ0X3N0cmVhbWluZyIpCgogICAgY29udGV4dC5sb2dnZXIuaW5mbygiTG9hZGluZyBiYXNlIGRhdGFzZXQiKQogICAgYmFzZV9kZiA9IGJhc2VfZGF0YXNldC5hc19kZigpCiAgICBlcnJvcl9zdHJlYW0gPSBucC53aGVyZSgKICAgICAgICBiYXNlX2RmW3ByZWRpY3Rpb25fY29sXS52YWx1ZXMgPT0gYmFzZV9kZltsYWJlbF9jb2xdLnZhbHVlcywgMCwgMQogICAgKQoKICAgIGNvbnRleHQubG9nZ2VyLmluZm8oIkNyZWF0aW5nIG1vZGVscyIpCiAgICBtb2RlbHMgPSBbCiAgICAgICAgbW9kZWwuc3RyaXAoKQogICAgICAgIGZvciBtb2RlbCBpbiBvcy5nZXRlbnYoIm1vZGVscyIsICJwYWdlaGlua2xleSwgZGRtLCBlZGRtIikuc3BsaXQoIiwiKQogICAgXQogICAgbW9kZWxzID0gewogICAgICAgICJlZGRtIjogc2ttdWx0aWZsb3cuZHJpZnRfZGV0ZWN0aW9uLkVERE0oKSwKICAgICAgICAicGFnZWhpbmtsZXkiOiBza211bHRpZmxvdy5kcmlmdF9kZXRlY3Rpb24uUGFnZUhpbmtsZXkoCiAgICAgICAgICAgIG1pbl9pbnN0YW5jZXM9bGVuKGVycm9yX3N0cmVhbSksIHRocmVzaG9sZD1wYWdlaGlua2xleV90aHJlc2hvbGQKICAgICAgICApLAogICAgICAgICJkZG0iOiBza211bHRpZmxvdy5kcmlmdF9kZXRlY3Rpb24uRERNKAogICAgICAgICAgICBtaW5fbnVtX2luc3RhbmNlcz1sZW4oZXJyb3Jfc3RyZWFtKSwKICAgICAgICAgICAgd2FybmluZ19sZXZlbD1kZG1fd2FybmluZ19sZXZlbCwKICAgICAgICAgICAgb3V0X2NvbnRyb2xfbGV2ZWw9ZGRtX291dF9jb250cm9sX2xldmVsLAogICAgICAgICksCiAgICB9CgogICAgY29udGV4dC5sb2dnZXIuaW5mbygiU3RyZWFtaW5nIGRhdGEgdG8gbW9kZWxzIikKICAgIGZvciBpIGluIHJhbmdlKGxlbihlcnJvcl9zdHJlYW0pKToKICAgICAgICBmb3IgbW9kZWxfbmFtZSwgbW9kZWwgaW4gbW9kZWxzLml0ZW1zKCk6CiAgICAgICAgICAgIG1vZGVsLmFkZF9lbGVtZW50KGVycm9yX3N0cmVhbVtpXSkKCiAgICBjb250ZXh0LmxvZ2dlci5pbmZvKCJMb2dnaW5nIHJlYWR5IG1vZGVscyIpCiAgICBmb3IgbmFtZSwgbW9kZWwgaW4gbW9kZWxzLml0ZW1zKCk6CiAgICAgICAgZGF0YSA9IGR1bXBzKG1vZGVsKQogICAgICAgIG1vZGVsX2ZpbGUgPSBmIntuYW1lfS5wa2wiCiAgICAgICAgY29udGV4dC5sb2dfbW9kZWwoCiAgICAgICAgICAgIGYie25hbWV9X2NvbmNlcHRfZHJpZnQiLAogICAgICAgICAgICBib2R5PWRhdGEsCiAgICAgICAgICAgIGxhYmVscz17ImZyYW1ld29yayI6ICJza211bHRpZmxvdyIsICJ3b3JrZmxvdyI6ICJjb25jZXB0LWRyaWZ0In0sCiAgICAgICAgICAgIG1vZGVsX2ZpbGU9bW9kZWxfZmlsZSwKICAgICAgICAgICAgbW9kZWxfZGlyPW1vZGVsc19kZXN0LAogICAgICAgICAgICB0YWc9ImxhdGVzdCIsCiAgICAgICAgKQogICAgICAgIGZuLnNldF9lbnZzKAogICAgICAgICAgICB7CiAgICAgICAgICAgICAgICBmIntuYW1lfV9tb2RlbF9wYXRoIjogb3MucGF0aC5qb2luKAogICAgICAgICAgICAgICAgICAgIGNvbnRleHQuYXJ0aWZhY3RfcGF0aCwgbW9kZWxzX2Rlc3QsIG1vZGVsX2ZpbGUKICAgICAgICAgICAgICAgICkKICAgICAgICAgICAgfQogICAgICAgICkKCiAgICBjb250ZXh0LmxvZ2dlci5pbmZvKCJEZXBsb3lpbmcgQ29uY2VwdCBEcmlmdCBTdHJlYW1pbmcgZnVuY3Rpb24iKQogICAgZm4uc2V0X2VudnMoCiAgICAgICAgewogICAgICAgICAgICAibGFiZWxfY29sIjogbGFiZWxfY29sLAogICAgICAgICAgICAicHJlZGljdGlvbl9jb2wiOiBwcmVkaWN0aW9uX2NvbCwKICAgICAgICAgICAgImRyaWZ0X3N0cmVhbSI6IG91dHB1dF9zdHJlYW0sCiAgICAgICAgICAgICJ0c2RiX3RhYmxlIjogb3V0cHV0X3RzZGIsCiAgICAgICAgICAgICJwYWdlaGlua2xleV90aHJlc2hvbGQiOiBwYWdlaGlua2xleV90aHJlc2hvbGQsCiAgICAgICAgICAgICJkZG1fd2FybmluZ19sZXZlbCI6IGRkbV93YXJuaW5nX2xldmVsLAogICAgICAgICAgICAiZGRtX291dF9jb250cm9sIjogZGRtX291dF9jb250cm9sX2xldmVsLAogICAgICAgIH0KICAgICkKICAgIGZuLmFkZF90cmlnZ2VyKAogICAgICAgICJsYWJlbGVkX3N0cmVhbSIsIFYzSU9TdHJlYW1UcmlnZ2VyKHVybD1pbnB1dF9zdHJlYW0sIG5hbWU9ImxhYmVsZWRfc3RyZWFtIikKICAgICkKICAgIGZuLmFwcGx5KG1vdW50X3YzaW8oKSkKICAgIGZuLmRlcGxveShwcm9qZWN0PWNvbnRleHQucHJvamVjdCkK
-    commands: []
-    code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/concept_drift/concept_drift.py
-  affinity: null
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/concept_drift/0.0.1/static/item.html b/functions/development/concept_drift/0.0.1/static/item.html deleted file mode 100644 index e4382f66..00000000 --- a/functions/development/concept_drift/0.0.1/static/item.html +++ /dev/null @@ -1,47 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- machine-learning
-- model-serving
-description: Deploy a streaming Concept Drift detector on a labeled stream
-doc: ''
-example: concept_drift.ipynb
-generationDate: 2021-05-19:22-04
-icon: ''
-labels:
-  author: orz
-  framework: sklearn
-maintainers: []
-marketplaceType: ''
-mlrunVersion: ''
-name: concept-drift
-platformVersion: ''
-spec:
-  filename: concept_drift.py
-  handler: concept_drift_deployer
-  image: mlrun/ml-models
-  kind: job
-  requirements: []
-url: ''
-version: 0.0.1
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/concept_drift/0.0.1/static/source.html b/functions/development/concept_drift/0.0.1/static/source.html deleted file mode 100644 index 281c65e8..00000000 --- a/functions/development/concept_drift/0.0.1/static/source.html +++ /dev/null @@ -1,156 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-# Generated by nuclio.export.NuclioExporter
-
-import skmultiflow.drift_detection  # We will grab our PH, DDM, EDDM algorithms from here
-import numpy as np
-import pandas as pd
-import os
-from cloudpickle import dumps, load, dump
-
-from nuclio.triggers import V3IOStreamTrigger
-from mlrun import DataItem, import_function, mlconf, MLClientCtx, mount_v3io
-
-import random
-
-
-def concept_drift_deployer(
-    context: MLClientCtx,
-    base_dataset: DataItem,
-    input_stream: str,
-    output_stream: str,
-    output_tsdb: str,
-    tsdb_batch_size: int,
-    callbacks: list,
-    models: list = ["ddm", "eddm", "pagehinkley"],
-    models_dest="models",
-    pagehinkley_threshold: float = 10,
-    ddm_warning_level: float = 2,
-    ddm_out_control_level: float = 3,
-    label_col="label",
-    prediction_col="prediction",
-    hub_url: str = mlconf.hub_url,
-    fn_tag: str = "master",
-):
-    """Deploy a streaming Concept Drift detector on a labeled stream
-       This function is the Deployment step for the Streaming Concept Drift Detector.
-       It will load the selected drift detectors and initialize them with the
-       base_dataset's statistics.  Then it will deploy the concept_drift_streaming
-       function and pass the models to it for streaming concept-drift detection on top
-       of a labeled stream.
-
-    :param context:         MLRun context
-    :param base_dataset:    Dataset containing label_col and prediction_col to initialize the detectors
-    :param input_stream:    labeled stream to track.
-                            Should contain label_col and prediction_col
-    :param output_stream:   Output stream to push the detector's alerts
-    :param output_tsdb:     Output TSDB table to allow analysis and display
-    :param tsdb_batch_size: Batch size of alerts to buffer before pushing to the TSDB
-    :param callbacks:       Additional rest endpoints to send the alert data to
-    :param models:          List of the detectors to deploy
-                            Defaults to ['ddm', 'eddm', 'pagehinkley'].
-    :param models_dest:     Location for saving the detectors
-                            Defaults to 'models' (in relation to artifact_path).
-    :param pagehinkley_threshold:  Drift level threshold for PH detector Defaults to 10.
-    :param ddm_warning_level:      Warning level alert for DDM detector Defaults to 2.
-    :param ddm_out_control_level:  Drift level alert for DDM detector Defaults to 3.
-    :param label_col:       Label column to be used on base_dataset and input_stream
-                            Defaults to 'label'.
-    :param prediction_col:  Prediction column to be used on base_dataset and input_stream
-                            Defaults to 'prediction'.
-    :param hub_url:         hub_url in case the default is not used, concept_drift_streaming will be loaded
-                            by this url
-                            Defaults to mlconf.hub_url.
-    :param fn_tag:          hub tag to use
-                            Defaults to 'master'
-    """
-
-    mlconf.dbpath = mlconf.dbpath or "http://mlrun-api:8080"
-    mlconf.hub_url = hub_url
-    fn = import_function(url="hub://concept_drift_streaming")
-
-    context.logger.info("Loading base dataset")
-    base_df = base_dataset.as_df()
-    error_stream = np.where(
-        base_df[prediction_col].values == base_df[label_col].values, 0, 1
-    )
-
-    context.logger.info("Creating models")
-    models = [
-        model.strip()
-        for model in os.getenv("models", "pagehinkley, ddm, eddm").split(",")
-    ]
-    models = {
-        "eddm": skmultiflow.drift_detection.EDDM(),
-        "pagehinkley": skmultiflow.drift_detection.PageHinkley(
-            min_instances=len(error_stream), threshold=pagehinkley_threshold
-        ),
-        "ddm": skmultiflow.drift_detection.DDM(
-            min_num_instances=len(error_stream),
-            warning_level=ddm_warning_level,
-            out_control_level=ddm_out_control_level,
-        ),
-    }
-
-    context.logger.info("Streaming data to models")
-    for i in range(len(error_stream)):
-        for model_name, model in models.items():
-            model.add_element(error_stream[i])
-
-    context.logger.info("Logging ready models")
-    for name, model in models.items():
-        data = dumps(model)
-        model_file = f"{name}.pkl"
-        context.log_model(
-            f"{name}_concept_drift",
-            body=data,
-            labels={"framework": "skmultiflow", "workflow": "concept-drift"},
-            model_file=model_file,
-            model_dir=models_dest,
-            tag="latest",
-        )
-        fn.set_envs(
-            {
-                f"{name}_model_path": os.path.join(
-                    context.artifact_path, models_dest, model_file
-                )
-            }
-        )
-
-    context.logger.info("Deploying Concept Drift Streaming function")
-    fn.set_envs(
-        {
-            "label_col": label_col,
-            "prediction_col": prediction_col,
-            "drift_stream": output_stream,
-            "tsdb_table": output_tsdb,
-            "pagehinkley_threshold": pagehinkley_threshold,
-            "ddm_warning_level": ddm_warning_level,
-            "ddm_out_control": ddm_out_control_level,
-        }
-    )
-    fn.add_trigger(
-        "labeled_stream", V3IOStreamTrigger(url=input_stream, name="labeled_stream")
-    )
-    fn.apply(mount_v3io())
-    fn.deploy(project=context.project)
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/concept_drift/0.0.2/src/README.md b/functions/development/concept_drift/0.0.2/src/README.md deleted file mode 100644 index 92e6d893..00000000 --- a/functions/development/concept_drift/0.0.2/src/README.md +++ /dev/null @@ -1,132 +0,0 @@ -# Concept Drift - -**Concept drift** is a change in the statistical properties of the **target variable** over time. - -When deploying our models to production, we must ensure our models perform as we expect them to - reaching the same level of performence we have seen on our test sets or at least performing in the same quality as when they were deployed. - -However, often this is not the case. there are many factors that can affect our model's performance like seasonality or any unkown root causes that will change the laws underlying our data and invalidate some assumptions made by the model. - -We offer this function to help combat Concept Drift with implementation of streaming DDM, EDDM and PH concept drift detectors. - -## How to integrate - -This function is made of two parts: - -1. Kubernetes job to instantiate the selected models with a provided base dataset (the test dataset could be used) -2. [Nuclio serverless function](../concept_drift_streaming/concept_drift_streaming.ipynb) listed on a _labeled stream_, which will be deployed from this function after the models initialization and run the models per event and provide necessary alerts. - -There are two steps to integrate sucessfully with your workflow: - -1. Provide a stream where each event containes the joined **label** and **prediction** for that specific event. -2. Add this function to the workflow with the following params: - -```markdown -:param context: MLRun context -:param base_dataset: Dataset containing label_col and prediction_col to initialize the detectors -:param input_stream: labeled stream to track. - Should contain label_col and prediction_col -:param output_stream: Output stream to push the detector's alerts -:param output_tsdb: Output TSDB table to allow analysis and display -:param tsdb_batch_size: Batch size of alerts to buffer before pushing to the TSDB -:param callbacks: Additional rest endpoints to send the alert data to -:param models: List of the detectors to deploy - Defaults to ['ddm', 'eddm', 'pagehinkley']. -:param models_dest: Location for saving the detectors - Defaults to 'models' (in relation to artifact_path). -:param pagehinkley_threshold: Drift level threshold for PH detector Defaults to 10. -:param ddm_warning_level: Warning level alert for DDM detector Defaults to 2. -:param ddm_out_control_level: Drift level alert for DDM detector Defaults to 3. -:param label_col: Label column to be used on base_dataset and input_stream - Defaults to 'label'. -:param prediction_col: Prediction column to be used on base_dataset and input_stream - Defaults to 'prediction'. -:param hub_url: hub_url in case the default is not used, concept_drift_streaming will be loaded - by this url - Defaults to mlconf.hub_url. -:param fn_tag: hub tag to use - Defaults to 'master' -``` - -## Algorithms - -We offer to deploy up to 3 concept drift streaming detectors - -### DDM - Drift Detection Method - -Models the **Number of errors** as a **binomial** variable. This enables us to confine the expected number of errors in a prediction stream window to within some standard deviation. - -- Good for **abrupt** drift changes - -
- -![$mu=np_t$](https://latex.codecogs.com/svg.latex?mu=np_t) - -![$\sigma=\sqrt{\frac{p_t(1-p_t)}{n}}$]() - -
- -**Alert** when: - -
- -![$p_t+\sigma_t\ge{p_{min}+3\sigma_{min}}$](https://latex.codecogs.com/svg.latex?p_t+\sigma_t\ge{p_{min}+3\sigma_{min}}) - -
- -### EDDM - Early Drift Detection Method - -Uses the distance between two consecutive errors. - -- works better for **gradual** drift changes. -- More sensitive then DDM for noise -- Requires Minimal number of errors to initialize the statistics. - -**Warning**: - -
- -![$\frac{p_t+2\sigma_t}{p_{max}+2\sigma_{max}}<0.95$](https://latex.codecogs.com/svg.latex?\frac{p_t+2\sigma_t}{p_{max}+2\sigma_{max}}<0.95) - -
- -**Alert**: - -
- -![$\frac{p_t+2\sigma_t}{p_{max}+2\sigma_{max}}<0.90$](https://latex.codecogs.com/svg.latex?\frac{p_t+2\sigma_t}{p_{max}+2\sigma_{max}}<0.90) - -
- -### PageHinkley Test: - -The PageHinkley test is a sequential analysis technique typically used for monitoring change detection. (The test was designed to detect change in avg. of a Gaussian signal). In this test we use: -x*1*, ..., x*n* - labeled dataset -δ - magnitude threshold -λ - detection threshold - -
- -![$\hat{x_T}=\frac{1}{T}\sum_{t=1}^{t}{x_t}$](https://latex.codecogs.com/svg.latex?\hat{x_T}=\frac{1}{T}\sum_{t=1}^{t}{x_t}) - -![$\sum_{t=1}^T{x_t-\hat{x_T}-\delta}$](https://latex.codecogs.com/svg.latex?U_T=\sum_{t=1}^T{x_t-\hat{x_T}-\delta}) - -![$m_T=min(U_t,t=1..T)$]() - -
- -**Alert**: - -
- -![$U_T-m_T>\lambda$](https://latex.codecogs.com/svg.latex?U_T-m_T>\lambda) - -
- -## Additional resources -[A Study on Change Detection Methods](https://pdfs.semanticscholar.org/bb6e/8a44c0efcd725aae1c0b1817561f6e278c2c.pdf), Raquel Sebasti˜ao1,2 and Jo˜ao Gama1,3, 1 LIAAD-INESC Porto L.A., University of Porto -Rua de Ceuta, 118 - 6, 4050-190 Porto, Portugal -2 Faculty of Science, University of Porto -3 Faculty of Economics, University of Porto -{raquel,jgama}@liaad.up.pt - -[MLOps Live #4 - How to Detect & Remediate Drift in Production with MLOps Automation](https://www.youtube.com/watch?v=66_Q7mJZOSc&t=1296s) diff --git a/functions/development/concept_drift/0.0.2/src/concept_drift.ipynb b/functions/development/concept_drift/0.0.2/src/concept_drift.ipynb deleted file mode 100644 index e9c063b6..00000000 --- a/functions/development/concept_drift/0.0.2/src/concept_drift.ipynb +++ /dev/null @@ -1,793 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Concept Drift - Deployer\n", - "Deploy a streaming Concept Drift detector on a labeled stream. \n", - "It will initialize the selected drift detectors with the base_dataset's statistics and deploy the [concept_drift_streaming](https://github.com/mlrun/functions/blob/master/concept_drift_streaming/concept_drift_streaming.ipynb) function from the hub.
\n", - "adding [V3IOStreamTrigger](https://nuclio.io/docs/latest/reference/triggers/v3iostream/) in order to listen to the input_stream." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Steps**\n", - "\n", - "1. [Data exploration](#Data-exploration)\n", - "2. [Creating the input stream](#Creating-the-input-stream)\n", - "3. [Importing the function](#Importing-the-function)\n", - "4. [Running the function remotely](#Running-the-function-remotely)\n", - "5. [Testing the function](#Testing-the-function)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Data exploration**" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In order to know about the performance of a drift detector by measuring the different detection metrics, we need to know beforehand where a real drift occurs.
\n", - "This is only possible with synthetic datasets.
The scikit-multiflow framework allows generating several kinds of synthetic data to simulate the occurrence of drifts.
\n", - "[Harvard dataverse](https://dataverse.harvard.edu) provides futher explanations on the [used dataset](https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/5OWRGB) along with different kinds of drifted datasets.
\n", - "mixed_0101_abrupto has 4 concepts and 3 drifts at time steps 10000, 20000, and 30000.
\n", - "Our dataset will be train-test-splitted, the train part (first 5000 examples) is used to train the model (that is generated easly using [sklearn_classifer](https://github.com/mlrun/functions/blob/master/sklearn_classifier/sklearn_classifier.ipynb)).
\n", - "The test part (which is already predicted by the model) will be pushed to the input stream in order to detect drifts." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
X1X2X3X4class
00.01.00.4601010.5927441.0
11.01.00.5887880.5749840.0
20.00.00.4016410.6793251.0
31.01.00.3060760.1821080.0
40.00.00.9628470.5792451.0
\n", - "
" - ], - "text/plain": [ - " X1 X2 X3 X4 class\n", - "0 0.0 1.0 0.460101 0.592744 1.0\n", - "1 1.0 1.0 0.588788 0.574984 0.0\n", - "2 0.0 0.0 0.401641 0.679325 1.0\n", - "3 1.0 1.0 0.306076 0.182108 0.0\n", - "4 0.0 0.0 0.962847 0.579245 1.0" - ] - }, - "execution_count": 1, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import pandas as pd\n", - "data_path = 'https://s3.wasabisys.com/iguazio/data/function-marketplace-data/concept_drift/mixed_0101_abrupto.csv'\n", - "predicted_train_path = 'https://s3.wasabisys.com/iguazio/data/function-marketplace-data/concept_drift/predicted_abrupto_train.csv'\n", - "predicted_test_data_path = 'https://s3.wasabisys.com/iguazio/data/function-marketplace-data/concept_drift/predicted_abrupto_test.csv'\n", - "# You can find the model used here\n", - "models_path = 'https://s3.wasabisys.com/iguazio/models/function-marketplace-models/concept_drift/concept_drift_random_forest.pkl'\n", - "original_data = pd.read_csv(data_path)\n", - "original_data.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
X1X2X3X4classpredicted_col
349950.00.00.0101060.6472690.01.0
349961.01.00.2936510.7372911.00.0
349970.00.00.8485460.5523370.01.0
349981.01.00.6147540.8598961.00.0
349991.00.00.2653060.8437160.01.0
\n", - "
" - ], - "text/plain": [ - " X1 X2 X3 X4 class predicted_col\n", - "34995 0.0 0.0 0.010106 0.647269 0.0 1.0\n", - "34996 1.0 1.0 0.293651 0.737291 1.0 0.0\n", - "34997 0.0 0.0 0.848546 0.552337 0.0 1.0\n", - "34998 1.0 1.0 0.614754 0.859896 1.0 0.0\n", - "34999 1.0 0.0 0.265306 0.843716 0.0 1.0" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "predicted_test = pd.read_csv(predicted_test_data_path)\n", - "predicted_test.tail()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Creating the input stream**" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "import os \n", - "\n", - "container = os.path.join('/',os.environ['V3IO_HOME'].split('/')[0])\n", - "user = os.environ[\"V3IO_USERNAME\"]\n", - "rel_path = os.getcwd()[6:] + '/artifacts'\n", - "\n", - "base_input_stream = os.path.join(user,rel_path) + \"/inputs_stream\"\n", - "base_output_stream = os.path.join(user,rel_path) + \"/output_stream\"\n", - "input_stream = os.path.join(container,base_input_stream)\n", - "output_stream = os.path.join(container,user,rel_path) + \"/output_stream\"\n", - "tsdb_path = os.path.join(container,user,rel_path) + \"/output_tsdb\"\n", - "\n", - "stream_consumer_group = 'cg45'" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "import v3io.dataplane\n", - "\n", - "client = v3io.dataplane.Client()\n", - "response = client.stream.create(container = container,\n", - " stream_path=base_input_stream,\n", - " shard_count=1,\n", - " raise_for_status = v3io.dataplane.RaiseForStatus.never)\n", - "response.raise_for_status([409, 204])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Importing the function**" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2021-10-25 10:27:04,105 [info] created and saved project function-marketplace\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Importing the function\n", - "import mlrun\n", - "mlrun.set_environment(project='function-marketplace')\n", - "\n", - "fn = mlrun.import_function(\"hub://concept_drift:development\")\n", - "fn.apply(mlrun.auto_mount())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Running the function remotely**" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2021-10-25 10:27:04,567 [info] starting run concept_drift uid=fa07c222e77d4eac86d2ce9317aaded1 DB=http://mlrun-api:8080\n", - "> 2021-10-25 10:27:04,709 [info] Job is running in the background, pod: concept-drift-ggxgb\n", - "> 2021-10-25 10:27:11,199 [info] Loading base dataset\n", - "> 2021-10-25 10:27:13,227 [info] Creating models\n", - "> 2021-10-25 10:27:13,227 [info] Streaming data to models\n", - "> 2021-10-25 10:27:13,347 [info] Logging ready models\n", - "> 2021-10-25 10:27:13,487 [info] Deploying Concept Drift Streaming function\n", - "> 2021-10-25 10:27:13,490 [info] Starting remote function deploy\n", - "2021-10-25 10:27:13 (info) Deploying function\n", - "2021-10-25 10:27:13 (info) Building\n", - "2021-10-25 10:27:13 (info) Staging files and preparing base images\n", - "2021-10-25 10:27:13 (info) Building processor image\n", - "2021-10-25 10:27:15 (info) Build complete\n", - "2021-10-25 10:27:21 (info) Function deploy complete\n", - "> 2021-10-25 10:27:21,797 [info] successfully deployed function: {'internal_invocation_urls': ['nuclio-function-marketplace-concept-drift-streaming.default-tenant.svc.cluster.local:8080'], 'external_invocation_urls': ['default-tenant.app.dev39.lab.iguazeng.com:31143']}\n", - "> 2021-10-25 10:27:21,868 [info] run executed, status=completed\n", - "final state: completed\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
function-marketplace0Oct 25 10:27:10completedconcept_drift
v3io_user=dani
kind=job
owner=dani
host=concept-drift-ggxgb
base_dataset
input_stream=/users/dani/test/functions/concept_drift/artifacts/inputs_stream
consumer_group=cg45
output_stream=/users/dani/test/functions/concept_drift/artifacts/output_stream
output_tsdb=/users/dani/test/functions/concept_drift/artifacts/output_tsdb
tsdb_batch_size=1
models=['ddm', 'eddm', 'pagehinkley']
label_col=class
prediction_col=predicted_col
fn_tag=development
eddm_concept_drift
pagehinkley_concept_drift
ddm_concept_drift
\n", - "
\n", - "
\n", - "
\n", - " Title\n", - " ×\n", - "
\n", - " \n", - "
\n", - "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - }, - { - "data": { - "text/html": [ - " > to track results use the .show() or .logs() methods or click here to open in UI" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2021-10-25 10:27:23,031 [info] run executed, status=completed\n" - ] - } - ], - "source": [ - "drift_run = fn.run(name='concept_drift',\n", - " params={'input_stream' : input_stream,\n", - " 'consumer_group' : stream_consumer_group,\n", - " 'output_stream' : output_stream,\n", - " 'output_tsdb' : tsdb_path,\n", - " 'tsdb_batch_size' : 1,\n", - " 'models' : ['ddm', 'eddm', 'pagehinkley'], # defaults\n", - " 'label_col' : 'class',\n", - " 'prediction_col' : 'predicted_col',\n", - " 'fn_tag' : 'development'},\n", - " inputs={'base_dataset' : predicted_train_path},\n", - " artifact_path = os.path.join(os.getcwd(), 'artifacts'))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Testing the function**\n", - "> Mark that we are testing the deployed function - concept_drift_streaming" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'data': '{\"class\": 1.0, \"request\": {\"instances\": [{\"X1\": 0.0, \"X2\": 0.0, \"X3\": 0.0634475073, \"X4\": 0.4136568818}]}, \"resp\": [1], \"when\": \"2021-10-25 10:27:23.152584\", \"model\": \"sklearn.ensemble.RandomForestClassifier\"}'}" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import json\n", - "import datetime\n", - "\n", - "# Reshaping the data to V3IOStream format.\n", - "def restructure_stream_event(context, event):\n", - " instances = [dict()]\n", - " for key in predicted_test.keys():\n", - " if key not in ['when', 'class', 'model', 'worker', 'hostname', 'predicted_col']:\n", - " instances[0].update({key: event.pop(key)})\n", - " event['request'] = {'instances': instances}\n", - " event['resp'] = [int(event.pop('predicted_col'))]\n", - " event['when'] = datetime.datetime.strftime(datetime.datetime.now(), format=\"%Y-%m-%d %H:%M:%S.%f\")\n", - " event['model'] = 'sklearn.ensemble.RandomForestClassifier'\n", - " return event\n", - " \n", - " \n", - "records = json.loads(predicted_test.to_json(orient='records'))\n", - "records = [{'data': json.dumps(restructure_stream_event(context, record))} for record in records]\n", - "\n", - "# showing first record\n", - "records[0]" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "# Creating v3io client\n", - "v3io_client = v3io.dataplane.Client()\n", - "\n", - "# Pushing some undrifted data to the input stream\n", - "response = v3io_client.stream.put_records(container=container,\n", - " stream_path=base_input_stream, \n", - " records=records[4900:5100])" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'SequenceNumber': 200,\n", - " 'Data': 'eyJjbGFzcyI6IDAuMCwgInJlcXVlc3QiOiB7Imluc3RhbmNlcyI6IFt7IlgxIjogMC4wLCAiWDIiOiAwLjAsICJYMyI6IDAuMzMzMTYzNjk4OSwgIlg0IjogMC40MjE2NzY1Njg3fV19LCAicmVzcCI6IFsxXSwgIndoZW4iOiAiMjAyMS0xMC0yNSAxMDoyNzoyMy4yOTM3OTgiLCAibW9kZWwiOiAic2tsZWFybi5lbnNlbWJsZS5SYW5kb21Gb3Jlc3RDbGFzc2lmaWVyIn0=',\n", - " 'ArrivalTimeSec': 1635157644,\n", - " 'ArrivalTimeNSec': 395309631}" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Getting earliest location in the shard\n", - "location = json.loads(v3io_client.stream.seek(container=container,\n", - " stream_path=base_input_stream,\n", - " shard_id=0,\n", - " seek_type='EARLIEST').body)['Location']\n", - "# Getting records from input stream\n", - "response = v3io_client.stream.get_records(container=container,\n", - " stream_path=base_input_stream,\n", - " shard_id=0, location=location)\n", - "# Showing the last sequence that is written to the input stream\n", - "json.loads(response.body)['Records'][-1]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Make sure some time has passed - the function needs to be triggered by the input stream, then it'll write to the output stream" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "# Getting earliest location in the shard\n", - "location = json.loads(v3io_client.stream.seek(container=container,\n", - " stream_path=base_output_stream,\n", - " shard_id=0,\n", - " seek_type='EARLIEST').body)['Location']\n", - "# Getting records from output stream\n", - "response = v3io_client.stream.get_records(container=container,\n", - " stream_path=base_output_stream,\n", - " shard_id=0, location=location)" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "sequence number : 106, data : {'class': 0.0, 'request': {'instances': [{'X1': 0.0, 'X2': 0.0, 'X3': 0.9628473804, 'X4': 0.5792453402}]}, 'resp': [1], 'when': '2021-10-25 10:27:23.291145', 'model': 'sklearn.ensemble.RandomForestClassifier', 'ddm_warning_zone': 0, 'ddm_drift': 1, 'eddm_warning_zone': 0, 'eddm_drift': 0}\n", - "sequence number : 122, data : {'class': 0.0, 'request': {'instances': [{'X1': 0.0, 'X2': 0.0, 'X3': 0.4969765505, 'X4': 0.9784738351}]}, 'resp': [1], 'when': '2021-10-25 10:27:23.291558', 'model': 'sklearn.ensemble.RandomForestClassifier', 'ddm_warning_zone': 0, 'ddm_drift': 0, 'eddm_warning_zone': 0, 'eddm_drift': 1}\n" - ] - } - ], - "source": [ - "# Showing changed detected\n", - "import base64\n", - "for instance in json.loads(response.body)['Records']:\n", - " seq = instance[\"SequenceNumber\"]\n", - " data = json.loads(base64.b64decode(instance['Data']))\n", - " if(data['ddm_drift']==1 or data['eddm_drift']==1):\n", - " print(f'sequence number : {seq}, data : {data}')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can see that the system detected a change in the 106 instance, which is 10006 instance in the real dataset -
\n", - "5000 first instances are for train, we started pushing data from the 4900 instance of the test dataset (9900 from the real dataset), and we pushed only 200 instances.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "[Back to the top](#Concept-Drift---Deployer)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python [conda env:root] *", - "language": "python", - "name": "conda-root-py" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/functions/development/concept_drift/0.0.2/src/concept_drift.py b/functions/development/concept_drift/0.0.2/src/concept_drift.py deleted file mode 100644 index f6fd8dcc..00000000 --- a/functions/development/concept_drift/0.0.2/src/concept_drift.py +++ /dev/null @@ -1,133 +0,0 @@ -# Generated by nuclio.export.NuclioExporter - -import skmultiflow.drift_detection # We will grab our PH, DDM, EDDM algorithms from here -import numpy as np -import pandas as pd -import os -from cloudpickle import dumps, load, dump - -from nuclio.triggers import V3IOStreamTrigger -from mlrun import DataItem, import_function, mlconf, MLClientCtx, mount_v3io - -import random - - -def concept_drift_deployer( - context: MLClientCtx, - base_dataset: DataItem, - input_stream: str, - consumer_group: str, - output_stream: str, - output_tsdb: str, - tsdb_batch_size: int, - callbacks: list, - models: list = ["ddm", "eddm", "pagehinkley"], - models_dest="models", - pagehinkley_threshold: float = 10, - ddm_warning_level: float = 2, - ddm_out_control_level: float = 3, - label_col="label", - prediction_col="prediction", - hub_url: str = mlconf.hub_url, - fn_tag: str = "master", -): - """Deploy a streaming Concept Drift detector on a labeled stream - This function is the Deployment step for the Streaming Concept Drift Detector. - It will load the selected drift detectors and initialize them with the - base_dataset's statistics. Then it will deploy the concept_drift_streaming - function and pass the models to it for streaming concept-drift detection on top - of a labeled stream. - - :param context: MLRun context - :param base_dataset: Dataset containing label_col and prediction_col to initialize the detectors - :param input_stream: labeled stream to track. - Should contain label_col and prediction_col - :param output_stream: Output stream to push the detector's alerts - :param output_tsdb: Output TSDB table to allow analysis and display - :param tsdb_batch_size: Batch size of alerts to buffer before pushing to the TSDB - :param callbacks: Additional rest endpoints to send the alert data to - :param models: List of the detectors to deploy - Defaults to ['ddm', 'eddm', 'pagehinkley']. - :param models_dest: Location for saving the detectors - Defaults to 'models' (in relation to artifact_path). - :param pagehinkley_threshold: Drift level threshold for PH detector Defaults to 10. - :param ddm_warning_level: Warning level alert for DDM detector Defaults to 2. - :param ddm_out_control_level: Drift level alert for DDM detector Defaults to 3. - :param label_col: Label column to be used on base_dataset and input_stream - Defaults to 'label'. - :param prediction_col: Prediction column to be used on base_dataset and input_stream - Defaults to 'prediction'. - :param hub_url: hub_url in case the default is not used, concept_drift_streaming will be loaded - by this url - Defaults to mlconf.hub_url. - :param fn_tag: hub tag to use - Defaults to 'master' - """ - - mlconf.dbpath = mlconf.dbpath or "http://mlrun-api:8080" - mlconf.hub_url = hub_url - fn = import_function(url=f"hub://concept_drift_streaming:{fn_tag}") - - context.logger.info("Loading base dataset") - base_df = base_dataset.as_df() - error_stream = np.where( - base_df[prediction_col].values == base_df[label_col].values, 0, 1 - ) - - context.logger.info("Creating models") - models = [ - model.strip() - for model in os.getenv("models", "pagehinkley, ddm, eddm").split(",") - ] - models = { - "eddm": skmultiflow.drift_detection.EDDM(), - "pagehinkley": skmultiflow.drift_detection.PageHinkley( - min_instances=len(error_stream), threshold=pagehinkley_threshold - ), - "ddm": skmultiflow.drift_detection.DDM( - min_num_instances=len(error_stream), - warning_level=ddm_warning_level, - out_control_level=ddm_out_control_level, - ), - } - - context.logger.info("Streaming data to models") - for i in range(len(error_stream)): - for model_name, model in models.items(): - model.add_element(error_stream[i]) - - context.logger.info("Logging ready models") - for name, model in models.items(): - data = dumps(model) - model_file = f"{name}.pkl" - context.log_model( - f"{name}_concept_drift", - body=data, - labels={"framework": "skmultiflow", "workflow": "concept-drift"}, - model_file=model_file, - model_dir=models_dest, - tag="latest", - ) - fn.set_envs( - { - f"{name}_model_path": os.path.join( - context.artifact_path, models_dest, model_file - ) - } - ) - - context.logger.info("Deploying Concept Drift Streaming function") - fn.set_envs( - { - "label_col": label_col, - "prediction_col": prediction_col, - "drift_stream": output_stream, - "tsdb_table": output_tsdb, - "pagehinkley_threshold": pagehinkley_threshold, - "ddm_warning_level": ddm_warning_level, - "ddm_out_control": ddm_out_control_level, - } - ) - fn.add_v3io_stream_trigger(stream_path = input_stream, name = 'stream', group = consumer_group) - fn.apply(mount_v3io()) - fn.deploy(project=context.project) diff --git a/functions/development/concept_drift/0.0.2/src/function.yaml b/functions/development/concept_drift/0.0.2/src/function.yaml deleted file mode 100644 index eea75339..00000000 --- a/functions/development/concept_drift/0.0.2/src/function.yaml +++ /dev/null @@ -1,112 +0,0 @@ -kind: job -metadata: - name: concept-drift - tag: '' - hash: 935da41196802875e19948974f32b6f00c29feb2 - project: default - labels: - author: orz - framework: sklearn - categories: - - machine-learning - - model-serving -spec: - command: '' - args: [] - image: mlrun/ml-models - env: [] - default_handler: concept_drift_deployer - entry_points: - concept_drift_deployer: - name: concept_drift_deployer - doc: "Deploy a streaming Concept Drift detector on a labeled stream\n This\ - \ function is the Deployment step for the Streaming Concept Drift Detector.\n\ - \ It will load the selected drift detectors and initialize them with the\n\ - \ base_dataset's statistics. Then it will deploy the concept_drift_streaming\n\ - \ function and pass the models to it for streaming concept-drift detection\ - \ on top\n of a labeled stream." - parameters: - - name: context - type: MLClientCtx - doc: MLRun context - default: '' - - name: base_dataset - type: DataItem - doc: Dataset containing label_col and prediction_col to initialize the detectors - default: '' - - name: input_stream - type: str - doc: labeled stream to track. Should contain label_col and prediction_col - default: '' - - name: consumer_group - type: str - default: '' - - name: output_stream - type: str - doc: Output stream to push the detector's alerts - default: '' - - name: output_tsdb - type: str - doc: Output TSDB table to allow analysis and display - default: '' - - name: tsdb_batch_size - type: int - doc: Batch size of alerts to buffer before pushing to the TSDB - default: '' - - name: callbacks - type: list - doc: Additional rest endpoints to send the alert data to - default: '' - - name: models - type: list - doc: List of the detectors to deploy Defaults to ['ddm', 'eddm', 'pagehinkley']. - default: - - ddm - - eddm - - pagehinkley - - name: models_dest - doc: Location for saving the detectors Defaults to 'models' (in relation to - artifact_path). - default: models - - name: pagehinkley_threshold - type: float - doc: Drift level threshold for PH detector Defaults to 10. - default: 10 - - name: ddm_warning_level - type: float - doc: Warning level alert for DDM detector Defaults to 2. - default: 2 - - name: ddm_out_control_level - type: float - doc: Drift level alert for DDM detector Defaults to 3. - default: 3 - - name: label_col - doc: Label column to be used on base_dataset and input_stream Defaults to - 'label'. - default: label - - name: prediction_col - doc: Prediction column to be used on base_dataset and input_stream Defaults - to 'prediction'. - default: prediction - - name: hub_url - type: str - doc: hub_url in case the default is not used, concept_drift_streaming will - be loaded by this url Defaults to mlconf.hub_url. - default: <_ast.Name object at 0x7f48eda946d0> - - name: fn_tag - type: str - doc: hub tag to use Defaults to 'master' - default: master - outputs: - - default: '' - lineno: 15 - description: Deploy a streaming Concept Drift detector on a labeled stream - build: - functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHNrbXVsdGlmbG93LmRyaWZ0X2RldGVjdGlvbiAgIyBXZSB3aWxsIGdyYWIgb3VyIFBILCBERE0sIEVERE0gYWxnb3JpdGhtcyBmcm9tIGhlcmUKaW1wb3J0IG51bXB5IGFzIG5wCmltcG9ydCBwYW5kYXMgYXMgcGQKaW1wb3J0IG9zCmZyb20gY2xvdWRwaWNrbGUgaW1wb3J0IGR1bXBzLCBsb2FkLCBkdW1wCgpmcm9tIG51Y2xpby50cmlnZ2VycyBpbXBvcnQgVjNJT1N0cmVhbVRyaWdnZXIKZnJvbSBtbHJ1biBpbXBvcnQgRGF0YUl0ZW0sIGltcG9ydF9mdW5jdGlvbiwgbWxjb25mLCBNTENsaWVudEN0eCwgbW91bnRfdjNpbwoKaW1wb3J0IHJhbmRvbQoKCmRlZiBjb25jZXB0X2RyaWZ0X2RlcGxveWVyKAogICAgY29udGV4dDogTUxDbGllbnRDdHgsCiAgICBiYXNlX2RhdGFzZXQ6IERhdGFJdGVtLAogICAgaW5wdXRfc3RyZWFtOiBzdHIsCiAgICBjb25zdW1lcl9ncm91cDogc3RyLAogICAgb3V0cHV0X3N0cmVhbTogc3RyLAogICAgb3V0cHV0X3RzZGI6IHN0ciwKICAgIHRzZGJfYmF0Y2hfc2l6ZTogaW50LAogICAgY2FsbGJhY2tzOiBsaXN0LAogICAgbW9kZWxzOiBsaXN0ID0gWyJkZG0iLCAiZWRkbSIsICJwYWdlaGlua2xleSJdLAogICAgbW9kZWxzX2Rlc3Q9Im1vZGVscyIsCiAgICBwYWdlaGlua2xleV90aHJlc2hvbGQ6IGZsb2F0ID0gMTAsCiAgICBkZG1fd2FybmluZ19sZXZlbDogZmxvYXQgPSAyLAogICAgZGRtX291dF9jb250cm9sX2xldmVsOiBmbG9hdCA9IDMsCiAgICBsYWJlbF9jb2w9ImxhYmVsIiwKICAgIHByZWRpY3Rpb25fY29sPSJwcmVkaWN0aW9uIiwKICAgIGh1Yl91cmw6IHN0ciA9IG1sY29uZi5odWJfdXJsLAogICAgZm5fdGFnOiBzdHIgPSAibWFzdGVyIiwKKToKICAgICIiIkRlcGxveSBhIHN0cmVhbWluZyBDb25jZXB0IERyaWZ0IGRldGVjdG9yIG9uIGEgbGFiZWxlZCBzdHJlYW0KICAgICAgIFRoaXMgZnVuY3Rpb24gaXMgdGhlIERlcGxveW1lbnQgc3RlcCBmb3IgdGhlIFN0cmVhbWluZyBDb25jZXB0IERyaWZ0IERldGVjdG9yLgogICAgICAgSXQgd2lsbCBsb2FkIHRoZSBzZWxlY3RlZCBkcmlmdCBkZXRlY3RvcnMgYW5kIGluaXRpYWxpemUgdGhlbSB3aXRoIHRoZQogICAgICAgYmFzZV9kYXRhc2V0J3Mgc3RhdGlzdGljcy4gIFRoZW4gaXQgd2lsbCBkZXBsb3kgdGhlIGNvbmNlcHRfZHJpZnRfc3RyZWFtaW5nCiAgICAgICBmdW5jdGlvbiBhbmQgcGFzcyB0aGUgbW9kZWxzIHRvIGl0IGZvciBzdHJlYW1pbmcgY29uY2VwdC1kcmlmdCBkZXRlY3Rpb24gb24gdG9wCiAgICAgICBvZiBhIGxhYmVsZWQgc3RyZWFtLgoKICAgIDpwYXJhbSBjb250ZXh0OiAgICAgICAgIE1MUnVuIGNvbnRleHQKICAgIDpwYXJhbSBiYXNlX2RhdGFzZXQ6ICAgIERhdGFzZXQgY29udGFpbmluZyBsYWJlbF9jb2wgYW5kIHByZWRpY3Rpb25fY29sIHRvIGluaXRpYWxpemUgdGhlIGRldGVjdG9ycwogICAgOnBhcmFtIGlucHV0X3N0cmVhbTogICAgbGFiZWxlZCBzdHJlYW0gdG8gdHJhY2suCiAgICAgICAgICAgICAgICAgICAgICAgICAgICBTaG91bGQgY29udGFpbiBsYWJlbF9jb2wgYW5kIHByZWRpY3Rpb25fY29sCiAgICA6cGFyYW0gb3V0cHV0X3N0cmVhbTogICBPdXRwdXQgc3RyZWFtIHRvIHB1c2ggdGhlIGRldGVjdG9yJ3MgYWxlcnRzCiAgICA6cGFyYW0gb3V0cHV0X3RzZGI6ICAgICBPdXRwdXQgVFNEQiB0YWJsZSB0byBhbGxvdyBhbmFseXNpcyBhbmQgZGlzcGxheQogICAgOnBhcmFtIHRzZGJfYmF0Y2hfc2l6ZTogQmF0Y2ggc2l6ZSBvZiBhbGVydHMgdG8gYnVmZmVyIGJlZm9yZSBwdXNoaW5nIHRvIHRoZSBUU0RCCiAgICA6cGFyYW0gY2FsbGJhY2tzOiAgICAgICBBZGRpdGlvbmFsIHJlc3QgZW5kcG9pbnRzIHRvIHNlbmQgdGhlIGFsZXJ0IGRhdGEgdG8KICAgIDpwYXJhbSBtb2RlbHM6ICAgICAgICAgIExpc3Qgb2YgdGhlIGRldGVjdG9ycyB0byBkZXBsb3kKICAgICAgICAgICAgICAgICAgICAgICAgICAgIERlZmF1bHRzIHRvIFsnZGRtJywgJ2VkZG0nLCAncGFnZWhpbmtsZXknXS4KICAgIDpwYXJhbSBtb2RlbHNfZGVzdDogICAgIExvY2F0aW9uIGZvciBzYXZpbmcgdGhlIGRldGVjdG9ycwogICAgICAgICAgICAgICAgICAgICAgICAgICAgRGVmYXVsdHMgdG8gJ21vZGVscycgKGluIHJlbGF0aW9uIHRvIGFydGlmYWN0X3BhdGgpLgogICAgOnBhcmFtIHBhZ2VoaW5rbGV5X3RocmVzaG9sZDogIERyaWZ0IGxldmVsIHRocmVzaG9sZCBmb3IgUEggZGV0ZWN0b3IgRGVmYXVsdHMgdG8gMTAuCiAgICA6cGFyYW0gZGRtX3dhcm5pbmdfbGV2ZWw6ICAgICAgV2FybmluZyBsZXZlbCBhbGVydCBmb3IgRERNIGRldGVjdG9yIERlZmF1bHRzIHRvIDIuCiAgICA6cGFyYW0gZGRtX291dF9jb250cm9sX2xldmVsOiAgRHJpZnQgbGV2ZWwgYWxlcnQgZm9yIERETSBkZXRlY3RvciBEZWZhdWx0cyB0byAzLgogICAgOnBhcmFtIGxhYmVsX2NvbDogICAgICAgTGFiZWwgY29sdW1uIHRvIGJlIHVzZWQgb24gYmFzZV9kYXRhc2V0IGFuZCBpbnB1dF9zdHJlYW0KICAgICAgICAgICAgICAgICAgICAgICAgICAgIERlZmF1bHRzIHRvICdsYWJlbCcuCiAgICA6cGFyYW0gcHJlZGljdGlvbl9jb2w6ICBQcmVkaWN0aW9uIGNvbHVtbiB0byBiZSB1c2VkIG9uIGJhc2VfZGF0YXNldCBhbmQgaW5wdXRfc3RyZWFtCiAgICAgICAgICAgICAgICAgICAgICAgICAgICBEZWZhdWx0cyB0byAncHJlZGljdGlvbicuCiAgICA6cGFyYW0gaHViX3VybDogICAgICAgICBodWJfdXJsIGluIGNhc2UgdGhlIGRlZmF1bHQgaXMgbm90IHVzZWQsIGNvbmNlcHRfZHJpZnRfc3RyZWFtaW5nIHdpbGwgYmUgbG9hZGVkCiAgICAgICAgICAgICAgICAgICAgICAgICAgICBieSB0aGlzIHVybAogICAgICAgICAgICAgICAgICAgICAgICAgICAgRGVmYXVsdHMgdG8gbWxjb25mLmh1Yl91cmwuCiAgICA6cGFyYW0gZm5fdGFnOiAgICAgICAgICBodWIgdGFnIHRvIHVzZQogICAgICAgICAgICAgICAgICAgICAgICAgICAgRGVmYXVsdHMgdG8gJ21hc3RlcicKICAgICIiIgoKICAgIG1sY29uZi5kYnBhdGggPSBtbGNvbmYuZGJwYXRoIG9yICJodHRwOi8vbWxydW4tYXBpOjgwODAiCiAgICBtbGNvbmYuaHViX3VybCA9IGh1Yl91cmwKICAgIGZuID0gaW1wb3J0X2Z1bmN0aW9uKHVybD1mImh1YjovL2NvbmNlcHRfZHJpZnRfc3RyZWFtaW5nOntmbl90YWd9IikKCiAgICBjb250ZXh0LmxvZ2dlci5pbmZvKCJMb2FkaW5nIGJhc2UgZGF0YXNldCIpCiAgICBiYXNlX2RmID0gYmFzZV9kYXRhc2V0LmFzX2RmKCkKICAgIGVycm9yX3N0cmVhbSA9IG5wLndoZXJlKAogICAgICAgIGJhc2VfZGZbcHJlZGljdGlvbl9jb2xdLnZhbHVlcyA9PSBiYXNlX2RmW2xhYmVsX2NvbF0udmFsdWVzLCAwLCAxCiAgICApCgogICAgY29udGV4dC5sb2dnZXIuaW5mbygiQ3JlYXRpbmcgbW9kZWxzIikKICAgIG1vZGVscyA9IFsKICAgICAgICBtb2RlbC5zdHJpcCgpCiAgICAgICAgZm9yIG1vZGVsIGluIG9zLmdldGVudigibW9kZWxzIiwgInBhZ2VoaW5rbGV5LCBkZG0sIGVkZG0iKS5zcGxpdCgiLCIpCiAgICBdCiAgICBtb2RlbHMgPSB7CiAgICAgICAgImVkZG0iOiBza211bHRpZmxvdy5kcmlmdF9kZXRlY3Rpb24uRURETSgpLAogICAgICAgICJwYWdlaGlua2xleSI6IHNrbXVsdGlmbG93LmRyaWZ0X2RldGVjdGlvbi5QYWdlSGlua2xleSgKICAgICAgICAgICAgbWluX2luc3RhbmNlcz1sZW4oZXJyb3Jfc3RyZWFtKSwgdGhyZXNob2xkPXBhZ2VoaW5rbGV5X3RocmVzaG9sZAogICAgICAgICksCiAgICAgICAgImRkbSI6IHNrbXVsdGlmbG93LmRyaWZ0X2RldGVjdGlvbi5ERE0oCiAgICAgICAgICAgIG1pbl9udW1faW5zdGFuY2VzPWxlbihlcnJvcl9zdHJlYW0pLAogICAgICAgICAgICB3YXJuaW5nX2xldmVsPWRkbV93YXJuaW5nX2xldmVsLAogICAgICAgICAgICBvdXRfY29udHJvbF9sZXZlbD1kZG1fb3V0X2NvbnRyb2xfbGV2ZWwsCiAgICAgICAgKSwKICAgIH0KCiAgICBjb250ZXh0LmxvZ2dlci5pbmZvKCJTdHJlYW1pbmcgZGF0YSB0byBtb2RlbHMiKQogICAgZm9yIGkgaW4gcmFuZ2UobGVuKGVycm9yX3N0cmVhbSkpOgogICAgICAgIGZvciBtb2RlbF9uYW1lLCBtb2RlbCBpbiBtb2RlbHMuaXRlbXMoKToKICAgICAgICAgICAgbW9kZWwuYWRkX2VsZW1lbnQoZXJyb3Jfc3RyZWFtW2ldKQoKICAgIGNvbnRleHQubG9nZ2VyLmluZm8oIkxvZ2dpbmcgcmVhZHkgbW9kZWxzIikKICAgIGZvciBuYW1lLCBtb2RlbCBpbiBtb2RlbHMuaXRlbXMoKToKICAgICAgICBkYXRhID0gZHVtcHMobW9kZWwpCiAgICAgICAgbW9kZWxfZmlsZSA9IGYie25hbWV9LnBrbCIKICAgICAgICBjb250ZXh0LmxvZ19tb2RlbCgKICAgICAgICAgICAgZiJ7bmFtZX1fY29uY2VwdF9kcmlmdCIsCiAgICAgICAgICAgIGJvZHk9ZGF0YSwKICAgICAgICAgICAgbGFiZWxzPXsiZnJhbWV3b3JrIjogInNrbXVsdGlmbG93IiwgIndvcmtmbG93IjogImNvbmNlcHQtZHJpZnQifSwKICAgICAgICAgICAgbW9kZWxfZmlsZT1tb2RlbF9maWxlLAogICAgICAgICAgICBtb2RlbF9kaXI9bW9kZWxzX2Rlc3QsCiAgICAgICAgICAgIHRhZz0ibGF0ZXN0IiwKICAgICAgICApCiAgICAgICAgZm4uc2V0X2VudnMoCiAgICAgICAgICAgIHsKICAgICAgICAgICAgICAgIGYie25hbWV9X21vZGVsX3BhdGgiOiBvcy5wYXRoLmpvaW4oCiAgICAgICAgICAgICAgICAgICAgY29udGV4dC5hcnRpZmFjdF9wYXRoLCBtb2RlbHNfZGVzdCwgbW9kZWxfZmlsZQogICAgICAgICAgICAgICAgKQogICAgICAgICAgICB9CiAgICAgICAgKQoKICAgIGNvbnRleHQubG9nZ2VyLmluZm8oIkRlcGxveWluZyBDb25jZXB0IERyaWZ0IFN0cmVhbWluZyBmdW5jdGlvbiIpCiAgICBmbi5zZXRfZW52cygKICAgICAgICB7CiAgICAgICAgICAgICJsYWJlbF9jb2wiOiBsYWJlbF9jb2wsCiAgICAgICAgICAgICJwcmVkaWN0aW9uX2NvbCI6IHByZWRpY3Rpb25fY29sLAogICAgICAgICAgICAiZHJpZnRfc3RyZWFtIjogb3V0cHV0X3N0cmVhbSwKICAgICAgICAgICAgInRzZGJfdGFibGUiOiBvdXRwdXRfdHNkYiwKICAgICAgICAgICAgInBhZ2VoaW5rbGV5X3RocmVzaG9sZCI6IHBhZ2VoaW5rbGV5X3RocmVzaG9sZCwKICAgICAgICAgICAgImRkbV93YXJuaW5nX2xldmVsIjogZGRtX3dhcm5pbmdfbGV2ZWwsCiAgICAgICAgICAgICJkZG1fb3V0X2NvbnRyb2wiOiBkZG1fb3V0X2NvbnRyb2xfbGV2ZWwsCiAgICAgICAgfQogICAgKQogICAgZm4uYWRkX3YzaW9fc3RyZWFtX3RyaWdnZXIoc3RyZWFtX3BhdGggPSBpbnB1dF9zdHJlYW0sIG5hbWUgPSAnc3RyZWFtJywgZ3JvdXAgPSBjb25zdW1lcl9ncm91cCkKICAgIGZuLmFwcGx5KG1vdW50X3YzaW8oKSkKICAgIGZuLmRlcGxveShwcm9qZWN0PWNvbnRleHQucHJvamVjdCkK - commands: - - python -m pip install scikit-multiflow - code_origin: https://github.com/daniels290813/functions.git#82bbfde4afa2eae77059e05c70bbebacf530fd0d:/User/test/functions/concept_drift/concept_drift.py - origin_filename: /User/test/functions/concept_drift/concept_drift.py - disable_auto_mount: false - affinity: null -verbose: false diff --git a/functions/development/concept_drift/0.0.2/src/item.yaml b/functions/development/concept_drift/0.0.2/src/item.yaml deleted file mode 100644 index 685ffe5f..00000000 --- a/functions/development/concept_drift/0.0.2/src/item.yaml +++ /dev/null @@ -1,25 +0,0 @@ -apiVersion: v1 -categories: -- machine-learning -- model-serving -description: Deploy a streaming Concept Drift detector on a labeled stream -doc: '' -example: concept_drift.ipynb -generationDate: 2021-05-19:22-04 -icon: '' -labels: - author: orz - framework: sklearn -maintainers: [] -marketplaceType: '' -mlrunVersion: '' -name: concept-drift -platformVersion: '' -spec: - filename: concept_drift.py - handler: concept_drift_deployer - image: mlrun/ml-models - kind: job - requirements: [scikit-multiflow] -url: '' -version: 0.0.2 diff --git a/functions/development/concept_drift/0.0.2/static/documentation.html b/functions/development/concept_drift/0.0.2/static/documentation.html deleted file mode 100644 index f8ce9bb9..00000000 --- a/functions/development/concept_drift/0.0.2/static/documentation.html +++ /dev/null @@ -1,125 +0,0 @@ - - - - - - - -concept_drift package - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- - -
-
-
-
-
-
-

concept_drift package

-
-

Submodules

-
-
-

concept_drift.concept_drift module

-
-
-

Module contents

-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/concept_drift/0.0.2/static/example.html b/functions/development/concept_drift/0.0.2/static/example.html deleted file mode 100644 index 1dd83595..00000000 --- a/functions/development/concept_drift/0.0.2/static/example.html +++ /dev/null @@ -1,753 +0,0 @@ - - - - - - - -Concept Drift - Deployer - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
- -
-
-
-
-

Concept Drift - Deployer

-

Deploy a streaming Concept Drift detector on a labeled stream.
-It will initialize the selected drift detectors with the base_dataset’s statistics and deploy the concept_drift_streaming function from the hub.
-adding V3IOStreamTrigger in order to listen to the input_stream.

- -
-

Data exploration

-

In order to know about the performance of a drift detector by measuring the different detection metrics, we need to know beforehand where a real drift occurs.
-This is only possible with synthetic datasets.
The scikit-multiflow framework allows generating several kinds of synthetic data to simulate the occurrence of drifts.
-Harvard dataverse provides futher explanations on the used dataset along with different kinds of drifted datasets.
-mixed_0101_abrupto has 4 concepts and 3 drifts at time steps 10000, 20000, and 30000.
-Our dataset will be train-test-splitted, the train part (first 5000 examples) is used to train the model (that is generated easly using sklearn_classifer).
-The test part (which is already predicted by the model) will be pushed to the input stream in order to detect drifts.

-
-
-
import pandas as pd
-data_path = 'https://s3.wasabisys.com/iguazio/data/function-marketplace-data/concept_drift/mixed_0101_abrupto.csv'
-predicted_train_path = 'https://s3.wasabisys.com/iguazio/data/function-marketplace-data/concept_drift/predicted_abrupto_train.csv'
-predicted_test_data_path = 'https://s3.wasabisys.com/iguazio/data/function-marketplace-data/concept_drift/predicted_abrupto_test.csv'
-# You can find the model used here
-models_path = 'https://s3.wasabisys.com/iguazio/models/function-marketplace-models/concept_drift/concept_drift_random_forest.pkl'
-original_data = pd.read_csv(data_path)
-original_data.head()
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
X1X2X3X4class
00.01.00.4601010.5927441.0
11.01.00.5887880.5749840.0
20.00.00.4016410.6793251.0
31.01.00.3060760.1821080.0
40.00.00.9628470.5792451.0
-
-
-
-
-
predicted_test = pd.read_csv(predicted_test_data_path)
-predicted_test.tail()
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
X1X2X3X4classpredicted_col
349950.00.00.0101060.6472690.01.0
349961.01.00.2936510.7372911.00.0
349970.00.00.8485460.5523370.01.0
349981.01.00.6147540.8598961.00.0
349991.00.00.2653060.8437160.01.0
-
-
-
-
-

Creating the input stream

-
-
-
import os 
-
-container = os.path.join('/',os.environ['V3IO_HOME'].split('/')[0])
-user = os.environ["V3IO_USERNAME"]
-rel_path = os.getcwd()[6:] + '/artifacts'
-
-base_input_stream = os.path.join(user,rel_path) + "/inputs_stream"
-base_output_stream = os.path.join(user,rel_path) + "/output_stream"
-input_stream = os.path.join(container,base_input_stream)
-output_stream = os.path.join(container,user,rel_path) + "/output_stream"
-tsdb_path = os.path.join(container,user,rel_path) + "/output_tsdb"
-
-stream_consumer_group = 'cg45'
-
-
-
-
-
-
-
import v3io.dataplane
-
-client = v3io.dataplane.Client()
-response = client.stream.create(container = container,
-                                stream_path=base_input_stream,
-                                shard_count=1,
-                                raise_for_status = v3io.dataplane.RaiseForStatus.never)
-response.raise_for_status([409, 204])
-
-
-
-
-
-
-

Importing the function

-
-
-
# Importing the function
-import mlrun
-mlrun.set_environment(project='function-marketplace')
-
-fn = mlrun.import_function("hub://concept_drift:development")
-fn.apply(mlrun.auto_mount())
-
-
-
-
-
> 2021-10-25 10:27:04,105 [info] created and saved project function-marketplace
-
-
-
<mlrun.runtimes.kubejob.KubejobRuntime at 0x7f145dd80fd0>
-
-
-
-
-
-
-

Running the function remotely

-
-
-
drift_run = fn.run(name='concept_drift',
-                   params={'input_stream'    : input_stream,
-                           'consumer_group'  : stream_consumer_group,
-                           'output_stream'   : output_stream,
-                           'output_tsdb'     : tsdb_path,
-                           'tsdb_batch_size' : 1,
-                           'models'          : ['ddm', 'eddm', 'pagehinkley'], # defaults
-                           'label_col'       : 'class',
-                           'prediction_col'  : 'predicted_col',
-                           'fn_tag'          : 'development'},
-                   inputs={'base_dataset'    : predicted_train_path},
-                   artifact_path = os.path.join(os.getcwd(), 'artifacts'))
-
-
-
-
-
> 2021-10-25 10:27:04,567 [info] starting run concept_drift uid=fa07c222e77d4eac86d2ce9317aaded1 DB=http://mlrun-api:8080
-> 2021-10-25 10:27:04,709 [info] Job is running in the background, pod: concept-drift-ggxgb
-> 2021-10-25 10:27:11,199 [info] Loading base dataset
-> 2021-10-25 10:27:13,227 [info] Creating models
-> 2021-10-25 10:27:13,227 [info] Streaming data to models
-> 2021-10-25 10:27:13,347 [info] Logging ready models
-> 2021-10-25 10:27:13,487 [info] Deploying Concept Drift Streaming function
-> 2021-10-25 10:27:13,490 [info] Starting remote function deploy
-2021-10-25 10:27:13  (info) Deploying function
-2021-10-25 10:27:13  (info) Building
-2021-10-25 10:27:13  (info) Staging files and preparing base images
-2021-10-25 10:27:13  (info) Building processor image
-2021-10-25 10:27:15  (info) Build complete
-2021-10-25 10:27:21  (info) Function deploy complete
-> 2021-10-25 10:27:21,797 [info] successfully deployed function: {'internal_invocation_urls': ['nuclio-function-marketplace-concept-drift-streaming.default-tenant.svc.cluster.local:8080'], 'external_invocation_urls': ['default-tenant.app.dev39.lab.iguazeng.com:31143']}
-> 2021-10-25 10:27:21,868 [info] run executed, status=completed
-final state: completed
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
function-marketplace0Oct 25 10:27:10completedconcept_drift
v3io_user=dani
kind=job
owner=dani
host=concept-drift-ggxgb
base_dataset
input_stream=/users/dani/test/functions/concept_drift/artifacts/inputs_stream
consumer_group=cg45
output_stream=/users/dani/test/functions/concept_drift/artifacts/output_stream
output_tsdb=/users/dani/test/functions/concept_drift/artifacts/output_tsdb
tsdb_batch_size=1
models=['ddm', 'eddm', 'pagehinkley']
label_col=class
prediction_col=predicted_col
fn_tag=development
eddm_concept_drift
pagehinkley_concept_drift
ddm_concept_drift
-
- -
-

-
-
-
> to track results use the .show() or .logs() methods or click here to open in UI
> 2021-10-25 10:27:23,031 [info] run executed, status=completed
-
-
-
-
-
-
-

Testing the function

-
-

Mark that we are testing the deployed function - concept_drift_streaming

-
-
-
-
import json
-import datetime
-
-# Reshaping the data to V3IOStream format.
-def restructure_stream_event(context, event):
-    instances = [dict()]
-    for key in predicted_test.keys():
-        if key not in ['when', 'class', 'model', 'worker', 'hostname', 'predicted_col']:
-            instances[0].update({key: event.pop(key)})
-    event['request'] = {'instances': instances}
-    event['resp'] = [int(event.pop('predicted_col'))]
-    event['when'] = datetime.datetime.strftime(datetime.datetime.now(), format="%Y-%m-%d %H:%M:%S.%f")
-    event['model'] = 'sklearn.ensemble.RandomForestClassifier'
-    return event
-    
-    
-records = json.loads(predicted_test.to_json(orient='records'))
-records = [{'data': json.dumps(restructure_stream_event(context, record))} for record in records]
-
-# showing first record
-records[0]
-
-
-
-
-
{'data': '{"class": 1.0, "request": {"instances": [{"X1": 0.0, "X2": 0.0, "X3": 0.0634475073, "X4": 0.4136568818}]}, "resp": [1], "when": "2021-10-25 10:27:23.152584", "model": "sklearn.ensemble.RandomForestClassifier"}'}
-
-
-
-
-
-
-
# Creating v3io client
-v3io_client = v3io.dataplane.Client()
-
-# Pushing some undrifted data to the input stream
-response = v3io_client.stream.put_records(container=container,
-                                          stream_path=base_input_stream, 
-                                          records=records[4900:5100])
-
-
-
-
-
-
-
# Getting earliest location in the shard
-location = json.loads(v3io_client.stream.seek(container=container,
-                                              stream_path=base_input_stream,
-                                              shard_id=0,
-                                              seek_type='EARLIEST').body)['Location']
-# Getting records from input stream
-response = v3io_client.stream.get_records(container=container,
-                                          stream_path=base_input_stream,
-                                          shard_id=0, location=location)
-# Showing the last sequence that is written to the input stream
-json.loads(response.body)['Records'][-1]
-
-
-
-
-
{'SequenceNumber': 200,
- 'Data': 'eyJjbGFzcyI6IDAuMCwgInJlcXVlc3QiOiB7Imluc3RhbmNlcyI6IFt7IlgxIjogMC4wLCAiWDIiOiAwLjAsICJYMyI6IDAuMzMzMTYzNjk4OSwgIlg0IjogMC40MjE2NzY1Njg3fV19LCAicmVzcCI6IFsxXSwgIndoZW4iOiAiMjAyMS0xMC0yNSAxMDoyNzoyMy4yOTM3OTgiLCAibW9kZWwiOiAic2tsZWFybi5lbnNlbWJsZS5SYW5kb21Gb3Jlc3RDbGFzc2lmaWVyIn0=',
- 'ArrivalTimeSec': 1635157644,
- 'ArrivalTimeNSec': 395309631}
-
-
-
-
-
-

Make sure some time has passed - the function needs to be triggered by the input stream, then it’ll write to the output stream

-
-
-
# Getting earliest location in the shard
-location = json.loads(v3io_client.stream.seek(container=container,
-                                              stream_path=base_output_stream,
-                                              shard_id=0,
-                                              seek_type='EARLIEST').body)['Location']
-# Getting records from output stream
-response = v3io_client.stream.get_records(container=container,
-                                          stream_path=base_output_stream,
-                                          shard_id=0, location=location)
-
-
-
-
-
-
-
# Showing changed detected
-import base64
-for instance in json.loads(response.body)['Records']:
-    seq = instance["SequenceNumber"]
-    data = json.loads(base64.b64decode(instance['Data']))
-    if(data['ddm_drift']==1 or data['eddm_drift']==1):
-        print(f'sequence number : {seq}, data : {data}')
-
-
-
-
-
sequence number : 106, data : {'class': 0.0, 'request': {'instances': [{'X1': 0.0, 'X2': 0.0, 'X3': 0.9628473804, 'X4': 0.5792453402}]}, 'resp': [1], 'when': '2021-10-25 10:27:23.291145', 'model': 'sklearn.ensemble.RandomForestClassifier', 'ddm_warning_zone': 0, 'ddm_drift': 1, 'eddm_warning_zone': 0, 'eddm_drift': 0}
-sequence number : 122, data : {'class': 0.0, 'request': {'instances': [{'X1': 0.0, 'X2': 0.0, 'X3': 0.4969765505, 'X4': 0.9784738351}]}, 'resp': [1], 'when': '2021-10-25 10:27:23.291558', 'model': 'sklearn.ensemble.RandomForestClassifier', 'ddm_warning_zone': 0, 'ddm_drift': 0, 'eddm_warning_zone': 0, 'eddm_drift': 1}
-
-
-
-
-

We can see that the system detected a change in the 106 instance, which is 10006 instance in the real dataset -
-5000 first instances are for train, we started pushing data from the 4900 instance of the test dataset (9900 from the real dataset), and we pushed only 200 instances.

-

Back to the top

-
-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/concept_drift/0.0.2/static/function.html b/functions/development/concept_drift/0.0.2/static/function.html deleted file mode 100644 index c12aad7c..00000000 --- a/functions/development/concept_drift/0.0.2/static/function.html +++ /dev/null @@ -1,134 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: job
-metadata:
-  name: concept-drift
-  tag: ''
-  hash: 935da41196802875e19948974f32b6f00c29feb2
-  project: default
-  labels:
-    author: orz
-    framework: sklearn
-  categories:
-  - machine-learning
-  - model-serving
-spec:
-  command: ''
-  args: []
-  image: mlrun/ml-models
-  env: []
-  default_handler: concept_drift_deployer
-  entry_points:
-    concept_drift_deployer:
-      name: concept_drift_deployer
-      doc: "Deploy a streaming Concept Drift detector on a labeled stream\n   This\
-        \ function is the Deployment step for the Streaming Concept Drift Detector.\n\
-        \   It will load the selected drift detectors and initialize them with the\n\
-        \   base_dataset's statistics.  Then it will deploy the concept_drift_streaming\n\
-        \   function and pass the models to it for streaming concept-drift detection\
-        \ on top\n   of a labeled stream."
-      parameters:
-      - name: context
-        type: MLClientCtx
-        doc: MLRun context
-        default: ''
-      - name: base_dataset
-        type: DataItem
-        doc: Dataset containing label_col and prediction_col to initialize the detectors
-        default: ''
-      - name: input_stream
-        type: str
-        doc: labeled stream to track. Should contain label_col and prediction_col
-        default: ''
-      - name: consumer_group
-        type: str
-        default: ''
-      - name: output_stream
-        type: str
-        doc: Output stream to push the detector's alerts
-        default: ''
-      - name: output_tsdb
-        type: str
-        doc: Output TSDB table to allow analysis and display
-        default: ''
-      - name: tsdb_batch_size
-        type: int
-        doc: Batch size of alerts to buffer before pushing to the TSDB
-        default: ''
-      - name: callbacks
-        type: list
-        doc: Additional rest endpoints to send the alert data to
-        default: ''
-      - name: models
-        type: list
-        doc: List of the detectors to deploy Defaults to ['ddm', 'eddm', 'pagehinkley'].
-        default:
-        - ddm
-        - eddm
-        - pagehinkley
-      - name: models_dest
-        doc: Location for saving the detectors Defaults to 'models' (in relation to
-          artifact_path).
-        default: models
-      - name: pagehinkley_threshold
-        type: float
-        doc: Drift level threshold for PH detector Defaults to 10.
-        default: 10
-      - name: ddm_warning_level
-        type: float
-        doc: Warning level alert for DDM detector Defaults to 2.
-        default: 2
-      - name: ddm_out_control_level
-        type: float
-        doc: Drift level alert for DDM detector Defaults to 3.
-        default: 3
-      - name: label_col
-        doc: Label column to be used on base_dataset and input_stream Defaults to
-          'label'.
-        default: label
-      - name: prediction_col
-        doc: Prediction column to be used on base_dataset and input_stream Defaults
-          to 'prediction'.
-        default: prediction
-      - name: hub_url
-        type: str
-        doc: hub_url in case the default is not used, concept_drift_streaming will
-          be loaded by this url Defaults to mlconf.hub_url.
-        default: <_ast.Name object at 0x7f48eda946d0>
-      - name: fn_tag
-        type: str
-        doc: hub tag to use Defaults to 'master'
-        default: master
-      outputs:
-      - default: ''
-      lineno: 15
-  description: Deploy a streaming Concept Drift detector on a labeled stream
-  build:
-    functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHNrbXVsdGlmbG93LmRyaWZ0X2RldGVjdGlvbiAgIyBXZSB3aWxsIGdyYWIgb3VyIFBILCBERE0sIEVERE0gYWxnb3JpdGhtcyBmcm9tIGhlcmUKaW1wb3J0IG51bXB5IGFzIG5wCmltcG9ydCBwYW5kYXMgYXMgcGQKaW1wb3J0IG9zCmZyb20gY2xvdWRwaWNrbGUgaW1wb3J0IGR1bXBzLCBsb2FkLCBkdW1wCgpmcm9tIG51Y2xpby50cmlnZ2VycyBpbXBvcnQgVjNJT1N0cmVhbVRyaWdnZXIKZnJvbSBtbHJ1biBpbXBvcnQgRGF0YUl0ZW0sIGltcG9ydF9mdW5jdGlvbiwgbWxjb25mLCBNTENsaWVudEN0eCwgbW91bnRfdjNpbwoKaW1wb3J0IHJhbmRvbQoKCmRlZiBjb25jZXB0X2RyaWZ0X2RlcGxveWVyKAogICAgY29udGV4dDogTUxDbGllbnRDdHgsCiAgICBiYXNlX2RhdGFzZXQ6IERhdGFJdGVtLAogICAgaW5wdXRfc3RyZWFtOiBzdHIsCiAgICBjb25zdW1lcl9ncm91cDogc3RyLAogICAgb3V0cHV0X3N0cmVhbTogc3RyLAogICAgb3V0cHV0X3RzZGI6IHN0ciwKICAgIHRzZGJfYmF0Y2hfc2l6ZTogaW50LAogICAgY2FsbGJhY2tzOiBsaXN0LAogICAgbW9kZWxzOiBsaXN0ID0gWyJkZG0iLCAiZWRkbSIsICJwYWdlaGlua2xleSJdLAogICAgbW9kZWxzX2Rlc3Q9Im1vZGVscyIsCiAgICBwYWdlaGlua2xleV90aHJlc2hvbGQ6IGZsb2F0ID0gMTAsCiAgICBkZG1fd2FybmluZ19sZXZlbDogZmxvYXQgPSAyLAogICAgZGRtX291dF9jb250cm9sX2xldmVsOiBmbG9hdCA9IDMsCiAgICBsYWJlbF9jb2w9ImxhYmVsIiwKICAgIHByZWRpY3Rpb25fY29sPSJwcmVkaWN0aW9uIiwKICAgIGh1Yl91cmw6IHN0ciA9IG1sY29uZi5odWJfdXJsLAogICAgZm5fdGFnOiBzdHIgPSAibWFzdGVyIiwKKToKICAgICIiIkRlcGxveSBhIHN0cmVhbWluZyBDb25jZXB0IERyaWZ0IGRldGVjdG9yIG9uIGEgbGFiZWxlZCBzdHJlYW0KICAgICAgIFRoaXMgZnVuY3Rpb24gaXMgdGhlIERlcGxveW1lbnQgc3RlcCBmb3IgdGhlIFN0cmVhbWluZyBDb25jZXB0IERyaWZ0IERldGVjdG9yLgogICAgICAgSXQgd2lsbCBsb2FkIHRoZSBzZWxlY3RlZCBkcmlmdCBkZXRlY3RvcnMgYW5kIGluaXRpYWxpemUgdGhlbSB3aXRoIHRoZQogICAgICAgYmFzZV9kYXRhc2V0J3Mgc3RhdGlzdGljcy4gIFRoZW4gaXQgd2lsbCBkZXBsb3kgdGhlIGNvbmNlcHRfZHJpZnRfc3RyZWFtaW5nCiAgICAgICBmdW5jdGlvbiBhbmQgcGFzcyB0aGUgbW9kZWxzIHRvIGl0IGZvciBzdHJlYW1pbmcgY29uY2VwdC1kcmlmdCBkZXRlY3Rpb24gb24gdG9wCiAgICAgICBvZiBhIGxhYmVsZWQgc3RyZWFtLgoKICAgIDpwYXJhbSBjb250ZXh0OiAgICAgICAgIE1MUnVuIGNvbnRleHQKICAgIDpwYXJhbSBiYXNlX2RhdGFzZXQ6ICAgIERhdGFzZXQgY29udGFpbmluZyBsYWJlbF9jb2wgYW5kIHByZWRpY3Rpb25fY29sIHRvIGluaXRpYWxpemUgdGhlIGRldGVjdG9ycwogICAgOnBhcmFtIGlucHV0X3N0cmVhbTogICAgbGFiZWxlZCBzdHJlYW0gdG8gdHJhY2suCiAgICAgICAgICAgICAgICAgICAgICAgICAgICBTaG91bGQgY29udGFpbiBsYWJlbF9jb2wgYW5kIHByZWRpY3Rpb25fY29sCiAgICA6cGFyYW0gb3V0cHV0X3N0cmVhbTogICBPdXRwdXQgc3RyZWFtIHRvIHB1c2ggdGhlIGRldGVjdG9yJ3MgYWxlcnRzCiAgICA6cGFyYW0gb3V0cHV0X3RzZGI6ICAgICBPdXRwdXQgVFNEQiB0YWJsZSB0byBhbGxvdyBhbmFseXNpcyBhbmQgZGlzcGxheQogICAgOnBhcmFtIHRzZGJfYmF0Y2hfc2l6ZTogQmF0Y2ggc2l6ZSBvZiBhbGVydHMgdG8gYnVmZmVyIGJlZm9yZSBwdXNoaW5nIHRvIHRoZSBUU0RCCiAgICA6cGFyYW0gY2FsbGJhY2tzOiAgICAgICBBZGRpdGlvbmFsIHJlc3QgZW5kcG9pbnRzIHRvIHNlbmQgdGhlIGFsZXJ0IGRhdGEgdG8KICAgIDpwYXJhbSBtb2RlbHM6ICAgICAgICAgIExpc3Qgb2YgdGhlIGRldGVjdG9ycyB0byBkZXBsb3kKICAgICAgICAgICAgICAgICAgICAgICAgICAgIERlZmF1bHRzIHRvIFsnZGRtJywgJ2VkZG0nLCAncGFnZWhpbmtsZXknXS4KICAgIDpwYXJhbSBtb2RlbHNfZGVzdDogICAgIExvY2F0aW9uIGZvciBzYXZpbmcgdGhlIGRldGVjdG9ycwogICAgICAgICAgICAgICAgICAgICAgICAgICAgRGVmYXVsdHMgdG8gJ21vZGVscycgKGluIHJlbGF0aW9uIHRvIGFydGlmYWN0X3BhdGgpLgogICAgOnBhcmFtIHBhZ2VoaW5rbGV5X3RocmVzaG9sZDogIERyaWZ0IGxldmVsIHRocmVzaG9sZCBmb3IgUEggZGV0ZWN0b3IgRGVmYXVsdHMgdG8gMTAuCiAgICA6cGFyYW0gZGRtX3dhcm5pbmdfbGV2ZWw6ICAgICAgV2FybmluZyBsZXZlbCBhbGVydCBmb3IgRERNIGRldGVjdG9yIERlZmF1bHRzIHRvIDIuCiAgICA6cGFyYW0gZGRtX291dF9jb250cm9sX2xldmVsOiAgRHJpZnQgbGV2ZWwgYWxlcnQgZm9yIERETSBkZXRlY3RvciBEZWZhdWx0cyB0byAzLgogICAgOnBhcmFtIGxhYmVsX2NvbDogICAgICAgTGFiZWwgY29sdW1uIHRvIGJlIHVzZWQgb24gYmFzZV9kYXRhc2V0IGFuZCBpbnB1dF9zdHJlYW0KICAgICAgICAgICAgICAgICAgICAgICAgICAgIERlZmF1bHRzIHRvICdsYWJlbCcuCiAgICA6cGFyYW0gcHJlZGljdGlvbl9jb2w6ICBQcmVkaWN0aW9uIGNvbHVtbiB0byBiZSB1c2VkIG9uIGJhc2VfZGF0YXNldCBhbmQgaW5wdXRfc3RyZWFtCiAgICAgICAgICAgICAgICAgICAgICAgICAgICBEZWZhdWx0cyB0byAncHJlZGljdGlvbicuCiAgICA6cGFyYW0gaHViX3VybDogICAgICAgICBodWJfdXJsIGluIGNhc2UgdGhlIGRlZmF1bHQgaXMgbm90IHVzZWQsIGNvbmNlcHRfZHJpZnRfc3RyZWFtaW5nIHdpbGwgYmUgbG9hZGVkCiAgICAgICAgICAgICAgICAgICAgICAgICAgICBieSB0aGlzIHVybAogICAgICAgICAgICAgICAgICAgICAgICAgICAgRGVmYXVsdHMgdG8gbWxjb25mLmh1Yl91cmwuCiAgICA6cGFyYW0gZm5fdGFnOiAgICAgICAgICBodWIgdGFnIHRvIHVzZQogICAgICAgICAgICAgICAgICAgICAgICAgICAgRGVmYXVsdHMgdG8gJ21hc3RlcicKICAgICIiIgoKICAgIG1sY29uZi5kYnBhdGggPSBtbGNvbmYuZGJwYXRoIG9yICJodHRwOi8vbWxydW4tYXBpOjgwODAiCiAgICBtbGNvbmYuaHViX3VybCA9IGh1Yl91cmwKICAgIGZuID0gaW1wb3J0X2Z1bmN0aW9uKHVybD1mImh1YjovL2NvbmNlcHRfZHJpZnRfc3RyZWFtaW5nOntmbl90YWd9IikKCiAgICBjb250ZXh0LmxvZ2dlci5pbmZvKCJMb2FkaW5nIGJhc2UgZGF0YXNldCIpCiAgICBiYXNlX2RmID0gYmFzZV9kYXRhc2V0LmFzX2RmKCkKICAgIGVycm9yX3N0cmVhbSA9IG5wLndoZXJlKAogICAgICAgIGJhc2VfZGZbcHJlZGljdGlvbl9jb2xdLnZhbHVlcyA9PSBiYXNlX2RmW2xhYmVsX2NvbF0udmFsdWVzLCAwLCAxCiAgICApCgogICAgY29udGV4dC5sb2dnZXIuaW5mbygiQ3JlYXRpbmcgbW9kZWxzIikKICAgIG1vZGVscyA9IFsKICAgICAgICBtb2RlbC5zdHJpcCgpCiAgICAgICAgZm9yIG1vZGVsIGluIG9zLmdldGVudigibW9kZWxzIiwgInBhZ2VoaW5rbGV5LCBkZG0sIGVkZG0iKS5zcGxpdCgiLCIpCiAgICBdCiAgICBtb2RlbHMgPSB7CiAgICAgICAgImVkZG0iOiBza211bHRpZmxvdy5kcmlmdF9kZXRlY3Rpb24uRURETSgpLAogICAgICAgICJwYWdlaGlua2xleSI6IHNrbXVsdGlmbG93LmRyaWZ0X2RldGVjdGlvbi5QYWdlSGlua2xleSgKICAgICAgICAgICAgbWluX2luc3RhbmNlcz1sZW4oZXJyb3Jfc3RyZWFtKSwgdGhyZXNob2xkPXBhZ2VoaW5rbGV5X3RocmVzaG9sZAogICAgICAgICksCiAgICAgICAgImRkbSI6IHNrbXVsdGlmbG93LmRyaWZ0X2RldGVjdGlvbi5ERE0oCiAgICAgICAgICAgIG1pbl9udW1faW5zdGFuY2VzPWxlbihlcnJvcl9zdHJlYW0pLAogICAgICAgICAgICB3YXJuaW5nX2xldmVsPWRkbV93YXJuaW5nX2xldmVsLAogICAgICAgICAgICBvdXRfY29udHJvbF9sZXZlbD1kZG1fb3V0X2NvbnRyb2xfbGV2ZWwsCiAgICAgICAgKSwKICAgIH0KCiAgICBjb250ZXh0LmxvZ2dlci5pbmZvKCJTdHJlYW1pbmcgZGF0YSB0byBtb2RlbHMiKQogICAgZm9yIGkgaW4gcmFuZ2UobGVuKGVycm9yX3N0cmVhbSkpOgogICAgICAgIGZvciBtb2RlbF9uYW1lLCBtb2RlbCBpbiBtb2RlbHMuaXRlbXMoKToKICAgICAgICAgICAgbW9kZWwuYWRkX2VsZW1lbnQoZXJyb3Jfc3RyZWFtW2ldKQoKICAgIGNvbnRleHQubG9nZ2VyLmluZm8oIkxvZ2dpbmcgcmVhZHkgbW9kZWxzIikKICAgIGZvciBuYW1lLCBtb2RlbCBpbiBtb2RlbHMuaXRlbXMoKToKICAgICAgICBkYXRhID0gZHVtcHMobW9kZWwpCiAgICAgICAgbW9kZWxfZmlsZSA9IGYie25hbWV9LnBrbCIKICAgICAgICBjb250ZXh0LmxvZ19tb2RlbCgKICAgICAgICAgICAgZiJ7bmFtZX1fY29uY2VwdF9kcmlmdCIsCiAgICAgICAgICAgIGJvZHk9ZGF0YSwKICAgICAgICAgICAgbGFiZWxzPXsiZnJhbWV3b3JrIjogInNrbXVsdGlmbG93IiwgIndvcmtmbG93IjogImNvbmNlcHQtZHJpZnQifSwKICAgICAgICAgICAgbW9kZWxfZmlsZT1tb2RlbF9maWxlLAogICAgICAgICAgICBtb2RlbF9kaXI9bW9kZWxzX2Rlc3QsCiAgICAgICAgICAgIHRhZz0ibGF0ZXN0IiwKICAgICAgICApCiAgICAgICAgZm4uc2V0X2VudnMoCiAgICAgICAgICAgIHsKICAgICAgICAgICAgICAgIGYie25hbWV9X21vZGVsX3BhdGgiOiBvcy5wYXRoLmpvaW4oCiAgICAgICAgICAgICAgICAgICAgY29udGV4dC5hcnRpZmFjdF9wYXRoLCBtb2RlbHNfZGVzdCwgbW9kZWxfZmlsZQogICAgICAgICAgICAgICAgKQogICAgICAgICAgICB9CiAgICAgICAgKQoKICAgIGNvbnRleHQubG9nZ2VyLmluZm8oIkRlcGxveWluZyBDb25jZXB0IERyaWZ0IFN0cmVhbWluZyBmdW5jdGlvbiIpCiAgICBmbi5zZXRfZW52cygKICAgICAgICB7CiAgICAgICAgICAgICJsYWJlbF9jb2wiOiBsYWJlbF9jb2wsCiAgICAgICAgICAgICJwcmVkaWN0aW9uX2NvbCI6IHByZWRpY3Rpb25fY29sLAogICAgICAgICAgICAiZHJpZnRfc3RyZWFtIjogb3V0cHV0X3N0cmVhbSwKICAgICAgICAgICAgInRzZGJfdGFibGUiOiBvdXRwdXRfdHNkYiwKICAgICAgICAgICAgInBhZ2VoaW5rbGV5X3RocmVzaG9sZCI6IHBhZ2VoaW5rbGV5X3RocmVzaG9sZCwKICAgICAgICAgICAgImRkbV93YXJuaW5nX2xldmVsIjogZGRtX3dhcm5pbmdfbGV2ZWwsCiAgICAgICAgICAgICJkZG1fb3V0X2NvbnRyb2wiOiBkZG1fb3V0X2NvbnRyb2xfbGV2ZWwsCiAgICAgICAgfQogICAgKQogICAgZm4uYWRkX3YzaW9fc3RyZWFtX3RyaWdnZXIoc3RyZWFtX3BhdGggPSBpbnB1dF9zdHJlYW0sIG5hbWUgPSAnc3RyZWFtJywgZ3JvdXAgPSBjb25zdW1lcl9ncm91cCkKICAgIGZuLmFwcGx5KG1vdW50X3YzaW8oKSkKICAgIGZuLmRlcGxveShwcm9qZWN0PWNvbnRleHQucHJvamVjdCkK
-    commands:
-    - python -m pip install scikit-multiflow
-    code_origin: https://github.com/daniels290813/functions.git#82bbfde4afa2eae77059e05c70bbebacf530fd0d:/User/test/functions/concept_drift/concept_drift.py
-    origin_filename: /User/test/functions/concept_drift/concept_drift.py
-  disable_auto_mount: false
-  affinity: null
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/concept_drift/0.0.2/static/item.html b/functions/development/concept_drift/0.0.2/static/item.html deleted file mode 100644 index f852bf11..00000000 --- a/functions/development/concept_drift/0.0.2/static/item.html +++ /dev/null @@ -1,47 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- machine-learning
-- model-serving
-description: Deploy a streaming Concept Drift detector on a labeled stream
-doc: ''
-example: concept_drift.ipynb
-generationDate: 2021-05-19:22-04
-icon: ''
-labels:
-  author: orz
-  framework: sklearn
-maintainers: []
-marketplaceType: ''
-mlrunVersion: ''
-name: concept-drift
-platformVersion: ''
-spec:
-  filename: concept_drift.py
-  handler: concept_drift_deployer
-  image: mlrun/ml-models
-  kind: job
-  requirements: [scikit-multiflow]
-url: ''
-version: 0.0.2
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/concept_drift/0.0.2/static/source.html b/functions/development/concept_drift/0.0.2/static/source.html deleted file mode 100644 index e6ae663b..00000000 --- a/functions/development/concept_drift/0.0.2/static/source.html +++ /dev/null @@ -1,155 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-# Generated by nuclio.export.NuclioExporter
-
-import skmultiflow.drift_detection  # We will grab our PH, DDM, EDDM algorithms from here
-import numpy as np
-import pandas as pd
-import os
-from cloudpickle import dumps, load, dump
-
-from nuclio.triggers import V3IOStreamTrigger
-from mlrun import DataItem, import_function, mlconf, MLClientCtx, mount_v3io
-
-import random
-
-
-def concept_drift_deployer(
-    context: MLClientCtx,
-    base_dataset: DataItem,
-    input_stream: str,
-    consumer_group: str,
-    output_stream: str,
-    output_tsdb: str,
-    tsdb_batch_size: int,
-    callbacks: list,
-    models: list = ["ddm", "eddm", "pagehinkley"],
-    models_dest="models",
-    pagehinkley_threshold: float = 10,
-    ddm_warning_level: float = 2,
-    ddm_out_control_level: float = 3,
-    label_col="label",
-    prediction_col="prediction",
-    hub_url: str = mlconf.hub_url,
-    fn_tag: str = "master",
-):
-    """Deploy a streaming Concept Drift detector on a labeled stream
-       This function is the Deployment step for the Streaming Concept Drift Detector.
-       It will load the selected drift detectors and initialize them with the
-       base_dataset's statistics.  Then it will deploy the concept_drift_streaming
-       function and pass the models to it for streaming concept-drift detection on top
-       of a labeled stream.
-
-    :param context:         MLRun context
-    :param base_dataset:    Dataset containing label_col and prediction_col to initialize the detectors
-    :param input_stream:    labeled stream to track.
-                            Should contain label_col and prediction_col
-    :param output_stream:   Output stream to push the detector's alerts
-    :param output_tsdb:     Output TSDB table to allow analysis and display
-    :param tsdb_batch_size: Batch size of alerts to buffer before pushing to the TSDB
-    :param callbacks:       Additional rest endpoints to send the alert data to
-    :param models:          List of the detectors to deploy
-                            Defaults to ['ddm', 'eddm', 'pagehinkley'].
-    :param models_dest:     Location for saving the detectors
-                            Defaults to 'models' (in relation to artifact_path).
-    :param pagehinkley_threshold:  Drift level threshold for PH detector Defaults to 10.
-    :param ddm_warning_level:      Warning level alert for DDM detector Defaults to 2.
-    :param ddm_out_control_level:  Drift level alert for DDM detector Defaults to 3.
-    :param label_col:       Label column to be used on base_dataset and input_stream
-                            Defaults to 'label'.
-    :param prediction_col:  Prediction column to be used on base_dataset and input_stream
-                            Defaults to 'prediction'.
-    :param hub_url:         hub_url in case the default is not used, concept_drift_streaming will be loaded
-                            by this url
-                            Defaults to mlconf.hub_url.
-    :param fn_tag:          hub tag to use
-                            Defaults to 'master'
-    """
-
-    mlconf.dbpath = mlconf.dbpath or "http://mlrun-api:8080"
-    mlconf.hub_url = hub_url
-    fn = import_function(url=f"hub://concept_drift_streaming:{fn_tag}")
-
-    context.logger.info("Loading base dataset")
-    base_df = base_dataset.as_df()
-    error_stream = np.where(
-        base_df[prediction_col].values == base_df[label_col].values, 0, 1
-    )
-
-    context.logger.info("Creating models")
-    models = [
-        model.strip()
-        for model in os.getenv("models", "pagehinkley, ddm, eddm").split(",")
-    ]
-    models = {
-        "eddm": skmultiflow.drift_detection.EDDM(),
-        "pagehinkley": skmultiflow.drift_detection.PageHinkley(
-            min_instances=len(error_stream), threshold=pagehinkley_threshold
-        ),
-        "ddm": skmultiflow.drift_detection.DDM(
-            min_num_instances=len(error_stream),
-            warning_level=ddm_warning_level,
-            out_control_level=ddm_out_control_level,
-        ),
-    }
-
-    context.logger.info("Streaming data to models")
-    for i in range(len(error_stream)):
-        for model_name, model in models.items():
-            model.add_element(error_stream[i])
-
-    context.logger.info("Logging ready models")
-    for name, model in models.items():
-        data = dumps(model)
-        model_file = f"{name}.pkl"
-        context.log_model(
-            f"{name}_concept_drift",
-            body=data,
-            labels={"framework": "skmultiflow", "workflow": "concept-drift"},
-            model_file=model_file,
-            model_dir=models_dest,
-            tag="latest",
-        )
-        fn.set_envs(
-            {
-                f"{name}_model_path": os.path.join(
-                    context.artifact_path, models_dest, model_file
-                )
-            }
-        )
-
-    context.logger.info("Deploying Concept Drift Streaming function")
-    fn.set_envs(
-        {
-            "label_col": label_col,
-            "prediction_col": prediction_col,
-            "drift_stream": output_stream,
-            "tsdb_table": output_tsdb,
-            "pagehinkley_threshold": pagehinkley_threshold,
-            "ddm_warning_level": ddm_warning_level,
-            "ddm_out_control": ddm_out_control_level,
-        }
-    )
-    fn.add_v3io_stream_trigger(stream_path = input_stream, name = 'stream', group = consumer_group)
-    fn.apply(mount_v3io())
-    fn.deploy(project=context.project)
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/concept_drift/0.8.0/src/README.md b/functions/development/concept_drift/0.8.0/src/README.md deleted file mode 100644 index 92e6d893..00000000 --- a/functions/development/concept_drift/0.8.0/src/README.md +++ /dev/null @@ -1,132 +0,0 @@ -# Concept Drift - -**Concept drift** is a change in the statistical properties of the **target variable** over time. - -When deploying our models to production, we must ensure our models perform as we expect them to - reaching the same level of performence we have seen on our test sets or at least performing in the same quality as when they were deployed. - -However, often this is not the case. there are many factors that can affect our model's performance like seasonality or any unkown root causes that will change the laws underlying our data and invalidate some assumptions made by the model. - -We offer this function to help combat Concept Drift with implementation of streaming DDM, EDDM and PH concept drift detectors. - -## How to integrate - -This function is made of two parts: - -1. Kubernetes job to instantiate the selected models with a provided base dataset (the test dataset could be used) -2. [Nuclio serverless function](../concept_drift_streaming/concept_drift_streaming.ipynb) listed on a _labeled stream_, which will be deployed from this function after the models initialization and run the models per event and provide necessary alerts. - -There are two steps to integrate sucessfully with your workflow: - -1. Provide a stream where each event containes the joined **label** and **prediction** for that specific event. -2. Add this function to the workflow with the following params: - -```markdown -:param context: MLRun context -:param base_dataset: Dataset containing label_col and prediction_col to initialize the detectors -:param input_stream: labeled stream to track. - Should contain label_col and prediction_col -:param output_stream: Output stream to push the detector's alerts -:param output_tsdb: Output TSDB table to allow analysis and display -:param tsdb_batch_size: Batch size of alerts to buffer before pushing to the TSDB -:param callbacks: Additional rest endpoints to send the alert data to -:param models: List of the detectors to deploy - Defaults to ['ddm', 'eddm', 'pagehinkley']. -:param models_dest: Location for saving the detectors - Defaults to 'models' (in relation to artifact_path). -:param pagehinkley_threshold: Drift level threshold for PH detector Defaults to 10. -:param ddm_warning_level: Warning level alert for DDM detector Defaults to 2. -:param ddm_out_control_level: Drift level alert for DDM detector Defaults to 3. -:param label_col: Label column to be used on base_dataset and input_stream - Defaults to 'label'. -:param prediction_col: Prediction column to be used on base_dataset and input_stream - Defaults to 'prediction'. -:param hub_url: hub_url in case the default is not used, concept_drift_streaming will be loaded - by this url - Defaults to mlconf.hub_url. -:param fn_tag: hub tag to use - Defaults to 'master' -``` - -## Algorithms - -We offer to deploy up to 3 concept drift streaming detectors - -### DDM - Drift Detection Method - -Models the **Number of errors** as a **binomial** variable. This enables us to confine the expected number of errors in a prediction stream window to within some standard deviation. - -- Good for **abrupt** drift changes - -
- -![$mu=np_t$](https://latex.codecogs.com/svg.latex?mu=np_t) - -![$\sigma=\sqrt{\frac{p_t(1-p_t)}{n}}$]() - -
- -**Alert** when: - -
- -![$p_t+\sigma_t\ge{p_{min}+3\sigma_{min}}$](https://latex.codecogs.com/svg.latex?p_t+\sigma_t\ge{p_{min}+3\sigma_{min}}) - -
- -### EDDM - Early Drift Detection Method - -Uses the distance between two consecutive errors. - -- works better for **gradual** drift changes. -- More sensitive then DDM for noise -- Requires Minimal number of errors to initialize the statistics. - -**Warning**: - -
- -![$\frac{p_t+2\sigma_t}{p_{max}+2\sigma_{max}}<0.95$](https://latex.codecogs.com/svg.latex?\frac{p_t+2\sigma_t}{p_{max}+2\sigma_{max}}<0.95) - -
- -**Alert**: - -
- -![$\frac{p_t+2\sigma_t}{p_{max}+2\sigma_{max}}<0.90$](https://latex.codecogs.com/svg.latex?\frac{p_t+2\sigma_t}{p_{max}+2\sigma_{max}}<0.90) - -
- -### PageHinkley Test: - -The PageHinkley test is a sequential analysis technique typically used for monitoring change detection. (The test was designed to detect change in avg. of a Gaussian signal). In this test we use: -x*1*, ..., x*n* - labeled dataset -δ - magnitude threshold -λ - detection threshold - -
- -![$\hat{x_T}=\frac{1}{T}\sum_{t=1}^{t}{x_t}$](https://latex.codecogs.com/svg.latex?\hat{x_T}=\frac{1}{T}\sum_{t=1}^{t}{x_t}) - -![$\sum_{t=1}^T{x_t-\hat{x_T}-\delta}$](https://latex.codecogs.com/svg.latex?U_T=\sum_{t=1}^T{x_t-\hat{x_T}-\delta}) - -![$m_T=min(U_t,t=1..T)$]() - -
- -**Alert**: - -
- -![$U_T-m_T>\lambda$](https://latex.codecogs.com/svg.latex?U_T-m_T>\lambda) - -
- -## Additional resources -[A Study on Change Detection Methods](https://pdfs.semanticscholar.org/bb6e/8a44c0efcd725aae1c0b1817561f6e278c2c.pdf), Raquel Sebasti˜ao1,2 and Jo˜ao Gama1,3, 1 LIAAD-INESC Porto L.A., University of Porto -Rua de Ceuta, 118 - 6, 4050-190 Porto, Portugal -2 Faculty of Science, University of Porto -3 Faculty of Economics, University of Porto -{raquel,jgama}@liaad.up.pt - -[MLOps Live #4 - How to Detect & Remediate Drift in Production with MLOps Automation](https://www.youtube.com/watch?v=66_Q7mJZOSc&t=1296s) diff --git a/functions/development/concept_drift/0.8.0/src/concept_drift.ipynb b/functions/development/concept_drift/0.8.0/src/concept_drift.ipynb deleted file mode 100644 index e9c063b6..00000000 --- a/functions/development/concept_drift/0.8.0/src/concept_drift.ipynb +++ /dev/null @@ -1,793 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Concept Drift - Deployer\n", - "Deploy a streaming Concept Drift detector on a labeled stream. \n", - "It will initialize the selected drift detectors with the base_dataset's statistics and deploy the [concept_drift_streaming](https://github.com/mlrun/functions/blob/master/concept_drift_streaming/concept_drift_streaming.ipynb) function from the hub.
\n", - "adding [V3IOStreamTrigger](https://nuclio.io/docs/latest/reference/triggers/v3iostream/) in order to listen to the input_stream." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Steps**\n", - "\n", - "1. [Data exploration](#Data-exploration)\n", - "2. [Creating the input stream](#Creating-the-input-stream)\n", - "3. [Importing the function](#Importing-the-function)\n", - "4. [Running the function remotely](#Running-the-function-remotely)\n", - "5. [Testing the function](#Testing-the-function)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Data exploration**" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In order to know about the performance of a drift detector by measuring the different detection metrics, we need to know beforehand where a real drift occurs.
\n", - "This is only possible with synthetic datasets.
The scikit-multiflow framework allows generating several kinds of synthetic data to simulate the occurrence of drifts.
\n", - "[Harvard dataverse](https://dataverse.harvard.edu) provides futher explanations on the [used dataset](https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/5OWRGB) along with different kinds of drifted datasets.
\n", - "mixed_0101_abrupto has 4 concepts and 3 drifts at time steps 10000, 20000, and 30000.
\n", - "Our dataset will be train-test-splitted, the train part (first 5000 examples) is used to train the model (that is generated easly using [sklearn_classifer](https://github.com/mlrun/functions/blob/master/sklearn_classifier/sklearn_classifier.ipynb)).
\n", - "The test part (which is already predicted by the model) will be pushed to the input stream in order to detect drifts." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
X1X2X3X4class
00.01.00.4601010.5927441.0
11.01.00.5887880.5749840.0
20.00.00.4016410.6793251.0
31.01.00.3060760.1821080.0
40.00.00.9628470.5792451.0
\n", - "
" - ], - "text/plain": [ - " X1 X2 X3 X4 class\n", - "0 0.0 1.0 0.460101 0.592744 1.0\n", - "1 1.0 1.0 0.588788 0.574984 0.0\n", - "2 0.0 0.0 0.401641 0.679325 1.0\n", - "3 1.0 1.0 0.306076 0.182108 0.0\n", - "4 0.0 0.0 0.962847 0.579245 1.0" - ] - }, - "execution_count": 1, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import pandas as pd\n", - "data_path = 'https://s3.wasabisys.com/iguazio/data/function-marketplace-data/concept_drift/mixed_0101_abrupto.csv'\n", - "predicted_train_path = 'https://s3.wasabisys.com/iguazio/data/function-marketplace-data/concept_drift/predicted_abrupto_train.csv'\n", - "predicted_test_data_path = 'https://s3.wasabisys.com/iguazio/data/function-marketplace-data/concept_drift/predicted_abrupto_test.csv'\n", - "# You can find the model used here\n", - "models_path = 'https://s3.wasabisys.com/iguazio/models/function-marketplace-models/concept_drift/concept_drift_random_forest.pkl'\n", - "original_data = pd.read_csv(data_path)\n", - "original_data.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
X1X2X3X4classpredicted_col
349950.00.00.0101060.6472690.01.0
349961.01.00.2936510.7372911.00.0
349970.00.00.8485460.5523370.01.0
349981.01.00.6147540.8598961.00.0
349991.00.00.2653060.8437160.01.0
\n", - "
" - ], - "text/plain": [ - " X1 X2 X3 X4 class predicted_col\n", - "34995 0.0 0.0 0.010106 0.647269 0.0 1.0\n", - "34996 1.0 1.0 0.293651 0.737291 1.0 0.0\n", - "34997 0.0 0.0 0.848546 0.552337 0.0 1.0\n", - "34998 1.0 1.0 0.614754 0.859896 1.0 0.0\n", - "34999 1.0 0.0 0.265306 0.843716 0.0 1.0" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "predicted_test = pd.read_csv(predicted_test_data_path)\n", - "predicted_test.tail()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Creating the input stream**" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "import os \n", - "\n", - "container = os.path.join('/',os.environ['V3IO_HOME'].split('/')[0])\n", - "user = os.environ[\"V3IO_USERNAME\"]\n", - "rel_path = os.getcwd()[6:] + '/artifacts'\n", - "\n", - "base_input_stream = os.path.join(user,rel_path) + \"/inputs_stream\"\n", - "base_output_stream = os.path.join(user,rel_path) + \"/output_stream\"\n", - "input_stream = os.path.join(container,base_input_stream)\n", - "output_stream = os.path.join(container,user,rel_path) + \"/output_stream\"\n", - "tsdb_path = os.path.join(container,user,rel_path) + \"/output_tsdb\"\n", - "\n", - "stream_consumer_group = 'cg45'" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "import v3io.dataplane\n", - "\n", - "client = v3io.dataplane.Client()\n", - "response = client.stream.create(container = container,\n", - " stream_path=base_input_stream,\n", - " shard_count=1,\n", - " raise_for_status = v3io.dataplane.RaiseForStatus.never)\n", - "response.raise_for_status([409, 204])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Importing the function**" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2021-10-25 10:27:04,105 [info] created and saved project function-marketplace\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Importing the function\n", - "import mlrun\n", - "mlrun.set_environment(project='function-marketplace')\n", - "\n", - "fn = mlrun.import_function(\"hub://concept_drift:development\")\n", - "fn.apply(mlrun.auto_mount())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Running the function remotely**" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2021-10-25 10:27:04,567 [info] starting run concept_drift uid=fa07c222e77d4eac86d2ce9317aaded1 DB=http://mlrun-api:8080\n", - "> 2021-10-25 10:27:04,709 [info] Job is running in the background, pod: concept-drift-ggxgb\n", - "> 2021-10-25 10:27:11,199 [info] Loading base dataset\n", - "> 2021-10-25 10:27:13,227 [info] Creating models\n", - "> 2021-10-25 10:27:13,227 [info] Streaming data to models\n", - "> 2021-10-25 10:27:13,347 [info] Logging ready models\n", - "> 2021-10-25 10:27:13,487 [info] Deploying Concept Drift Streaming function\n", - "> 2021-10-25 10:27:13,490 [info] Starting remote function deploy\n", - "2021-10-25 10:27:13 (info) Deploying function\n", - "2021-10-25 10:27:13 (info) Building\n", - "2021-10-25 10:27:13 (info) Staging files and preparing base images\n", - "2021-10-25 10:27:13 (info) Building processor image\n", - "2021-10-25 10:27:15 (info) Build complete\n", - "2021-10-25 10:27:21 (info) Function deploy complete\n", - "> 2021-10-25 10:27:21,797 [info] successfully deployed function: {'internal_invocation_urls': ['nuclio-function-marketplace-concept-drift-streaming.default-tenant.svc.cluster.local:8080'], 'external_invocation_urls': ['default-tenant.app.dev39.lab.iguazeng.com:31143']}\n", - "> 2021-10-25 10:27:21,868 [info] run executed, status=completed\n", - "final state: completed\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
function-marketplace0Oct 25 10:27:10completedconcept_drift
v3io_user=dani
kind=job
owner=dani
host=concept-drift-ggxgb
base_dataset
input_stream=/users/dani/test/functions/concept_drift/artifacts/inputs_stream
consumer_group=cg45
output_stream=/users/dani/test/functions/concept_drift/artifacts/output_stream
output_tsdb=/users/dani/test/functions/concept_drift/artifacts/output_tsdb
tsdb_batch_size=1
models=['ddm', 'eddm', 'pagehinkley']
label_col=class
prediction_col=predicted_col
fn_tag=development
eddm_concept_drift
pagehinkley_concept_drift
ddm_concept_drift
\n", - "
\n", - "
\n", - "
\n", - " Title\n", - " ×\n", - "
\n", - " \n", - "
\n", - "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - }, - { - "data": { - "text/html": [ - " > to track results use the .show() or .logs() methods or click here to open in UI" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2021-10-25 10:27:23,031 [info] run executed, status=completed\n" - ] - } - ], - "source": [ - "drift_run = fn.run(name='concept_drift',\n", - " params={'input_stream' : input_stream,\n", - " 'consumer_group' : stream_consumer_group,\n", - " 'output_stream' : output_stream,\n", - " 'output_tsdb' : tsdb_path,\n", - " 'tsdb_batch_size' : 1,\n", - " 'models' : ['ddm', 'eddm', 'pagehinkley'], # defaults\n", - " 'label_col' : 'class',\n", - " 'prediction_col' : 'predicted_col',\n", - " 'fn_tag' : 'development'},\n", - " inputs={'base_dataset' : predicted_train_path},\n", - " artifact_path = os.path.join(os.getcwd(), 'artifacts'))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Testing the function**\n", - "> Mark that we are testing the deployed function - concept_drift_streaming" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'data': '{\"class\": 1.0, \"request\": {\"instances\": [{\"X1\": 0.0, \"X2\": 0.0, \"X3\": 0.0634475073, \"X4\": 0.4136568818}]}, \"resp\": [1], \"when\": \"2021-10-25 10:27:23.152584\", \"model\": \"sklearn.ensemble.RandomForestClassifier\"}'}" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import json\n", - "import datetime\n", - "\n", - "# Reshaping the data to V3IOStream format.\n", - "def restructure_stream_event(context, event):\n", - " instances = [dict()]\n", - " for key in predicted_test.keys():\n", - " if key not in ['when', 'class', 'model', 'worker', 'hostname', 'predicted_col']:\n", - " instances[0].update({key: event.pop(key)})\n", - " event['request'] = {'instances': instances}\n", - " event['resp'] = [int(event.pop('predicted_col'))]\n", - " event['when'] = datetime.datetime.strftime(datetime.datetime.now(), format=\"%Y-%m-%d %H:%M:%S.%f\")\n", - " event['model'] = 'sklearn.ensemble.RandomForestClassifier'\n", - " return event\n", - " \n", - " \n", - "records = json.loads(predicted_test.to_json(orient='records'))\n", - "records = [{'data': json.dumps(restructure_stream_event(context, record))} for record in records]\n", - "\n", - "# showing first record\n", - "records[0]" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "# Creating v3io client\n", - "v3io_client = v3io.dataplane.Client()\n", - "\n", - "# Pushing some undrifted data to the input stream\n", - "response = v3io_client.stream.put_records(container=container,\n", - " stream_path=base_input_stream, \n", - " records=records[4900:5100])" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'SequenceNumber': 200,\n", - " 'Data': 'eyJjbGFzcyI6IDAuMCwgInJlcXVlc3QiOiB7Imluc3RhbmNlcyI6IFt7IlgxIjogMC4wLCAiWDIiOiAwLjAsICJYMyI6IDAuMzMzMTYzNjk4OSwgIlg0IjogMC40MjE2NzY1Njg3fV19LCAicmVzcCI6IFsxXSwgIndoZW4iOiAiMjAyMS0xMC0yNSAxMDoyNzoyMy4yOTM3OTgiLCAibW9kZWwiOiAic2tsZWFybi5lbnNlbWJsZS5SYW5kb21Gb3Jlc3RDbGFzc2lmaWVyIn0=',\n", - " 'ArrivalTimeSec': 1635157644,\n", - " 'ArrivalTimeNSec': 395309631}" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Getting earliest location in the shard\n", - "location = json.loads(v3io_client.stream.seek(container=container,\n", - " stream_path=base_input_stream,\n", - " shard_id=0,\n", - " seek_type='EARLIEST').body)['Location']\n", - "# Getting records from input stream\n", - "response = v3io_client.stream.get_records(container=container,\n", - " stream_path=base_input_stream,\n", - " shard_id=0, location=location)\n", - "# Showing the last sequence that is written to the input stream\n", - "json.loads(response.body)['Records'][-1]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Make sure some time has passed - the function needs to be triggered by the input stream, then it'll write to the output stream" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "# Getting earliest location in the shard\n", - "location = json.loads(v3io_client.stream.seek(container=container,\n", - " stream_path=base_output_stream,\n", - " shard_id=0,\n", - " seek_type='EARLIEST').body)['Location']\n", - "# Getting records from output stream\n", - "response = v3io_client.stream.get_records(container=container,\n", - " stream_path=base_output_stream,\n", - " shard_id=0, location=location)" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "sequence number : 106, data : {'class': 0.0, 'request': {'instances': [{'X1': 0.0, 'X2': 0.0, 'X3': 0.9628473804, 'X4': 0.5792453402}]}, 'resp': [1], 'when': '2021-10-25 10:27:23.291145', 'model': 'sklearn.ensemble.RandomForestClassifier', 'ddm_warning_zone': 0, 'ddm_drift': 1, 'eddm_warning_zone': 0, 'eddm_drift': 0}\n", - "sequence number : 122, data : {'class': 0.0, 'request': {'instances': [{'X1': 0.0, 'X2': 0.0, 'X3': 0.4969765505, 'X4': 0.9784738351}]}, 'resp': [1], 'when': '2021-10-25 10:27:23.291558', 'model': 'sklearn.ensemble.RandomForestClassifier', 'ddm_warning_zone': 0, 'ddm_drift': 0, 'eddm_warning_zone': 0, 'eddm_drift': 1}\n" - ] - } - ], - "source": [ - "# Showing changed detected\n", - "import base64\n", - "for instance in json.loads(response.body)['Records']:\n", - " seq = instance[\"SequenceNumber\"]\n", - " data = json.loads(base64.b64decode(instance['Data']))\n", - " if(data['ddm_drift']==1 or data['eddm_drift']==1):\n", - " print(f'sequence number : {seq}, data : {data}')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can see that the system detected a change in the 106 instance, which is 10006 instance in the real dataset -
\n", - "5000 first instances are for train, we started pushing data from the 4900 instance of the test dataset (9900 from the real dataset), and we pushed only 200 instances.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "[Back to the top](#Concept-Drift---Deployer)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python [conda env:root] *", - "language": "python", - "name": "conda-root-py" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/functions/development/concept_drift/0.8.0/src/concept_drift.py b/functions/development/concept_drift/0.8.0/src/concept_drift.py deleted file mode 100644 index f6fd8dcc..00000000 --- a/functions/development/concept_drift/0.8.0/src/concept_drift.py +++ /dev/null @@ -1,133 +0,0 @@ -# Generated by nuclio.export.NuclioExporter - -import skmultiflow.drift_detection # We will grab our PH, DDM, EDDM algorithms from here -import numpy as np -import pandas as pd -import os -from cloudpickle import dumps, load, dump - -from nuclio.triggers import V3IOStreamTrigger -from mlrun import DataItem, import_function, mlconf, MLClientCtx, mount_v3io - -import random - - -def concept_drift_deployer( - context: MLClientCtx, - base_dataset: DataItem, - input_stream: str, - consumer_group: str, - output_stream: str, - output_tsdb: str, - tsdb_batch_size: int, - callbacks: list, - models: list = ["ddm", "eddm", "pagehinkley"], - models_dest="models", - pagehinkley_threshold: float = 10, - ddm_warning_level: float = 2, - ddm_out_control_level: float = 3, - label_col="label", - prediction_col="prediction", - hub_url: str = mlconf.hub_url, - fn_tag: str = "master", -): - """Deploy a streaming Concept Drift detector on a labeled stream - This function is the Deployment step for the Streaming Concept Drift Detector. - It will load the selected drift detectors and initialize them with the - base_dataset's statistics. Then it will deploy the concept_drift_streaming - function and pass the models to it for streaming concept-drift detection on top - of a labeled stream. - - :param context: MLRun context - :param base_dataset: Dataset containing label_col and prediction_col to initialize the detectors - :param input_stream: labeled stream to track. - Should contain label_col and prediction_col - :param output_stream: Output stream to push the detector's alerts - :param output_tsdb: Output TSDB table to allow analysis and display - :param tsdb_batch_size: Batch size of alerts to buffer before pushing to the TSDB - :param callbacks: Additional rest endpoints to send the alert data to - :param models: List of the detectors to deploy - Defaults to ['ddm', 'eddm', 'pagehinkley']. - :param models_dest: Location for saving the detectors - Defaults to 'models' (in relation to artifact_path). - :param pagehinkley_threshold: Drift level threshold for PH detector Defaults to 10. - :param ddm_warning_level: Warning level alert for DDM detector Defaults to 2. - :param ddm_out_control_level: Drift level alert for DDM detector Defaults to 3. - :param label_col: Label column to be used on base_dataset and input_stream - Defaults to 'label'. - :param prediction_col: Prediction column to be used on base_dataset and input_stream - Defaults to 'prediction'. - :param hub_url: hub_url in case the default is not used, concept_drift_streaming will be loaded - by this url - Defaults to mlconf.hub_url. - :param fn_tag: hub tag to use - Defaults to 'master' - """ - - mlconf.dbpath = mlconf.dbpath or "http://mlrun-api:8080" - mlconf.hub_url = hub_url - fn = import_function(url=f"hub://concept_drift_streaming:{fn_tag}") - - context.logger.info("Loading base dataset") - base_df = base_dataset.as_df() - error_stream = np.where( - base_df[prediction_col].values == base_df[label_col].values, 0, 1 - ) - - context.logger.info("Creating models") - models = [ - model.strip() - for model in os.getenv("models", "pagehinkley, ddm, eddm").split(",") - ] - models = { - "eddm": skmultiflow.drift_detection.EDDM(), - "pagehinkley": skmultiflow.drift_detection.PageHinkley( - min_instances=len(error_stream), threshold=pagehinkley_threshold - ), - "ddm": skmultiflow.drift_detection.DDM( - min_num_instances=len(error_stream), - warning_level=ddm_warning_level, - out_control_level=ddm_out_control_level, - ), - } - - context.logger.info("Streaming data to models") - for i in range(len(error_stream)): - for model_name, model in models.items(): - model.add_element(error_stream[i]) - - context.logger.info("Logging ready models") - for name, model in models.items(): - data = dumps(model) - model_file = f"{name}.pkl" - context.log_model( - f"{name}_concept_drift", - body=data, - labels={"framework": "skmultiflow", "workflow": "concept-drift"}, - model_file=model_file, - model_dir=models_dest, - tag="latest", - ) - fn.set_envs( - { - f"{name}_model_path": os.path.join( - context.artifact_path, models_dest, model_file - ) - } - ) - - context.logger.info("Deploying Concept Drift Streaming function") - fn.set_envs( - { - "label_col": label_col, - "prediction_col": prediction_col, - "drift_stream": output_stream, - "tsdb_table": output_tsdb, - "pagehinkley_threshold": pagehinkley_threshold, - "ddm_warning_level": ddm_warning_level, - "ddm_out_control": ddm_out_control_level, - } - ) - fn.add_v3io_stream_trigger(stream_path = input_stream, name = 'stream', group = consumer_group) - fn.apply(mount_v3io()) - fn.deploy(project=context.project) diff --git a/functions/development/concept_drift/0.8.0/src/function.yaml b/functions/development/concept_drift/0.8.0/src/function.yaml deleted file mode 100644 index eea75339..00000000 --- a/functions/development/concept_drift/0.8.0/src/function.yaml +++ /dev/null @@ -1,112 +0,0 @@ -kind: job -metadata: - name: concept-drift - tag: '' - hash: 935da41196802875e19948974f32b6f00c29feb2 - project: default - labels: - author: orz - framework: sklearn - categories: - - machine-learning - - model-serving -spec: - command: '' - args: [] - image: mlrun/ml-models - env: [] - default_handler: concept_drift_deployer - entry_points: - concept_drift_deployer: - name: concept_drift_deployer - doc: "Deploy a streaming Concept Drift detector on a labeled stream\n This\ - \ function is the Deployment step for the Streaming Concept Drift Detector.\n\ - \ It will load the selected drift detectors and initialize them with the\n\ - \ base_dataset's statistics. Then it will deploy the concept_drift_streaming\n\ - \ function and pass the models to it for streaming concept-drift detection\ - \ on top\n of a labeled stream." - parameters: - - name: context - type: MLClientCtx - doc: MLRun context - default: '' - - name: base_dataset - type: DataItem - doc: Dataset containing label_col and prediction_col to initialize the detectors - default: '' - - name: input_stream - type: str - doc: labeled stream to track. Should contain label_col and prediction_col - default: '' - - name: consumer_group - type: str - default: '' - - name: output_stream - type: str - doc: Output stream to push the detector's alerts - default: '' - - name: output_tsdb - type: str - doc: Output TSDB table to allow analysis and display - default: '' - - name: tsdb_batch_size - type: int - doc: Batch size of alerts to buffer before pushing to the TSDB - default: '' - - name: callbacks - type: list - doc: Additional rest endpoints to send the alert data to - default: '' - - name: models - type: list - doc: List of the detectors to deploy Defaults to ['ddm', 'eddm', 'pagehinkley']. - default: - - ddm - - eddm - - pagehinkley - - name: models_dest - doc: Location for saving the detectors Defaults to 'models' (in relation to - artifact_path). - default: models - - name: pagehinkley_threshold - type: float - doc: Drift level threshold for PH detector Defaults to 10. - default: 10 - - name: ddm_warning_level - type: float - doc: Warning level alert for DDM detector Defaults to 2. - default: 2 - - name: ddm_out_control_level - type: float - doc: Drift level alert for DDM detector Defaults to 3. - default: 3 - - name: label_col - doc: Label column to be used on base_dataset and input_stream Defaults to - 'label'. - default: label - - name: prediction_col - doc: Prediction column to be used on base_dataset and input_stream Defaults - to 'prediction'. - default: prediction - - name: hub_url - type: str - doc: hub_url in case the default is not used, concept_drift_streaming will - be loaded by this url Defaults to mlconf.hub_url. - default: <_ast.Name object at 0x7f48eda946d0> - - name: fn_tag - type: str - doc: hub tag to use Defaults to 'master' - default: master - outputs: - - default: '' - lineno: 15 - description: Deploy a streaming Concept Drift detector on a labeled stream - build: - functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHNrbXVsdGlmbG93LmRyaWZ0X2RldGVjdGlvbiAgIyBXZSB3aWxsIGdyYWIgb3VyIFBILCBERE0sIEVERE0gYWxnb3JpdGhtcyBmcm9tIGhlcmUKaW1wb3J0IG51bXB5IGFzIG5wCmltcG9ydCBwYW5kYXMgYXMgcGQKaW1wb3J0IG9zCmZyb20gY2xvdWRwaWNrbGUgaW1wb3J0IGR1bXBzLCBsb2FkLCBkdW1wCgpmcm9tIG51Y2xpby50cmlnZ2VycyBpbXBvcnQgVjNJT1N0cmVhbVRyaWdnZXIKZnJvbSBtbHJ1biBpbXBvcnQgRGF0YUl0ZW0sIGltcG9ydF9mdW5jdGlvbiwgbWxjb25mLCBNTENsaWVudEN0eCwgbW91bnRfdjNpbwoKaW1wb3J0IHJhbmRvbQoKCmRlZiBjb25jZXB0X2RyaWZ0X2RlcGxveWVyKAogICAgY29udGV4dDogTUxDbGllbnRDdHgsCiAgICBiYXNlX2RhdGFzZXQ6IERhdGFJdGVtLAogICAgaW5wdXRfc3RyZWFtOiBzdHIsCiAgICBjb25zdW1lcl9ncm91cDogc3RyLAogICAgb3V0cHV0X3N0cmVhbTogc3RyLAogICAgb3V0cHV0X3RzZGI6IHN0ciwKICAgIHRzZGJfYmF0Y2hfc2l6ZTogaW50LAogICAgY2FsbGJhY2tzOiBsaXN0LAogICAgbW9kZWxzOiBsaXN0ID0gWyJkZG0iLCAiZWRkbSIsICJwYWdlaGlua2xleSJdLAogICAgbW9kZWxzX2Rlc3Q9Im1vZGVscyIsCiAgICBwYWdlaGlua2xleV90aHJlc2hvbGQ6IGZsb2F0ID0gMTAsCiAgICBkZG1fd2FybmluZ19sZXZlbDogZmxvYXQgPSAyLAogICAgZGRtX291dF9jb250cm9sX2xldmVsOiBmbG9hdCA9IDMsCiAgICBsYWJlbF9jb2w9ImxhYmVsIiwKICAgIHByZWRpY3Rpb25fY29sPSJwcmVkaWN0aW9uIiwKICAgIGh1Yl91cmw6IHN0ciA9IG1sY29uZi5odWJfdXJsLAogICAgZm5fdGFnOiBzdHIgPSAibWFzdGVyIiwKKToKICAgICIiIkRlcGxveSBhIHN0cmVhbWluZyBDb25jZXB0IERyaWZ0IGRldGVjdG9yIG9uIGEgbGFiZWxlZCBzdHJlYW0KICAgICAgIFRoaXMgZnVuY3Rpb24gaXMgdGhlIERlcGxveW1lbnQgc3RlcCBmb3IgdGhlIFN0cmVhbWluZyBDb25jZXB0IERyaWZ0IERldGVjdG9yLgogICAgICAgSXQgd2lsbCBsb2FkIHRoZSBzZWxlY3RlZCBkcmlmdCBkZXRlY3RvcnMgYW5kIGluaXRpYWxpemUgdGhlbSB3aXRoIHRoZQogICAgICAgYmFzZV9kYXRhc2V0J3Mgc3RhdGlzdGljcy4gIFRoZW4gaXQgd2lsbCBkZXBsb3kgdGhlIGNvbmNlcHRfZHJpZnRfc3RyZWFtaW5nCiAgICAgICBmdW5jdGlvbiBhbmQgcGFzcyB0aGUgbW9kZWxzIHRvIGl0IGZvciBzdHJlYW1pbmcgY29uY2VwdC1kcmlmdCBkZXRlY3Rpb24gb24gdG9wCiAgICAgICBvZiBhIGxhYmVsZWQgc3RyZWFtLgoKICAgIDpwYXJhbSBjb250ZXh0OiAgICAgICAgIE1MUnVuIGNvbnRleHQKICAgIDpwYXJhbSBiYXNlX2RhdGFzZXQ6ICAgIERhdGFzZXQgY29udGFpbmluZyBsYWJlbF9jb2wgYW5kIHByZWRpY3Rpb25fY29sIHRvIGluaXRpYWxpemUgdGhlIGRldGVjdG9ycwogICAgOnBhcmFtIGlucHV0X3N0cmVhbTogICAgbGFiZWxlZCBzdHJlYW0gdG8gdHJhY2suCiAgICAgICAgICAgICAgICAgICAgICAgICAgICBTaG91bGQgY29udGFpbiBsYWJlbF9jb2wgYW5kIHByZWRpY3Rpb25fY29sCiAgICA6cGFyYW0gb3V0cHV0X3N0cmVhbTogICBPdXRwdXQgc3RyZWFtIHRvIHB1c2ggdGhlIGRldGVjdG9yJ3MgYWxlcnRzCiAgICA6cGFyYW0gb3V0cHV0X3RzZGI6ICAgICBPdXRwdXQgVFNEQiB0YWJsZSB0byBhbGxvdyBhbmFseXNpcyBhbmQgZGlzcGxheQogICAgOnBhcmFtIHRzZGJfYmF0Y2hfc2l6ZTogQmF0Y2ggc2l6ZSBvZiBhbGVydHMgdG8gYnVmZmVyIGJlZm9yZSBwdXNoaW5nIHRvIHRoZSBUU0RCCiAgICA6cGFyYW0gY2FsbGJhY2tzOiAgICAgICBBZGRpdGlvbmFsIHJlc3QgZW5kcG9pbnRzIHRvIHNlbmQgdGhlIGFsZXJ0IGRhdGEgdG8KICAgIDpwYXJhbSBtb2RlbHM6ICAgICAgICAgIExpc3Qgb2YgdGhlIGRldGVjdG9ycyB0byBkZXBsb3kKICAgICAgICAgICAgICAgICAgICAgICAgICAgIERlZmF1bHRzIHRvIFsnZGRtJywgJ2VkZG0nLCAncGFnZWhpbmtsZXknXS4KICAgIDpwYXJhbSBtb2RlbHNfZGVzdDogICAgIExvY2F0aW9uIGZvciBzYXZpbmcgdGhlIGRldGVjdG9ycwogICAgICAgICAgICAgICAgICAgICAgICAgICAgRGVmYXVsdHMgdG8gJ21vZGVscycgKGluIHJlbGF0aW9uIHRvIGFydGlmYWN0X3BhdGgpLgogICAgOnBhcmFtIHBhZ2VoaW5rbGV5X3RocmVzaG9sZDogIERyaWZ0IGxldmVsIHRocmVzaG9sZCBmb3IgUEggZGV0ZWN0b3IgRGVmYXVsdHMgdG8gMTAuCiAgICA6cGFyYW0gZGRtX3dhcm5pbmdfbGV2ZWw6ICAgICAgV2FybmluZyBsZXZlbCBhbGVydCBmb3IgRERNIGRldGVjdG9yIERlZmF1bHRzIHRvIDIuCiAgICA6cGFyYW0gZGRtX291dF9jb250cm9sX2xldmVsOiAgRHJpZnQgbGV2ZWwgYWxlcnQgZm9yIERETSBkZXRlY3RvciBEZWZhdWx0cyB0byAzLgogICAgOnBhcmFtIGxhYmVsX2NvbDogICAgICAgTGFiZWwgY29sdW1uIHRvIGJlIHVzZWQgb24gYmFzZV9kYXRhc2V0IGFuZCBpbnB1dF9zdHJlYW0KICAgICAgICAgICAgICAgICAgICAgICAgICAgIERlZmF1bHRzIHRvICdsYWJlbCcuCiAgICA6cGFyYW0gcHJlZGljdGlvbl9jb2w6ICBQcmVkaWN0aW9uIGNvbHVtbiB0byBiZSB1c2VkIG9uIGJhc2VfZGF0YXNldCBhbmQgaW5wdXRfc3RyZWFtCiAgICAgICAgICAgICAgICAgICAgICAgICAgICBEZWZhdWx0cyB0byAncHJlZGljdGlvbicuCiAgICA6cGFyYW0gaHViX3VybDogICAgICAgICBodWJfdXJsIGluIGNhc2UgdGhlIGRlZmF1bHQgaXMgbm90IHVzZWQsIGNvbmNlcHRfZHJpZnRfc3RyZWFtaW5nIHdpbGwgYmUgbG9hZGVkCiAgICAgICAgICAgICAgICAgICAgICAgICAgICBieSB0aGlzIHVybAogICAgICAgICAgICAgICAgICAgICAgICAgICAgRGVmYXVsdHMgdG8gbWxjb25mLmh1Yl91cmwuCiAgICA6cGFyYW0gZm5fdGFnOiAgICAgICAgICBodWIgdGFnIHRvIHVzZQogICAgICAgICAgICAgICAgICAgICAgICAgICAgRGVmYXVsdHMgdG8gJ21hc3RlcicKICAgICIiIgoKICAgIG1sY29uZi5kYnBhdGggPSBtbGNvbmYuZGJwYXRoIG9yICJodHRwOi8vbWxydW4tYXBpOjgwODAiCiAgICBtbGNvbmYuaHViX3VybCA9IGh1Yl91cmwKICAgIGZuID0gaW1wb3J0X2Z1bmN0aW9uKHVybD1mImh1YjovL2NvbmNlcHRfZHJpZnRfc3RyZWFtaW5nOntmbl90YWd9IikKCiAgICBjb250ZXh0LmxvZ2dlci5pbmZvKCJMb2FkaW5nIGJhc2UgZGF0YXNldCIpCiAgICBiYXNlX2RmID0gYmFzZV9kYXRhc2V0LmFzX2RmKCkKICAgIGVycm9yX3N0cmVhbSA9IG5wLndoZXJlKAogICAgICAgIGJhc2VfZGZbcHJlZGljdGlvbl9jb2xdLnZhbHVlcyA9PSBiYXNlX2RmW2xhYmVsX2NvbF0udmFsdWVzLCAwLCAxCiAgICApCgogICAgY29udGV4dC5sb2dnZXIuaW5mbygiQ3JlYXRpbmcgbW9kZWxzIikKICAgIG1vZGVscyA9IFsKICAgICAgICBtb2RlbC5zdHJpcCgpCiAgICAgICAgZm9yIG1vZGVsIGluIG9zLmdldGVudigibW9kZWxzIiwgInBhZ2VoaW5rbGV5LCBkZG0sIGVkZG0iKS5zcGxpdCgiLCIpCiAgICBdCiAgICBtb2RlbHMgPSB7CiAgICAgICAgImVkZG0iOiBza211bHRpZmxvdy5kcmlmdF9kZXRlY3Rpb24uRURETSgpLAogICAgICAgICJwYWdlaGlua2xleSI6IHNrbXVsdGlmbG93LmRyaWZ0X2RldGVjdGlvbi5QYWdlSGlua2xleSgKICAgICAgICAgICAgbWluX2luc3RhbmNlcz1sZW4oZXJyb3Jfc3RyZWFtKSwgdGhyZXNob2xkPXBhZ2VoaW5rbGV5X3RocmVzaG9sZAogICAgICAgICksCiAgICAgICAgImRkbSI6IHNrbXVsdGlmbG93LmRyaWZ0X2RldGVjdGlvbi5ERE0oCiAgICAgICAgICAgIG1pbl9udW1faW5zdGFuY2VzPWxlbihlcnJvcl9zdHJlYW0pLAogICAgICAgICAgICB3YXJuaW5nX2xldmVsPWRkbV93YXJuaW5nX2xldmVsLAogICAgICAgICAgICBvdXRfY29udHJvbF9sZXZlbD1kZG1fb3V0X2NvbnRyb2xfbGV2ZWwsCiAgICAgICAgKSwKICAgIH0KCiAgICBjb250ZXh0LmxvZ2dlci5pbmZvKCJTdHJlYW1pbmcgZGF0YSB0byBtb2RlbHMiKQogICAgZm9yIGkgaW4gcmFuZ2UobGVuKGVycm9yX3N0cmVhbSkpOgogICAgICAgIGZvciBtb2RlbF9uYW1lLCBtb2RlbCBpbiBtb2RlbHMuaXRlbXMoKToKICAgICAgICAgICAgbW9kZWwuYWRkX2VsZW1lbnQoZXJyb3Jfc3RyZWFtW2ldKQoKICAgIGNvbnRleHQubG9nZ2VyLmluZm8oIkxvZ2dpbmcgcmVhZHkgbW9kZWxzIikKICAgIGZvciBuYW1lLCBtb2RlbCBpbiBtb2RlbHMuaXRlbXMoKToKICAgICAgICBkYXRhID0gZHVtcHMobW9kZWwpCiAgICAgICAgbW9kZWxfZmlsZSA9IGYie25hbWV9LnBrbCIKICAgICAgICBjb250ZXh0LmxvZ19tb2RlbCgKICAgICAgICAgICAgZiJ7bmFtZX1fY29uY2VwdF9kcmlmdCIsCiAgICAgICAgICAgIGJvZHk9ZGF0YSwKICAgICAgICAgICAgbGFiZWxzPXsiZnJhbWV3b3JrIjogInNrbXVsdGlmbG93IiwgIndvcmtmbG93IjogImNvbmNlcHQtZHJpZnQifSwKICAgICAgICAgICAgbW9kZWxfZmlsZT1tb2RlbF9maWxlLAogICAgICAgICAgICBtb2RlbF9kaXI9bW9kZWxzX2Rlc3QsCiAgICAgICAgICAgIHRhZz0ibGF0ZXN0IiwKICAgICAgICApCiAgICAgICAgZm4uc2V0X2VudnMoCiAgICAgICAgICAgIHsKICAgICAgICAgICAgICAgIGYie25hbWV9X21vZGVsX3BhdGgiOiBvcy5wYXRoLmpvaW4oCiAgICAgICAgICAgICAgICAgICAgY29udGV4dC5hcnRpZmFjdF9wYXRoLCBtb2RlbHNfZGVzdCwgbW9kZWxfZmlsZQogICAgICAgICAgICAgICAgKQogICAgICAgICAgICB9CiAgICAgICAgKQoKICAgIGNvbnRleHQubG9nZ2VyLmluZm8oIkRlcGxveWluZyBDb25jZXB0IERyaWZ0IFN0cmVhbWluZyBmdW5jdGlvbiIpCiAgICBmbi5zZXRfZW52cygKICAgICAgICB7CiAgICAgICAgICAgICJsYWJlbF9jb2wiOiBsYWJlbF9jb2wsCiAgICAgICAgICAgICJwcmVkaWN0aW9uX2NvbCI6IHByZWRpY3Rpb25fY29sLAogICAgICAgICAgICAiZHJpZnRfc3RyZWFtIjogb3V0cHV0X3N0cmVhbSwKICAgICAgICAgICAgInRzZGJfdGFibGUiOiBvdXRwdXRfdHNkYiwKICAgICAgICAgICAgInBhZ2VoaW5rbGV5X3RocmVzaG9sZCI6IHBhZ2VoaW5rbGV5X3RocmVzaG9sZCwKICAgICAgICAgICAgImRkbV93YXJuaW5nX2xldmVsIjogZGRtX3dhcm5pbmdfbGV2ZWwsCiAgICAgICAgICAgICJkZG1fb3V0X2NvbnRyb2wiOiBkZG1fb3V0X2NvbnRyb2xfbGV2ZWwsCiAgICAgICAgfQogICAgKQogICAgZm4uYWRkX3YzaW9fc3RyZWFtX3RyaWdnZXIoc3RyZWFtX3BhdGggPSBpbnB1dF9zdHJlYW0sIG5hbWUgPSAnc3RyZWFtJywgZ3JvdXAgPSBjb25zdW1lcl9ncm91cCkKICAgIGZuLmFwcGx5KG1vdW50X3YzaW8oKSkKICAgIGZuLmRlcGxveShwcm9qZWN0PWNvbnRleHQucHJvamVjdCkK - commands: - - python -m pip install scikit-multiflow - code_origin: https://github.com/daniels290813/functions.git#82bbfde4afa2eae77059e05c70bbebacf530fd0d:/User/test/functions/concept_drift/concept_drift.py - origin_filename: /User/test/functions/concept_drift/concept_drift.py - disable_auto_mount: false - affinity: null -verbose: false diff --git a/functions/development/concept_drift/0.8.0/src/item.yaml b/functions/development/concept_drift/0.8.0/src/item.yaml deleted file mode 100644 index 038c35d5..00000000 --- a/functions/development/concept_drift/0.8.0/src/item.yaml +++ /dev/null @@ -1,26 +0,0 @@ -apiVersion: v1 -categories: -- machine-learning -- model-serving -description: Deploy a streaming Concept Drift detector on a labeled stream -doc: '' -example: concept_drift.ipynb -generationDate: 2021-05-19:22-04 -icon: '' -labels: - author: orz - framework: sklearn -maintainers: [] -marketplaceType: '' -mlrunVersion: 0.8.0 -name: concept-drift -platformVersion: 3.2.0 -spec: - filename: concept_drift.py - handler: concept_drift_deployer - image: mlrun/ml-models - kind: job - requirements: - - scikit-multiflow -url: '' -version: 0.8.0 diff --git a/functions/development/concept_drift/0.8.0/static/documentation.html b/functions/development/concept_drift/0.8.0/static/documentation.html deleted file mode 100644 index 37daa510..00000000 --- a/functions/development/concept_drift/0.8.0/static/documentation.html +++ /dev/null @@ -1,125 +0,0 @@ - - - - - - - -concept_drift package - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- - -
-
-
-
-
-
-

concept_drift package

-
-

Submodules

-
-
-

concept_drift.concept_drift module

-
-
-

Module contents

-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/concept_drift/0.8.0/static/example.html b/functions/development/concept_drift/0.8.0/static/example.html deleted file mode 100644 index 4cd7041c..00000000 --- a/functions/development/concept_drift/0.8.0/static/example.html +++ /dev/null @@ -1,753 +0,0 @@ - - - - - - - -Concept Drift - Deployer - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
- -
-
-
-
-

Concept Drift - Deployer

-

Deploy a streaming Concept Drift detector on a labeled stream.
-It will initialize the selected drift detectors with the base_dataset’s statistics and deploy the concept_drift_streaming function from the hub.
-adding V3IOStreamTrigger in order to listen to the input_stream.

- -
-

Data exploration

-

In order to know about the performance of a drift detector by measuring the different detection metrics, we need to know beforehand where a real drift occurs.
-This is only possible with synthetic datasets.
The scikit-multiflow framework allows generating several kinds of synthetic data to simulate the occurrence of drifts.
-Harvard dataverse provides futher explanations on the used dataset along with different kinds of drifted datasets.
-mixed_0101_abrupto has 4 concepts and 3 drifts at time steps 10000, 20000, and 30000.
-Our dataset will be train-test-splitted, the train part (first 5000 examples) is used to train the model (that is generated easly using sklearn_classifer).
-The test part (which is already predicted by the model) will be pushed to the input stream in order to detect drifts.

-
-
-
import pandas as pd
-data_path = 'https://s3.wasabisys.com/iguazio/data/function-marketplace-data/concept_drift/mixed_0101_abrupto.csv'
-predicted_train_path = 'https://s3.wasabisys.com/iguazio/data/function-marketplace-data/concept_drift/predicted_abrupto_train.csv'
-predicted_test_data_path = 'https://s3.wasabisys.com/iguazio/data/function-marketplace-data/concept_drift/predicted_abrupto_test.csv'
-# You can find the model used here
-models_path = 'https://s3.wasabisys.com/iguazio/models/function-marketplace-models/concept_drift/concept_drift_random_forest.pkl'
-original_data = pd.read_csv(data_path)
-original_data.head()
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
X1X2X3X4class
00.01.00.4601010.5927441.0
11.01.00.5887880.5749840.0
20.00.00.4016410.6793251.0
31.01.00.3060760.1821080.0
40.00.00.9628470.5792451.0
-
-
-
-
-
predicted_test = pd.read_csv(predicted_test_data_path)
-predicted_test.tail()
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
X1X2X3X4classpredicted_col
349950.00.00.0101060.6472690.01.0
349961.01.00.2936510.7372911.00.0
349970.00.00.8485460.5523370.01.0
349981.01.00.6147540.8598961.00.0
349991.00.00.2653060.8437160.01.0
-
-
-
-
-

Creating the input stream

-
-
-
import os 
-
-container = os.path.join('/',os.environ['V3IO_HOME'].split('/')[0])
-user = os.environ["V3IO_USERNAME"]
-rel_path = os.getcwd()[6:] + '/artifacts'
-
-base_input_stream = os.path.join(user,rel_path) + "/inputs_stream"
-base_output_stream = os.path.join(user,rel_path) + "/output_stream"
-input_stream = os.path.join(container,base_input_stream)
-output_stream = os.path.join(container,user,rel_path) + "/output_stream"
-tsdb_path = os.path.join(container,user,rel_path) + "/output_tsdb"
-
-stream_consumer_group = 'cg45'
-
-
-
-
-
-
-
import v3io.dataplane
-
-client = v3io.dataplane.Client()
-response = client.stream.create(container = container,
-                                stream_path=base_input_stream,
-                                shard_count=1,
-                                raise_for_status = v3io.dataplane.RaiseForStatus.never)
-response.raise_for_status([409, 204])
-
-
-
-
-
-
-

Importing the function

-
-
-
# Importing the function
-import mlrun
-mlrun.set_environment(project='function-marketplace')
-
-fn = mlrun.import_function("hub://concept_drift:development")
-fn.apply(mlrun.auto_mount())
-
-
-
-
-
> 2021-10-25 10:27:04,105 [info] created and saved project function-marketplace
-
-
-
<mlrun.runtimes.kubejob.KubejobRuntime at 0x7f145dd80fd0>
-
-
-
-
-
-
-

Running the function remotely

-
-
-
drift_run = fn.run(name='concept_drift',
-                   params={'input_stream'    : input_stream,
-                           'consumer_group'  : stream_consumer_group,
-                           'output_stream'   : output_stream,
-                           'output_tsdb'     : tsdb_path,
-                           'tsdb_batch_size' : 1,
-                           'models'          : ['ddm', 'eddm', 'pagehinkley'], # defaults
-                           'label_col'       : 'class',
-                           'prediction_col'  : 'predicted_col',
-                           'fn_tag'          : 'development'},
-                   inputs={'base_dataset'    : predicted_train_path},
-                   artifact_path = os.path.join(os.getcwd(), 'artifacts'))
-
-
-
-
-
> 2021-10-25 10:27:04,567 [info] starting run concept_drift uid=fa07c222e77d4eac86d2ce9317aaded1 DB=http://mlrun-api:8080
-> 2021-10-25 10:27:04,709 [info] Job is running in the background, pod: concept-drift-ggxgb
-> 2021-10-25 10:27:11,199 [info] Loading base dataset
-> 2021-10-25 10:27:13,227 [info] Creating models
-> 2021-10-25 10:27:13,227 [info] Streaming data to models
-> 2021-10-25 10:27:13,347 [info] Logging ready models
-> 2021-10-25 10:27:13,487 [info] Deploying Concept Drift Streaming function
-> 2021-10-25 10:27:13,490 [info] Starting remote function deploy
-2021-10-25 10:27:13  (info) Deploying function
-2021-10-25 10:27:13  (info) Building
-2021-10-25 10:27:13  (info) Staging files and preparing base images
-2021-10-25 10:27:13  (info) Building processor image
-2021-10-25 10:27:15  (info) Build complete
-2021-10-25 10:27:21  (info) Function deploy complete
-> 2021-10-25 10:27:21,797 [info] successfully deployed function: {'internal_invocation_urls': ['nuclio-function-marketplace-concept-drift-streaming.default-tenant.svc.cluster.local:8080'], 'external_invocation_urls': ['default-tenant.app.dev39.lab.iguazeng.com:31143']}
-> 2021-10-25 10:27:21,868 [info] run executed, status=completed
-final state: completed
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
function-marketplace0Oct 25 10:27:10completedconcept_drift
v3io_user=dani
kind=job
owner=dani
host=concept-drift-ggxgb
base_dataset
input_stream=/users/dani/test/functions/concept_drift/artifacts/inputs_stream
consumer_group=cg45
output_stream=/users/dani/test/functions/concept_drift/artifacts/output_stream
output_tsdb=/users/dani/test/functions/concept_drift/artifacts/output_tsdb
tsdb_batch_size=1
models=['ddm', 'eddm', 'pagehinkley']
label_col=class
prediction_col=predicted_col
fn_tag=development
eddm_concept_drift
pagehinkley_concept_drift
ddm_concept_drift
-
- -
-

-
-
-
> to track results use the .show() or .logs() methods or click here to open in UI
> 2021-10-25 10:27:23,031 [info] run executed, status=completed
-
-
-
-
-
-
-

Testing the function

-
-

Mark that we are testing the deployed function - concept_drift_streaming

-
-
-
-
import json
-import datetime
-
-# Reshaping the data to V3IOStream format.
-def restructure_stream_event(context, event):
-    instances = [dict()]
-    for key in predicted_test.keys():
-        if key not in ['when', 'class', 'model', 'worker', 'hostname', 'predicted_col']:
-            instances[0].update({key: event.pop(key)})
-    event['request'] = {'instances': instances}
-    event['resp'] = [int(event.pop('predicted_col'))]
-    event['when'] = datetime.datetime.strftime(datetime.datetime.now(), format="%Y-%m-%d %H:%M:%S.%f")
-    event['model'] = 'sklearn.ensemble.RandomForestClassifier'
-    return event
-    
-    
-records = json.loads(predicted_test.to_json(orient='records'))
-records = [{'data': json.dumps(restructure_stream_event(context, record))} for record in records]
-
-# showing first record
-records[0]
-
-
-
-
-
{'data': '{"class": 1.0, "request": {"instances": [{"X1": 0.0, "X2": 0.0, "X3": 0.0634475073, "X4": 0.4136568818}]}, "resp": [1], "when": "2021-10-25 10:27:23.152584", "model": "sklearn.ensemble.RandomForestClassifier"}'}
-
-
-
-
-
-
-
# Creating v3io client
-v3io_client = v3io.dataplane.Client()
-
-# Pushing some undrifted data to the input stream
-response = v3io_client.stream.put_records(container=container,
-                                          stream_path=base_input_stream, 
-                                          records=records[4900:5100])
-
-
-
-
-
-
-
# Getting earliest location in the shard
-location = json.loads(v3io_client.stream.seek(container=container,
-                                              stream_path=base_input_stream,
-                                              shard_id=0,
-                                              seek_type='EARLIEST').body)['Location']
-# Getting records from input stream
-response = v3io_client.stream.get_records(container=container,
-                                          stream_path=base_input_stream,
-                                          shard_id=0, location=location)
-# Showing the last sequence that is written to the input stream
-json.loads(response.body)['Records'][-1]
-
-
-
-
-
{'SequenceNumber': 200,
- 'Data': 'eyJjbGFzcyI6IDAuMCwgInJlcXVlc3QiOiB7Imluc3RhbmNlcyI6IFt7IlgxIjogMC4wLCAiWDIiOiAwLjAsICJYMyI6IDAuMzMzMTYzNjk4OSwgIlg0IjogMC40MjE2NzY1Njg3fV19LCAicmVzcCI6IFsxXSwgIndoZW4iOiAiMjAyMS0xMC0yNSAxMDoyNzoyMy4yOTM3OTgiLCAibW9kZWwiOiAic2tsZWFybi5lbnNlbWJsZS5SYW5kb21Gb3Jlc3RDbGFzc2lmaWVyIn0=',
- 'ArrivalTimeSec': 1635157644,
- 'ArrivalTimeNSec': 395309631}
-
-
-
-
-
-

Make sure some time has passed - the function needs to be triggered by the input stream, then it’ll write to the output stream

-
-
-
# Getting earliest location in the shard
-location = json.loads(v3io_client.stream.seek(container=container,
-                                              stream_path=base_output_stream,
-                                              shard_id=0,
-                                              seek_type='EARLIEST').body)['Location']
-# Getting records from output stream
-response = v3io_client.stream.get_records(container=container,
-                                          stream_path=base_output_stream,
-                                          shard_id=0, location=location)
-
-
-
-
-
-
-
# Showing changed detected
-import base64
-for instance in json.loads(response.body)['Records']:
-    seq = instance["SequenceNumber"]
-    data = json.loads(base64.b64decode(instance['Data']))
-    if(data['ddm_drift']==1 or data['eddm_drift']==1):
-        print(f'sequence number : {seq}, data : {data}')
-
-
-
-
-
sequence number : 106, data : {'class': 0.0, 'request': {'instances': [{'X1': 0.0, 'X2': 0.0, 'X3': 0.9628473804, 'X4': 0.5792453402}]}, 'resp': [1], 'when': '2021-10-25 10:27:23.291145', 'model': 'sklearn.ensemble.RandomForestClassifier', 'ddm_warning_zone': 0, 'ddm_drift': 1, 'eddm_warning_zone': 0, 'eddm_drift': 0}
-sequence number : 122, data : {'class': 0.0, 'request': {'instances': [{'X1': 0.0, 'X2': 0.0, 'X3': 0.4969765505, 'X4': 0.9784738351}]}, 'resp': [1], 'when': '2021-10-25 10:27:23.291558', 'model': 'sklearn.ensemble.RandomForestClassifier', 'ddm_warning_zone': 0, 'ddm_drift': 0, 'eddm_warning_zone': 0, 'eddm_drift': 1}
-
-
-
-
-

We can see that the system detected a change in the 106 instance, which is 10006 instance in the real dataset -
-5000 first instances are for train, we started pushing data from the 4900 instance of the test dataset (9900 from the real dataset), and we pushed only 200 instances.

-

Back to the top

-
-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/concept_drift/0.8.0/static/function.html b/functions/development/concept_drift/0.8.0/static/function.html deleted file mode 100644 index c12aad7c..00000000 --- a/functions/development/concept_drift/0.8.0/static/function.html +++ /dev/null @@ -1,134 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: job
-metadata:
-  name: concept-drift
-  tag: ''
-  hash: 935da41196802875e19948974f32b6f00c29feb2
-  project: default
-  labels:
-    author: orz
-    framework: sklearn
-  categories:
-  - machine-learning
-  - model-serving
-spec:
-  command: ''
-  args: []
-  image: mlrun/ml-models
-  env: []
-  default_handler: concept_drift_deployer
-  entry_points:
-    concept_drift_deployer:
-      name: concept_drift_deployer
-      doc: "Deploy a streaming Concept Drift detector on a labeled stream\n   This\
-        \ function is the Deployment step for the Streaming Concept Drift Detector.\n\
-        \   It will load the selected drift detectors and initialize them with the\n\
-        \   base_dataset's statistics.  Then it will deploy the concept_drift_streaming\n\
-        \   function and pass the models to it for streaming concept-drift detection\
-        \ on top\n   of a labeled stream."
-      parameters:
-      - name: context
-        type: MLClientCtx
-        doc: MLRun context
-        default: ''
-      - name: base_dataset
-        type: DataItem
-        doc: Dataset containing label_col and prediction_col to initialize the detectors
-        default: ''
-      - name: input_stream
-        type: str
-        doc: labeled stream to track. Should contain label_col and prediction_col
-        default: ''
-      - name: consumer_group
-        type: str
-        default: ''
-      - name: output_stream
-        type: str
-        doc: Output stream to push the detector's alerts
-        default: ''
-      - name: output_tsdb
-        type: str
-        doc: Output TSDB table to allow analysis and display
-        default: ''
-      - name: tsdb_batch_size
-        type: int
-        doc: Batch size of alerts to buffer before pushing to the TSDB
-        default: ''
-      - name: callbacks
-        type: list
-        doc: Additional rest endpoints to send the alert data to
-        default: ''
-      - name: models
-        type: list
-        doc: List of the detectors to deploy Defaults to ['ddm', 'eddm', 'pagehinkley'].
-        default:
-        - ddm
-        - eddm
-        - pagehinkley
-      - name: models_dest
-        doc: Location for saving the detectors Defaults to 'models' (in relation to
-          artifact_path).
-        default: models
-      - name: pagehinkley_threshold
-        type: float
-        doc: Drift level threshold for PH detector Defaults to 10.
-        default: 10
-      - name: ddm_warning_level
-        type: float
-        doc: Warning level alert for DDM detector Defaults to 2.
-        default: 2
-      - name: ddm_out_control_level
-        type: float
-        doc: Drift level alert for DDM detector Defaults to 3.
-        default: 3
-      - name: label_col
-        doc: Label column to be used on base_dataset and input_stream Defaults to
-          'label'.
-        default: label
-      - name: prediction_col
-        doc: Prediction column to be used on base_dataset and input_stream Defaults
-          to 'prediction'.
-        default: prediction
-      - name: hub_url
-        type: str
-        doc: hub_url in case the default is not used, concept_drift_streaming will
-          be loaded by this url Defaults to mlconf.hub_url.
-        default: <_ast.Name object at 0x7f48eda946d0>
-      - name: fn_tag
-        type: str
-        doc: hub tag to use Defaults to 'master'
-        default: master
-      outputs:
-      - default: ''
-      lineno: 15
-  description: Deploy a streaming Concept Drift detector on a labeled stream
-  build:
-    functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHNrbXVsdGlmbG93LmRyaWZ0X2RldGVjdGlvbiAgIyBXZSB3aWxsIGdyYWIgb3VyIFBILCBERE0sIEVERE0gYWxnb3JpdGhtcyBmcm9tIGhlcmUKaW1wb3J0IG51bXB5IGFzIG5wCmltcG9ydCBwYW5kYXMgYXMgcGQKaW1wb3J0IG9zCmZyb20gY2xvdWRwaWNrbGUgaW1wb3J0IGR1bXBzLCBsb2FkLCBkdW1wCgpmcm9tIG51Y2xpby50cmlnZ2VycyBpbXBvcnQgVjNJT1N0cmVhbVRyaWdnZXIKZnJvbSBtbHJ1biBpbXBvcnQgRGF0YUl0ZW0sIGltcG9ydF9mdW5jdGlvbiwgbWxjb25mLCBNTENsaWVudEN0eCwgbW91bnRfdjNpbwoKaW1wb3J0IHJhbmRvbQoKCmRlZiBjb25jZXB0X2RyaWZ0X2RlcGxveWVyKAogICAgY29udGV4dDogTUxDbGllbnRDdHgsCiAgICBiYXNlX2RhdGFzZXQ6IERhdGFJdGVtLAogICAgaW5wdXRfc3RyZWFtOiBzdHIsCiAgICBjb25zdW1lcl9ncm91cDogc3RyLAogICAgb3V0cHV0X3N0cmVhbTogc3RyLAogICAgb3V0cHV0X3RzZGI6IHN0ciwKICAgIHRzZGJfYmF0Y2hfc2l6ZTogaW50LAogICAgY2FsbGJhY2tzOiBsaXN0LAogICAgbW9kZWxzOiBsaXN0ID0gWyJkZG0iLCAiZWRkbSIsICJwYWdlaGlua2xleSJdLAogICAgbW9kZWxzX2Rlc3Q9Im1vZGVscyIsCiAgICBwYWdlaGlua2xleV90aHJlc2hvbGQ6IGZsb2F0ID0gMTAsCiAgICBkZG1fd2FybmluZ19sZXZlbDogZmxvYXQgPSAyLAogICAgZGRtX291dF9jb250cm9sX2xldmVsOiBmbG9hdCA9IDMsCiAgICBsYWJlbF9jb2w9ImxhYmVsIiwKICAgIHByZWRpY3Rpb25fY29sPSJwcmVkaWN0aW9uIiwKICAgIGh1Yl91cmw6IHN0ciA9IG1sY29uZi5odWJfdXJsLAogICAgZm5fdGFnOiBzdHIgPSAibWFzdGVyIiwKKToKICAgICIiIkRlcGxveSBhIHN0cmVhbWluZyBDb25jZXB0IERyaWZ0IGRldGVjdG9yIG9uIGEgbGFiZWxlZCBzdHJlYW0KICAgICAgIFRoaXMgZnVuY3Rpb24gaXMgdGhlIERlcGxveW1lbnQgc3RlcCBmb3IgdGhlIFN0cmVhbWluZyBDb25jZXB0IERyaWZ0IERldGVjdG9yLgogICAgICAgSXQgd2lsbCBsb2FkIHRoZSBzZWxlY3RlZCBkcmlmdCBkZXRlY3RvcnMgYW5kIGluaXRpYWxpemUgdGhlbSB3aXRoIHRoZQogICAgICAgYmFzZV9kYXRhc2V0J3Mgc3RhdGlzdGljcy4gIFRoZW4gaXQgd2lsbCBkZXBsb3kgdGhlIGNvbmNlcHRfZHJpZnRfc3RyZWFtaW5nCiAgICAgICBmdW5jdGlvbiBhbmQgcGFzcyB0aGUgbW9kZWxzIHRvIGl0IGZvciBzdHJlYW1pbmcgY29uY2VwdC1kcmlmdCBkZXRlY3Rpb24gb24gdG9wCiAgICAgICBvZiBhIGxhYmVsZWQgc3RyZWFtLgoKICAgIDpwYXJhbSBjb250ZXh0OiAgICAgICAgIE1MUnVuIGNvbnRleHQKICAgIDpwYXJhbSBiYXNlX2RhdGFzZXQ6ICAgIERhdGFzZXQgY29udGFpbmluZyBsYWJlbF9jb2wgYW5kIHByZWRpY3Rpb25fY29sIHRvIGluaXRpYWxpemUgdGhlIGRldGVjdG9ycwogICAgOnBhcmFtIGlucHV0X3N0cmVhbTogICAgbGFiZWxlZCBzdHJlYW0gdG8gdHJhY2suCiAgICAgICAgICAgICAgICAgICAgICAgICAgICBTaG91bGQgY29udGFpbiBsYWJlbF9jb2wgYW5kIHByZWRpY3Rpb25fY29sCiAgICA6cGFyYW0gb3V0cHV0X3N0cmVhbTogICBPdXRwdXQgc3RyZWFtIHRvIHB1c2ggdGhlIGRldGVjdG9yJ3MgYWxlcnRzCiAgICA6cGFyYW0gb3V0cHV0X3RzZGI6ICAgICBPdXRwdXQgVFNEQiB0YWJsZSB0byBhbGxvdyBhbmFseXNpcyBhbmQgZGlzcGxheQogICAgOnBhcmFtIHRzZGJfYmF0Y2hfc2l6ZTogQmF0Y2ggc2l6ZSBvZiBhbGVydHMgdG8gYnVmZmVyIGJlZm9yZSBwdXNoaW5nIHRvIHRoZSBUU0RCCiAgICA6cGFyYW0gY2FsbGJhY2tzOiAgICAgICBBZGRpdGlvbmFsIHJlc3QgZW5kcG9pbnRzIHRvIHNlbmQgdGhlIGFsZXJ0IGRhdGEgdG8KICAgIDpwYXJhbSBtb2RlbHM6ICAgICAgICAgIExpc3Qgb2YgdGhlIGRldGVjdG9ycyB0byBkZXBsb3kKICAgICAgICAgICAgICAgICAgICAgICAgICAgIERlZmF1bHRzIHRvIFsnZGRtJywgJ2VkZG0nLCAncGFnZWhpbmtsZXknXS4KICAgIDpwYXJhbSBtb2RlbHNfZGVzdDogICAgIExvY2F0aW9uIGZvciBzYXZpbmcgdGhlIGRldGVjdG9ycwogICAgICAgICAgICAgICAgICAgICAgICAgICAgRGVmYXVsdHMgdG8gJ21vZGVscycgKGluIHJlbGF0aW9uIHRvIGFydGlmYWN0X3BhdGgpLgogICAgOnBhcmFtIHBhZ2VoaW5rbGV5X3RocmVzaG9sZDogIERyaWZ0IGxldmVsIHRocmVzaG9sZCBmb3IgUEggZGV0ZWN0b3IgRGVmYXVsdHMgdG8gMTAuCiAgICA6cGFyYW0gZGRtX3dhcm5pbmdfbGV2ZWw6ICAgICAgV2FybmluZyBsZXZlbCBhbGVydCBmb3IgRERNIGRldGVjdG9yIERlZmF1bHRzIHRvIDIuCiAgICA6cGFyYW0gZGRtX291dF9jb250cm9sX2xldmVsOiAgRHJpZnQgbGV2ZWwgYWxlcnQgZm9yIERETSBkZXRlY3RvciBEZWZhdWx0cyB0byAzLgogICAgOnBhcmFtIGxhYmVsX2NvbDogICAgICAgTGFiZWwgY29sdW1uIHRvIGJlIHVzZWQgb24gYmFzZV9kYXRhc2V0IGFuZCBpbnB1dF9zdHJlYW0KICAgICAgICAgICAgICAgICAgICAgICAgICAgIERlZmF1bHRzIHRvICdsYWJlbCcuCiAgICA6cGFyYW0gcHJlZGljdGlvbl9jb2w6ICBQcmVkaWN0aW9uIGNvbHVtbiB0byBiZSB1c2VkIG9uIGJhc2VfZGF0YXNldCBhbmQgaW5wdXRfc3RyZWFtCiAgICAgICAgICAgICAgICAgICAgICAgICAgICBEZWZhdWx0cyB0byAncHJlZGljdGlvbicuCiAgICA6cGFyYW0gaHViX3VybDogICAgICAgICBodWJfdXJsIGluIGNhc2UgdGhlIGRlZmF1bHQgaXMgbm90IHVzZWQsIGNvbmNlcHRfZHJpZnRfc3RyZWFtaW5nIHdpbGwgYmUgbG9hZGVkCiAgICAgICAgICAgICAgICAgICAgICAgICAgICBieSB0aGlzIHVybAogICAgICAgICAgICAgICAgICAgICAgICAgICAgRGVmYXVsdHMgdG8gbWxjb25mLmh1Yl91cmwuCiAgICA6cGFyYW0gZm5fdGFnOiAgICAgICAgICBodWIgdGFnIHRvIHVzZQogICAgICAgICAgICAgICAgICAgICAgICAgICAgRGVmYXVsdHMgdG8gJ21hc3RlcicKICAgICIiIgoKICAgIG1sY29uZi5kYnBhdGggPSBtbGNvbmYuZGJwYXRoIG9yICJodHRwOi8vbWxydW4tYXBpOjgwODAiCiAgICBtbGNvbmYuaHViX3VybCA9IGh1Yl91cmwKICAgIGZuID0gaW1wb3J0X2Z1bmN0aW9uKHVybD1mImh1YjovL2NvbmNlcHRfZHJpZnRfc3RyZWFtaW5nOntmbl90YWd9IikKCiAgICBjb250ZXh0LmxvZ2dlci5pbmZvKCJMb2FkaW5nIGJhc2UgZGF0YXNldCIpCiAgICBiYXNlX2RmID0gYmFzZV9kYXRhc2V0LmFzX2RmKCkKICAgIGVycm9yX3N0cmVhbSA9IG5wLndoZXJlKAogICAgICAgIGJhc2VfZGZbcHJlZGljdGlvbl9jb2xdLnZhbHVlcyA9PSBiYXNlX2RmW2xhYmVsX2NvbF0udmFsdWVzLCAwLCAxCiAgICApCgogICAgY29udGV4dC5sb2dnZXIuaW5mbygiQ3JlYXRpbmcgbW9kZWxzIikKICAgIG1vZGVscyA9IFsKICAgICAgICBtb2RlbC5zdHJpcCgpCiAgICAgICAgZm9yIG1vZGVsIGluIG9zLmdldGVudigibW9kZWxzIiwgInBhZ2VoaW5rbGV5LCBkZG0sIGVkZG0iKS5zcGxpdCgiLCIpCiAgICBdCiAgICBtb2RlbHMgPSB7CiAgICAgICAgImVkZG0iOiBza211bHRpZmxvdy5kcmlmdF9kZXRlY3Rpb24uRURETSgpLAogICAgICAgICJwYWdlaGlua2xleSI6IHNrbXVsdGlmbG93LmRyaWZ0X2RldGVjdGlvbi5QYWdlSGlua2xleSgKICAgICAgICAgICAgbWluX2luc3RhbmNlcz1sZW4oZXJyb3Jfc3RyZWFtKSwgdGhyZXNob2xkPXBhZ2VoaW5rbGV5X3RocmVzaG9sZAogICAgICAgICksCiAgICAgICAgImRkbSI6IHNrbXVsdGlmbG93LmRyaWZ0X2RldGVjdGlvbi5ERE0oCiAgICAgICAgICAgIG1pbl9udW1faW5zdGFuY2VzPWxlbihlcnJvcl9zdHJlYW0pLAogICAgICAgICAgICB3YXJuaW5nX2xldmVsPWRkbV93YXJuaW5nX2xldmVsLAogICAgICAgICAgICBvdXRfY29udHJvbF9sZXZlbD1kZG1fb3V0X2NvbnRyb2xfbGV2ZWwsCiAgICAgICAgKSwKICAgIH0KCiAgICBjb250ZXh0LmxvZ2dlci5pbmZvKCJTdHJlYW1pbmcgZGF0YSB0byBtb2RlbHMiKQogICAgZm9yIGkgaW4gcmFuZ2UobGVuKGVycm9yX3N0cmVhbSkpOgogICAgICAgIGZvciBtb2RlbF9uYW1lLCBtb2RlbCBpbiBtb2RlbHMuaXRlbXMoKToKICAgICAgICAgICAgbW9kZWwuYWRkX2VsZW1lbnQoZXJyb3Jfc3RyZWFtW2ldKQoKICAgIGNvbnRleHQubG9nZ2VyLmluZm8oIkxvZ2dpbmcgcmVhZHkgbW9kZWxzIikKICAgIGZvciBuYW1lLCBtb2RlbCBpbiBtb2RlbHMuaXRlbXMoKToKICAgICAgICBkYXRhID0gZHVtcHMobW9kZWwpCiAgICAgICAgbW9kZWxfZmlsZSA9IGYie25hbWV9LnBrbCIKICAgICAgICBjb250ZXh0LmxvZ19tb2RlbCgKICAgICAgICAgICAgZiJ7bmFtZX1fY29uY2VwdF9kcmlmdCIsCiAgICAgICAgICAgIGJvZHk9ZGF0YSwKICAgICAgICAgICAgbGFiZWxzPXsiZnJhbWV3b3JrIjogInNrbXVsdGlmbG93IiwgIndvcmtmbG93IjogImNvbmNlcHQtZHJpZnQifSwKICAgICAgICAgICAgbW9kZWxfZmlsZT1tb2RlbF9maWxlLAogICAgICAgICAgICBtb2RlbF9kaXI9bW9kZWxzX2Rlc3QsCiAgICAgICAgICAgIHRhZz0ibGF0ZXN0IiwKICAgICAgICApCiAgICAgICAgZm4uc2V0X2VudnMoCiAgICAgICAgICAgIHsKICAgICAgICAgICAgICAgIGYie25hbWV9X21vZGVsX3BhdGgiOiBvcy5wYXRoLmpvaW4oCiAgICAgICAgICAgICAgICAgICAgY29udGV4dC5hcnRpZmFjdF9wYXRoLCBtb2RlbHNfZGVzdCwgbW9kZWxfZmlsZQogICAgICAgICAgICAgICAgKQogICAgICAgICAgICB9CiAgICAgICAgKQoKICAgIGNvbnRleHQubG9nZ2VyLmluZm8oIkRlcGxveWluZyBDb25jZXB0IERyaWZ0IFN0cmVhbWluZyBmdW5jdGlvbiIpCiAgICBmbi5zZXRfZW52cygKICAgICAgICB7CiAgICAgICAgICAgICJsYWJlbF9jb2wiOiBsYWJlbF9jb2wsCiAgICAgICAgICAgICJwcmVkaWN0aW9uX2NvbCI6IHByZWRpY3Rpb25fY29sLAogICAgICAgICAgICAiZHJpZnRfc3RyZWFtIjogb3V0cHV0X3N0cmVhbSwKICAgICAgICAgICAgInRzZGJfdGFibGUiOiBvdXRwdXRfdHNkYiwKICAgICAgICAgICAgInBhZ2VoaW5rbGV5X3RocmVzaG9sZCI6IHBhZ2VoaW5rbGV5X3RocmVzaG9sZCwKICAgICAgICAgICAgImRkbV93YXJuaW5nX2xldmVsIjogZGRtX3dhcm5pbmdfbGV2ZWwsCiAgICAgICAgICAgICJkZG1fb3V0X2NvbnRyb2wiOiBkZG1fb3V0X2NvbnRyb2xfbGV2ZWwsCiAgICAgICAgfQogICAgKQogICAgZm4uYWRkX3YzaW9fc3RyZWFtX3RyaWdnZXIoc3RyZWFtX3BhdGggPSBpbnB1dF9zdHJlYW0sIG5hbWUgPSAnc3RyZWFtJywgZ3JvdXAgPSBjb25zdW1lcl9ncm91cCkKICAgIGZuLmFwcGx5KG1vdW50X3YzaW8oKSkKICAgIGZuLmRlcGxveShwcm9qZWN0PWNvbnRleHQucHJvamVjdCkK
-    commands:
-    - python -m pip install scikit-multiflow
-    code_origin: https://github.com/daniels290813/functions.git#82bbfde4afa2eae77059e05c70bbebacf530fd0d:/User/test/functions/concept_drift/concept_drift.py
-    origin_filename: /User/test/functions/concept_drift/concept_drift.py
-  disable_auto_mount: false
-  affinity: null
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/concept_drift/0.8.0/static/item.html b/functions/development/concept_drift/0.8.0/static/item.html deleted file mode 100644 index d3957a78..00000000 --- a/functions/development/concept_drift/0.8.0/static/item.html +++ /dev/null @@ -1,48 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- machine-learning
-- model-serving
-description: Deploy a streaming Concept Drift detector on a labeled stream
-doc: ''
-example: concept_drift.ipynb
-generationDate: 2021-05-19:22-04
-icon: ''
-labels:
-  author: orz
-  framework: sklearn
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 0.8.0
-name: concept-drift
-platformVersion: 3.2.0
-spec:
-  filename: concept_drift.py
-  handler: concept_drift_deployer
-  image: mlrun/ml-models
-  kind: job
-  requirements:
-  - scikit-multiflow
-url: ''
-version: 0.8.0
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/concept_drift/0.8.0/static/source.html b/functions/development/concept_drift/0.8.0/static/source.html deleted file mode 100644 index e6ae663b..00000000 --- a/functions/development/concept_drift/0.8.0/static/source.html +++ /dev/null @@ -1,155 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-# Generated by nuclio.export.NuclioExporter
-
-import skmultiflow.drift_detection  # We will grab our PH, DDM, EDDM algorithms from here
-import numpy as np
-import pandas as pd
-import os
-from cloudpickle import dumps, load, dump
-
-from nuclio.triggers import V3IOStreamTrigger
-from mlrun import DataItem, import_function, mlconf, MLClientCtx, mount_v3io
-
-import random
-
-
-def concept_drift_deployer(
-    context: MLClientCtx,
-    base_dataset: DataItem,
-    input_stream: str,
-    consumer_group: str,
-    output_stream: str,
-    output_tsdb: str,
-    tsdb_batch_size: int,
-    callbacks: list,
-    models: list = ["ddm", "eddm", "pagehinkley"],
-    models_dest="models",
-    pagehinkley_threshold: float = 10,
-    ddm_warning_level: float = 2,
-    ddm_out_control_level: float = 3,
-    label_col="label",
-    prediction_col="prediction",
-    hub_url: str = mlconf.hub_url,
-    fn_tag: str = "master",
-):
-    """Deploy a streaming Concept Drift detector on a labeled stream
-       This function is the Deployment step for the Streaming Concept Drift Detector.
-       It will load the selected drift detectors and initialize them with the
-       base_dataset's statistics.  Then it will deploy the concept_drift_streaming
-       function and pass the models to it for streaming concept-drift detection on top
-       of a labeled stream.
-
-    :param context:         MLRun context
-    :param base_dataset:    Dataset containing label_col and prediction_col to initialize the detectors
-    :param input_stream:    labeled stream to track.
-                            Should contain label_col and prediction_col
-    :param output_stream:   Output stream to push the detector's alerts
-    :param output_tsdb:     Output TSDB table to allow analysis and display
-    :param tsdb_batch_size: Batch size of alerts to buffer before pushing to the TSDB
-    :param callbacks:       Additional rest endpoints to send the alert data to
-    :param models:          List of the detectors to deploy
-                            Defaults to ['ddm', 'eddm', 'pagehinkley'].
-    :param models_dest:     Location for saving the detectors
-                            Defaults to 'models' (in relation to artifact_path).
-    :param pagehinkley_threshold:  Drift level threshold for PH detector Defaults to 10.
-    :param ddm_warning_level:      Warning level alert for DDM detector Defaults to 2.
-    :param ddm_out_control_level:  Drift level alert for DDM detector Defaults to 3.
-    :param label_col:       Label column to be used on base_dataset and input_stream
-                            Defaults to 'label'.
-    :param prediction_col:  Prediction column to be used on base_dataset and input_stream
-                            Defaults to 'prediction'.
-    :param hub_url:         hub_url in case the default is not used, concept_drift_streaming will be loaded
-                            by this url
-                            Defaults to mlconf.hub_url.
-    :param fn_tag:          hub tag to use
-                            Defaults to 'master'
-    """
-
-    mlconf.dbpath = mlconf.dbpath or "http://mlrun-api:8080"
-    mlconf.hub_url = hub_url
-    fn = import_function(url=f"hub://concept_drift_streaming:{fn_tag}")
-
-    context.logger.info("Loading base dataset")
-    base_df = base_dataset.as_df()
-    error_stream = np.where(
-        base_df[prediction_col].values == base_df[label_col].values, 0, 1
-    )
-
-    context.logger.info("Creating models")
-    models = [
-        model.strip()
-        for model in os.getenv("models", "pagehinkley, ddm, eddm").split(",")
-    ]
-    models = {
-        "eddm": skmultiflow.drift_detection.EDDM(),
-        "pagehinkley": skmultiflow.drift_detection.PageHinkley(
-            min_instances=len(error_stream), threshold=pagehinkley_threshold
-        ),
-        "ddm": skmultiflow.drift_detection.DDM(
-            min_num_instances=len(error_stream),
-            warning_level=ddm_warning_level,
-            out_control_level=ddm_out_control_level,
-        ),
-    }
-
-    context.logger.info("Streaming data to models")
-    for i in range(len(error_stream)):
-        for model_name, model in models.items():
-            model.add_element(error_stream[i])
-
-    context.logger.info("Logging ready models")
-    for name, model in models.items():
-        data = dumps(model)
-        model_file = f"{name}.pkl"
-        context.log_model(
-            f"{name}_concept_drift",
-            body=data,
-            labels={"framework": "skmultiflow", "workflow": "concept-drift"},
-            model_file=model_file,
-            model_dir=models_dest,
-            tag="latest",
-        )
-        fn.set_envs(
-            {
-                f"{name}_model_path": os.path.join(
-                    context.artifact_path, models_dest, model_file
-                )
-            }
-        )
-
-    context.logger.info("Deploying Concept Drift Streaming function")
-    fn.set_envs(
-        {
-            "label_col": label_col,
-            "prediction_col": prediction_col,
-            "drift_stream": output_stream,
-            "tsdb_table": output_tsdb,
-            "pagehinkley_threshold": pagehinkley_threshold,
-            "ddm_warning_level": ddm_warning_level,
-            "ddm_out_control": ddm_out_control_level,
-        }
-    )
-    fn.add_v3io_stream_trigger(stream_path = input_stream, name = 'stream', group = consumer_group)
-    fn.apply(mount_v3io())
-    fn.deploy(project=context.project)
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/concept_drift/0.9.0/src/README.md b/functions/development/concept_drift/0.9.0/src/README.md deleted file mode 100644 index 92e6d893..00000000 --- a/functions/development/concept_drift/0.9.0/src/README.md +++ /dev/null @@ -1,132 +0,0 @@ -# Concept Drift - -**Concept drift** is a change in the statistical properties of the **target variable** over time. - -When deploying our models to production, we must ensure our models perform as we expect them to - reaching the same level of performence we have seen on our test sets or at least performing in the same quality as when they were deployed. - -However, often this is not the case. there are many factors that can affect our model's performance like seasonality or any unkown root causes that will change the laws underlying our data and invalidate some assumptions made by the model. - -We offer this function to help combat Concept Drift with implementation of streaming DDM, EDDM and PH concept drift detectors. - -## How to integrate - -This function is made of two parts: - -1. Kubernetes job to instantiate the selected models with a provided base dataset (the test dataset could be used) -2. [Nuclio serverless function](../concept_drift_streaming/concept_drift_streaming.ipynb) listed on a _labeled stream_, which will be deployed from this function after the models initialization and run the models per event and provide necessary alerts. - -There are two steps to integrate sucessfully with your workflow: - -1. Provide a stream where each event containes the joined **label** and **prediction** for that specific event. -2. Add this function to the workflow with the following params: - -```markdown -:param context: MLRun context -:param base_dataset: Dataset containing label_col and prediction_col to initialize the detectors -:param input_stream: labeled stream to track. - Should contain label_col and prediction_col -:param output_stream: Output stream to push the detector's alerts -:param output_tsdb: Output TSDB table to allow analysis and display -:param tsdb_batch_size: Batch size of alerts to buffer before pushing to the TSDB -:param callbacks: Additional rest endpoints to send the alert data to -:param models: List of the detectors to deploy - Defaults to ['ddm', 'eddm', 'pagehinkley']. -:param models_dest: Location for saving the detectors - Defaults to 'models' (in relation to artifact_path). -:param pagehinkley_threshold: Drift level threshold for PH detector Defaults to 10. -:param ddm_warning_level: Warning level alert for DDM detector Defaults to 2. -:param ddm_out_control_level: Drift level alert for DDM detector Defaults to 3. -:param label_col: Label column to be used on base_dataset and input_stream - Defaults to 'label'. -:param prediction_col: Prediction column to be used on base_dataset and input_stream - Defaults to 'prediction'. -:param hub_url: hub_url in case the default is not used, concept_drift_streaming will be loaded - by this url - Defaults to mlconf.hub_url. -:param fn_tag: hub tag to use - Defaults to 'master' -``` - -## Algorithms - -We offer to deploy up to 3 concept drift streaming detectors - -### DDM - Drift Detection Method - -Models the **Number of errors** as a **binomial** variable. This enables us to confine the expected number of errors in a prediction stream window to within some standard deviation. - -- Good for **abrupt** drift changes - -
- -![$mu=np_t$](https://latex.codecogs.com/svg.latex?mu=np_t) - -![$\sigma=\sqrt{\frac{p_t(1-p_t)}{n}}$]() - -
- -**Alert** when: - -
- -![$p_t+\sigma_t\ge{p_{min}+3\sigma_{min}}$](https://latex.codecogs.com/svg.latex?p_t+\sigma_t\ge{p_{min}+3\sigma_{min}}) - -
- -### EDDM - Early Drift Detection Method - -Uses the distance between two consecutive errors. - -- works better for **gradual** drift changes. -- More sensitive then DDM for noise -- Requires Minimal number of errors to initialize the statistics. - -**Warning**: - -
- -![$\frac{p_t+2\sigma_t}{p_{max}+2\sigma_{max}}<0.95$](https://latex.codecogs.com/svg.latex?\frac{p_t+2\sigma_t}{p_{max}+2\sigma_{max}}<0.95) - -
- -**Alert**: - -
- -![$\frac{p_t+2\sigma_t}{p_{max}+2\sigma_{max}}<0.90$](https://latex.codecogs.com/svg.latex?\frac{p_t+2\sigma_t}{p_{max}+2\sigma_{max}}<0.90) - -
- -### PageHinkley Test: - -The PageHinkley test is a sequential analysis technique typically used for monitoring change detection. (The test was designed to detect change in avg. of a Gaussian signal). In this test we use: -x*1*, ..., x*n* - labeled dataset -δ - magnitude threshold -λ - detection threshold - -
- -![$\hat{x_T}=\frac{1}{T}\sum_{t=1}^{t}{x_t}$](https://latex.codecogs.com/svg.latex?\hat{x_T}=\frac{1}{T}\sum_{t=1}^{t}{x_t}) - -![$\sum_{t=1}^T{x_t-\hat{x_T}-\delta}$](https://latex.codecogs.com/svg.latex?U_T=\sum_{t=1}^T{x_t-\hat{x_T}-\delta}) - -![$m_T=min(U_t,t=1..T)$]() - -
- -**Alert**: - -
- -![$U_T-m_T>\lambda$](https://latex.codecogs.com/svg.latex?U_T-m_T>\lambda) - -
- -## Additional resources -[A Study on Change Detection Methods](https://pdfs.semanticscholar.org/bb6e/8a44c0efcd725aae1c0b1817561f6e278c2c.pdf), Raquel Sebasti˜ao1,2 and Jo˜ao Gama1,3, 1 LIAAD-INESC Porto L.A., University of Porto -Rua de Ceuta, 118 - 6, 4050-190 Porto, Portugal -2 Faculty of Science, University of Porto -3 Faculty of Economics, University of Porto -{raquel,jgama}@liaad.up.pt - -[MLOps Live #4 - How to Detect & Remediate Drift in Production with MLOps Automation](https://www.youtube.com/watch?v=66_Q7mJZOSc&t=1296s) diff --git a/functions/development/concept_drift/0.9.0/src/concept_drift.ipynb b/functions/development/concept_drift/0.9.0/src/concept_drift.ipynb deleted file mode 100644 index e9c063b6..00000000 --- a/functions/development/concept_drift/0.9.0/src/concept_drift.ipynb +++ /dev/null @@ -1,793 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Concept Drift - Deployer\n", - "Deploy a streaming Concept Drift detector on a labeled stream. \n", - "It will initialize the selected drift detectors with the base_dataset's statistics and deploy the [concept_drift_streaming](https://github.com/mlrun/functions/blob/master/concept_drift_streaming/concept_drift_streaming.ipynb) function from the hub.
\n", - "adding [V3IOStreamTrigger](https://nuclio.io/docs/latest/reference/triggers/v3iostream/) in order to listen to the input_stream." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Steps**\n", - "\n", - "1. [Data exploration](#Data-exploration)\n", - "2. [Creating the input stream](#Creating-the-input-stream)\n", - "3. [Importing the function](#Importing-the-function)\n", - "4. [Running the function remotely](#Running-the-function-remotely)\n", - "5. [Testing the function](#Testing-the-function)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Data exploration**" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In order to know about the performance of a drift detector by measuring the different detection metrics, we need to know beforehand where a real drift occurs.
\n", - "This is only possible with synthetic datasets.
The scikit-multiflow framework allows generating several kinds of synthetic data to simulate the occurrence of drifts.
\n", - "[Harvard dataverse](https://dataverse.harvard.edu) provides futher explanations on the [used dataset](https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/5OWRGB) along with different kinds of drifted datasets.
\n", - "mixed_0101_abrupto has 4 concepts and 3 drifts at time steps 10000, 20000, and 30000.
\n", - "Our dataset will be train-test-splitted, the train part (first 5000 examples) is used to train the model (that is generated easly using [sklearn_classifer](https://github.com/mlrun/functions/blob/master/sklearn_classifier/sklearn_classifier.ipynb)).
\n", - "The test part (which is already predicted by the model) will be pushed to the input stream in order to detect drifts." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
X1X2X3X4class
00.01.00.4601010.5927441.0
11.01.00.5887880.5749840.0
20.00.00.4016410.6793251.0
31.01.00.3060760.1821080.0
40.00.00.9628470.5792451.0
\n", - "
" - ], - "text/plain": [ - " X1 X2 X3 X4 class\n", - "0 0.0 1.0 0.460101 0.592744 1.0\n", - "1 1.0 1.0 0.588788 0.574984 0.0\n", - "2 0.0 0.0 0.401641 0.679325 1.0\n", - "3 1.0 1.0 0.306076 0.182108 0.0\n", - "4 0.0 0.0 0.962847 0.579245 1.0" - ] - }, - "execution_count": 1, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import pandas as pd\n", - "data_path = 'https://s3.wasabisys.com/iguazio/data/function-marketplace-data/concept_drift/mixed_0101_abrupto.csv'\n", - "predicted_train_path = 'https://s3.wasabisys.com/iguazio/data/function-marketplace-data/concept_drift/predicted_abrupto_train.csv'\n", - "predicted_test_data_path = 'https://s3.wasabisys.com/iguazio/data/function-marketplace-data/concept_drift/predicted_abrupto_test.csv'\n", - "# You can find the model used here\n", - "models_path = 'https://s3.wasabisys.com/iguazio/models/function-marketplace-models/concept_drift/concept_drift_random_forest.pkl'\n", - "original_data = pd.read_csv(data_path)\n", - "original_data.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
X1X2X3X4classpredicted_col
349950.00.00.0101060.6472690.01.0
349961.01.00.2936510.7372911.00.0
349970.00.00.8485460.5523370.01.0
349981.01.00.6147540.8598961.00.0
349991.00.00.2653060.8437160.01.0
\n", - "
" - ], - "text/plain": [ - " X1 X2 X3 X4 class predicted_col\n", - "34995 0.0 0.0 0.010106 0.647269 0.0 1.0\n", - "34996 1.0 1.0 0.293651 0.737291 1.0 0.0\n", - "34997 0.0 0.0 0.848546 0.552337 0.0 1.0\n", - "34998 1.0 1.0 0.614754 0.859896 1.0 0.0\n", - "34999 1.0 0.0 0.265306 0.843716 0.0 1.0" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "predicted_test = pd.read_csv(predicted_test_data_path)\n", - "predicted_test.tail()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Creating the input stream**" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "import os \n", - "\n", - "container = os.path.join('/',os.environ['V3IO_HOME'].split('/')[0])\n", - "user = os.environ[\"V3IO_USERNAME\"]\n", - "rel_path = os.getcwd()[6:] + '/artifacts'\n", - "\n", - "base_input_stream = os.path.join(user,rel_path) + \"/inputs_stream\"\n", - "base_output_stream = os.path.join(user,rel_path) + \"/output_stream\"\n", - "input_stream = os.path.join(container,base_input_stream)\n", - "output_stream = os.path.join(container,user,rel_path) + \"/output_stream\"\n", - "tsdb_path = os.path.join(container,user,rel_path) + \"/output_tsdb\"\n", - "\n", - "stream_consumer_group = 'cg45'" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "import v3io.dataplane\n", - "\n", - "client = v3io.dataplane.Client()\n", - "response = client.stream.create(container = container,\n", - " stream_path=base_input_stream,\n", - " shard_count=1,\n", - " raise_for_status = v3io.dataplane.RaiseForStatus.never)\n", - "response.raise_for_status([409, 204])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Importing the function**" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2021-10-25 10:27:04,105 [info] created and saved project function-marketplace\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Importing the function\n", - "import mlrun\n", - "mlrun.set_environment(project='function-marketplace')\n", - "\n", - "fn = mlrun.import_function(\"hub://concept_drift:development\")\n", - "fn.apply(mlrun.auto_mount())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Running the function remotely**" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2021-10-25 10:27:04,567 [info] starting run concept_drift uid=fa07c222e77d4eac86d2ce9317aaded1 DB=http://mlrun-api:8080\n", - "> 2021-10-25 10:27:04,709 [info] Job is running in the background, pod: concept-drift-ggxgb\n", - "> 2021-10-25 10:27:11,199 [info] Loading base dataset\n", - "> 2021-10-25 10:27:13,227 [info] Creating models\n", - "> 2021-10-25 10:27:13,227 [info] Streaming data to models\n", - "> 2021-10-25 10:27:13,347 [info] Logging ready models\n", - "> 2021-10-25 10:27:13,487 [info] Deploying Concept Drift Streaming function\n", - "> 2021-10-25 10:27:13,490 [info] Starting remote function deploy\n", - "2021-10-25 10:27:13 (info) Deploying function\n", - "2021-10-25 10:27:13 (info) Building\n", - "2021-10-25 10:27:13 (info) Staging files and preparing base images\n", - "2021-10-25 10:27:13 (info) Building processor image\n", - "2021-10-25 10:27:15 (info) Build complete\n", - "2021-10-25 10:27:21 (info) Function deploy complete\n", - "> 2021-10-25 10:27:21,797 [info] successfully deployed function: {'internal_invocation_urls': ['nuclio-function-marketplace-concept-drift-streaming.default-tenant.svc.cluster.local:8080'], 'external_invocation_urls': ['default-tenant.app.dev39.lab.iguazeng.com:31143']}\n", - "> 2021-10-25 10:27:21,868 [info] run executed, status=completed\n", - "final state: completed\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
function-marketplace0Oct 25 10:27:10completedconcept_drift
v3io_user=dani
kind=job
owner=dani
host=concept-drift-ggxgb
base_dataset
input_stream=/users/dani/test/functions/concept_drift/artifacts/inputs_stream
consumer_group=cg45
output_stream=/users/dani/test/functions/concept_drift/artifacts/output_stream
output_tsdb=/users/dani/test/functions/concept_drift/artifacts/output_tsdb
tsdb_batch_size=1
models=['ddm', 'eddm', 'pagehinkley']
label_col=class
prediction_col=predicted_col
fn_tag=development
eddm_concept_drift
pagehinkley_concept_drift
ddm_concept_drift
\n", - "
\n", - "
\n", - "
\n", - " Title\n", - " ×\n", - "
\n", - " \n", - "
\n", - "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - }, - { - "data": { - "text/html": [ - " > to track results use the .show() or .logs() methods or click here to open in UI" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2021-10-25 10:27:23,031 [info] run executed, status=completed\n" - ] - } - ], - "source": [ - "drift_run = fn.run(name='concept_drift',\n", - " params={'input_stream' : input_stream,\n", - " 'consumer_group' : stream_consumer_group,\n", - " 'output_stream' : output_stream,\n", - " 'output_tsdb' : tsdb_path,\n", - " 'tsdb_batch_size' : 1,\n", - " 'models' : ['ddm', 'eddm', 'pagehinkley'], # defaults\n", - " 'label_col' : 'class',\n", - " 'prediction_col' : 'predicted_col',\n", - " 'fn_tag' : 'development'},\n", - " inputs={'base_dataset' : predicted_train_path},\n", - " artifact_path = os.path.join(os.getcwd(), 'artifacts'))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Testing the function**\n", - "> Mark that we are testing the deployed function - concept_drift_streaming" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'data': '{\"class\": 1.0, \"request\": {\"instances\": [{\"X1\": 0.0, \"X2\": 0.0, \"X3\": 0.0634475073, \"X4\": 0.4136568818}]}, \"resp\": [1], \"when\": \"2021-10-25 10:27:23.152584\", \"model\": \"sklearn.ensemble.RandomForestClassifier\"}'}" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import json\n", - "import datetime\n", - "\n", - "# Reshaping the data to V3IOStream format.\n", - "def restructure_stream_event(context, event):\n", - " instances = [dict()]\n", - " for key in predicted_test.keys():\n", - " if key not in ['when', 'class', 'model', 'worker', 'hostname', 'predicted_col']:\n", - " instances[0].update({key: event.pop(key)})\n", - " event['request'] = {'instances': instances}\n", - " event['resp'] = [int(event.pop('predicted_col'))]\n", - " event['when'] = datetime.datetime.strftime(datetime.datetime.now(), format=\"%Y-%m-%d %H:%M:%S.%f\")\n", - " event['model'] = 'sklearn.ensemble.RandomForestClassifier'\n", - " return event\n", - " \n", - " \n", - "records = json.loads(predicted_test.to_json(orient='records'))\n", - "records = [{'data': json.dumps(restructure_stream_event(context, record))} for record in records]\n", - "\n", - "# showing first record\n", - "records[0]" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "# Creating v3io client\n", - "v3io_client = v3io.dataplane.Client()\n", - "\n", - "# Pushing some undrifted data to the input stream\n", - "response = v3io_client.stream.put_records(container=container,\n", - " stream_path=base_input_stream, \n", - " records=records[4900:5100])" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'SequenceNumber': 200,\n", - " 'Data': 'eyJjbGFzcyI6IDAuMCwgInJlcXVlc3QiOiB7Imluc3RhbmNlcyI6IFt7IlgxIjogMC4wLCAiWDIiOiAwLjAsICJYMyI6IDAuMzMzMTYzNjk4OSwgIlg0IjogMC40MjE2NzY1Njg3fV19LCAicmVzcCI6IFsxXSwgIndoZW4iOiAiMjAyMS0xMC0yNSAxMDoyNzoyMy4yOTM3OTgiLCAibW9kZWwiOiAic2tsZWFybi5lbnNlbWJsZS5SYW5kb21Gb3Jlc3RDbGFzc2lmaWVyIn0=',\n", - " 'ArrivalTimeSec': 1635157644,\n", - " 'ArrivalTimeNSec': 395309631}" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Getting earliest location in the shard\n", - "location = json.loads(v3io_client.stream.seek(container=container,\n", - " stream_path=base_input_stream,\n", - " shard_id=0,\n", - " seek_type='EARLIEST').body)['Location']\n", - "# Getting records from input stream\n", - "response = v3io_client.stream.get_records(container=container,\n", - " stream_path=base_input_stream,\n", - " shard_id=0, location=location)\n", - "# Showing the last sequence that is written to the input stream\n", - "json.loads(response.body)['Records'][-1]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Make sure some time has passed - the function needs to be triggered by the input stream, then it'll write to the output stream" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "# Getting earliest location in the shard\n", - "location = json.loads(v3io_client.stream.seek(container=container,\n", - " stream_path=base_output_stream,\n", - " shard_id=0,\n", - " seek_type='EARLIEST').body)['Location']\n", - "# Getting records from output stream\n", - "response = v3io_client.stream.get_records(container=container,\n", - " stream_path=base_output_stream,\n", - " shard_id=0, location=location)" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "sequence number : 106, data : {'class': 0.0, 'request': {'instances': [{'X1': 0.0, 'X2': 0.0, 'X3': 0.9628473804, 'X4': 0.5792453402}]}, 'resp': [1], 'when': '2021-10-25 10:27:23.291145', 'model': 'sklearn.ensemble.RandomForestClassifier', 'ddm_warning_zone': 0, 'ddm_drift': 1, 'eddm_warning_zone': 0, 'eddm_drift': 0}\n", - "sequence number : 122, data : {'class': 0.0, 'request': {'instances': [{'X1': 0.0, 'X2': 0.0, 'X3': 0.4969765505, 'X4': 0.9784738351}]}, 'resp': [1], 'when': '2021-10-25 10:27:23.291558', 'model': 'sklearn.ensemble.RandomForestClassifier', 'ddm_warning_zone': 0, 'ddm_drift': 0, 'eddm_warning_zone': 0, 'eddm_drift': 1}\n" - ] - } - ], - "source": [ - "# Showing changed detected\n", - "import base64\n", - "for instance in json.loads(response.body)['Records']:\n", - " seq = instance[\"SequenceNumber\"]\n", - " data = json.loads(base64.b64decode(instance['Data']))\n", - " if(data['ddm_drift']==1 or data['eddm_drift']==1):\n", - " print(f'sequence number : {seq}, data : {data}')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can see that the system detected a change in the 106 instance, which is 10006 instance in the real dataset -
\n", - "5000 first instances are for train, we started pushing data from the 4900 instance of the test dataset (9900 from the real dataset), and we pushed only 200 instances.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "[Back to the top](#Concept-Drift---Deployer)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python [conda env:root] *", - "language": "python", - "name": "conda-root-py" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/functions/development/concept_drift/0.9.0/src/concept_drift.py b/functions/development/concept_drift/0.9.0/src/concept_drift.py deleted file mode 100644 index f6fd8dcc..00000000 --- a/functions/development/concept_drift/0.9.0/src/concept_drift.py +++ /dev/null @@ -1,133 +0,0 @@ -# Generated by nuclio.export.NuclioExporter - -import skmultiflow.drift_detection # We will grab our PH, DDM, EDDM algorithms from here -import numpy as np -import pandas as pd -import os -from cloudpickle import dumps, load, dump - -from nuclio.triggers import V3IOStreamTrigger -from mlrun import DataItem, import_function, mlconf, MLClientCtx, mount_v3io - -import random - - -def concept_drift_deployer( - context: MLClientCtx, - base_dataset: DataItem, - input_stream: str, - consumer_group: str, - output_stream: str, - output_tsdb: str, - tsdb_batch_size: int, - callbacks: list, - models: list = ["ddm", "eddm", "pagehinkley"], - models_dest="models", - pagehinkley_threshold: float = 10, - ddm_warning_level: float = 2, - ddm_out_control_level: float = 3, - label_col="label", - prediction_col="prediction", - hub_url: str = mlconf.hub_url, - fn_tag: str = "master", -): - """Deploy a streaming Concept Drift detector on a labeled stream - This function is the Deployment step for the Streaming Concept Drift Detector. - It will load the selected drift detectors and initialize them with the - base_dataset's statistics. Then it will deploy the concept_drift_streaming - function and pass the models to it for streaming concept-drift detection on top - of a labeled stream. - - :param context: MLRun context - :param base_dataset: Dataset containing label_col and prediction_col to initialize the detectors - :param input_stream: labeled stream to track. - Should contain label_col and prediction_col - :param output_stream: Output stream to push the detector's alerts - :param output_tsdb: Output TSDB table to allow analysis and display - :param tsdb_batch_size: Batch size of alerts to buffer before pushing to the TSDB - :param callbacks: Additional rest endpoints to send the alert data to - :param models: List of the detectors to deploy - Defaults to ['ddm', 'eddm', 'pagehinkley']. - :param models_dest: Location for saving the detectors - Defaults to 'models' (in relation to artifact_path). - :param pagehinkley_threshold: Drift level threshold for PH detector Defaults to 10. - :param ddm_warning_level: Warning level alert for DDM detector Defaults to 2. - :param ddm_out_control_level: Drift level alert for DDM detector Defaults to 3. - :param label_col: Label column to be used on base_dataset and input_stream - Defaults to 'label'. - :param prediction_col: Prediction column to be used on base_dataset and input_stream - Defaults to 'prediction'. - :param hub_url: hub_url in case the default is not used, concept_drift_streaming will be loaded - by this url - Defaults to mlconf.hub_url. - :param fn_tag: hub tag to use - Defaults to 'master' - """ - - mlconf.dbpath = mlconf.dbpath or "http://mlrun-api:8080" - mlconf.hub_url = hub_url - fn = import_function(url=f"hub://concept_drift_streaming:{fn_tag}") - - context.logger.info("Loading base dataset") - base_df = base_dataset.as_df() - error_stream = np.where( - base_df[prediction_col].values == base_df[label_col].values, 0, 1 - ) - - context.logger.info("Creating models") - models = [ - model.strip() - for model in os.getenv("models", "pagehinkley, ddm, eddm").split(",") - ] - models = { - "eddm": skmultiflow.drift_detection.EDDM(), - "pagehinkley": skmultiflow.drift_detection.PageHinkley( - min_instances=len(error_stream), threshold=pagehinkley_threshold - ), - "ddm": skmultiflow.drift_detection.DDM( - min_num_instances=len(error_stream), - warning_level=ddm_warning_level, - out_control_level=ddm_out_control_level, - ), - } - - context.logger.info("Streaming data to models") - for i in range(len(error_stream)): - for model_name, model in models.items(): - model.add_element(error_stream[i]) - - context.logger.info("Logging ready models") - for name, model in models.items(): - data = dumps(model) - model_file = f"{name}.pkl" - context.log_model( - f"{name}_concept_drift", - body=data, - labels={"framework": "skmultiflow", "workflow": "concept-drift"}, - model_file=model_file, - model_dir=models_dest, - tag="latest", - ) - fn.set_envs( - { - f"{name}_model_path": os.path.join( - context.artifact_path, models_dest, model_file - ) - } - ) - - context.logger.info("Deploying Concept Drift Streaming function") - fn.set_envs( - { - "label_col": label_col, - "prediction_col": prediction_col, - "drift_stream": output_stream, - "tsdb_table": output_tsdb, - "pagehinkley_threshold": pagehinkley_threshold, - "ddm_warning_level": ddm_warning_level, - "ddm_out_control": ddm_out_control_level, - } - ) - fn.add_v3io_stream_trigger(stream_path = input_stream, name = 'stream', group = consumer_group) - fn.apply(mount_v3io()) - fn.deploy(project=context.project) diff --git a/functions/development/concept_drift/0.9.0/src/function.yaml b/functions/development/concept_drift/0.9.0/src/function.yaml deleted file mode 100644 index 071111c7..00000000 --- a/functions/development/concept_drift/0.9.0/src/function.yaml +++ /dev/null @@ -1,112 +0,0 @@ -kind: job -metadata: - name: concept-drift - tag: '' - hash: 935da41196802875e19948974f32b6f00c29feb2 - project: '' - labels: - author: orz - framework: sklearn - categories: - - machine-learning - - model-serving -spec: - command: '' - args: [] - image: mlrun/ml-models - env: [] - default_handler: concept_drift_deployer - entry_points: - concept_drift_deployer: - name: concept_drift_deployer - doc: "Deploy a streaming Concept Drift detector on a labeled stream\n This\ - \ function is the Deployment step for the Streaming Concept Drift Detector.\n\ - \ It will load the selected drift detectors and initialize them with the\n\ - \ base_dataset's statistics. Then it will deploy the concept_drift_streaming\n\ - \ function and pass the models to it for streaming concept-drift detection\ - \ on top\n of a labeled stream." - parameters: - - name: context - type: MLClientCtx - doc: MLRun context - default: '' - - name: base_dataset - type: DataItem - doc: Dataset containing label_col and prediction_col to initialize the detectors - default: '' - - name: input_stream - type: str - doc: labeled stream to track. Should contain label_col and prediction_col - default: '' - - name: consumer_group - type: str - default: '' - - name: output_stream - type: str - doc: Output stream to push the detector's alerts - default: '' - - name: output_tsdb - type: str - doc: Output TSDB table to allow analysis and display - default: '' - - name: tsdb_batch_size - type: int - doc: Batch size of alerts to buffer before pushing to the TSDB - default: '' - - name: callbacks - type: list - doc: Additional rest endpoints to send the alert data to - default: '' - - name: models - type: list - doc: List of the detectors to deploy Defaults to ['ddm', 'eddm', 'pagehinkley']. - default: - - ddm - - eddm - - pagehinkley - - name: models_dest - doc: Location for saving the detectors Defaults to 'models' (in relation to - artifact_path). - default: models - - name: pagehinkley_threshold - type: float - doc: Drift level threshold for PH detector Defaults to 10. - default: 10 - - name: ddm_warning_level - type: float - doc: Warning level alert for DDM detector Defaults to 2. - default: 2 - - name: ddm_out_control_level - type: float - doc: Drift level alert for DDM detector Defaults to 3. - default: 3 - - name: label_col - doc: Label column to be used on base_dataset and input_stream Defaults to - 'label'. - default: label - - name: prediction_col - doc: Prediction column to be used on base_dataset and input_stream Defaults - to 'prediction'. - default: prediction - - name: hub_url - type: str - doc: hub_url in case the default is not used, concept_drift_streaming will - be loaded by this url Defaults to mlconf.hub_url. - default: <_ast.Name object at 0x7f48eda946d0> - - name: fn_tag - type: str - doc: hub tag to use Defaults to 'master' - default: master - outputs: - - default: '' - lineno: 15 - description: Deploy a streaming Concept Drift detector on a labeled stream - build: - functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHNrbXVsdGlmbG93LmRyaWZ0X2RldGVjdGlvbiAgIyBXZSB3aWxsIGdyYWIgb3VyIFBILCBERE0sIEVERE0gYWxnb3JpdGhtcyBmcm9tIGhlcmUKaW1wb3J0IG51bXB5IGFzIG5wCmltcG9ydCBwYW5kYXMgYXMgcGQKaW1wb3J0IG9zCmZyb20gY2xvdWRwaWNrbGUgaW1wb3J0IGR1bXBzLCBsb2FkLCBkdW1wCgpmcm9tIG51Y2xpby50cmlnZ2VycyBpbXBvcnQgVjNJT1N0cmVhbVRyaWdnZXIKZnJvbSBtbHJ1biBpbXBvcnQgRGF0YUl0ZW0sIGltcG9ydF9mdW5jdGlvbiwgbWxjb25mLCBNTENsaWVudEN0eCwgbW91bnRfdjNpbwoKaW1wb3J0IHJhbmRvbQoKCmRlZiBjb25jZXB0X2RyaWZ0X2RlcGxveWVyKAogICAgY29udGV4dDogTUxDbGllbnRDdHgsCiAgICBiYXNlX2RhdGFzZXQ6IERhdGFJdGVtLAogICAgaW5wdXRfc3RyZWFtOiBzdHIsCiAgICBjb25zdW1lcl9ncm91cDogc3RyLAogICAgb3V0cHV0X3N0cmVhbTogc3RyLAogICAgb3V0cHV0X3RzZGI6IHN0ciwKICAgIHRzZGJfYmF0Y2hfc2l6ZTogaW50LAogICAgY2FsbGJhY2tzOiBsaXN0LAogICAgbW9kZWxzOiBsaXN0ID0gWyJkZG0iLCAiZWRkbSIsICJwYWdlaGlua2xleSJdLAogICAgbW9kZWxzX2Rlc3Q9Im1vZGVscyIsCiAgICBwYWdlaGlua2xleV90aHJlc2hvbGQ6IGZsb2F0ID0gMTAsCiAgICBkZG1fd2FybmluZ19sZXZlbDogZmxvYXQgPSAyLAogICAgZGRtX291dF9jb250cm9sX2xldmVsOiBmbG9hdCA9IDMsCiAgICBsYWJlbF9jb2w9ImxhYmVsIiwKICAgIHByZWRpY3Rpb25fY29sPSJwcmVkaWN0aW9uIiwKICAgIGh1Yl91cmw6IHN0ciA9IG1sY29uZi5odWJfdXJsLAogICAgZm5fdGFnOiBzdHIgPSAibWFzdGVyIiwKKToKICAgICIiIkRlcGxveSBhIHN0cmVhbWluZyBDb25jZXB0IERyaWZ0IGRldGVjdG9yIG9uIGEgbGFiZWxlZCBzdHJlYW0KICAgICAgIFRoaXMgZnVuY3Rpb24gaXMgdGhlIERlcGxveW1lbnQgc3RlcCBmb3IgdGhlIFN0cmVhbWluZyBDb25jZXB0IERyaWZ0IERldGVjdG9yLgogICAgICAgSXQgd2lsbCBsb2FkIHRoZSBzZWxlY3RlZCBkcmlmdCBkZXRlY3RvcnMgYW5kIGluaXRpYWxpemUgdGhlbSB3aXRoIHRoZQogICAgICAgYmFzZV9kYXRhc2V0J3Mgc3RhdGlzdGljcy4gIFRoZW4gaXQgd2lsbCBkZXBsb3kgdGhlIGNvbmNlcHRfZHJpZnRfc3RyZWFtaW5nCiAgICAgICBmdW5jdGlvbiBhbmQgcGFzcyB0aGUgbW9kZWxzIHRvIGl0IGZvciBzdHJlYW1pbmcgY29uY2VwdC1kcmlmdCBkZXRlY3Rpb24gb24gdG9wCiAgICAgICBvZiBhIGxhYmVsZWQgc3RyZWFtLgoKICAgIDpwYXJhbSBjb250ZXh0OiAgICAgICAgIE1MUnVuIGNvbnRleHQKICAgIDpwYXJhbSBiYXNlX2RhdGFzZXQ6ICAgIERhdGFzZXQgY29udGFpbmluZyBsYWJlbF9jb2wgYW5kIHByZWRpY3Rpb25fY29sIHRvIGluaXRpYWxpemUgdGhlIGRldGVjdG9ycwogICAgOnBhcmFtIGlucHV0X3N0cmVhbTogICAgbGFiZWxlZCBzdHJlYW0gdG8gdHJhY2suCiAgICAgICAgICAgICAgICAgICAgICAgICAgICBTaG91bGQgY29udGFpbiBsYWJlbF9jb2wgYW5kIHByZWRpY3Rpb25fY29sCiAgICA6cGFyYW0gb3V0cHV0X3N0cmVhbTogICBPdXRwdXQgc3RyZWFtIHRvIHB1c2ggdGhlIGRldGVjdG9yJ3MgYWxlcnRzCiAgICA6cGFyYW0gb3V0cHV0X3RzZGI6ICAgICBPdXRwdXQgVFNEQiB0YWJsZSB0byBhbGxvdyBhbmFseXNpcyBhbmQgZGlzcGxheQogICAgOnBhcmFtIHRzZGJfYmF0Y2hfc2l6ZTogQmF0Y2ggc2l6ZSBvZiBhbGVydHMgdG8gYnVmZmVyIGJlZm9yZSBwdXNoaW5nIHRvIHRoZSBUU0RCCiAgICA6cGFyYW0gY2FsbGJhY2tzOiAgICAgICBBZGRpdGlvbmFsIHJlc3QgZW5kcG9pbnRzIHRvIHNlbmQgdGhlIGFsZXJ0IGRhdGEgdG8KICAgIDpwYXJhbSBtb2RlbHM6ICAgICAgICAgIExpc3Qgb2YgdGhlIGRldGVjdG9ycyB0byBkZXBsb3kKICAgICAgICAgICAgICAgICAgICAgICAgICAgIERlZmF1bHRzIHRvIFsnZGRtJywgJ2VkZG0nLCAncGFnZWhpbmtsZXknXS4KICAgIDpwYXJhbSBtb2RlbHNfZGVzdDogICAgIExvY2F0aW9uIGZvciBzYXZpbmcgdGhlIGRldGVjdG9ycwogICAgICAgICAgICAgICAgICAgICAgICAgICAgRGVmYXVsdHMgdG8gJ21vZGVscycgKGluIHJlbGF0aW9uIHRvIGFydGlmYWN0X3BhdGgpLgogICAgOnBhcmFtIHBhZ2VoaW5rbGV5X3RocmVzaG9sZDogIERyaWZ0IGxldmVsIHRocmVzaG9sZCBmb3IgUEggZGV0ZWN0b3IgRGVmYXVsdHMgdG8gMTAuCiAgICA6cGFyYW0gZGRtX3dhcm5pbmdfbGV2ZWw6ICAgICAgV2FybmluZyBsZXZlbCBhbGVydCBmb3IgRERNIGRldGVjdG9yIERlZmF1bHRzIHRvIDIuCiAgICA6cGFyYW0gZGRtX291dF9jb250cm9sX2xldmVsOiAgRHJpZnQgbGV2ZWwgYWxlcnQgZm9yIERETSBkZXRlY3RvciBEZWZhdWx0cyB0byAzLgogICAgOnBhcmFtIGxhYmVsX2NvbDogICAgICAgTGFiZWwgY29sdW1uIHRvIGJlIHVzZWQgb24gYmFzZV9kYXRhc2V0IGFuZCBpbnB1dF9zdHJlYW0KICAgICAgICAgICAgICAgICAgICAgICAgICAgIERlZmF1bHRzIHRvICdsYWJlbCcuCiAgICA6cGFyYW0gcHJlZGljdGlvbl9jb2w6ICBQcmVkaWN0aW9uIGNvbHVtbiB0byBiZSB1c2VkIG9uIGJhc2VfZGF0YXNldCBhbmQgaW5wdXRfc3RyZWFtCiAgICAgICAgICAgICAgICAgICAgICAgICAgICBEZWZhdWx0cyB0byAncHJlZGljdGlvbicuCiAgICA6cGFyYW0gaHViX3VybDogICAgICAgICBodWJfdXJsIGluIGNhc2UgdGhlIGRlZmF1bHQgaXMgbm90IHVzZWQsIGNvbmNlcHRfZHJpZnRfc3RyZWFtaW5nIHdpbGwgYmUgbG9hZGVkCiAgICAgICAgICAgICAgICAgICAgICAgICAgICBieSB0aGlzIHVybAogICAgICAgICAgICAgICAgICAgICAgICAgICAgRGVmYXVsdHMgdG8gbWxjb25mLmh1Yl91cmwuCiAgICA6cGFyYW0gZm5fdGFnOiAgICAgICAgICBodWIgdGFnIHRvIHVzZQogICAgICAgICAgICAgICAgICAgICAgICAgICAgRGVmYXVsdHMgdG8gJ21hc3RlcicKICAgICIiIgoKICAgIG1sY29uZi5kYnBhdGggPSBtbGNvbmYuZGJwYXRoIG9yICJodHRwOi8vbWxydW4tYXBpOjgwODAiCiAgICBtbGNvbmYuaHViX3VybCA9IGh1Yl91cmwKICAgIGZuID0gaW1wb3J0X2Z1bmN0aW9uKHVybD1mImh1YjovL2NvbmNlcHRfZHJpZnRfc3RyZWFtaW5nOntmbl90YWd9IikKCiAgICBjb250ZXh0LmxvZ2dlci5pbmZvKCJMb2FkaW5nIGJhc2UgZGF0YXNldCIpCiAgICBiYXNlX2RmID0gYmFzZV9kYXRhc2V0LmFzX2RmKCkKICAgIGVycm9yX3N0cmVhbSA9IG5wLndoZXJlKAogICAgICAgIGJhc2VfZGZbcHJlZGljdGlvbl9jb2xdLnZhbHVlcyA9PSBiYXNlX2RmW2xhYmVsX2NvbF0udmFsdWVzLCAwLCAxCiAgICApCgogICAgY29udGV4dC5sb2dnZXIuaW5mbygiQ3JlYXRpbmcgbW9kZWxzIikKICAgIG1vZGVscyA9IFsKICAgICAgICBtb2RlbC5zdHJpcCgpCiAgICAgICAgZm9yIG1vZGVsIGluIG9zLmdldGVudigibW9kZWxzIiwgInBhZ2VoaW5rbGV5LCBkZG0sIGVkZG0iKS5zcGxpdCgiLCIpCiAgICBdCiAgICBtb2RlbHMgPSB7CiAgICAgICAgImVkZG0iOiBza211bHRpZmxvdy5kcmlmdF9kZXRlY3Rpb24uRURETSgpLAogICAgICAgICJwYWdlaGlua2xleSI6IHNrbXVsdGlmbG93LmRyaWZ0X2RldGVjdGlvbi5QYWdlSGlua2xleSgKICAgICAgICAgICAgbWluX2luc3RhbmNlcz1sZW4oZXJyb3Jfc3RyZWFtKSwgdGhyZXNob2xkPXBhZ2VoaW5rbGV5X3RocmVzaG9sZAogICAgICAgICksCiAgICAgICAgImRkbSI6IHNrbXVsdGlmbG93LmRyaWZ0X2RldGVjdGlvbi5ERE0oCiAgICAgICAgICAgIG1pbl9udW1faW5zdGFuY2VzPWxlbihlcnJvcl9zdHJlYW0pLAogICAgICAgICAgICB3YXJuaW5nX2xldmVsPWRkbV93YXJuaW5nX2xldmVsLAogICAgICAgICAgICBvdXRfY29udHJvbF9sZXZlbD1kZG1fb3V0X2NvbnRyb2xfbGV2ZWwsCiAgICAgICAgKSwKICAgIH0KCiAgICBjb250ZXh0LmxvZ2dlci5pbmZvKCJTdHJlYW1pbmcgZGF0YSB0byBtb2RlbHMiKQogICAgZm9yIGkgaW4gcmFuZ2UobGVuKGVycm9yX3N0cmVhbSkpOgogICAgICAgIGZvciBtb2RlbF9uYW1lLCBtb2RlbCBpbiBtb2RlbHMuaXRlbXMoKToKICAgICAgICAgICAgbW9kZWwuYWRkX2VsZW1lbnQoZXJyb3Jfc3RyZWFtW2ldKQoKICAgIGNvbnRleHQubG9nZ2VyLmluZm8oIkxvZ2dpbmcgcmVhZHkgbW9kZWxzIikKICAgIGZvciBuYW1lLCBtb2RlbCBpbiBtb2RlbHMuaXRlbXMoKToKICAgICAgICBkYXRhID0gZHVtcHMobW9kZWwpCiAgICAgICAgbW9kZWxfZmlsZSA9IGYie25hbWV9LnBrbCIKICAgICAgICBjb250ZXh0LmxvZ19tb2RlbCgKICAgICAgICAgICAgZiJ7bmFtZX1fY29uY2VwdF9kcmlmdCIsCiAgICAgICAgICAgIGJvZHk9ZGF0YSwKICAgICAgICAgICAgbGFiZWxzPXsiZnJhbWV3b3JrIjogInNrbXVsdGlmbG93IiwgIndvcmtmbG93IjogImNvbmNlcHQtZHJpZnQifSwKICAgICAgICAgICAgbW9kZWxfZmlsZT1tb2RlbF9maWxlLAogICAgICAgICAgICBtb2RlbF9kaXI9bW9kZWxzX2Rlc3QsCiAgICAgICAgICAgIHRhZz0ibGF0ZXN0IiwKICAgICAgICApCiAgICAgICAgZm4uc2V0X2VudnMoCiAgICAgICAgICAgIHsKICAgICAgICAgICAgICAgIGYie25hbWV9X21vZGVsX3BhdGgiOiBvcy5wYXRoLmpvaW4oCiAgICAgICAgICAgICAgICAgICAgY29udGV4dC5hcnRpZmFjdF9wYXRoLCBtb2RlbHNfZGVzdCwgbW9kZWxfZmlsZQogICAgICAgICAgICAgICAgKQogICAgICAgICAgICB9CiAgICAgICAgKQoKICAgIGNvbnRleHQubG9nZ2VyLmluZm8oIkRlcGxveWluZyBDb25jZXB0IERyaWZ0IFN0cmVhbWluZyBmdW5jdGlvbiIpCiAgICBmbi5zZXRfZW52cygKICAgICAgICB7CiAgICAgICAgICAgICJsYWJlbF9jb2wiOiBsYWJlbF9jb2wsCiAgICAgICAgICAgICJwcmVkaWN0aW9uX2NvbCI6IHByZWRpY3Rpb25fY29sLAogICAgICAgICAgICAiZHJpZnRfc3RyZWFtIjogb3V0cHV0X3N0cmVhbSwKICAgICAgICAgICAgInRzZGJfdGFibGUiOiBvdXRwdXRfdHNkYiwKICAgICAgICAgICAgInBhZ2VoaW5rbGV5X3RocmVzaG9sZCI6IHBhZ2VoaW5rbGV5X3RocmVzaG9sZCwKICAgICAgICAgICAgImRkbV93YXJuaW5nX2xldmVsIjogZGRtX3dhcm5pbmdfbGV2ZWwsCiAgICAgICAgICAgICJkZG1fb3V0X2NvbnRyb2wiOiBkZG1fb3V0X2NvbnRyb2xfbGV2ZWwsCiAgICAgICAgfQogICAgKQogICAgZm4uYWRkX3YzaW9fc3RyZWFtX3RyaWdnZXIoc3RyZWFtX3BhdGggPSBpbnB1dF9zdHJlYW0sIG5hbWUgPSAnc3RyZWFtJywgZ3JvdXAgPSBjb25zdW1lcl9ncm91cCkKICAgIGZuLmFwcGx5KG1vdW50X3YzaW8oKSkKICAgIGZuLmRlcGxveShwcm9qZWN0PWNvbnRleHQucHJvamVjdCkK - commands: - - python -m pip install scikit-multiflow - code_origin: https://github.com/daniels290813/functions.git#82bbfde4afa2eae77059e05c70bbebacf530fd0d:/User/test/functions/concept_drift/concept_drift.py - origin_filename: /User/test/functions/concept_drift/concept_drift.py - disable_auto_mount: false - affinity: null -verbose: false diff --git a/functions/development/concept_drift/0.9.0/src/item.yaml b/functions/development/concept_drift/0.9.0/src/item.yaml deleted file mode 100644 index 4b362e9a..00000000 --- a/functions/development/concept_drift/0.9.0/src/item.yaml +++ /dev/null @@ -1,26 +0,0 @@ -apiVersion: v1 -categories: -- machine-learning -- model-serving -description: Deploy a streaming Concept Drift detector on a labeled stream -doc: '' -example: concept_drift.ipynb -generationDate: 2021-11-18:12-28 -icon: '' -labels: - author: orz - framework: sklearn -maintainers: [] -marketplaceType: '' -mlrunVersion: 0.8.0 -name: concept-drift -platformVersion: 3.2.0 -spec: - filename: concept_drift.py - handler: concept_drift_deployer - image: mlrun/ml-models - kind: job - requirements: - - scikit-multiflow -url: '' -version: 0.9.0 diff --git a/functions/development/concept_drift/0.9.0/static/documentation.html b/functions/development/concept_drift/0.9.0/static/documentation.html deleted file mode 100644 index 37daa510..00000000 --- a/functions/development/concept_drift/0.9.0/static/documentation.html +++ /dev/null @@ -1,125 +0,0 @@ - - - - - - - -concept_drift package - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- - -
-
-
-
-
-
-

concept_drift package

-
-

Submodules

-
-
-

concept_drift.concept_drift module

-
-
-

Module contents

-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/concept_drift/0.9.0/static/example.html b/functions/development/concept_drift/0.9.0/static/example.html deleted file mode 100644 index 4cd7041c..00000000 --- a/functions/development/concept_drift/0.9.0/static/example.html +++ /dev/null @@ -1,753 +0,0 @@ - - - - - - - -Concept Drift - Deployer - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
- -
-
-
-
-

Concept Drift - Deployer

-

Deploy a streaming Concept Drift detector on a labeled stream.
-It will initialize the selected drift detectors with the base_dataset’s statistics and deploy the concept_drift_streaming function from the hub.
-adding V3IOStreamTrigger in order to listen to the input_stream.

- -
-

Data exploration

-

In order to know about the performance of a drift detector by measuring the different detection metrics, we need to know beforehand where a real drift occurs.
-This is only possible with synthetic datasets.
The scikit-multiflow framework allows generating several kinds of synthetic data to simulate the occurrence of drifts.
-Harvard dataverse provides futher explanations on the used dataset along with different kinds of drifted datasets.
-mixed_0101_abrupto has 4 concepts and 3 drifts at time steps 10000, 20000, and 30000.
-Our dataset will be train-test-splitted, the train part (first 5000 examples) is used to train the model (that is generated easly using sklearn_classifer).
-The test part (which is already predicted by the model) will be pushed to the input stream in order to detect drifts.

-
-
-
import pandas as pd
-data_path = 'https://s3.wasabisys.com/iguazio/data/function-marketplace-data/concept_drift/mixed_0101_abrupto.csv'
-predicted_train_path = 'https://s3.wasabisys.com/iguazio/data/function-marketplace-data/concept_drift/predicted_abrupto_train.csv'
-predicted_test_data_path = 'https://s3.wasabisys.com/iguazio/data/function-marketplace-data/concept_drift/predicted_abrupto_test.csv'
-# You can find the model used here
-models_path = 'https://s3.wasabisys.com/iguazio/models/function-marketplace-models/concept_drift/concept_drift_random_forest.pkl'
-original_data = pd.read_csv(data_path)
-original_data.head()
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
X1X2X3X4class
00.01.00.4601010.5927441.0
11.01.00.5887880.5749840.0
20.00.00.4016410.6793251.0
31.01.00.3060760.1821080.0
40.00.00.9628470.5792451.0
-
-
-
-
-
predicted_test = pd.read_csv(predicted_test_data_path)
-predicted_test.tail()
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
X1X2X3X4classpredicted_col
349950.00.00.0101060.6472690.01.0
349961.01.00.2936510.7372911.00.0
349970.00.00.8485460.5523370.01.0
349981.01.00.6147540.8598961.00.0
349991.00.00.2653060.8437160.01.0
-
-
-
-
-

Creating the input stream

-
-
-
import os 
-
-container = os.path.join('/',os.environ['V3IO_HOME'].split('/')[0])
-user = os.environ["V3IO_USERNAME"]
-rel_path = os.getcwd()[6:] + '/artifacts'
-
-base_input_stream = os.path.join(user,rel_path) + "/inputs_stream"
-base_output_stream = os.path.join(user,rel_path) + "/output_stream"
-input_stream = os.path.join(container,base_input_stream)
-output_stream = os.path.join(container,user,rel_path) + "/output_stream"
-tsdb_path = os.path.join(container,user,rel_path) + "/output_tsdb"
-
-stream_consumer_group = 'cg45'
-
-
-
-
-
-
-
import v3io.dataplane
-
-client = v3io.dataplane.Client()
-response = client.stream.create(container = container,
-                                stream_path=base_input_stream,
-                                shard_count=1,
-                                raise_for_status = v3io.dataplane.RaiseForStatus.never)
-response.raise_for_status([409, 204])
-
-
-
-
-
-
-

Importing the function

-
-
-
# Importing the function
-import mlrun
-mlrun.set_environment(project='function-marketplace')
-
-fn = mlrun.import_function("hub://concept_drift:development")
-fn.apply(mlrun.auto_mount())
-
-
-
-
-
> 2021-10-25 10:27:04,105 [info] created and saved project function-marketplace
-
-
-
<mlrun.runtimes.kubejob.KubejobRuntime at 0x7f145dd80fd0>
-
-
-
-
-
-
-

Running the function remotely

-
-
-
drift_run = fn.run(name='concept_drift',
-                   params={'input_stream'    : input_stream,
-                           'consumer_group'  : stream_consumer_group,
-                           'output_stream'   : output_stream,
-                           'output_tsdb'     : tsdb_path,
-                           'tsdb_batch_size' : 1,
-                           'models'          : ['ddm', 'eddm', 'pagehinkley'], # defaults
-                           'label_col'       : 'class',
-                           'prediction_col'  : 'predicted_col',
-                           'fn_tag'          : 'development'},
-                   inputs={'base_dataset'    : predicted_train_path},
-                   artifact_path = os.path.join(os.getcwd(), 'artifacts'))
-
-
-
-
-
> 2021-10-25 10:27:04,567 [info] starting run concept_drift uid=fa07c222e77d4eac86d2ce9317aaded1 DB=http://mlrun-api:8080
-> 2021-10-25 10:27:04,709 [info] Job is running in the background, pod: concept-drift-ggxgb
-> 2021-10-25 10:27:11,199 [info] Loading base dataset
-> 2021-10-25 10:27:13,227 [info] Creating models
-> 2021-10-25 10:27:13,227 [info] Streaming data to models
-> 2021-10-25 10:27:13,347 [info] Logging ready models
-> 2021-10-25 10:27:13,487 [info] Deploying Concept Drift Streaming function
-> 2021-10-25 10:27:13,490 [info] Starting remote function deploy
-2021-10-25 10:27:13  (info) Deploying function
-2021-10-25 10:27:13  (info) Building
-2021-10-25 10:27:13  (info) Staging files and preparing base images
-2021-10-25 10:27:13  (info) Building processor image
-2021-10-25 10:27:15  (info) Build complete
-2021-10-25 10:27:21  (info) Function deploy complete
-> 2021-10-25 10:27:21,797 [info] successfully deployed function: {'internal_invocation_urls': ['nuclio-function-marketplace-concept-drift-streaming.default-tenant.svc.cluster.local:8080'], 'external_invocation_urls': ['default-tenant.app.dev39.lab.iguazeng.com:31143']}
-> 2021-10-25 10:27:21,868 [info] run executed, status=completed
-final state: completed
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
function-marketplace0Oct 25 10:27:10completedconcept_drift
v3io_user=dani
kind=job
owner=dani
host=concept-drift-ggxgb
base_dataset
input_stream=/users/dani/test/functions/concept_drift/artifacts/inputs_stream
consumer_group=cg45
output_stream=/users/dani/test/functions/concept_drift/artifacts/output_stream
output_tsdb=/users/dani/test/functions/concept_drift/artifacts/output_tsdb
tsdb_batch_size=1
models=['ddm', 'eddm', 'pagehinkley']
label_col=class
prediction_col=predicted_col
fn_tag=development
eddm_concept_drift
pagehinkley_concept_drift
ddm_concept_drift
-
- -
-

-
-
-
> to track results use the .show() or .logs() methods or click here to open in UI
> 2021-10-25 10:27:23,031 [info] run executed, status=completed
-
-
-
-
-
-
-

Testing the function

-
-

Mark that we are testing the deployed function - concept_drift_streaming

-
-
-
-
import json
-import datetime
-
-# Reshaping the data to V3IOStream format.
-def restructure_stream_event(context, event):
-    instances = [dict()]
-    for key in predicted_test.keys():
-        if key not in ['when', 'class', 'model', 'worker', 'hostname', 'predicted_col']:
-            instances[0].update({key: event.pop(key)})
-    event['request'] = {'instances': instances}
-    event['resp'] = [int(event.pop('predicted_col'))]
-    event['when'] = datetime.datetime.strftime(datetime.datetime.now(), format="%Y-%m-%d %H:%M:%S.%f")
-    event['model'] = 'sklearn.ensemble.RandomForestClassifier'
-    return event
-    
-    
-records = json.loads(predicted_test.to_json(orient='records'))
-records = [{'data': json.dumps(restructure_stream_event(context, record))} for record in records]
-
-# showing first record
-records[0]
-
-
-
-
-
{'data': '{"class": 1.0, "request": {"instances": [{"X1": 0.0, "X2": 0.0, "X3": 0.0634475073, "X4": 0.4136568818}]}, "resp": [1], "when": "2021-10-25 10:27:23.152584", "model": "sklearn.ensemble.RandomForestClassifier"}'}
-
-
-
-
-
-
-
# Creating v3io client
-v3io_client = v3io.dataplane.Client()
-
-# Pushing some undrifted data to the input stream
-response = v3io_client.stream.put_records(container=container,
-                                          stream_path=base_input_stream, 
-                                          records=records[4900:5100])
-
-
-
-
-
-
-
# Getting earliest location in the shard
-location = json.loads(v3io_client.stream.seek(container=container,
-                                              stream_path=base_input_stream,
-                                              shard_id=0,
-                                              seek_type='EARLIEST').body)['Location']
-# Getting records from input stream
-response = v3io_client.stream.get_records(container=container,
-                                          stream_path=base_input_stream,
-                                          shard_id=0, location=location)
-# Showing the last sequence that is written to the input stream
-json.loads(response.body)['Records'][-1]
-
-
-
-
-
{'SequenceNumber': 200,
- 'Data': 'eyJjbGFzcyI6IDAuMCwgInJlcXVlc3QiOiB7Imluc3RhbmNlcyI6IFt7IlgxIjogMC4wLCAiWDIiOiAwLjAsICJYMyI6IDAuMzMzMTYzNjk4OSwgIlg0IjogMC40MjE2NzY1Njg3fV19LCAicmVzcCI6IFsxXSwgIndoZW4iOiAiMjAyMS0xMC0yNSAxMDoyNzoyMy4yOTM3OTgiLCAibW9kZWwiOiAic2tsZWFybi5lbnNlbWJsZS5SYW5kb21Gb3Jlc3RDbGFzc2lmaWVyIn0=',
- 'ArrivalTimeSec': 1635157644,
- 'ArrivalTimeNSec': 395309631}
-
-
-
-
-
-

Make sure some time has passed - the function needs to be triggered by the input stream, then it’ll write to the output stream

-
-
-
# Getting earliest location in the shard
-location = json.loads(v3io_client.stream.seek(container=container,
-                                              stream_path=base_output_stream,
-                                              shard_id=0,
-                                              seek_type='EARLIEST').body)['Location']
-# Getting records from output stream
-response = v3io_client.stream.get_records(container=container,
-                                          stream_path=base_output_stream,
-                                          shard_id=0, location=location)
-
-
-
-
-
-
-
# Showing changed detected
-import base64
-for instance in json.loads(response.body)['Records']:
-    seq = instance["SequenceNumber"]
-    data = json.loads(base64.b64decode(instance['Data']))
-    if(data['ddm_drift']==1 or data['eddm_drift']==1):
-        print(f'sequence number : {seq}, data : {data}')
-
-
-
-
-
sequence number : 106, data : {'class': 0.0, 'request': {'instances': [{'X1': 0.0, 'X2': 0.0, 'X3': 0.9628473804, 'X4': 0.5792453402}]}, 'resp': [1], 'when': '2021-10-25 10:27:23.291145', 'model': 'sklearn.ensemble.RandomForestClassifier', 'ddm_warning_zone': 0, 'ddm_drift': 1, 'eddm_warning_zone': 0, 'eddm_drift': 0}
-sequence number : 122, data : {'class': 0.0, 'request': {'instances': [{'X1': 0.0, 'X2': 0.0, 'X3': 0.4969765505, 'X4': 0.9784738351}]}, 'resp': [1], 'when': '2021-10-25 10:27:23.291558', 'model': 'sklearn.ensemble.RandomForestClassifier', 'ddm_warning_zone': 0, 'ddm_drift': 0, 'eddm_warning_zone': 0, 'eddm_drift': 1}
-
-
-
-
-

We can see that the system detected a change in the 106 instance, which is 10006 instance in the real dataset -
-5000 first instances are for train, we started pushing data from the 4900 instance of the test dataset (9900 from the real dataset), and we pushed only 200 instances.

-

Back to the top

-
-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/concept_drift/0.9.0/static/function.html b/functions/development/concept_drift/0.9.0/static/function.html deleted file mode 100644 index f9afe688..00000000 --- a/functions/development/concept_drift/0.9.0/static/function.html +++ /dev/null @@ -1,134 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: job
-metadata:
-  name: concept-drift
-  tag: ''
-  hash: 935da41196802875e19948974f32b6f00c29feb2
-  project: ''
-  labels:
-    author: orz
-    framework: sklearn
-  categories:
-  - machine-learning
-  - model-serving
-spec:
-  command: ''
-  args: []
-  image: mlrun/ml-models
-  env: []
-  default_handler: concept_drift_deployer
-  entry_points:
-    concept_drift_deployer:
-      name: concept_drift_deployer
-      doc: "Deploy a streaming Concept Drift detector on a labeled stream\n   This\
-        \ function is the Deployment step for the Streaming Concept Drift Detector.\n\
-        \   It will load the selected drift detectors and initialize them with the\n\
-        \   base_dataset's statistics.  Then it will deploy the concept_drift_streaming\n\
-        \   function and pass the models to it for streaming concept-drift detection\
-        \ on top\n   of a labeled stream."
-      parameters:
-      - name: context
-        type: MLClientCtx
-        doc: MLRun context
-        default: ''
-      - name: base_dataset
-        type: DataItem
-        doc: Dataset containing label_col and prediction_col to initialize the detectors
-        default: ''
-      - name: input_stream
-        type: str
-        doc: labeled stream to track. Should contain label_col and prediction_col
-        default: ''
-      - name: consumer_group
-        type: str
-        default: ''
-      - name: output_stream
-        type: str
-        doc: Output stream to push the detector's alerts
-        default: ''
-      - name: output_tsdb
-        type: str
-        doc: Output TSDB table to allow analysis and display
-        default: ''
-      - name: tsdb_batch_size
-        type: int
-        doc: Batch size of alerts to buffer before pushing to the TSDB
-        default: ''
-      - name: callbacks
-        type: list
-        doc: Additional rest endpoints to send the alert data to
-        default: ''
-      - name: models
-        type: list
-        doc: List of the detectors to deploy Defaults to ['ddm', 'eddm', 'pagehinkley'].
-        default:
-        - ddm
-        - eddm
-        - pagehinkley
-      - name: models_dest
-        doc: Location for saving the detectors Defaults to 'models' (in relation to
-          artifact_path).
-        default: models
-      - name: pagehinkley_threshold
-        type: float
-        doc: Drift level threshold for PH detector Defaults to 10.
-        default: 10
-      - name: ddm_warning_level
-        type: float
-        doc: Warning level alert for DDM detector Defaults to 2.
-        default: 2
-      - name: ddm_out_control_level
-        type: float
-        doc: Drift level alert for DDM detector Defaults to 3.
-        default: 3
-      - name: label_col
-        doc: Label column to be used on base_dataset and input_stream Defaults to
-          'label'.
-        default: label
-      - name: prediction_col
-        doc: Prediction column to be used on base_dataset and input_stream Defaults
-          to 'prediction'.
-        default: prediction
-      - name: hub_url
-        type: str
-        doc: hub_url in case the default is not used, concept_drift_streaming will
-          be loaded by this url Defaults to mlconf.hub_url.
-        default: <_ast.Name object at 0x7f48eda946d0>
-      - name: fn_tag
-        type: str
-        doc: hub tag to use Defaults to 'master'
-        default: master
-      outputs:
-      - default: ''
-      lineno: 15
-  description: Deploy a streaming Concept Drift detector on a labeled stream
-  build:
-    functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHNrbXVsdGlmbG93LmRyaWZ0X2RldGVjdGlvbiAgIyBXZSB3aWxsIGdyYWIgb3VyIFBILCBERE0sIEVERE0gYWxnb3JpdGhtcyBmcm9tIGhlcmUKaW1wb3J0IG51bXB5IGFzIG5wCmltcG9ydCBwYW5kYXMgYXMgcGQKaW1wb3J0IG9zCmZyb20gY2xvdWRwaWNrbGUgaW1wb3J0IGR1bXBzLCBsb2FkLCBkdW1wCgpmcm9tIG51Y2xpby50cmlnZ2VycyBpbXBvcnQgVjNJT1N0cmVhbVRyaWdnZXIKZnJvbSBtbHJ1biBpbXBvcnQgRGF0YUl0ZW0sIGltcG9ydF9mdW5jdGlvbiwgbWxjb25mLCBNTENsaWVudEN0eCwgbW91bnRfdjNpbwoKaW1wb3J0IHJhbmRvbQoKCmRlZiBjb25jZXB0X2RyaWZ0X2RlcGxveWVyKAogICAgY29udGV4dDogTUxDbGllbnRDdHgsCiAgICBiYXNlX2RhdGFzZXQ6IERhdGFJdGVtLAogICAgaW5wdXRfc3RyZWFtOiBzdHIsCiAgICBjb25zdW1lcl9ncm91cDogc3RyLAogICAgb3V0cHV0X3N0cmVhbTogc3RyLAogICAgb3V0cHV0X3RzZGI6IHN0ciwKICAgIHRzZGJfYmF0Y2hfc2l6ZTogaW50LAogICAgY2FsbGJhY2tzOiBsaXN0LAogICAgbW9kZWxzOiBsaXN0ID0gWyJkZG0iLCAiZWRkbSIsICJwYWdlaGlua2xleSJdLAogICAgbW9kZWxzX2Rlc3Q9Im1vZGVscyIsCiAgICBwYWdlaGlua2xleV90aHJlc2hvbGQ6IGZsb2F0ID0gMTAsCiAgICBkZG1fd2FybmluZ19sZXZlbDogZmxvYXQgPSAyLAogICAgZGRtX291dF9jb250cm9sX2xldmVsOiBmbG9hdCA9IDMsCiAgICBsYWJlbF9jb2w9ImxhYmVsIiwKICAgIHByZWRpY3Rpb25fY29sPSJwcmVkaWN0aW9uIiwKICAgIGh1Yl91cmw6IHN0ciA9IG1sY29uZi5odWJfdXJsLAogICAgZm5fdGFnOiBzdHIgPSAibWFzdGVyIiwKKToKICAgICIiIkRlcGxveSBhIHN0cmVhbWluZyBDb25jZXB0IERyaWZ0IGRldGVjdG9yIG9uIGEgbGFiZWxlZCBzdHJlYW0KICAgICAgIFRoaXMgZnVuY3Rpb24gaXMgdGhlIERlcGxveW1lbnQgc3RlcCBmb3IgdGhlIFN0cmVhbWluZyBDb25jZXB0IERyaWZ0IERldGVjdG9yLgogICAgICAgSXQgd2lsbCBsb2FkIHRoZSBzZWxlY3RlZCBkcmlmdCBkZXRlY3RvcnMgYW5kIGluaXRpYWxpemUgdGhlbSB3aXRoIHRoZQogICAgICAgYmFzZV9kYXRhc2V0J3Mgc3RhdGlzdGljcy4gIFRoZW4gaXQgd2lsbCBkZXBsb3kgdGhlIGNvbmNlcHRfZHJpZnRfc3RyZWFtaW5nCiAgICAgICBmdW5jdGlvbiBhbmQgcGFzcyB0aGUgbW9kZWxzIHRvIGl0IGZvciBzdHJlYW1pbmcgY29uY2VwdC1kcmlmdCBkZXRlY3Rpb24gb24gdG9wCiAgICAgICBvZiBhIGxhYmVsZWQgc3RyZWFtLgoKICAgIDpwYXJhbSBjb250ZXh0OiAgICAgICAgIE1MUnVuIGNvbnRleHQKICAgIDpwYXJhbSBiYXNlX2RhdGFzZXQ6ICAgIERhdGFzZXQgY29udGFpbmluZyBsYWJlbF9jb2wgYW5kIHByZWRpY3Rpb25fY29sIHRvIGluaXRpYWxpemUgdGhlIGRldGVjdG9ycwogICAgOnBhcmFtIGlucHV0X3N0cmVhbTogICAgbGFiZWxlZCBzdHJlYW0gdG8gdHJhY2suCiAgICAgICAgICAgICAgICAgICAgICAgICAgICBTaG91bGQgY29udGFpbiBsYWJlbF9jb2wgYW5kIHByZWRpY3Rpb25fY29sCiAgICA6cGFyYW0gb3V0cHV0X3N0cmVhbTogICBPdXRwdXQgc3RyZWFtIHRvIHB1c2ggdGhlIGRldGVjdG9yJ3MgYWxlcnRzCiAgICA6cGFyYW0gb3V0cHV0X3RzZGI6ICAgICBPdXRwdXQgVFNEQiB0YWJsZSB0byBhbGxvdyBhbmFseXNpcyBhbmQgZGlzcGxheQogICAgOnBhcmFtIHRzZGJfYmF0Y2hfc2l6ZTogQmF0Y2ggc2l6ZSBvZiBhbGVydHMgdG8gYnVmZmVyIGJlZm9yZSBwdXNoaW5nIHRvIHRoZSBUU0RCCiAgICA6cGFyYW0gY2FsbGJhY2tzOiAgICAgICBBZGRpdGlvbmFsIHJlc3QgZW5kcG9pbnRzIHRvIHNlbmQgdGhlIGFsZXJ0IGRhdGEgdG8KICAgIDpwYXJhbSBtb2RlbHM6ICAgICAgICAgIExpc3Qgb2YgdGhlIGRldGVjdG9ycyB0byBkZXBsb3kKICAgICAgICAgICAgICAgICAgICAgICAgICAgIERlZmF1bHRzIHRvIFsnZGRtJywgJ2VkZG0nLCAncGFnZWhpbmtsZXknXS4KICAgIDpwYXJhbSBtb2RlbHNfZGVzdDogICAgIExvY2F0aW9uIGZvciBzYXZpbmcgdGhlIGRldGVjdG9ycwogICAgICAgICAgICAgICAgICAgICAgICAgICAgRGVmYXVsdHMgdG8gJ21vZGVscycgKGluIHJlbGF0aW9uIHRvIGFydGlmYWN0X3BhdGgpLgogICAgOnBhcmFtIHBhZ2VoaW5rbGV5X3RocmVzaG9sZDogIERyaWZ0IGxldmVsIHRocmVzaG9sZCBmb3IgUEggZGV0ZWN0b3IgRGVmYXVsdHMgdG8gMTAuCiAgICA6cGFyYW0gZGRtX3dhcm5pbmdfbGV2ZWw6ICAgICAgV2FybmluZyBsZXZlbCBhbGVydCBmb3IgRERNIGRldGVjdG9yIERlZmF1bHRzIHRvIDIuCiAgICA6cGFyYW0gZGRtX291dF9jb250cm9sX2xldmVsOiAgRHJpZnQgbGV2ZWwgYWxlcnQgZm9yIERETSBkZXRlY3RvciBEZWZhdWx0cyB0byAzLgogICAgOnBhcmFtIGxhYmVsX2NvbDogICAgICAgTGFiZWwgY29sdW1uIHRvIGJlIHVzZWQgb24gYmFzZV9kYXRhc2V0IGFuZCBpbnB1dF9zdHJlYW0KICAgICAgICAgICAgICAgICAgICAgICAgICAgIERlZmF1bHRzIHRvICdsYWJlbCcuCiAgICA6cGFyYW0gcHJlZGljdGlvbl9jb2w6ICBQcmVkaWN0aW9uIGNvbHVtbiB0byBiZSB1c2VkIG9uIGJhc2VfZGF0YXNldCBhbmQgaW5wdXRfc3RyZWFtCiAgICAgICAgICAgICAgICAgICAgICAgICAgICBEZWZhdWx0cyB0byAncHJlZGljdGlvbicuCiAgICA6cGFyYW0gaHViX3VybDogICAgICAgICBodWJfdXJsIGluIGNhc2UgdGhlIGRlZmF1bHQgaXMgbm90IHVzZWQsIGNvbmNlcHRfZHJpZnRfc3RyZWFtaW5nIHdpbGwgYmUgbG9hZGVkCiAgICAgICAgICAgICAgICAgICAgICAgICAgICBieSB0aGlzIHVybAogICAgICAgICAgICAgICAgICAgICAgICAgICAgRGVmYXVsdHMgdG8gbWxjb25mLmh1Yl91cmwuCiAgICA6cGFyYW0gZm5fdGFnOiAgICAgICAgICBodWIgdGFnIHRvIHVzZQogICAgICAgICAgICAgICAgICAgICAgICAgICAgRGVmYXVsdHMgdG8gJ21hc3RlcicKICAgICIiIgoKICAgIG1sY29uZi5kYnBhdGggPSBtbGNvbmYuZGJwYXRoIG9yICJodHRwOi8vbWxydW4tYXBpOjgwODAiCiAgICBtbGNvbmYuaHViX3VybCA9IGh1Yl91cmwKICAgIGZuID0gaW1wb3J0X2Z1bmN0aW9uKHVybD1mImh1YjovL2NvbmNlcHRfZHJpZnRfc3RyZWFtaW5nOntmbl90YWd9IikKCiAgICBjb250ZXh0LmxvZ2dlci5pbmZvKCJMb2FkaW5nIGJhc2UgZGF0YXNldCIpCiAgICBiYXNlX2RmID0gYmFzZV9kYXRhc2V0LmFzX2RmKCkKICAgIGVycm9yX3N0cmVhbSA9IG5wLndoZXJlKAogICAgICAgIGJhc2VfZGZbcHJlZGljdGlvbl9jb2xdLnZhbHVlcyA9PSBiYXNlX2RmW2xhYmVsX2NvbF0udmFsdWVzLCAwLCAxCiAgICApCgogICAgY29udGV4dC5sb2dnZXIuaW5mbygiQ3JlYXRpbmcgbW9kZWxzIikKICAgIG1vZGVscyA9IFsKICAgICAgICBtb2RlbC5zdHJpcCgpCiAgICAgICAgZm9yIG1vZGVsIGluIG9zLmdldGVudigibW9kZWxzIiwgInBhZ2VoaW5rbGV5LCBkZG0sIGVkZG0iKS5zcGxpdCgiLCIpCiAgICBdCiAgICBtb2RlbHMgPSB7CiAgICAgICAgImVkZG0iOiBza211bHRpZmxvdy5kcmlmdF9kZXRlY3Rpb24uRURETSgpLAogICAgICAgICJwYWdlaGlua2xleSI6IHNrbXVsdGlmbG93LmRyaWZ0X2RldGVjdGlvbi5QYWdlSGlua2xleSgKICAgICAgICAgICAgbWluX2luc3RhbmNlcz1sZW4oZXJyb3Jfc3RyZWFtKSwgdGhyZXNob2xkPXBhZ2VoaW5rbGV5X3RocmVzaG9sZAogICAgICAgICksCiAgICAgICAgImRkbSI6IHNrbXVsdGlmbG93LmRyaWZ0X2RldGVjdGlvbi5ERE0oCiAgICAgICAgICAgIG1pbl9udW1faW5zdGFuY2VzPWxlbihlcnJvcl9zdHJlYW0pLAogICAgICAgICAgICB3YXJuaW5nX2xldmVsPWRkbV93YXJuaW5nX2xldmVsLAogICAgICAgICAgICBvdXRfY29udHJvbF9sZXZlbD1kZG1fb3V0X2NvbnRyb2xfbGV2ZWwsCiAgICAgICAgKSwKICAgIH0KCiAgICBjb250ZXh0LmxvZ2dlci5pbmZvKCJTdHJlYW1pbmcgZGF0YSB0byBtb2RlbHMiKQogICAgZm9yIGkgaW4gcmFuZ2UobGVuKGVycm9yX3N0cmVhbSkpOgogICAgICAgIGZvciBtb2RlbF9uYW1lLCBtb2RlbCBpbiBtb2RlbHMuaXRlbXMoKToKICAgICAgICAgICAgbW9kZWwuYWRkX2VsZW1lbnQoZXJyb3Jfc3RyZWFtW2ldKQoKICAgIGNvbnRleHQubG9nZ2VyLmluZm8oIkxvZ2dpbmcgcmVhZHkgbW9kZWxzIikKICAgIGZvciBuYW1lLCBtb2RlbCBpbiBtb2RlbHMuaXRlbXMoKToKICAgICAgICBkYXRhID0gZHVtcHMobW9kZWwpCiAgICAgICAgbW9kZWxfZmlsZSA9IGYie25hbWV9LnBrbCIKICAgICAgICBjb250ZXh0LmxvZ19tb2RlbCgKICAgICAgICAgICAgZiJ7bmFtZX1fY29uY2VwdF9kcmlmdCIsCiAgICAgICAgICAgIGJvZHk9ZGF0YSwKICAgICAgICAgICAgbGFiZWxzPXsiZnJhbWV3b3JrIjogInNrbXVsdGlmbG93IiwgIndvcmtmbG93IjogImNvbmNlcHQtZHJpZnQifSwKICAgICAgICAgICAgbW9kZWxfZmlsZT1tb2RlbF9maWxlLAogICAgICAgICAgICBtb2RlbF9kaXI9bW9kZWxzX2Rlc3QsCiAgICAgICAgICAgIHRhZz0ibGF0ZXN0IiwKICAgICAgICApCiAgICAgICAgZm4uc2V0X2VudnMoCiAgICAgICAgICAgIHsKICAgICAgICAgICAgICAgIGYie25hbWV9X21vZGVsX3BhdGgiOiBvcy5wYXRoLmpvaW4oCiAgICAgICAgICAgICAgICAgICAgY29udGV4dC5hcnRpZmFjdF9wYXRoLCBtb2RlbHNfZGVzdCwgbW9kZWxfZmlsZQogICAgICAgICAgICAgICAgKQogICAgICAgICAgICB9CiAgICAgICAgKQoKICAgIGNvbnRleHQubG9nZ2VyLmluZm8oIkRlcGxveWluZyBDb25jZXB0IERyaWZ0IFN0cmVhbWluZyBmdW5jdGlvbiIpCiAgICBmbi5zZXRfZW52cygKICAgICAgICB7CiAgICAgICAgICAgICJsYWJlbF9jb2wiOiBsYWJlbF9jb2wsCiAgICAgICAgICAgICJwcmVkaWN0aW9uX2NvbCI6IHByZWRpY3Rpb25fY29sLAogICAgICAgICAgICAiZHJpZnRfc3RyZWFtIjogb3V0cHV0X3N0cmVhbSwKICAgICAgICAgICAgInRzZGJfdGFibGUiOiBvdXRwdXRfdHNkYiwKICAgICAgICAgICAgInBhZ2VoaW5rbGV5X3RocmVzaG9sZCI6IHBhZ2VoaW5rbGV5X3RocmVzaG9sZCwKICAgICAgICAgICAgImRkbV93YXJuaW5nX2xldmVsIjogZGRtX3dhcm5pbmdfbGV2ZWwsCiAgICAgICAgICAgICJkZG1fb3V0X2NvbnRyb2wiOiBkZG1fb3V0X2NvbnRyb2xfbGV2ZWwsCiAgICAgICAgfQogICAgKQogICAgZm4uYWRkX3YzaW9fc3RyZWFtX3RyaWdnZXIoc3RyZWFtX3BhdGggPSBpbnB1dF9zdHJlYW0sIG5hbWUgPSAnc3RyZWFtJywgZ3JvdXAgPSBjb25zdW1lcl9ncm91cCkKICAgIGZuLmFwcGx5KG1vdW50X3YzaW8oKSkKICAgIGZuLmRlcGxveShwcm9qZWN0PWNvbnRleHQucHJvamVjdCkK
-    commands:
-    - python -m pip install scikit-multiflow
-    code_origin: https://github.com/daniels290813/functions.git#82bbfde4afa2eae77059e05c70bbebacf530fd0d:/User/test/functions/concept_drift/concept_drift.py
-    origin_filename: /User/test/functions/concept_drift/concept_drift.py
-  disable_auto_mount: false
-  affinity: null
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/concept_drift/0.9.0/static/item.html b/functions/development/concept_drift/0.9.0/static/item.html deleted file mode 100644 index 8c8c2b65..00000000 --- a/functions/development/concept_drift/0.9.0/static/item.html +++ /dev/null @@ -1,48 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- machine-learning
-- model-serving
-description: Deploy a streaming Concept Drift detector on a labeled stream
-doc: ''
-example: concept_drift.ipynb
-generationDate: 2021-11-18:12-28
-icon: ''
-labels:
-  author: orz
-  framework: sklearn
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 0.8.0
-name: concept-drift
-platformVersion: 3.2.0
-spec:
-  filename: concept_drift.py
-  handler: concept_drift_deployer
-  image: mlrun/ml-models
-  kind: job
-  requirements:
-  - scikit-multiflow
-url: ''
-version: 0.9.0
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/concept_drift/0.9.0/static/source.html b/functions/development/concept_drift/0.9.0/static/source.html deleted file mode 100644 index e6ae663b..00000000 --- a/functions/development/concept_drift/0.9.0/static/source.html +++ /dev/null @@ -1,155 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-# Generated by nuclio.export.NuclioExporter
-
-import skmultiflow.drift_detection  # We will grab our PH, DDM, EDDM algorithms from here
-import numpy as np
-import pandas as pd
-import os
-from cloudpickle import dumps, load, dump
-
-from nuclio.triggers import V3IOStreamTrigger
-from mlrun import DataItem, import_function, mlconf, MLClientCtx, mount_v3io
-
-import random
-
-
-def concept_drift_deployer(
-    context: MLClientCtx,
-    base_dataset: DataItem,
-    input_stream: str,
-    consumer_group: str,
-    output_stream: str,
-    output_tsdb: str,
-    tsdb_batch_size: int,
-    callbacks: list,
-    models: list = ["ddm", "eddm", "pagehinkley"],
-    models_dest="models",
-    pagehinkley_threshold: float = 10,
-    ddm_warning_level: float = 2,
-    ddm_out_control_level: float = 3,
-    label_col="label",
-    prediction_col="prediction",
-    hub_url: str = mlconf.hub_url,
-    fn_tag: str = "master",
-):
-    """Deploy a streaming Concept Drift detector on a labeled stream
-       This function is the Deployment step for the Streaming Concept Drift Detector.
-       It will load the selected drift detectors and initialize them with the
-       base_dataset's statistics.  Then it will deploy the concept_drift_streaming
-       function and pass the models to it for streaming concept-drift detection on top
-       of a labeled stream.
-
-    :param context:         MLRun context
-    :param base_dataset:    Dataset containing label_col and prediction_col to initialize the detectors
-    :param input_stream:    labeled stream to track.
-                            Should contain label_col and prediction_col
-    :param output_stream:   Output stream to push the detector's alerts
-    :param output_tsdb:     Output TSDB table to allow analysis and display
-    :param tsdb_batch_size: Batch size of alerts to buffer before pushing to the TSDB
-    :param callbacks:       Additional rest endpoints to send the alert data to
-    :param models:          List of the detectors to deploy
-                            Defaults to ['ddm', 'eddm', 'pagehinkley'].
-    :param models_dest:     Location for saving the detectors
-                            Defaults to 'models' (in relation to artifact_path).
-    :param pagehinkley_threshold:  Drift level threshold for PH detector Defaults to 10.
-    :param ddm_warning_level:      Warning level alert for DDM detector Defaults to 2.
-    :param ddm_out_control_level:  Drift level alert for DDM detector Defaults to 3.
-    :param label_col:       Label column to be used on base_dataset and input_stream
-                            Defaults to 'label'.
-    :param prediction_col:  Prediction column to be used on base_dataset and input_stream
-                            Defaults to 'prediction'.
-    :param hub_url:         hub_url in case the default is not used, concept_drift_streaming will be loaded
-                            by this url
-                            Defaults to mlconf.hub_url.
-    :param fn_tag:          hub tag to use
-                            Defaults to 'master'
-    """
-
-    mlconf.dbpath = mlconf.dbpath or "http://mlrun-api:8080"
-    mlconf.hub_url = hub_url
-    fn = import_function(url=f"hub://concept_drift_streaming:{fn_tag}")
-
-    context.logger.info("Loading base dataset")
-    base_df = base_dataset.as_df()
-    error_stream = np.where(
-        base_df[prediction_col].values == base_df[label_col].values, 0, 1
-    )
-
-    context.logger.info("Creating models")
-    models = [
-        model.strip()
-        for model in os.getenv("models", "pagehinkley, ddm, eddm").split(",")
-    ]
-    models = {
-        "eddm": skmultiflow.drift_detection.EDDM(),
-        "pagehinkley": skmultiflow.drift_detection.PageHinkley(
-            min_instances=len(error_stream), threshold=pagehinkley_threshold
-        ),
-        "ddm": skmultiflow.drift_detection.DDM(
-            min_num_instances=len(error_stream),
-            warning_level=ddm_warning_level,
-            out_control_level=ddm_out_control_level,
-        ),
-    }
-
-    context.logger.info("Streaming data to models")
-    for i in range(len(error_stream)):
-        for model_name, model in models.items():
-            model.add_element(error_stream[i])
-
-    context.logger.info("Logging ready models")
-    for name, model in models.items():
-        data = dumps(model)
-        model_file = f"{name}.pkl"
-        context.log_model(
-            f"{name}_concept_drift",
-            body=data,
-            labels={"framework": "skmultiflow", "workflow": "concept-drift"},
-            model_file=model_file,
-            model_dir=models_dest,
-            tag="latest",
-        )
-        fn.set_envs(
-            {
-                f"{name}_model_path": os.path.join(
-                    context.artifact_path, models_dest, model_file
-                )
-            }
-        )
-
-    context.logger.info("Deploying Concept Drift Streaming function")
-    fn.set_envs(
-        {
-            "label_col": label_col,
-            "prediction_col": prediction_col,
-            "drift_stream": output_stream,
-            "tsdb_table": output_tsdb,
-            "pagehinkley_threshold": pagehinkley_threshold,
-            "ddm_warning_level": ddm_warning_level,
-            "ddm_out_control": ddm_out_control_level,
-        }
-    )
-    fn.add_v3io_stream_trigger(stream_path = input_stream, name = 'stream', group = consumer_group)
-    fn.apply(mount_v3io())
-    fn.deploy(project=context.project)
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/concept_drift/0.9.1/src/README.md b/functions/development/concept_drift/0.9.1/src/README.md deleted file mode 100644 index 92e6d893..00000000 --- a/functions/development/concept_drift/0.9.1/src/README.md +++ /dev/null @@ -1,132 +0,0 @@ -# Concept Drift - -**Concept drift** is a change in the statistical properties of the **target variable** over time. - -When deploying our models to production, we must ensure our models perform as we expect them to - reaching the same level of performence we have seen on our test sets or at least performing in the same quality as when they were deployed. - -However, often this is not the case. there are many factors that can affect our model's performance like seasonality or any unkown root causes that will change the laws underlying our data and invalidate some assumptions made by the model. - -We offer this function to help combat Concept Drift with implementation of streaming DDM, EDDM and PH concept drift detectors. - -## How to integrate - -This function is made of two parts: - -1. Kubernetes job to instantiate the selected models with a provided base dataset (the test dataset could be used) -2. [Nuclio serverless function](../concept_drift_streaming/concept_drift_streaming.ipynb) listed on a _labeled stream_, which will be deployed from this function after the models initialization and run the models per event and provide necessary alerts. - -There are two steps to integrate sucessfully with your workflow: - -1. Provide a stream where each event containes the joined **label** and **prediction** for that specific event. -2. Add this function to the workflow with the following params: - -```markdown -:param context: MLRun context -:param base_dataset: Dataset containing label_col and prediction_col to initialize the detectors -:param input_stream: labeled stream to track. - Should contain label_col and prediction_col -:param output_stream: Output stream to push the detector's alerts -:param output_tsdb: Output TSDB table to allow analysis and display -:param tsdb_batch_size: Batch size of alerts to buffer before pushing to the TSDB -:param callbacks: Additional rest endpoints to send the alert data to -:param models: List of the detectors to deploy - Defaults to ['ddm', 'eddm', 'pagehinkley']. -:param models_dest: Location for saving the detectors - Defaults to 'models' (in relation to artifact_path). -:param pagehinkley_threshold: Drift level threshold for PH detector Defaults to 10. -:param ddm_warning_level: Warning level alert for DDM detector Defaults to 2. -:param ddm_out_control_level: Drift level alert for DDM detector Defaults to 3. -:param label_col: Label column to be used on base_dataset and input_stream - Defaults to 'label'. -:param prediction_col: Prediction column to be used on base_dataset and input_stream - Defaults to 'prediction'. -:param hub_url: hub_url in case the default is not used, concept_drift_streaming will be loaded - by this url - Defaults to mlconf.hub_url. -:param fn_tag: hub tag to use - Defaults to 'master' -``` - -## Algorithms - -We offer to deploy up to 3 concept drift streaming detectors - -### DDM - Drift Detection Method - -Models the **Number of errors** as a **binomial** variable. This enables us to confine the expected number of errors in a prediction stream window to within some standard deviation. - -- Good for **abrupt** drift changes - -
- -![$mu=np_t$](https://latex.codecogs.com/svg.latex?mu=np_t) - -![$\sigma=\sqrt{\frac{p_t(1-p_t)}{n}}$]() - -
- -**Alert** when: - -
- -![$p_t+\sigma_t\ge{p_{min}+3\sigma_{min}}$](https://latex.codecogs.com/svg.latex?p_t+\sigma_t\ge{p_{min}+3\sigma_{min}}) - -
- -### EDDM - Early Drift Detection Method - -Uses the distance between two consecutive errors. - -- works better for **gradual** drift changes. -- More sensitive then DDM for noise -- Requires Minimal number of errors to initialize the statistics. - -**Warning**: - -
- -![$\frac{p_t+2\sigma_t}{p_{max}+2\sigma_{max}}<0.95$](https://latex.codecogs.com/svg.latex?\frac{p_t+2\sigma_t}{p_{max}+2\sigma_{max}}<0.95) - -
- -**Alert**: - -
- -![$\frac{p_t+2\sigma_t}{p_{max}+2\sigma_{max}}<0.90$](https://latex.codecogs.com/svg.latex?\frac{p_t+2\sigma_t}{p_{max}+2\sigma_{max}}<0.90) - -
- -### PageHinkley Test: - -The PageHinkley test is a sequential analysis technique typically used for monitoring change detection. (The test was designed to detect change in avg. of a Gaussian signal). In this test we use: -x*1*, ..., x*n* - labeled dataset -δ - magnitude threshold -λ - detection threshold - -
- -![$\hat{x_T}=\frac{1}{T}\sum_{t=1}^{t}{x_t}$](https://latex.codecogs.com/svg.latex?\hat{x_T}=\frac{1}{T}\sum_{t=1}^{t}{x_t}) - -![$\sum_{t=1}^T{x_t-\hat{x_T}-\delta}$](https://latex.codecogs.com/svg.latex?U_T=\sum_{t=1}^T{x_t-\hat{x_T}-\delta}) - -![$m_T=min(U_t,t=1..T)$]() - -
- -**Alert**: - -
- -![$U_T-m_T>\lambda$](https://latex.codecogs.com/svg.latex?U_T-m_T>\lambda) - -
- -## Additional resources -[A Study on Change Detection Methods](https://pdfs.semanticscholar.org/bb6e/8a44c0efcd725aae1c0b1817561f6e278c2c.pdf), Raquel Sebasti˜ao1,2 and Jo˜ao Gama1,3, 1 LIAAD-INESC Porto L.A., University of Porto -Rua de Ceuta, 118 - 6, 4050-190 Porto, Portugal -2 Faculty of Science, University of Porto -3 Faculty of Economics, University of Porto -{raquel,jgama}@liaad.up.pt - -[MLOps Live #4 - How to Detect & Remediate Drift in Production with MLOps Automation](https://www.youtube.com/watch?v=66_Q7mJZOSc&t=1296s) diff --git a/functions/development/concept_drift/0.9.1/src/concept_drift.ipynb b/functions/development/concept_drift/0.9.1/src/concept_drift.ipynb deleted file mode 100644 index e9c063b6..00000000 --- a/functions/development/concept_drift/0.9.1/src/concept_drift.ipynb +++ /dev/null @@ -1,793 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Concept Drift - Deployer\n", - "Deploy a streaming Concept Drift detector on a labeled stream. \n", - "It will initialize the selected drift detectors with the base_dataset's statistics and deploy the [concept_drift_streaming](https://github.com/mlrun/functions/blob/master/concept_drift_streaming/concept_drift_streaming.ipynb) function from the hub.
\n", - "adding [V3IOStreamTrigger](https://nuclio.io/docs/latest/reference/triggers/v3iostream/) in order to listen to the input_stream." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Steps**\n", - "\n", - "1. [Data exploration](#Data-exploration)\n", - "2. [Creating the input stream](#Creating-the-input-stream)\n", - "3. [Importing the function](#Importing-the-function)\n", - "4. [Running the function remotely](#Running-the-function-remotely)\n", - "5. [Testing the function](#Testing-the-function)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Data exploration**" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In order to know about the performance of a drift detector by measuring the different detection metrics, we need to know beforehand where a real drift occurs.
\n", - "This is only possible with synthetic datasets.
The scikit-multiflow framework allows generating several kinds of synthetic data to simulate the occurrence of drifts.
\n", - "[Harvard dataverse](https://dataverse.harvard.edu) provides futher explanations on the [used dataset](https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/5OWRGB) along with different kinds of drifted datasets.
\n", - "mixed_0101_abrupto has 4 concepts and 3 drifts at time steps 10000, 20000, and 30000.
\n", - "Our dataset will be train-test-splitted, the train part (first 5000 examples) is used to train the model (that is generated easly using [sklearn_classifer](https://github.com/mlrun/functions/blob/master/sklearn_classifier/sklearn_classifier.ipynb)).
\n", - "The test part (which is already predicted by the model) will be pushed to the input stream in order to detect drifts." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
X1X2X3X4class
00.01.00.4601010.5927441.0
11.01.00.5887880.5749840.0
20.00.00.4016410.6793251.0
31.01.00.3060760.1821080.0
40.00.00.9628470.5792451.0
\n", - "
" - ], - "text/plain": [ - " X1 X2 X3 X4 class\n", - "0 0.0 1.0 0.460101 0.592744 1.0\n", - "1 1.0 1.0 0.588788 0.574984 0.0\n", - "2 0.0 0.0 0.401641 0.679325 1.0\n", - "3 1.0 1.0 0.306076 0.182108 0.0\n", - "4 0.0 0.0 0.962847 0.579245 1.0" - ] - }, - "execution_count": 1, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import pandas as pd\n", - "data_path = 'https://s3.wasabisys.com/iguazio/data/function-marketplace-data/concept_drift/mixed_0101_abrupto.csv'\n", - "predicted_train_path = 'https://s3.wasabisys.com/iguazio/data/function-marketplace-data/concept_drift/predicted_abrupto_train.csv'\n", - "predicted_test_data_path = 'https://s3.wasabisys.com/iguazio/data/function-marketplace-data/concept_drift/predicted_abrupto_test.csv'\n", - "# You can find the model used here\n", - "models_path = 'https://s3.wasabisys.com/iguazio/models/function-marketplace-models/concept_drift/concept_drift_random_forest.pkl'\n", - "original_data = pd.read_csv(data_path)\n", - "original_data.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
X1X2X3X4classpredicted_col
349950.00.00.0101060.6472690.01.0
349961.01.00.2936510.7372911.00.0
349970.00.00.8485460.5523370.01.0
349981.01.00.6147540.8598961.00.0
349991.00.00.2653060.8437160.01.0
\n", - "
" - ], - "text/plain": [ - " X1 X2 X3 X4 class predicted_col\n", - "34995 0.0 0.0 0.010106 0.647269 0.0 1.0\n", - "34996 1.0 1.0 0.293651 0.737291 1.0 0.0\n", - "34997 0.0 0.0 0.848546 0.552337 0.0 1.0\n", - "34998 1.0 1.0 0.614754 0.859896 1.0 0.0\n", - "34999 1.0 0.0 0.265306 0.843716 0.0 1.0" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "predicted_test = pd.read_csv(predicted_test_data_path)\n", - "predicted_test.tail()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Creating the input stream**" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "import os \n", - "\n", - "container = os.path.join('/',os.environ['V3IO_HOME'].split('/')[0])\n", - "user = os.environ[\"V3IO_USERNAME\"]\n", - "rel_path = os.getcwd()[6:] + '/artifacts'\n", - "\n", - "base_input_stream = os.path.join(user,rel_path) + \"/inputs_stream\"\n", - "base_output_stream = os.path.join(user,rel_path) + \"/output_stream\"\n", - "input_stream = os.path.join(container,base_input_stream)\n", - "output_stream = os.path.join(container,user,rel_path) + \"/output_stream\"\n", - "tsdb_path = os.path.join(container,user,rel_path) + \"/output_tsdb\"\n", - "\n", - "stream_consumer_group = 'cg45'" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "import v3io.dataplane\n", - "\n", - "client = v3io.dataplane.Client()\n", - "response = client.stream.create(container = container,\n", - " stream_path=base_input_stream,\n", - " shard_count=1,\n", - " raise_for_status = v3io.dataplane.RaiseForStatus.never)\n", - "response.raise_for_status([409, 204])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Importing the function**" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2021-10-25 10:27:04,105 [info] created and saved project function-marketplace\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Importing the function\n", - "import mlrun\n", - "mlrun.set_environment(project='function-marketplace')\n", - "\n", - "fn = mlrun.import_function(\"hub://concept_drift:development\")\n", - "fn.apply(mlrun.auto_mount())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Running the function remotely**" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2021-10-25 10:27:04,567 [info] starting run concept_drift uid=fa07c222e77d4eac86d2ce9317aaded1 DB=http://mlrun-api:8080\n", - "> 2021-10-25 10:27:04,709 [info] Job is running in the background, pod: concept-drift-ggxgb\n", - "> 2021-10-25 10:27:11,199 [info] Loading base dataset\n", - "> 2021-10-25 10:27:13,227 [info] Creating models\n", - "> 2021-10-25 10:27:13,227 [info] Streaming data to models\n", - "> 2021-10-25 10:27:13,347 [info] Logging ready models\n", - "> 2021-10-25 10:27:13,487 [info] Deploying Concept Drift Streaming function\n", - "> 2021-10-25 10:27:13,490 [info] Starting remote function deploy\n", - "2021-10-25 10:27:13 (info) Deploying function\n", - "2021-10-25 10:27:13 (info) Building\n", - "2021-10-25 10:27:13 (info) Staging files and preparing base images\n", - "2021-10-25 10:27:13 (info) Building processor image\n", - "2021-10-25 10:27:15 (info) Build complete\n", - "2021-10-25 10:27:21 (info) Function deploy complete\n", - "> 2021-10-25 10:27:21,797 [info] successfully deployed function: {'internal_invocation_urls': ['nuclio-function-marketplace-concept-drift-streaming.default-tenant.svc.cluster.local:8080'], 'external_invocation_urls': ['default-tenant.app.dev39.lab.iguazeng.com:31143']}\n", - "> 2021-10-25 10:27:21,868 [info] run executed, status=completed\n", - "final state: completed\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
function-marketplace0Oct 25 10:27:10completedconcept_drift
v3io_user=dani
kind=job
owner=dani
host=concept-drift-ggxgb
base_dataset
input_stream=/users/dani/test/functions/concept_drift/artifacts/inputs_stream
consumer_group=cg45
output_stream=/users/dani/test/functions/concept_drift/artifacts/output_stream
output_tsdb=/users/dani/test/functions/concept_drift/artifacts/output_tsdb
tsdb_batch_size=1
models=['ddm', 'eddm', 'pagehinkley']
label_col=class
prediction_col=predicted_col
fn_tag=development
eddm_concept_drift
pagehinkley_concept_drift
ddm_concept_drift
\n", - "
\n", - "
\n", - "
\n", - " Title\n", - " ×\n", - "
\n", - " \n", - "
\n", - "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - }, - { - "data": { - "text/html": [ - " > to track results use the .show() or .logs() methods or click here to open in UI" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2021-10-25 10:27:23,031 [info] run executed, status=completed\n" - ] - } - ], - "source": [ - "drift_run = fn.run(name='concept_drift',\n", - " params={'input_stream' : input_stream,\n", - " 'consumer_group' : stream_consumer_group,\n", - " 'output_stream' : output_stream,\n", - " 'output_tsdb' : tsdb_path,\n", - " 'tsdb_batch_size' : 1,\n", - " 'models' : ['ddm', 'eddm', 'pagehinkley'], # defaults\n", - " 'label_col' : 'class',\n", - " 'prediction_col' : 'predicted_col',\n", - " 'fn_tag' : 'development'},\n", - " inputs={'base_dataset' : predicted_train_path},\n", - " artifact_path = os.path.join(os.getcwd(), 'artifacts'))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Testing the function**\n", - "> Mark that we are testing the deployed function - concept_drift_streaming" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'data': '{\"class\": 1.0, \"request\": {\"instances\": [{\"X1\": 0.0, \"X2\": 0.0, \"X3\": 0.0634475073, \"X4\": 0.4136568818}]}, \"resp\": [1], \"when\": \"2021-10-25 10:27:23.152584\", \"model\": \"sklearn.ensemble.RandomForestClassifier\"}'}" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import json\n", - "import datetime\n", - "\n", - "# Reshaping the data to V3IOStream format.\n", - "def restructure_stream_event(context, event):\n", - " instances = [dict()]\n", - " for key in predicted_test.keys():\n", - " if key not in ['when', 'class', 'model', 'worker', 'hostname', 'predicted_col']:\n", - " instances[0].update({key: event.pop(key)})\n", - " event['request'] = {'instances': instances}\n", - " event['resp'] = [int(event.pop('predicted_col'))]\n", - " event['when'] = datetime.datetime.strftime(datetime.datetime.now(), format=\"%Y-%m-%d %H:%M:%S.%f\")\n", - " event['model'] = 'sklearn.ensemble.RandomForestClassifier'\n", - " return event\n", - " \n", - " \n", - "records = json.loads(predicted_test.to_json(orient='records'))\n", - "records = [{'data': json.dumps(restructure_stream_event(context, record))} for record in records]\n", - "\n", - "# showing first record\n", - "records[0]" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "# Creating v3io client\n", - "v3io_client = v3io.dataplane.Client()\n", - "\n", - "# Pushing some undrifted data to the input stream\n", - "response = v3io_client.stream.put_records(container=container,\n", - " stream_path=base_input_stream, \n", - " records=records[4900:5100])" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'SequenceNumber': 200,\n", - " 'Data': 'eyJjbGFzcyI6IDAuMCwgInJlcXVlc3QiOiB7Imluc3RhbmNlcyI6IFt7IlgxIjogMC4wLCAiWDIiOiAwLjAsICJYMyI6IDAuMzMzMTYzNjk4OSwgIlg0IjogMC40MjE2NzY1Njg3fV19LCAicmVzcCI6IFsxXSwgIndoZW4iOiAiMjAyMS0xMC0yNSAxMDoyNzoyMy4yOTM3OTgiLCAibW9kZWwiOiAic2tsZWFybi5lbnNlbWJsZS5SYW5kb21Gb3Jlc3RDbGFzc2lmaWVyIn0=',\n", - " 'ArrivalTimeSec': 1635157644,\n", - " 'ArrivalTimeNSec': 395309631}" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Getting earliest location in the shard\n", - "location = json.loads(v3io_client.stream.seek(container=container,\n", - " stream_path=base_input_stream,\n", - " shard_id=0,\n", - " seek_type='EARLIEST').body)['Location']\n", - "# Getting records from input stream\n", - "response = v3io_client.stream.get_records(container=container,\n", - " stream_path=base_input_stream,\n", - " shard_id=0, location=location)\n", - "# Showing the last sequence that is written to the input stream\n", - "json.loads(response.body)['Records'][-1]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Make sure some time has passed - the function needs to be triggered by the input stream, then it'll write to the output stream" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "# Getting earliest location in the shard\n", - "location = json.loads(v3io_client.stream.seek(container=container,\n", - " stream_path=base_output_stream,\n", - " shard_id=0,\n", - " seek_type='EARLIEST').body)['Location']\n", - "# Getting records from output stream\n", - "response = v3io_client.stream.get_records(container=container,\n", - " stream_path=base_output_stream,\n", - " shard_id=0, location=location)" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "sequence number : 106, data : {'class': 0.0, 'request': {'instances': [{'X1': 0.0, 'X2': 0.0, 'X3': 0.9628473804, 'X4': 0.5792453402}]}, 'resp': [1], 'when': '2021-10-25 10:27:23.291145', 'model': 'sklearn.ensemble.RandomForestClassifier', 'ddm_warning_zone': 0, 'ddm_drift': 1, 'eddm_warning_zone': 0, 'eddm_drift': 0}\n", - "sequence number : 122, data : {'class': 0.0, 'request': {'instances': [{'X1': 0.0, 'X2': 0.0, 'X3': 0.4969765505, 'X4': 0.9784738351}]}, 'resp': [1], 'when': '2021-10-25 10:27:23.291558', 'model': 'sklearn.ensemble.RandomForestClassifier', 'ddm_warning_zone': 0, 'ddm_drift': 0, 'eddm_warning_zone': 0, 'eddm_drift': 1}\n" - ] - } - ], - "source": [ - "# Showing changed detected\n", - "import base64\n", - "for instance in json.loads(response.body)['Records']:\n", - " seq = instance[\"SequenceNumber\"]\n", - " data = json.loads(base64.b64decode(instance['Data']))\n", - " if(data['ddm_drift']==1 or data['eddm_drift']==1):\n", - " print(f'sequence number : {seq}, data : {data}')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can see that the system detected a change in the 106 instance, which is 10006 instance in the real dataset -
\n", - "5000 first instances are for train, we started pushing data from the 4900 instance of the test dataset (9900 from the real dataset), and we pushed only 200 instances.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "[Back to the top](#Concept-Drift---Deployer)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python [conda env:root] *", - "language": "python", - "name": "conda-root-py" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/functions/development/concept_drift/0.9.1/src/concept_drift.py b/functions/development/concept_drift/0.9.1/src/concept_drift.py deleted file mode 100644 index f6fd8dcc..00000000 --- a/functions/development/concept_drift/0.9.1/src/concept_drift.py +++ /dev/null @@ -1,133 +0,0 @@ -# Generated by nuclio.export.NuclioExporter - -import skmultiflow.drift_detection # We will grab our PH, DDM, EDDM algorithms from here -import numpy as np -import pandas as pd -import os -from cloudpickle import dumps, load, dump - -from nuclio.triggers import V3IOStreamTrigger -from mlrun import DataItem, import_function, mlconf, MLClientCtx, mount_v3io - -import random - - -def concept_drift_deployer( - context: MLClientCtx, - base_dataset: DataItem, - input_stream: str, - consumer_group: str, - output_stream: str, - output_tsdb: str, - tsdb_batch_size: int, - callbacks: list, - models: list = ["ddm", "eddm", "pagehinkley"], - models_dest="models", - pagehinkley_threshold: float = 10, - ddm_warning_level: float = 2, - ddm_out_control_level: float = 3, - label_col="label", - prediction_col="prediction", - hub_url: str = mlconf.hub_url, - fn_tag: str = "master", -): - """Deploy a streaming Concept Drift detector on a labeled stream - This function is the Deployment step for the Streaming Concept Drift Detector. - It will load the selected drift detectors and initialize them with the - base_dataset's statistics. Then it will deploy the concept_drift_streaming - function and pass the models to it for streaming concept-drift detection on top - of a labeled stream. - - :param context: MLRun context - :param base_dataset: Dataset containing label_col and prediction_col to initialize the detectors - :param input_stream: labeled stream to track. - Should contain label_col and prediction_col - :param output_stream: Output stream to push the detector's alerts - :param output_tsdb: Output TSDB table to allow analysis and display - :param tsdb_batch_size: Batch size of alerts to buffer before pushing to the TSDB - :param callbacks: Additional rest endpoints to send the alert data to - :param models: List of the detectors to deploy - Defaults to ['ddm', 'eddm', 'pagehinkley']. - :param models_dest: Location for saving the detectors - Defaults to 'models' (in relation to artifact_path). - :param pagehinkley_threshold: Drift level threshold for PH detector Defaults to 10. - :param ddm_warning_level: Warning level alert for DDM detector Defaults to 2. - :param ddm_out_control_level: Drift level alert for DDM detector Defaults to 3. - :param label_col: Label column to be used on base_dataset and input_stream - Defaults to 'label'. - :param prediction_col: Prediction column to be used on base_dataset and input_stream - Defaults to 'prediction'. - :param hub_url: hub_url in case the default is not used, concept_drift_streaming will be loaded - by this url - Defaults to mlconf.hub_url. - :param fn_tag: hub tag to use - Defaults to 'master' - """ - - mlconf.dbpath = mlconf.dbpath or "http://mlrun-api:8080" - mlconf.hub_url = hub_url - fn = import_function(url=f"hub://concept_drift_streaming:{fn_tag}") - - context.logger.info("Loading base dataset") - base_df = base_dataset.as_df() - error_stream = np.where( - base_df[prediction_col].values == base_df[label_col].values, 0, 1 - ) - - context.logger.info("Creating models") - models = [ - model.strip() - for model in os.getenv("models", "pagehinkley, ddm, eddm").split(",") - ] - models = { - "eddm": skmultiflow.drift_detection.EDDM(), - "pagehinkley": skmultiflow.drift_detection.PageHinkley( - min_instances=len(error_stream), threshold=pagehinkley_threshold - ), - "ddm": skmultiflow.drift_detection.DDM( - min_num_instances=len(error_stream), - warning_level=ddm_warning_level, - out_control_level=ddm_out_control_level, - ), - } - - context.logger.info("Streaming data to models") - for i in range(len(error_stream)): - for model_name, model in models.items(): - model.add_element(error_stream[i]) - - context.logger.info("Logging ready models") - for name, model in models.items(): - data = dumps(model) - model_file = f"{name}.pkl" - context.log_model( - f"{name}_concept_drift", - body=data, - labels={"framework": "skmultiflow", "workflow": "concept-drift"}, - model_file=model_file, - model_dir=models_dest, - tag="latest", - ) - fn.set_envs( - { - f"{name}_model_path": os.path.join( - context.artifact_path, models_dest, model_file - ) - } - ) - - context.logger.info("Deploying Concept Drift Streaming function") - fn.set_envs( - { - "label_col": label_col, - "prediction_col": prediction_col, - "drift_stream": output_stream, - "tsdb_table": output_tsdb, - "pagehinkley_threshold": pagehinkley_threshold, - "ddm_warning_level": ddm_warning_level, - "ddm_out_control": ddm_out_control_level, - } - ) - fn.add_v3io_stream_trigger(stream_path = input_stream, name = 'stream', group = consumer_group) - fn.apply(mount_v3io()) - fn.deploy(project=context.project) diff --git a/functions/development/concept_drift/0.9.1/src/function.yaml b/functions/development/concept_drift/0.9.1/src/function.yaml deleted file mode 100644 index 071111c7..00000000 --- a/functions/development/concept_drift/0.9.1/src/function.yaml +++ /dev/null @@ -1,112 +0,0 @@ -kind: job -metadata: - name: concept-drift - tag: '' - hash: 935da41196802875e19948974f32b6f00c29feb2 - project: '' - labels: - author: orz - framework: sklearn - categories: - - machine-learning - - model-serving -spec: - command: '' - args: [] - image: mlrun/ml-models - env: [] - default_handler: concept_drift_deployer - entry_points: - concept_drift_deployer: - name: concept_drift_deployer - doc: "Deploy a streaming Concept Drift detector on a labeled stream\n This\ - \ function is the Deployment step for the Streaming Concept Drift Detector.\n\ - \ It will load the selected drift detectors and initialize them with the\n\ - \ base_dataset's statistics. Then it will deploy the concept_drift_streaming\n\ - \ function and pass the models to it for streaming concept-drift detection\ - \ on top\n of a labeled stream." - parameters: - - name: context - type: MLClientCtx - doc: MLRun context - default: '' - - name: base_dataset - type: DataItem - doc: Dataset containing label_col and prediction_col to initialize the detectors - default: '' - - name: input_stream - type: str - doc: labeled stream to track. Should contain label_col and prediction_col - default: '' - - name: consumer_group - type: str - default: '' - - name: output_stream - type: str - doc: Output stream to push the detector's alerts - default: '' - - name: output_tsdb - type: str - doc: Output TSDB table to allow analysis and display - default: '' - - name: tsdb_batch_size - type: int - doc: Batch size of alerts to buffer before pushing to the TSDB - default: '' - - name: callbacks - type: list - doc: Additional rest endpoints to send the alert data to - default: '' - - name: models - type: list - doc: List of the detectors to deploy Defaults to ['ddm', 'eddm', 'pagehinkley']. - default: - - ddm - - eddm - - pagehinkley - - name: models_dest - doc: Location for saving the detectors Defaults to 'models' (in relation to - artifact_path). - default: models - - name: pagehinkley_threshold - type: float - doc: Drift level threshold for PH detector Defaults to 10. - default: 10 - - name: ddm_warning_level - type: float - doc: Warning level alert for DDM detector Defaults to 2. - default: 2 - - name: ddm_out_control_level - type: float - doc: Drift level alert for DDM detector Defaults to 3. - default: 3 - - name: label_col - doc: Label column to be used on base_dataset and input_stream Defaults to - 'label'. - default: label - - name: prediction_col - doc: Prediction column to be used on base_dataset and input_stream Defaults - to 'prediction'. - default: prediction - - name: hub_url - type: str - doc: hub_url in case the default is not used, concept_drift_streaming will - be loaded by this url Defaults to mlconf.hub_url. - default: <_ast.Name object at 0x7f48eda946d0> - - name: fn_tag - type: str - doc: hub tag to use Defaults to 'master' - default: master - outputs: - - default: '' - lineno: 15 - description: Deploy a streaming Concept Drift detector on a labeled stream - build: - functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHNrbXVsdGlmbG93LmRyaWZ0X2RldGVjdGlvbiAgIyBXZSB3aWxsIGdyYWIgb3VyIFBILCBERE0sIEVERE0gYWxnb3JpdGhtcyBmcm9tIGhlcmUKaW1wb3J0IG51bXB5IGFzIG5wCmltcG9ydCBwYW5kYXMgYXMgcGQKaW1wb3J0IG9zCmZyb20gY2xvdWRwaWNrbGUgaW1wb3J0IGR1bXBzLCBsb2FkLCBkdW1wCgpmcm9tIG51Y2xpby50cmlnZ2VycyBpbXBvcnQgVjNJT1N0cmVhbVRyaWdnZXIKZnJvbSBtbHJ1biBpbXBvcnQgRGF0YUl0ZW0sIGltcG9ydF9mdW5jdGlvbiwgbWxjb25mLCBNTENsaWVudEN0eCwgbW91bnRfdjNpbwoKaW1wb3J0IHJhbmRvbQoKCmRlZiBjb25jZXB0X2RyaWZ0X2RlcGxveWVyKAogICAgY29udGV4dDogTUxDbGllbnRDdHgsCiAgICBiYXNlX2RhdGFzZXQ6IERhdGFJdGVtLAogICAgaW5wdXRfc3RyZWFtOiBzdHIsCiAgICBjb25zdW1lcl9ncm91cDogc3RyLAogICAgb3V0cHV0X3N0cmVhbTogc3RyLAogICAgb3V0cHV0X3RzZGI6IHN0ciwKICAgIHRzZGJfYmF0Y2hfc2l6ZTogaW50LAogICAgY2FsbGJhY2tzOiBsaXN0LAogICAgbW9kZWxzOiBsaXN0ID0gWyJkZG0iLCAiZWRkbSIsICJwYWdlaGlua2xleSJdLAogICAgbW9kZWxzX2Rlc3Q9Im1vZGVscyIsCiAgICBwYWdlaGlua2xleV90aHJlc2hvbGQ6IGZsb2F0ID0gMTAsCiAgICBkZG1fd2FybmluZ19sZXZlbDogZmxvYXQgPSAyLAogICAgZGRtX291dF9jb250cm9sX2xldmVsOiBmbG9hdCA9IDMsCiAgICBsYWJlbF9jb2w9ImxhYmVsIiwKICAgIHByZWRpY3Rpb25fY29sPSJwcmVkaWN0aW9uIiwKICAgIGh1Yl91cmw6IHN0ciA9IG1sY29uZi5odWJfdXJsLAogICAgZm5fdGFnOiBzdHIgPSAibWFzdGVyIiwKKToKICAgICIiIkRlcGxveSBhIHN0cmVhbWluZyBDb25jZXB0IERyaWZ0IGRldGVjdG9yIG9uIGEgbGFiZWxlZCBzdHJlYW0KICAgICAgIFRoaXMgZnVuY3Rpb24gaXMgdGhlIERlcGxveW1lbnQgc3RlcCBmb3IgdGhlIFN0cmVhbWluZyBDb25jZXB0IERyaWZ0IERldGVjdG9yLgogICAgICAgSXQgd2lsbCBsb2FkIHRoZSBzZWxlY3RlZCBkcmlmdCBkZXRlY3RvcnMgYW5kIGluaXRpYWxpemUgdGhlbSB3aXRoIHRoZQogICAgICAgYmFzZV9kYXRhc2V0J3Mgc3RhdGlzdGljcy4gIFRoZW4gaXQgd2lsbCBkZXBsb3kgdGhlIGNvbmNlcHRfZHJpZnRfc3RyZWFtaW5nCiAgICAgICBmdW5jdGlvbiBhbmQgcGFzcyB0aGUgbW9kZWxzIHRvIGl0IGZvciBzdHJlYW1pbmcgY29uY2VwdC1kcmlmdCBkZXRlY3Rpb24gb24gdG9wCiAgICAgICBvZiBhIGxhYmVsZWQgc3RyZWFtLgoKICAgIDpwYXJhbSBjb250ZXh0OiAgICAgICAgIE1MUnVuIGNvbnRleHQKICAgIDpwYXJhbSBiYXNlX2RhdGFzZXQ6ICAgIERhdGFzZXQgY29udGFpbmluZyBsYWJlbF9jb2wgYW5kIHByZWRpY3Rpb25fY29sIHRvIGluaXRpYWxpemUgdGhlIGRldGVjdG9ycwogICAgOnBhcmFtIGlucHV0X3N0cmVhbTogICAgbGFiZWxlZCBzdHJlYW0gdG8gdHJhY2suCiAgICAgICAgICAgICAgICAgICAgICAgICAgICBTaG91bGQgY29udGFpbiBsYWJlbF9jb2wgYW5kIHByZWRpY3Rpb25fY29sCiAgICA6cGFyYW0gb3V0cHV0X3N0cmVhbTogICBPdXRwdXQgc3RyZWFtIHRvIHB1c2ggdGhlIGRldGVjdG9yJ3MgYWxlcnRzCiAgICA6cGFyYW0gb3V0cHV0X3RzZGI6ICAgICBPdXRwdXQgVFNEQiB0YWJsZSB0byBhbGxvdyBhbmFseXNpcyBhbmQgZGlzcGxheQogICAgOnBhcmFtIHRzZGJfYmF0Y2hfc2l6ZTogQmF0Y2ggc2l6ZSBvZiBhbGVydHMgdG8gYnVmZmVyIGJlZm9yZSBwdXNoaW5nIHRvIHRoZSBUU0RCCiAgICA6cGFyYW0gY2FsbGJhY2tzOiAgICAgICBBZGRpdGlvbmFsIHJlc3QgZW5kcG9pbnRzIHRvIHNlbmQgdGhlIGFsZXJ0IGRhdGEgdG8KICAgIDpwYXJhbSBtb2RlbHM6ICAgICAgICAgIExpc3Qgb2YgdGhlIGRldGVjdG9ycyB0byBkZXBsb3kKICAgICAgICAgICAgICAgICAgICAgICAgICAgIERlZmF1bHRzIHRvIFsnZGRtJywgJ2VkZG0nLCAncGFnZWhpbmtsZXknXS4KICAgIDpwYXJhbSBtb2RlbHNfZGVzdDogICAgIExvY2F0aW9uIGZvciBzYXZpbmcgdGhlIGRldGVjdG9ycwogICAgICAgICAgICAgICAgICAgICAgICAgICAgRGVmYXVsdHMgdG8gJ21vZGVscycgKGluIHJlbGF0aW9uIHRvIGFydGlmYWN0X3BhdGgpLgogICAgOnBhcmFtIHBhZ2VoaW5rbGV5X3RocmVzaG9sZDogIERyaWZ0IGxldmVsIHRocmVzaG9sZCBmb3IgUEggZGV0ZWN0b3IgRGVmYXVsdHMgdG8gMTAuCiAgICA6cGFyYW0gZGRtX3dhcm5pbmdfbGV2ZWw6ICAgICAgV2FybmluZyBsZXZlbCBhbGVydCBmb3IgRERNIGRldGVjdG9yIERlZmF1bHRzIHRvIDIuCiAgICA6cGFyYW0gZGRtX291dF9jb250cm9sX2xldmVsOiAgRHJpZnQgbGV2ZWwgYWxlcnQgZm9yIERETSBkZXRlY3RvciBEZWZhdWx0cyB0byAzLgogICAgOnBhcmFtIGxhYmVsX2NvbDogICAgICAgTGFiZWwgY29sdW1uIHRvIGJlIHVzZWQgb24gYmFzZV9kYXRhc2V0IGFuZCBpbnB1dF9zdHJlYW0KICAgICAgICAgICAgICAgICAgICAgICAgICAgIERlZmF1bHRzIHRvICdsYWJlbCcuCiAgICA6cGFyYW0gcHJlZGljdGlvbl9jb2w6ICBQcmVkaWN0aW9uIGNvbHVtbiB0byBiZSB1c2VkIG9uIGJhc2VfZGF0YXNldCBhbmQgaW5wdXRfc3RyZWFtCiAgICAgICAgICAgICAgICAgICAgICAgICAgICBEZWZhdWx0cyB0byAncHJlZGljdGlvbicuCiAgICA6cGFyYW0gaHViX3VybDogICAgICAgICBodWJfdXJsIGluIGNhc2UgdGhlIGRlZmF1bHQgaXMgbm90IHVzZWQsIGNvbmNlcHRfZHJpZnRfc3RyZWFtaW5nIHdpbGwgYmUgbG9hZGVkCiAgICAgICAgICAgICAgICAgICAgICAgICAgICBieSB0aGlzIHVybAogICAgICAgICAgICAgICAgICAgICAgICAgICAgRGVmYXVsdHMgdG8gbWxjb25mLmh1Yl91cmwuCiAgICA6cGFyYW0gZm5fdGFnOiAgICAgICAgICBodWIgdGFnIHRvIHVzZQogICAgICAgICAgICAgICAgICAgICAgICAgICAgRGVmYXVsdHMgdG8gJ21hc3RlcicKICAgICIiIgoKICAgIG1sY29uZi5kYnBhdGggPSBtbGNvbmYuZGJwYXRoIG9yICJodHRwOi8vbWxydW4tYXBpOjgwODAiCiAgICBtbGNvbmYuaHViX3VybCA9IGh1Yl91cmwKICAgIGZuID0gaW1wb3J0X2Z1bmN0aW9uKHVybD1mImh1YjovL2NvbmNlcHRfZHJpZnRfc3RyZWFtaW5nOntmbl90YWd9IikKCiAgICBjb250ZXh0LmxvZ2dlci5pbmZvKCJMb2FkaW5nIGJhc2UgZGF0YXNldCIpCiAgICBiYXNlX2RmID0gYmFzZV9kYXRhc2V0LmFzX2RmKCkKICAgIGVycm9yX3N0cmVhbSA9IG5wLndoZXJlKAogICAgICAgIGJhc2VfZGZbcHJlZGljdGlvbl9jb2xdLnZhbHVlcyA9PSBiYXNlX2RmW2xhYmVsX2NvbF0udmFsdWVzLCAwLCAxCiAgICApCgogICAgY29udGV4dC5sb2dnZXIuaW5mbygiQ3JlYXRpbmcgbW9kZWxzIikKICAgIG1vZGVscyA9IFsKICAgICAgICBtb2RlbC5zdHJpcCgpCiAgICAgICAgZm9yIG1vZGVsIGluIG9zLmdldGVudigibW9kZWxzIiwgInBhZ2VoaW5rbGV5LCBkZG0sIGVkZG0iKS5zcGxpdCgiLCIpCiAgICBdCiAgICBtb2RlbHMgPSB7CiAgICAgICAgImVkZG0iOiBza211bHRpZmxvdy5kcmlmdF9kZXRlY3Rpb24uRURETSgpLAogICAgICAgICJwYWdlaGlua2xleSI6IHNrbXVsdGlmbG93LmRyaWZ0X2RldGVjdGlvbi5QYWdlSGlua2xleSgKICAgICAgICAgICAgbWluX2luc3RhbmNlcz1sZW4oZXJyb3Jfc3RyZWFtKSwgdGhyZXNob2xkPXBhZ2VoaW5rbGV5X3RocmVzaG9sZAogICAgICAgICksCiAgICAgICAgImRkbSI6IHNrbXVsdGlmbG93LmRyaWZ0X2RldGVjdGlvbi5ERE0oCiAgICAgICAgICAgIG1pbl9udW1faW5zdGFuY2VzPWxlbihlcnJvcl9zdHJlYW0pLAogICAgICAgICAgICB3YXJuaW5nX2xldmVsPWRkbV93YXJuaW5nX2xldmVsLAogICAgICAgICAgICBvdXRfY29udHJvbF9sZXZlbD1kZG1fb3V0X2NvbnRyb2xfbGV2ZWwsCiAgICAgICAgKSwKICAgIH0KCiAgICBjb250ZXh0LmxvZ2dlci5pbmZvKCJTdHJlYW1pbmcgZGF0YSB0byBtb2RlbHMiKQogICAgZm9yIGkgaW4gcmFuZ2UobGVuKGVycm9yX3N0cmVhbSkpOgogICAgICAgIGZvciBtb2RlbF9uYW1lLCBtb2RlbCBpbiBtb2RlbHMuaXRlbXMoKToKICAgICAgICAgICAgbW9kZWwuYWRkX2VsZW1lbnQoZXJyb3Jfc3RyZWFtW2ldKQoKICAgIGNvbnRleHQubG9nZ2VyLmluZm8oIkxvZ2dpbmcgcmVhZHkgbW9kZWxzIikKICAgIGZvciBuYW1lLCBtb2RlbCBpbiBtb2RlbHMuaXRlbXMoKToKICAgICAgICBkYXRhID0gZHVtcHMobW9kZWwpCiAgICAgICAgbW9kZWxfZmlsZSA9IGYie25hbWV9LnBrbCIKICAgICAgICBjb250ZXh0LmxvZ19tb2RlbCgKICAgICAgICAgICAgZiJ7bmFtZX1fY29uY2VwdF9kcmlmdCIsCiAgICAgICAgICAgIGJvZHk9ZGF0YSwKICAgICAgICAgICAgbGFiZWxzPXsiZnJhbWV3b3JrIjogInNrbXVsdGlmbG93IiwgIndvcmtmbG93IjogImNvbmNlcHQtZHJpZnQifSwKICAgICAgICAgICAgbW9kZWxfZmlsZT1tb2RlbF9maWxlLAogICAgICAgICAgICBtb2RlbF9kaXI9bW9kZWxzX2Rlc3QsCiAgICAgICAgICAgIHRhZz0ibGF0ZXN0IiwKICAgICAgICApCiAgICAgICAgZm4uc2V0X2VudnMoCiAgICAgICAgICAgIHsKICAgICAgICAgICAgICAgIGYie25hbWV9X21vZGVsX3BhdGgiOiBvcy5wYXRoLmpvaW4oCiAgICAgICAgICAgICAgICAgICAgY29udGV4dC5hcnRpZmFjdF9wYXRoLCBtb2RlbHNfZGVzdCwgbW9kZWxfZmlsZQogICAgICAgICAgICAgICAgKQogICAgICAgICAgICB9CiAgICAgICAgKQoKICAgIGNvbnRleHQubG9nZ2VyLmluZm8oIkRlcGxveWluZyBDb25jZXB0IERyaWZ0IFN0cmVhbWluZyBmdW5jdGlvbiIpCiAgICBmbi5zZXRfZW52cygKICAgICAgICB7CiAgICAgICAgICAgICJsYWJlbF9jb2wiOiBsYWJlbF9jb2wsCiAgICAgICAgICAgICJwcmVkaWN0aW9uX2NvbCI6IHByZWRpY3Rpb25fY29sLAogICAgICAgICAgICAiZHJpZnRfc3RyZWFtIjogb3V0cHV0X3N0cmVhbSwKICAgICAgICAgICAgInRzZGJfdGFibGUiOiBvdXRwdXRfdHNkYiwKICAgICAgICAgICAgInBhZ2VoaW5rbGV5X3RocmVzaG9sZCI6IHBhZ2VoaW5rbGV5X3RocmVzaG9sZCwKICAgICAgICAgICAgImRkbV93YXJuaW5nX2xldmVsIjogZGRtX3dhcm5pbmdfbGV2ZWwsCiAgICAgICAgICAgICJkZG1fb3V0X2NvbnRyb2wiOiBkZG1fb3V0X2NvbnRyb2xfbGV2ZWwsCiAgICAgICAgfQogICAgKQogICAgZm4uYWRkX3YzaW9fc3RyZWFtX3RyaWdnZXIoc3RyZWFtX3BhdGggPSBpbnB1dF9zdHJlYW0sIG5hbWUgPSAnc3RyZWFtJywgZ3JvdXAgPSBjb25zdW1lcl9ncm91cCkKICAgIGZuLmFwcGx5KG1vdW50X3YzaW8oKSkKICAgIGZuLmRlcGxveShwcm9qZWN0PWNvbnRleHQucHJvamVjdCkK - commands: - - python -m pip install scikit-multiflow - code_origin: https://github.com/daniels290813/functions.git#82bbfde4afa2eae77059e05c70bbebacf530fd0d:/User/test/functions/concept_drift/concept_drift.py - origin_filename: /User/test/functions/concept_drift/concept_drift.py - disable_auto_mount: false - affinity: null -verbose: false diff --git a/functions/development/concept_drift/0.9.1/src/item.yaml b/functions/development/concept_drift/0.9.1/src/item.yaml deleted file mode 100644 index 70ac4373..00000000 --- a/functions/development/concept_drift/0.9.1/src/item.yaml +++ /dev/null @@ -1,26 +0,0 @@ -apiVersion: v1 -categories: -- machine-learning -- model-serving -description: Deploy a streaming Concept Drift detector on a labeled stream -doc: '' -example: concept_drift.ipynb -generationDate: 2021-11-18:12-28 -icon: '' -labels: - author: orz - framework: sklearn -maintainers: [] -marketplaceType: '' -mlrunVersion: 0.8.0 -name: concept-drift -platformVersion: 3.2.0 -spec: - filename: concept_drift.py - handler: concept_drift_deployer - image: mlrun/ml-models - kind: job - requirements: - - scikit-multiflow -url: '' -version: 0.9.1 diff --git a/functions/development/concept_drift/0.9.1/src/requirements.txt b/functions/development/concept_drift/0.9.1/src/requirements.txt deleted file mode 100644 index fa0fddd8..00000000 --- a/functions/development/concept_drift/0.9.1/src/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -skmultiflow \ No newline at end of file diff --git a/functions/development/concept_drift/0.9.1/static/documentation.html b/functions/development/concept_drift/0.9.1/static/documentation.html deleted file mode 100644 index 1ff6f693..00000000 --- a/functions/development/concept_drift/0.9.1/static/documentation.html +++ /dev/null @@ -1,170 +0,0 @@ - - - - - - - -concept_drift package - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- - -
-
-
-
-
-
-

concept_drift package

-
-

Submodules

-
-
-

concept_drift.concept_drift module

-
-
-concept_drift.concept_drift.concept_drift_deployer(context: mlrun.execution.MLClientCtx, base_dataset: mlrun.datastore.base.DataItem, input_stream: str, consumer_group: str, output_stream: str, output_tsdb: str, tsdb_batch_size: int, callbacks: list, models: list = ['ddm', 'eddm', 'pagehinkley'], models_dest='models', pagehinkley_threshold: float = 10, ddm_warning_level: float = 2, ddm_out_control_level: float = 3, label_col='label', prediction_col='prediction', hub_url: str = 'https://raw.githubusercontent.com/mlrun/functions/{tag}/{name}/function.yaml', fn_tag: str = 'master')[source]
-
-
Deploy a streaming Concept Drift detector on a labeled stream

This function is the Deployment step for the Streaming Concept Drift Detector. -It will load the selected drift detectors and initialize them with the -base_dataset’s statistics. Then it will deploy the concept_drift_streaming -function and pass the models to it for streaming concept-drift detection on top -of a labeled stream.

-
-
-
-
Parameters
-
    -
  • context – MLRun context

  • -
  • base_dataset – Dataset containing label_col and prediction_col to initialize the detectors

  • -
  • input_stream – labeled stream to track. -Should contain label_col and prediction_col

  • -
  • output_stream – Output stream to push the detector’s alerts

  • -
  • output_tsdb – Output TSDB table to allow analysis and display

  • -
  • tsdb_batch_size – Batch size of alerts to buffer before pushing to the TSDB

  • -
  • callbacks – Additional rest endpoints to send the alert data to

  • -
  • models – List of the detectors to deploy -Defaults to [‘ddm’, ‘eddm’, ‘pagehinkley’].

  • -
  • models_dest – Location for saving the detectors -Defaults to ‘models’ (in relation to artifact_path).

  • -
  • pagehinkley_threshold – Drift level threshold for PH detector Defaults to 10.

  • -
  • ddm_warning_level – Warning level alert for DDM detector Defaults to 2.

  • -
  • ddm_out_control_level – Drift level alert for DDM detector Defaults to 3.

  • -
  • label_col – Label column to be used on base_dataset and input_stream -Defaults to ‘label’.

  • -
  • prediction_col – Prediction column to be used on base_dataset and input_stream -Defaults to ‘prediction’.

  • -
  • hub_url – hub_url in case the default is not used, concept_drift_streaming will be loaded -by this url -Defaults to mlconf.hub_url.

  • -
  • fn_tag – hub tag to use -Defaults to ‘master’

  • -
-
-
-
-
-
-

Module contents

-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/concept_drift/0.9.1/static/example.html b/functions/development/concept_drift/0.9.1/static/example.html deleted file mode 100644 index e4eef5e6..00000000 --- a/functions/development/concept_drift/0.9.1/static/example.html +++ /dev/null @@ -1,756 +0,0 @@ - - - - - - - -Concept Drift - Deployer - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
- -
-
-
-
-

Concept Drift - Deployer

-

Deploy a streaming Concept Drift detector on a labeled stream.
-It will initialize the selected drift detectors with the base_dataset’s statistics and deploy the concept_drift_streaming function from the hub.
-adding V3IOStreamTrigger in order to listen to the input_stream.

- -
-

Data exploration

-

In order to know about the performance of a drift detector by measuring the different detection metrics, we need to know beforehand where a real drift occurs.
-This is only possible with synthetic datasets.
The scikit-multiflow framework allows generating several kinds of synthetic data to simulate the occurrence of drifts.
-Harvard dataverse provides futher explanations on the used dataset along with different kinds of drifted datasets.
-mixed_0101_abrupto has 4 concepts and 3 drifts at time steps 10000, 20000, and 30000.
-Our dataset will be train-test-splitted, the train part (first 5000 examples) is used to train the model (that is generated easly using sklearn_classifer).
-The test part (which is already predicted by the model) will be pushed to the input stream in order to detect drifts.

-
-
-
import pandas as pd
-data_path = 'https://s3.wasabisys.com/iguazio/data/function-marketplace-data/concept_drift/mixed_0101_abrupto.csv'
-predicted_train_path = 'https://s3.wasabisys.com/iguazio/data/function-marketplace-data/concept_drift/predicted_abrupto_train.csv'
-predicted_test_data_path = 'https://s3.wasabisys.com/iguazio/data/function-marketplace-data/concept_drift/predicted_abrupto_test.csv'
-# You can find the model used here
-models_path = 'https://s3.wasabisys.com/iguazio/models/function-marketplace-models/concept_drift/concept_drift_random_forest.pkl'
-original_data = pd.read_csv(data_path)
-original_data.head()
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
X1X2X3X4class
00.01.00.4601010.5927441.0
11.01.00.5887880.5749840.0
20.00.00.4016410.6793251.0
31.01.00.3060760.1821080.0
40.00.00.9628470.5792451.0
-
-
-
-
-
predicted_test = pd.read_csv(predicted_test_data_path)
-predicted_test.tail()
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
X1X2X3X4classpredicted_col
349950.00.00.0101060.6472690.01.0
349961.01.00.2936510.7372911.00.0
349970.00.00.8485460.5523370.01.0
349981.01.00.6147540.8598961.00.0
349991.00.00.2653060.8437160.01.0
-
-
-
-
-

Creating the input stream

-
-
-
import os 
-
-container = os.path.join('/',os.environ['V3IO_HOME'].split('/')[0])
-user = os.environ["V3IO_USERNAME"]
-rel_path = os.getcwd()[6:] + '/artifacts'
-
-base_input_stream = os.path.join(user,rel_path) + "/inputs_stream"
-base_output_stream = os.path.join(user,rel_path) + "/output_stream"
-input_stream = os.path.join(container,base_input_stream)
-output_stream = os.path.join(container,user,rel_path) + "/output_stream"
-tsdb_path = os.path.join(container,user,rel_path) + "/output_tsdb"
-
-stream_consumer_group = 'cg45'
-
-
-
-
-
-
-
import v3io.dataplane
-
-client = v3io.dataplane.Client()
-response = client.stream.create(container = container,
-                                stream_path=base_input_stream,
-                                shard_count=1,
-                                raise_for_status = v3io.dataplane.RaiseForStatus.never)
-response.raise_for_status([409, 204])
-
-
-
-
-
-
-

Importing the function

-
-
-
# Importing the function
-import mlrun
-mlrun.set_environment(project='function-marketplace')
-
-fn = mlrun.import_function("hub://concept_drift:development")
-fn.apply(mlrun.auto_mount())
-
-
-
-
-
> 2021-10-25 10:27:04,105 [info] created and saved project function-marketplace
-
-
-
<mlrun.runtimes.kubejob.KubejobRuntime at 0x7f145dd80fd0>
-
-
-
-
-
-
-

Running the function remotely

-
-
-
drift_run = fn.run(name='concept_drift',
-                   params={'input_stream'    : input_stream,
-                           'consumer_group'  : stream_consumer_group,
-                           'output_stream'   : output_stream,
-                           'output_tsdb'     : tsdb_path,
-                           'tsdb_batch_size' : 1,
-                           'models'          : ['ddm', 'eddm', 'pagehinkley'], # defaults
-                           'label_col'       : 'class',
-                           'prediction_col'  : 'predicted_col',
-                           'fn_tag'          : 'development'},
-                   inputs={'base_dataset'    : predicted_train_path},
-                   artifact_path = os.path.join(os.getcwd(), 'artifacts'))
-
-
-
-
-
> 2021-10-25 10:27:04,567 [info] starting run concept_drift uid=fa07c222e77d4eac86d2ce9317aaded1 DB=http://mlrun-api:8080
-> 2021-10-25 10:27:04,709 [info] Job is running in the background, pod: concept-drift-ggxgb
-> 2021-10-25 10:27:11,199 [info] Loading base dataset
-> 2021-10-25 10:27:13,227 [info] Creating models
-> 2021-10-25 10:27:13,227 [info] Streaming data to models
-> 2021-10-25 10:27:13,347 [info] Logging ready models
-> 2021-10-25 10:27:13,487 [info] Deploying Concept Drift Streaming function
-> 2021-10-25 10:27:13,490 [info] Starting remote function deploy
-2021-10-25 10:27:13  (info) Deploying function
-2021-10-25 10:27:13  (info) Building
-2021-10-25 10:27:13  (info) Staging files and preparing base images
-2021-10-25 10:27:13  (info) Building processor image
-2021-10-25 10:27:15  (info) Build complete
-2021-10-25 10:27:21  (info) Function deploy complete
-> 2021-10-25 10:27:21,797 [info] successfully deployed function: {'internal_invocation_urls': ['nuclio-function-marketplace-concept-drift-streaming.default-tenant.svc.cluster.local:8080'], 'external_invocation_urls': ['default-tenant.app.dev39.lab.iguazeng.com:31143']}
-> 2021-10-25 10:27:21,868 [info] run executed, status=completed
-final state: completed
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
function-marketplace0Oct 25 10:27:10completedconcept_drift
v3io_user=dani
kind=job
owner=dani
host=concept-drift-ggxgb
base_dataset
input_stream=/users/dani/test/functions/concept_drift/artifacts/inputs_stream
consumer_group=cg45
output_stream=/users/dani/test/functions/concept_drift/artifacts/output_stream
output_tsdb=/users/dani/test/functions/concept_drift/artifacts/output_tsdb
tsdb_batch_size=1
models=['ddm', 'eddm', 'pagehinkley']
label_col=class
prediction_col=predicted_col
fn_tag=development
eddm_concept_drift
pagehinkley_concept_drift
ddm_concept_drift
-
- -
-

-
-
-
> to track results use the .show() or .logs() methods or click here to open in UI
> 2021-10-25 10:27:23,031 [info] run executed, status=completed
-
-
-
-
-
-
-

Testing the function

-
-

Mark that we are testing the deployed function - concept_drift_streaming

-
-
-
-
import json
-import datetime
-
-# Reshaping the data to V3IOStream format.
-def restructure_stream_event(context, event):
-    instances = [dict()]
-    for key in predicted_test.keys():
-        if key not in ['when', 'class', 'model', 'worker', 'hostname', 'predicted_col']:
-            instances[0].update({key: event.pop(key)})
-    event['request'] = {'instances': instances}
-    event['resp'] = [int(event.pop('predicted_col'))]
-    event['when'] = datetime.datetime.strftime(datetime.datetime.now(), format="%Y-%m-%d %H:%M:%S.%f")
-    event['model'] = 'sklearn.ensemble.RandomForestClassifier'
-    return event
-    
-    
-records = json.loads(predicted_test.to_json(orient='records'))
-records = [{'data': json.dumps(restructure_stream_event(context, record))} for record in records]
-
-# showing first record
-records[0]
-
-
-
-
-
{'data': '{"class": 1.0, "request": {"instances": [{"X1": 0.0, "X2": 0.0, "X3": 0.0634475073, "X4": 0.4136568818}]}, "resp": [1], "when": "2021-10-25 10:27:23.152584", "model": "sklearn.ensemble.RandomForestClassifier"}'}
-
-
-
-
-
-
-
# Creating v3io client
-v3io_client = v3io.dataplane.Client()
-
-# Pushing some undrifted data to the input stream
-response = v3io_client.stream.put_records(container=container,
-                                          stream_path=base_input_stream, 
-                                          records=records[4900:5100])
-
-
-
-
-
-
-
# Getting earliest location in the shard
-location = json.loads(v3io_client.stream.seek(container=container,
-                                              stream_path=base_input_stream,
-                                              shard_id=0,
-                                              seek_type='EARLIEST').body)['Location']
-# Getting records from input stream
-response = v3io_client.stream.get_records(container=container,
-                                          stream_path=base_input_stream,
-                                          shard_id=0, location=location)
-# Showing the last sequence that is written to the input stream
-json.loads(response.body)['Records'][-1]
-
-
-
-
-
{'SequenceNumber': 200,
- 'Data': 'eyJjbGFzcyI6IDAuMCwgInJlcXVlc3QiOiB7Imluc3RhbmNlcyI6IFt7IlgxIjogMC4wLCAiWDIiOiAwLjAsICJYMyI6IDAuMzMzMTYzNjk4OSwgIlg0IjogMC40MjE2NzY1Njg3fV19LCAicmVzcCI6IFsxXSwgIndoZW4iOiAiMjAyMS0xMC0yNSAxMDoyNzoyMy4yOTM3OTgiLCAibW9kZWwiOiAic2tsZWFybi5lbnNlbWJsZS5SYW5kb21Gb3Jlc3RDbGFzc2lmaWVyIn0=',
- 'ArrivalTimeSec': 1635157644,
- 'ArrivalTimeNSec': 395309631}
-
-
-
-
-
-

Make sure some time has passed - the function needs to be triggered by the input stream, then it’ll write to the output stream

-
-
-
# Getting earliest location in the shard
-location = json.loads(v3io_client.stream.seek(container=container,
-                                              stream_path=base_output_stream,
-                                              shard_id=0,
-                                              seek_type='EARLIEST').body)['Location']
-# Getting records from output stream
-response = v3io_client.stream.get_records(container=container,
-                                          stream_path=base_output_stream,
-                                          shard_id=0, location=location)
-
-
-
-
-
-
-
# Showing changed detected
-import base64
-for instance in json.loads(response.body)['Records']:
-    seq = instance["SequenceNumber"]
-    data = json.loads(base64.b64decode(instance['Data']))
-    if(data['ddm_drift']==1 or data['eddm_drift']==1):
-        print(f'sequence number : {seq}, data : {data}')
-
-
-
-
-
sequence number : 106, data : {'class': 0.0, 'request': {'instances': [{'X1': 0.0, 'X2': 0.0, 'X3': 0.9628473804, 'X4': 0.5792453402}]}, 'resp': [1], 'when': '2021-10-25 10:27:23.291145', 'model': 'sklearn.ensemble.RandomForestClassifier', 'ddm_warning_zone': 0, 'ddm_drift': 1, 'eddm_warning_zone': 0, 'eddm_drift': 0}
-sequence number : 122, data : {'class': 0.0, 'request': {'instances': [{'X1': 0.0, 'X2': 0.0, 'X3': 0.4969765505, 'X4': 0.9784738351}]}, 'resp': [1], 'when': '2021-10-25 10:27:23.291558', 'model': 'sklearn.ensemble.RandomForestClassifier', 'ddm_warning_zone': 0, 'ddm_drift': 0, 'eddm_warning_zone': 0, 'eddm_drift': 1}
-
-
-
-
-

We can see that the system detected a change in the 106 instance, which is 10006 instance in the real dataset -
-5000 first instances are for train, we started pushing data from the 4900 instance of the test dataset (9900 from the real dataset), and we pushed only 200 instances.

-

Back to the top

-
-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/concept_drift/0.9.1/static/function.html b/functions/development/concept_drift/0.9.1/static/function.html deleted file mode 100644 index f9afe688..00000000 --- a/functions/development/concept_drift/0.9.1/static/function.html +++ /dev/null @@ -1,134 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: job
-metadata:
-  name: concept-drift
-  tag: ''
-  hash: 935da41196802875e19948974f32b6f00c29feb2
-  project: ''
-  labels:
-    author: orz
-    framework: sklearn
-  categories:
-  - machine-learning
-  - model-serving
-spec:
-  command: ''
-  args: []
-  image: mlrun/ml-models
-  env: []
-  default_handler: concept_drift_deployer
-  entry_points:
-    concept_drift_deployer:
-      name: concept_drift_deployer
-      doc: "Deploy a streaming Concept Drift detector on a labeled stream\n   This\
-        \ function is the Deployment step for the Streaming Concept Drift Detector.\n\
-        \   It will load the selected drift detectors and initialize them with the\n\
-        \   base_dataset's statistics.  Then it will deploy the concept_drift_streaming\n\
-        \   function and pass the models to it for streaming concept-drift detection\
-        \ on top\n   of a labeled stream."
-      parameters:
-      - name: context
-        type: MLClientCtx
-        doc: MLRun context
-        default: ''
-      - name: base_dataset
-        type: DataItem
-        doc: Dataset containing label_col and prediction_col to initialize the detectors
-        default: ''
-      - name: input_stream
-        type: str
-        doc: labeled stream to track. Should contain label_col and prediction_col
-        default: ''
-      - name: consumer_group
-        type: str
-        default: ''
-      - name: output_stream
-        type: str
-        doc: Output stream to push the detector's alerts
-        default: ''
-      - name: output_tsdb
-        type: str
-        doc: Output TSDB table to allow analysis and display
-        default: ''
-      - name: tsdb_batch_size
-        type: int
-        doc: Batch size of alerts to buffer before pushing to the TSDB
-        default: ''
-      - name: callbacks
-        type: list
-        doc: Additional rest endpoints to send the alert data to
-        default: ''
-      - name: models
-        type: list
-        doc: List of the detectors to deploy Defaults to ['ddm', 'eddm', 'pagehinkley'].
-        default:
-        - ddm
-        - eddm
-        - pagehinkley
-      - name: models_dest
-        doc: Location for saving the detectors Defaults to 'models' (in relation to
-          artifact_path).
-        default: models
-      - name: pagehinkley_threshold
-        type: float
-        doc: Drift level threshold for PH detector Defaults to 10.
-        default: 10
-      - name: ddm_warning_level
-        type: float
-        doc: Warning level alert for DDM detector Defaults to 2.
-        default: 2
-      - name: ddm_out_control_level
-        type: float
-        doc: Drift level alert for DDM detector Defaults to 3.
-        default: 3
-      - name: label_col
-        doc: Label column to be used on base_dataset and input_stream Defaults to
-          'label'.
-        default: label
-      - name: prediction_col
-        doc: Prediction column to be used on base_dataset and input_stream Defaults
-          to 'prediction'.
-        default: prediction
-      - name: hub_url
-        type: str
-        doc: hub_url in case the default is not used, concept_drift_streaming will
-          be loaded by this url Defaults to mlconf.hub_url.
-        default: <_ast.Name object at 0x7f48eda946d0>
-      - name: fn_tag
-        type: str
-        doc: hub tag to use Defaults to 'master'
-        default: master
-      outputs:
-      - default: ''
-      lineno: 15
-  description: Deploy a streaming Concept Drift detector on a labeled stream
-  build:
-    functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHNrbXVsdGlmbG93LmRyaWZ0X2RldGVjdGlvbiAgIyBXZSB3aWxsIGdyYWIgb3VyIFBILCBERE0sIEVERE0gYWxnb3JpdGhtcyBmcm9tIGhlcmUKaW1wb3J0IG51bXB5IGFzIG5wCmltcG9ydCBwYW5kYXMgYXMgcGQKaW1wb3J0IG9zCmZyb20gY2xvdWRwaWNrbGUgaW1wb3J0IGR1bXBzLCBsb2FkLCBkdW1wCgpmcm9tIG51Y2xpby50cmlnZ2VycyBpbXBvcnQgVjNJT1N0cmVhbVRyaWdnZXIKZnJvbSBtbHJ1biBpbXBvcnQgRGF0YUl0ZW0sIGltcG9ydF9mdW5jdGlvbiwgbWxjb25mLCBNTENsaWVudEN0eCwgbW91bnRfdjNpbwoKaW1wb3J0IHJhbmRvbQoKCmRlZiBjb25jZXB0X2RyaWZ0X2RlcGxveWVyKAogICAgY29udGV4dDogTUxDbGllbnRDdHgsCiAgICBiYXNlX2RhdGFzZXQ6IERhdGFJdGVtLAogICAgaW5wdXRfc3RyZWFtOiBzdHIsCiAgICBjb25zdW1lcl9ncm91cDogc3RyLAogICAgb3V0cHV0X3N0cmVhbTogc3RyLAogICAgb3V0cHV0X3RzZGI6IHN0ciwKICAgIHRzZGJfYmF0Y2hfc2l6ZTogaW50LAogICAgY2FsbGJhY2tzOiBsaXN0LAogICAgbW9kZWxzOiBsaXN0ID0gWyJkZG0iLCAiZWRkbSIsICJwYWdlaGlua2xleSJdLAogICAgbW9kZWxzX2Rlc3Q9Im1vZGVscyIsCiAgICBwYWdlaGlua2xleV90aHJlc2hvbGQ6IGZsb2F0ID0gMTAsCiAgICBkZG1fd2FybmluZ19sZXZlbDogZmxvYXQgPSAyLAogICAgZGRtX291dF9jb250cm9sX2xldmVsOiBmbG9hdCA9IDMsCiAgICBsYWJlbF9jb2w9ImxhYmVsIiwKICAgIHByZWRpY3Rpb25fY29sPSJwcmVkaWN0aW9uIiwKICAgIGh1Yl91cmw6IHN0ciA9IG1sY29uZi5odWJfdXJsLAogICAgZm5fdGFnOiBzdHIgPSAibWFzdGVyIiwKKToKICAgICIiIkRlcGxveSBhIHN0cmVhbWluZyBDb25jZXB0IERyaWZ0IGRldGVjdG9yIG9uIGEgbGFiZWxlZCBzdHJlYW0KICAgICAgIFRoaXMgZnVuY3Rpb24gaXMgdGhlIERlcGxveW1lbnQgc3RlcCBmb3IgdGhlIFN0cmVhbWluZyBDb25jZXB0IERyaWZ0IERldGVjdG9yLgogICAgICAgSXQgd2lsbCBsb2FkIHRoZSBzZWxlY3RlZCBkcmlmdCBkZXRlY3RvcnMgYW5kIGluaXRpYWxpemUgdGhlbSB3aXRoIHRoZQogICAgICAgYmFzZV9kYXRhc2V0J3Mgc3RhdGlzdGljcy4gIFRoZW4gaXQgd2lsbCBkZXBsb3kgdGhlIGNvbmNlcHRfZHJpZnRfc3RyZWFtaW5nCiAgICAgICBmdW5jdGlvbiBhbmQgcGFzcyB0aGUgbW9kZWxzIHRvIGl0IGZvciBzdHJlYW1pbmcgY29uY2VwdC1kcmlmdCBkZXRlY3Rpb24gb24gdG9wCiAgICAgICBvZiBhIGxhYmVsZWQgc3RyZWFtLgoKICAgIDpwYXJhbSBjb250ZXh0OiAgICAgICAgIE1MUnVuIGNvbnRleHQKICAgIDpwYXJhbSBiYXNlX2RhdGFzZXQ6ICAgIERhdGFzZXQgY29udGFpbmluZyBsYWJlbF9jb2wgYW5kIHByZWRpY3Rpb25fY29sIHRvIGluaXRpYWxpemUgdGhlIGRldGVjdG9ycwogICAgOnBhcmFtIGlucHV0X3N0cmVhbTogICAgbGFiZWxlZCBzdHJlYW0gdG8gdHJhY2suCiAgICAgICAgICAgICAgICAgICAgICAgICAgICBTaG91bGQgY29udGFpbiBsYWJlbF9jb2wgYW5kIHByZWRpY3Rpb25fY29sCiAgICA6cGFyYW0gb3V0cHV0X3N0cmVhbTogICBPdXRwdXQgc3RyZWFtIHRvIHB1c2ggdGhlIGRldGVjdG9yJ3MgYWxlcnRzCiAgICA6cGFyYW0gb3V0cHV0X3RzZGI6ICAgICBPdXRwdXQgVFNEQiB0YWJsZSB0byBhbGxvdyBhbmFseXNpcyBhbmQgZGlzcGxheQogICAgOnBhcmFtIHRzZGJfYmF0Y2hfc2l6ZTogQmF0Y2ggc2l6ZSBvZiBhbGVydHMgdG8gYnVmZmVyIGJlZm9yZSBwdXNoaW5nIHRvIHRoZSBUU0RCCiAgICA6cGFyYW0gY2FsbGJhY2tzOiAgICAgICBBZGRpdGlvbmFsIHJlc3QgZW5kcG9pbnRzIHRvIHNlbmQgdGhlIGFsZXJ0IGRhdGEgdG8KICAgIDpwYXJhbSBtb2RlbHM6ICAgICAgICAgIExpc3Qgb2YgdGhlIGRldGVjdG9ycyB0byBkZXBsb3kKICAgICAgICAgICAgICAgICAgICAgICAgICAgIERlZmF1bHRzIHRvIFsnZGRtJywgJ2VkZG0nLCAncGFnZWhpbmtsZXknXS4KICAgIDpwYXJhbSBtb2RlbHNfZGVzdDogICAgIExvY2F0aW9uIGZvciBzYXZpbmcgdGhlIGRldGVjdG9ycwogICAgICAgICAgICAgICAgICAgICAgICAgICAgRGVmYXVsdHMgdG8gJ21vZGVscycgKGluIHJlbGF0aW9uIHRvIGFydGlmYWN0X3BhdGgpLgogICAgOnBhcmFtIHBhZ2VoaW5rbGV5X3RocmVzaG9sZDogIERyaWZ0IGxldmVsIHRocmVzaG9sZCBmb3IgUEggZGV0ZWN0b3IgRGVmYXVsdHMgdG8gMTAuCiAgICA6cGFyYW0gZGRtX3dhcm5pbmdfbGV2ZWw6ICAgICAgV2FybmluZyBsZXZlbCBhbGVydCBmb3IgRERNIGRldGVjdG9yIERlZmF1bHRzIHRvIDIuCiAgICA6cGFyYW0gZGRtX291dF9jb250cm9sX2xldmVsOiAgRHJpZnQgbGV2ZWwgYWxlcnQgZm9yIERETSBkZXRlY3RvciBEZWZhdWx0cyB0byAzLgogICAgOnBhcmFtIGxhYmVsX2NvbDogICAgICAgTGFiZWwgY29sdW1uIHRvIGJlIHVzZWQgb24gYmFzZV9kYXRhc2V0IGFuZCBpbnB1dF9zdHJlYW0KICAgICAgICAgICAgICAgICAgICAgICAgICAgIERlZmF1bHRzIHRvICdsYWJlbCcuCiAgICA6cGFyYW0gcHJlZGljdGlvbl9jb2w6ICBQcmVkaWN0aW9uIGNvbHVtbiB0byBiZSB1c2VkIG9uIGJhc2VfZGF0YXNldCBhbmQgaW5wdXRfc3RyZWFtCiAgICAgICAgICAgICAgICAgICAgICAgICAgICBEZWZhdWx0cyB0byAncHJlZGljdGlvbicuCiAgICA6cGFyYW0gaHViX3VybDogICAgICAgICBodWJfdXJsIGluIGNhc2UgdGhlIGRlZmF1bHQgaXMgbm90IHVzZWQsIGNvbmNlcHRfZHJpZnRfc3RyZWFtaW5nIHdpbGwgYmUgbG9hZGVkCiAgICAgICAgICAgICAgICAgICAgICAgICAgICBieSB0aGlzIHVybAogICAgICAgICAgICAgICAgICAgICAgICAgICAgRGVmYXVsdHMgdG8gbWxjb25mLmh1Yl91cmwuCiAgICA6cGFyYW0gZm5fdGFnOiAgICAgICAgICBodWIgdGFnIHRvIHVzZQogICAgICAgICAgICAgICAgICAgICAgICAgICAgRGVmYXVsdHMgdG8gJ21hc3RlcicKICAgICIiIgoKICAgIG1sY29uZi5kYnBhdGggPSBtbGNvbmYuZGJwYXRoIG9yICJodHRwOi8vbWxydW4tYXBpOjgwODAiCiAgICBtbGNvbmYuaHViX3VybCA9IGh1Yl91cmwKICAgIGZuID0gaW1wb3J0X2Z1bmN0aW9uKHVybD1mImh1YjovL2NvbmNlcHRfZHJpZnRfc3RyZWFtaW5nOntmbl90YWd9IikKCiAgICBjb250ZXh0LmxvZ2dlci5pbmZvKCJMb2FkaW5nIGJhc2UgZGF0YXNldCIpCiAgICBiYXNlX2RmID0gYmFzZV9kYXRhc2V0LmFzX2RmKCkKICAgIGVycm9yX3N0cmVhbSA9IG5wLndoZXJlKAogICAgICAgIGJhc2VfZGZbcHJlZGljdGlvbl9jb2xdLnZhbHVlcyA9PSBiYXNlX2RmW2xhYmVsX2NvbF0udmFsdWVzLCAwLCAxCiAgICApCgogICAgY29udGV4dC5sb2dnZXIuaW5mbygiQ3JlYXRpbmcgbW9kZWxzIikKICAgIG1vZGVscyA9IFsKICAgICAgICBtb2RlbC5zdHJpcCgpCiAgICAgICAgZm9yIG1vZGVsIGluIG9zLmdldGVudigibW9kZWxzIiwgInBhZ2VoaW5rbGV5LCBkZG0sIGVkZG0iKS5zcGxpdCgiLCIpCiAgICBdCiAgICBtb2RlbHMgPSB7CiAgICAgICAgImVkZG0iOiBza211bHRpZmxvdy5kcmlmdF9kZXRlY3Rpb24uRURETSgpLAogICAgICAgICJwYWdlaGlua2xleSI6IHNrbXVsdGlmbG93LmRyaWZ0X2RldGVjdGlvbi5QYWdlSGlua2xleSgKICAgICAgICAgICAgbWluX2luc3RhbmNlcz1sZW4oZXJyb3Jfc3RyZWFtKSwgdGhyZXNob2xkPXBhZ2VoaW5rbGV5X3RocmVzaG9sZAogICAgICAgICksCiAgICAgICAgImRkbSI6IHNrbXVsdGlmbG93LmRyaWZ0X2RldGVjdGlvbi5ERE0oCiAgICAgICAgICAgIG1pbl9udW1faW5zdGFuY2VzPWxlbihlcnJvcl9zdHJlYW0pLAogICAgICAgICAgICB3YXJuaW5nX2xldmVsPWRkbV93YXJuaW5nX2xldmVsLAogICAgICAgICAgICBvdXRfY29udHJvbF9sZXZlbD1kZG1fb3V0X2NvbnRyb2xfbGV2ZWwsCiAgICAgICAgKSwKICAgIH0KCiAgICBjb250ZXh0LmxvZ2dlci5pbmZvKCJTdHJlYW1pbmcgZGF0YSB0byBtb2RlbHMiKQogICAgZm9yIGkgaW4gcmFuZ2UobGVuKGVycm9yX3N0cmVhbSkpOgogICAgICAgIGZvciBtb2RlbF9uYW1lLCBtb2RlbCBpbiBtb2RlbHMuaXRlbXMoKToKICAgICAgICAgICAgbW9kZWwuYWRkX2VsZW1lbnQoZXJyb3Jfc3RyZWFtW2ldKQoKICAgIGNvbnRleHQubG9nZ2VyLmluZm8oIkxvZ2dpbmcgcmVhZHkgbW9kZWxzIikKICAgIGZvciBuYW1lLCBtb2RlbCBpbiBtb2RlbHMuaXRlbXMoKToKICAgICAgICBkYXRhID0gZHVtcHMobW9kZWwpCiAgICAgICAgbW9kZWxfZmlsZSA9IGYie25hbWV9LnBrbCIKICAgICAgICBjb250ZXh0LmxvZ19tb2RlbCgKICAgICAgICAgICAgZiJ7bmFtZX1fY29uY2VwdF9kcmlmdCIsCiAgICAgICAgICAgIGJvZHk9ZGF0YSwKICAgICAgICAgICAgbGFiZWxzPXsiZnJhbWV3b3JrIjogInNrbXVsdGlmbG93IiwgIndvcmtmbG93IjogImNvbmNlcHQtZHJpZnQifSwKICAgICAgICAgICAgbW9kZWxfZmlsZT1tb2RlbF9maWxlLAogICAgICAgICAgICBtb2RlbF9kaXI9bW9kZWxzX2Rlc3QsCiAgICAgICAgICAgIHRhZz0ibGF0ZXN0IiwKICAgICAgICApCiAgICAgICAgZm4uc2V0X2VudnMoCiAgICAgICAgICAgIHsKICAgICAgICAgICAgICAgIGYie25hbWV9X21vZGVsX3BhdGgiOiBvcy5wYXRoLmpvaW4oCiAgICAgICAgICAgICAgICAgICAgY29udGV4dC5hcnRpZmFjdF9wYXRoLCBtb2RlbHNfZGVzdCwgbW9kZWxfZmlsZQogICAgICAgICAgICAgICAgKQogICAgICAgICAgICB9CiAgICAgICAgKQoKICAgIGNvbnRleHQubG9nZ2VyLmluZm8oIkRlcGxveWluZyBDb25jZXB0IERyaWZ0IFN0cmVhbWluZyBmdW5jdGlvbiIpCiAgICBmbi5zZXRfZW52cygKICAgICAgICB7CiAgICAgICAgICAgICJsYWJlbF9jb2wiOiBsYWJlbF9jb2wsCiAgICAgICAgICAgICJwcmVkaWN0aW9uX2NvbCI6IHByZWRpY3Rpb25fY29sLAogICAgICAgICAgICAiZHJpZnRfc3RyZWFtIjogb3V0cHV0X3N0cmVhbSwKICAgICAgICAgICAgInRzZGJfdGFibGUiOiBvdXRwdXRfdHNkYiwKICAgICAgICAgICAgInBhZ2VoaW5rbGV5X3RocmVzaG9sZCI6IHBhZ2VoaW5rbGV5X3RocmVzaG9sZCwKICAgICAgICAgICAgImRkbV93YXJuaW5nX2xldmVsIjogZGRtX3dhcm5pbmdfbGV2ZWwsCiAgICAgICAgICAgICJkZG1fb3V0X2NvbnRyb2wiOiBkZG1fb3V0X2NvbnRyb2xfbGV2ZWwsCiAgICAgICAgfQogICAgKQogICAgZm4uYWRkX3YzaW9fc3RyZWFtX3RyaWdnZXIoc3RyZWFtX3BhdGggPSBpbnB1dF9zdHJlYW0sIG5hbWUgPSAnc3RyZWFtJywgZ3JvdXAgPSBjb25zdW1lcl9ncm91cCkKICAgIGZuLmFwcGx5KG1vdW50X3YzaW8oKSkKICAgIGZuLmRlcGxveShwcm9qZWN0PWNvbnRleHQucHJvamVjdCkK
-    commands:
-    - python -m pip install scikit-multiflow
-    code_origin: https://github.com/daniels290813/functions.git#82bbfde4afa2eae77059e05c70bbebacf530fd0d:/User/test/functions/concept_drift/concept_drift.py
-    origin_filename: /User/test/functions/concept_drift/concept_drift.py
-  disable_auto_mount: false
-  affinity: null
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/concept_drift/0.9.1/static/item.html b/functions/development/concept_drift/0.9.1/static/item.html deleted file mode 100644 index 07fe3786..00000000 --- a/functions/development/concept_drift/0.9.1/static/item.html +++ /dev/null @@ -1,48 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- machine-learning
-- model-serving
-description: Deploy a streaming Concept Drift detector on a labeled stream
-doc: ''
-example: concept_drift.ipynb
-generationDate: 2021-11-18:12-28
-icon: ''
-labels:
-  author: orz
-  framework: sklearn
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 0.8.0
-name: concept-drift
-platformVersion: 3.2.0
-spec:
-  filename: concept_drift.py
-  handler: concept_drift_deployer
-  image: mlrun/ml-models
-  kind: job
-  requirements:
-  - scikit-multiflow
-url: ''
-version: 0.9.1
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/concept_drift/0.9.1/static/source.html b/functions/development/concept_drift/0.9.1/static/source.html deleted file mode 100644 index e6ae663b..00000000 --- a/functions/development/concept_drift/0.9.1/static/source.html +++ /dev/null @@ -1,155 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-# Generated by nuclio.export.NuclioExporter
-
-import skmultiflow.drift_detection  # We will grab our PH, DDM, EDDM algorithms from here
-import numpy as np
-import pandas as pd
-import os
-from cloudpickle import dumps, load, dump
-
-from nuclio.triggers import V3IOStreamTrigger
-from mlrun import DataItem, import_function, mlconf, MLClientCtx, mount_v3io
-
-import random
-
-
-def concept_drift_deployer(
-    context: MLClientCtx,
-    base_dataset: DataItem,
-    input_stream: str,
-    consumer_group: str,
-    output_stream: str,
-    output_tsdb: str,
-    tsdb_batch_size: int,
-    callbacks: list,
-    models: list = ["ddm", "eddm", "pagehinkley"],
-    models_dest="models",
-    pagehinkley_threshold: float = 10,
-    ddm_warning_level: float = 2,
-    ddm_out_control_level: float = 3,
-    label_col="label",
-    prediction_col="prediction",
-    hub_url: str = mlconf.hub_url,
-    fn_tag: str = "master",
-):
-    """Deploy a streaming Concept Drift detector on a labeled stream
-       This function is the Deployment step for the Streaming Concept Drift Detector.
-       It will load the selected drift detectors and initialize them with the
-       base_dataset's statistics.  Then it will deploy the concept_drift_streaming
-       function and pass the models to it for streaming concept-drift detection on top
-       of a labeled stream.
-
-    :param context:         MLRun context
-    :param base_dataset:    Dataset containing label_col and prediction_col to initialize the detectors
-    :param input_stream:    labeled stream to track.
-                            Should contain label_col and prediction_col
-    :param output_stream:   Output stream to push the detector's alerts
-    :param output_tsdb:     Output TSDB table to allow analysis and display
-    :param tsdb_batch_size: Batch size of alerts to buffer before pushing to the TSDB
-    :param callbacks:       Additional rest endpoints to send the alert data to
-    :param models:          List of the detectors to deploy
-                            Defaults to ['ddm', 'eddm', 'pagehinkley'].
-    :param models_dest:     Location for saving the detectors
-                            Defaults to 'models' (in relation to artifact_path).
-    :param pagehinkley_threshold:  Drift level threshold for PH detector Defaults to 10.
-    :param ddm_warning_level:      Warning level alert for DDM detector Defaults to 2.
-    :param ddm_out_control_level:  Drift level alert for DDM detector Defaults to 3.
-    :param label_col:       Label column to be used on base_dataset and input_stream
-                            Defaults to 'label'.
-    :param prediction_col:  Prediction column to be used on base_dataset and input_stream
-                            Defaults to 'prediction'.
-    :param hub_url:         hub_url in case the default is not used, concept_drift_streaming will be loaded
-                            by this url
-                            Defaults to mlconf.hub_url.
-    :param fn_tag:          hub tag to use
-                            Defaults to 'master'
-    """
-
-    mlconf.dbpath = mlconf.dbpath or "http://mlrun-api:8080"
-    mlconf.hub_url = hub_url
-    fn = import_function(url=f"hub://concept_drift_streaming:{fn_tag}")
-
-    context.logger.info("Loading base dataset")
-    base_df = base_dataset.as_df()
-    error_stream = np.where(
-        base_df[prediction_col].values == base_df[label_col].values, 0, 1
-    )
-
-    context.logger.info("Creating models")
-    models = [
-        model.strip()
-        for model in os.getenv("models", "pagehinkley, ddm, eddm").split(",")
-    ]
-    models = {
-        "eddm": skmultiflow.drift_detection.EDDM(),
-        "pagehinkley": skmultiflow.drift_detection.PageHinkley(
-            min_instances=len(error_stream), threshold=pagehinkley_threshold
-        ),
-        "ddm": skmultiflow.drift_detection.DDM(
-            min_num_instances=len(error_stream),
-            warning_level=ddm_warning_level,
-            out_control_level=ddm_out_control_level,
-        ),
-    }
-
-    context.logger.info("Streaming data to models")
-    for i in range(len(error_stream)):
-        for model_name, model in models.items():
-            model.add_element(error_stream[i])
-
-    context.logger.info("Logging ready models")
-    for name, model in models.items():
-        data = dumps(model)
-        model_file = f"{name}.pkl"
-        context.log_model(
-            f"{name}_concept_drift",
-            body=data,
-            labels={"framework": "skmultiflow", "workflow": "concept-drift"},
-            model_file=model_file,
-            model_dir=models_dest,
-            tag="latest",
-        )
-        fn.set_envs(
-            {
-                f"{name}_model_path": os.path.join(
-                    context.artifact_path, models_dest, model_file
-                )
-            }
-        )
-
-    context.logger.info("Deploying Concept Drift Streaming function")
-    fn.set_envs(
-        {
-            "label_col": label_col,
-            "prediction_col": prediction_col,
-            "drift_stream": output_stream,
-            "tsdb_table": output_tsdb,
-            "pagehinkley_threshold": pagehinkley_threshold,
-            "ddm_warning_level": ddm_warning_level,
-            "ddm_out_control": ddm_out_control_level,
-        }
-    )
-    fn.add_v3io_stream_trigger(stream_path = input_stream, name = 'stream', group = consumer_group)
-    fn.apply(mount_v3io())
-    fn.deploy(project=context.project)
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/concept_drift/1.1.0/src/README.md b/functions/development/concept_drift/1.1.0/src/README.md deleted file mode 100644 index 92e6d893..00000000 --- a/functions/development/concept_drift/1.1.0/src/README.md +++ /dev/null @@ -1,132 +0,0 @@ -# Concept Drift - -**Concept drift** is a change in the statistical properties of the **target variable** over time. - -When deploying our models to production, we must ensure our models perform as we expect them to - reaching the same level of performence we have seen on our test sets or at least performing in the same quality as when they were deployed. - -However, often this is not the case. there are many factors that can affect our model's performance like seasonality or any unkown root causes that will change the laws underlying our data and invalidate some assumptions made by the model. - -We offer this function to help combat Concept Drift with implementation of streaming DDM, EDDM and PH concept drift detectors. - -## How to integrate - -This function is made of two parts: - -1. Kubernetes job to instantiate the selected models with a provided base dataset (the test dataset could be used) -2. [Nuclio serverless function](../concept_drift_streaming/concept_drift_streaming.ipynb) listed on a _labeled stream_, which will be deployed from this function after the models initialization and run the models per event and provide necessary alerts. - -There are two steps to integrate sucessfully with your workflow: - -1. Provide a stream where each event containes the joined **label** and **prediction** for that specific event. -2. Add this function to the workflow with the following params: - -```markdown -:param context: MLRun context -:param base_dataset: Dataset containing label_col and prediction_col to initialize the detectors -:param input_stream: labeled stream to track. - Should contain label_col and prediction_col -:param output_stream: Output stream to push the detector's alerts -:param output_tsdb: Output TSDB table to allow analysis and display -:param tsdb_batch_size: Batch size of alerts to buffer before pushing to the TSDB -:param callbacks: Additional rest endpoints to send the alert data to -:param models: List of the detectors to deploy - Defaults to ['ddm', 'eddm', 'pagehinkley']. -:param models_dest: Location for saving the detectors - Defaults to 'models' (in relation to artifact_path). -:param pagehinkley_threshold: Drift level threshold for PH detector Defaults to 10. -:param ddm_warning_level: Warning level alert for DDM detector Defaults to 2. -:param ddm_out_control_level: Drift level alert for DDM detector Defaults to 3. -:param label_col: Label column to be used on base_dataset and input_stream - Defaults to 'label'. -:param prediction_col: Prediction column to be used on base_dataset and input_stream - Defaults to 'prediction'. -:param hub_url: hub_url in case the default is not used, concept_drift_streaming will be loaded - by this url - Defaults to mlconf.hub_url. -:param fn_tag: hub tag to use - Defaults to 'master' -``` - -## Algorithms - -We offer to deploy up to 3 concept drift streaming detectors - -### DDM - Drift Detection Method - -Models the **Number of errors** as a **binomial** variable. This enables us to confine the expected number of errors in a prediction stream window to within some standard deviation. - -- Good for **abrupt** drift changes - -
- -![$mu=np_t$](https://latex.codecogs.com/svg.latex?mu=np_t) - -![$\sigma=\sqrt{\frac{p_t(1-p_t)}{n}}$]() - -
- -**Alert** when: - -
- -![$p_t+\sigma_t\ge{p_{min}+3\sigma_{min}}$](https://latex.codecogs.com/svg.latex?p_t+\sigma_t\ge{p_{min}+3\sigma_{min}}) - -
- -### EDDM - Early Drift Detection Method - -Uses the distance between two consecutive errors. - -- works better for **gradual** drift changes. -- More sensitive then DDM for noise -- Requires Minimal number of errors to initialize the statistics. - -**Warning**: - -
- -![$\frac{p_t+2\sigma_t}{p_{max}+2\sigma_{max}}<0.95$](https://latex.codecogs.com/svg.latex?\frac{p_t+2\sigma_t}{p_{max}+2\sigma_{max}}<0.95) - -
- -**Alert**: - -
- -![$\frac{p_t+2\sigma_t}{p_{max}+2\sigma_{max}}<0.90$](https://latex.codecogs.com/svg.latex?\frac{p_t+2\sigma_t}{p_{max}+2\sigma_{max}}<0.90) - -
- -### PageHinkley Test: - -The PageHinkley test is a sequential analysis technique typically used for monitoring change detection. (The test was designed to detect change in avg. of a Gaussian signal). In this test we use: -x*1*, ..., x*n* - labeled dataset -δ - magnitude threshold -λ - detection threshold - -
- -![$\hat{x_T}=\frac{1}{T}\sum_{t=1}^{t}{x_t}$](https://latex.codecogs.com/svg.latex?\hat{x_T}=\frac{1}{T}\sum_{t=1}^{t}{x_t}) - -![$\sum_{t=1}^T{x_t-\hat{x_T}-\delta}$](https://latex.codecogs.com/svg.latex?U_T=\sum_{t=1}^T{x_t-\hat{x_T}-\delta}) - -![$m_T=min(U_t,t=1..T)$]() - -
- -**Alert**: - -
- -![$U_T-m_T>\lambda$](https://latex.codecogs.com/svg.latex?U_T-m_T>\lambda) - -
- -## Additional resources -[A Study on Change Detection Methods](https://pdfs.semanticscholar.org/bb6e/8a44c0efcd725aae1c0b1817561f6e278c2c.pdf), Raquel Sebasti˜ao1,2 and Jo˜ao Gama1,3, 1 LIAAD-INESC Porto L.A., University of Porto -Rua de Ceuta, 118 - 6, 4050-190 Porto, Portugal -2 Faculty of Science, University of Porto -3 Faculty of Economics, University of Porto -{raquel,jgama}@liaad.up.pt - -[MLOps Live #4 - How to Detect & Remediate Drift in Production with MLOps Automation](https://www.youtube.com/watch?v=66_Q7mJZOSc&t=1296s) diff --git a/functions/development/concept_drift/1.1.0/src/concept_drift.ipynb b/functions/development/concept_drift/1.1.0/src/concept_drift.ipynb deleted file mode 100644 index e9c063b6..00000000 --- a/functions/development/concept_drift/1.1.0/src/concept_drift.ipynb +++ /dev/null @@ -1,793 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Concept Drift - Deployer\n", - "Deploy a streaming Concept Drift detector on a labeled stream. \n", - "It will initialize the selected drift detectors with the base_dataset's statistics and deploy the [concept_drift_streaming](https://github.com/mlrun/functions/blob/master/concept_drift_streaming/concept_drift_streaming.ipynb) function from the hub.
\n", - "adding [V3IOStreamTrigger](https://nuclio.io/docs/latest/reference/triggers/v3iostream/) in order to listen to the input_stream." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Steps**\n", - "\n", - "1. [Data exploration](#Data-exploration)\n", - "2. [Creating the input stream](#Creating-the-input-stream)\n", - "3. [Importing the function](#Importing-the-function)\n", - "4. [Running the function remotely](#Running-the-function-remotely)\n", - "5. [Testing the function](#Testing-the-function)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Data exploration**" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In order to know about the performance of a drift detector by measuring the different detection metrics, we need to know beforehand where a real drift occurs.
\n", - "This is only possible with synthetic datasets.
The scikit-multiflow framework allows generating several kinds of synthetic data to simulate the occurrence of drifts.
\n", - "[Harvard dataverse](https://dataverse.harvard.edu) provides futher explanations on the [used dataset](https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/5OWRGB) along with different kinds of drifted datasets.
\n", - "mixed_0101_abrupto has 4 concepts and 3 drifts at time steps 10000, 20000, and 30000.
\n", - "Our dataset will be train-test-splitted, the train part (first 5000 examples) is used to train the model (that is generated easly using [sklearn_classifer](https://github.com/mlrun/functions/blob/master/sklearn_classifier/sklearn_classifier.ipynb)).
\n", - "The test part (which is already predicted by the model) will be pushed to the input stream in order to detect drifts." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
X1X2X3X4class
00.01.00.4601010.5927441.0
11.01.00.5887880.5749840.0
20.00.00.4016410.6793251.0
31.01.00.3060760.1821080.0
40.00.00.9628470.5792451.0
\n", - "
" - ], - "text/plain": [ - " X1 X2 X3 X4 class\n", - "0 0.0 1.0 0.460101 0.592744 1.0\n", - "1 1.0 1.0 0.588788 0.574984 0.0\n", - "2 0.0 0.0 0.401641 0.679325 1.0\n", - "3 1.0 1.0 0.306076 0.182108 0.0\n", - "4 0.0 0.0 0.962847 0.579245 1.0" - ] - }, - "execution_count": 1, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import pandas as pd\n", - "data_path = 'https://s3.wasabisys.com/iguazio/data/function-marketplace-data/concept_drift/mixed_0101_abrupto.csv'\n", - "predicted_train_path = 'https://s3.wasabisys.com/iguazio/data/function-marketplace-data/concept_drift/predicted_abrupto_train.csv'\n", - "predicted_test_data_path = 'https://s3.wasabisys.com/iguazio/data/function-marketplace-data/concept_drift/predicted_abrupto_test.csv'\n", - "# You can find the model used here\n", - "models_path = 'https://s3.wasabisys.com/iguazio/models/function-marketplace-models/concept_drift/concept_drift_random_forest.pkl'\n", - "original_data = pd.read_csv(data_path)\n", - "original_data.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
X1X2X3X4classpredicted_col
349950.00.00.0101060.6472690.01.0
349961.01.00.2936510.7372911.00.0
349970.00.00.8485460.5523370.01.0
349981.01.00.6147540.8598961.00.0
349991.00.00.2653060.8437160.01.0
\n", - "
" - ], - "text/plain": [ - " X1 X2 X3 X4 class predicted_col\n", - "34995 0.0 0.0 0.010106 0.647269 0.0 1.0\n", - "34996 1.0 1.0 0.293651 0.737291 1.0 0.0\n", - "34997 0.0 0.0 0.848546 0.552337 0.0 1.0\n", - "34998 1.0 1.0 0.614754 0.859896 1.0 0.0\n", - "34999 1.0 0.0 0.265306 0.843716 0.0 1.0" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "predicted_test = pd.read_csv(predicted_test_data_path)\n", - "predicted_test.tail()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Creating the input stream**" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "import os \n", - "\n", - "container = os.path.join('/',os.environ['V3IO_HOME'].split('/')[0])\n", - "user = os.environ[\"V3IO_USERNAME\"]\n", - "rel_path = os.getcwd()[6:] + '/artifacts'\n", - "\n", - "base_input_stream = os.path.join(user,rel_path) + \"/inputs_stream\"\n", - "base_output_stream = os.path.join(user,rel_path) + \"/output_stream\"\n", - "input_stream = os.path.join(container,base_input_stream)\n", - "output_stream = os.path.join(container,user,rel_path) + \"/output_stream\"\n", - "tsdb_path = os.path.join(container,user,rel_path) + \"/output_tsdb\"\n", - "\n", - "stream_consumer_group = 'cg45'" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "import v3io.dataplane\n", - "\n", - "client = v3io.dataplane.Client()\n", - "response = client.stream.create(container = container,\n", - " stream_path=base_input_stream,\n", - " shard_count=1,\n", - " raise_for_status = v3io.dataplane.RaiseForStatus.never)\n", - "response.raise_for_status([409, 204])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Importing the function**" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2021-10-25 10:27:04,105 [info] created and saved project function-marketplace\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Importing the function\n", - "import mlrun\n", - "mlrun.set_environment(project='function-marketplace')\n", - "\n", - "fn = mlrun.import_function(\"hub://concept_drift:development\")\n", - "fn.apply(mlrun.auto_mount())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Running the function remotely**" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2021-10-25 10:27:04,567 [info] starting run concept_drift uid=fa07c222e77d4eac86d2ce9317aaded1 DB=http://mlrun-api:8080\n", - "> 2021-10-25 10:27:04,709 [info] Job is running in the background, pod: concept-drift-ggxgb\n", - "> 2021-10-25 10:27:11,199 [info] Loading base dataset\n", - "> 2021-10-25 10:27:13,227 [info] Creating models\n", - "> 2021-10-25 10:27:13,227 [info] Streaming data to models\n", - "> 2021-10-25 10:27:13,347 [info] Logging ready models\n", - "> 2021-10-25 10:27:13,487 [info] Deploying Concept Drift Streaming function\n", - "> 2021-10-25 10:27:13,490 [info] Starting remote function deploy\n", - "2021-10-25 10:27:13 (info) Deploying function\n", - "2021-10-25 10:27:13 (info) Building\n", - "2021-10-25 10:27:13 (info) Staging files and preparing base images\n", - "2021-10-25 10:27:13 (info) Building processor image\n", - "2021-10-25 10:27:15 (info) Build complete\n", - "2021-10-25 10:27:21 (info) Function deploy complete\n", - "> 2021-10-25 10:27:21,797 [info] successfully deployed function: {'internal_invocation_urls': ['nuclio-function-marketplace-concept-drift-streaming.default-tenant.svc.cluster.local:8080'], 'external_invocation_urls': ['default-tenant.app.dev39.lab.iguazeng.com:31143']}\n", - "> 2021-10-25 10:27:21,868 [info] run executed, status=completed\n", - "final state: completed\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
function-marketplace0Oct 25 10:27:10completedconcept_drift
v3io_user=dani
kind=job
owner=dani
host=concept-drift-ggxgb
base_dataset
input_stream=/users/dani/test/functions/concept_drift/artifacts/inputs_stream
consumer_group=cg45
output_stream=/users/dani/test/functions/concept_drift/artifacts/output_stream
output_tsdb=/users/dani/test/functions/concept_drift/artifacts/output_tsdb
tsdb_batch_size=1
models=['ddm', 'eddm', 'pagehinkley']
label_col=class
prediction_col=predicted_col
fn_tag=development
eddm_concept_drift
pagehinkley_concept_drift
ddm_concept_drift
\n", - "
\n", - "
\n", - "
\n", - " Title\n", - " ×\n", - "
\n", - " \n", - "
\n", - "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - }, - { - "data": { - "text/html": [ - " > to track results use the .show() or .logs() methods or click here to open in UI" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2021-10-25 10:27:23,031 [info] run executed, status=completed\n" - ] - } - ], - "source": [ - "drift_run = fn.run(name='concept_drift',\n", - " params={'input_stream' : input_stream,\n", - " 'consumer_group' : stream_consumer_group,\n", - " 'output_stream' : output_stream,\n", - " 'output_tsdb' : tsdb_path,\n", - " 'tsdb_batch_size' : 1,\n", - " 'models' : ['ddm', 'eddm', 'pagehinkley'], # defaults\n", - " 'label_col' : 'class',\n", - " 'prediction_col' : 'predicted_col',\n", - " 'fn_tag' : 'development'},\n", - " inputs={'base_dataset' : predicted_train_path},\n", - " artifact_path = os.path.join(os.getcwd(), 'artifacts'))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Testing the function**\n", - "> Mark that we are testing the deployed function - concept_drift_streaming" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'data': '{\"class\": 1.0, \"request\": {\"instances\": [{\"X1\": 0.0, \"X2\": 0.0, \"X3\": 0.0634475073, \"X4\": 0.4136568818}]}, \"resp\": [1], \"when\": \"2021-10-25 10:27:23.152584\", \"model\": \"sklearn.ensemble.RandomForestClassifier\"}'}" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import json\n", - "import datetime\n", - "\n", - "# Reshaping the data to V3IOStream format.\n", - "def restructure_stream_event(context, event):\n", - " instances = [dict()]\n", - " for key in predicted_test.keys():\n", - " if key not in ['when', 'class', 'model', 'worker', 'hostname', 'predicted_col']:\n", - " instances[0].update({key: event.pop(key)})\n", - " event['request'] = {'instances': instances}\n", - " event['resp'] = [int(event.pop('predicted_col'))]\n", - " event['when'] = datetime.datetime.strftime(datetime.datetime.now(), format=\"%Y-%m-%d %H:%M:%S.%f\")\n", - " event['model'] = 'sklearn.ensemble.RandomForestClassifier'\n", - " return event\n", - " \n", - " \n", - "records = json.loads(predicted_test.to_json(orient='records'))\n", - "records = [{'data': json.dumps(restructure_stream_event(context, record))} for record in records]\n", - "\n", - "# showing first record\n", - "records[0]" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "# Creating v3io client\n", - "v3io_client = v3io.dataplane.Client()\n", - "\n", - "# Pushing some undrifted data to the input stream\n", - "response = v3io_client.stream.put_records(container=container,\n", - " stream_path=base_input_stream, \n", - " records=records[4900:5100])" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'SequenceNumber': 200,\n", - " 'Data': 'eyJjbGFzcyI6IDAuMCwgInJlcXVlc3QiOiB7Imluc3RhbmNlcyI6IFt7IlgxIjogMC4wLCAiWDIiOiAwLjAsICJYMyI6IDAuMzMzMTYzNjk4OSwgIlg0IjogMC40MjE2NzY1Njg3fV19LCAicmVzcCI6IFsxXSwgIndoZW4iOiAiMjAyMS0xMC0yNSAxMDoyNzoyMy4yOTM3OTgiLCAibW9kZWwiOiAic2tsZWFybi5lbnNlbWJsZS5SYW5kb21Gb3Jlc3RDbGFzc2lmaWVyIn0=',\n", - " 'ArrivalTimeSec': 1635157644,\n", - " 'ArrivalTimeNSec': 395309631}" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Getting earliest location in the shard\n", - "location = json.loads(v3io_client.stream.seek(container=container,\n", - " stream_path=base_input_stream,\n", - " shard_id=0,\n", - " seek_type='EARLIEST').body)['Location']\n", - "# Getting records from input stream\n", - "response = v3io_client.stream.get_records(container=container,\n", - " stream_path=base_input_stream,\n", - " shard_id=0, location=location)\n", - "# Showing the last sequence that is written to the input stream\n", - "json.loads(response.body)['Records'][-1]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Make sure some time has passed - the function needs to be triggered by the input stream, then it'll write to the output stream" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "# Getting earliest location in the shard\n", - "location = json.loads(v3io_client.stream.seek(container=container,\n", - " stream_path=base_output_stream,\n", - " shard_id=0,\n", - " seek_type='EARLIEST').body)['Location']\n", - "# Getting records from output stream\n", - "response = v3io_client.stream.get_records(container=container,\n", - " stream_path=base_output_stream,\n", - " shard_id=0, location=location)" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "sequence number : 106, data : {'class': 0.0, 'request': {'instances': [{'X1': 0.0, 'X2': 0.0, 'X3': 0.9628473804, 'X4': 0.5792453402}]}, 'resp': [1], 'when': '2021-10-25 10:27:23.291145', 'model': 'sklearn.ensemble.RandomForestClassifier', 'ddm_warning_zone': 0, 'ddm_drift': 1, 'eddm_warning_zone': 0, 'eddm_drift': 0}\n", - "sequence number : 122, data : {'class': 0.0, 'request': {'instances': [{'X1': 0.0, 'X2': 0.0, 'X3': 0.4969765505, 'X4': 0.9784738351}]}, 'resp': [1], 'when': '2021-10-25 10:27:23.291558', 'model': 'sklearn.ensemble.RandomForestClassifier', 'ddm_warning_zone': 0, 'ddm_drift': 0, 'eddm_warning_zone': 0, 'eddm_drift': 1}\n" - ] - } - ], - "source": [ - "# Showing changed detected\n", - "import base64\n", - "for instance in json.loads(response.body)['Records']:\n", - " seq = instance[\"SequenceNumber\"]\n", - " data = json.loads(base64.b64decode(instance['Data']))\n", - " if(data['ddm_drift']==1 or data['eddm_drift']==1):\n", - " print(f'sequence number : {seq}, data : {data}')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can see that the system detected a change in the 106 instance, which is 10006 instance in the real dataset -
\n", - "5000 first instances are for train, we started pushing data from the 4900 instance of the test dataset (9900 from the real dataset), and we pushed only 200 instances.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "[Back to the top](#Concept-Drift---Deployer)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python [conda env:root] *", - "language": "python", - "name": "conda-root-py" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/functions/development/concept_drift/1.1.0/src/concept_drift.py b/functions/development/concept_drift/1.1.0/src/concept_drift.py deleted file mode 100644 index 03355d3b..00000000 --- a/functions/development/concept_drift/1.1.0/src/concept_drift.py +++ /dev/null @@ -1,147 +0,0 @@ -# Copyright 2019 Iguazio -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# Generated by nuclio.export.NuclioExporter - -import skmultiflow.drift_detection # We will grab our PH, DDM, EDDM algorithms from here -import numpy as np -import pandas as pd -import os -from cloudpickle import dumps, load, dump - -from nuclio.triggers import V3IOStreamTrigger -from mlrun import DataItem, import_function, mlconf, MLClientCtx, mount_v3io - -import random - - -def concept_drift_deployer( - context: MLClientCtx, - base_dataset: DataItem, - input_stream: str, - consumer_group: str, - output_stream: str, - output_tsdb: str, - tsdb_batch_size: int, - callbacks: list, - models: list = ["ddm", "eddm", "pagehinkley"], - models_dest="models", - pagehinkley_threshold: float = 10, - ddm_warning_level: float = 2, - ddm_out_control_level: float = 3, - label_col="label", - prediction_col="prediction", - hub_url: str = mlconf.hub_url, - fn_tag: str = "master", -): - """Deploy a streaming Concept Drift detector on a labeled stream - This function is the Deployment step for the Streaming Concept Drift Detector. - It will load the selected drift detectors and initialize them with the - base_dataset's statistics. Then it will deploy the concept_drift_streaming - function and pass the models to it for streaming concept-drift detection on top - of a labeled stream. - - :param context: MLRun context - :param base_dataset: Dataset containing label_col and prediction_col to initialize the detectors - :param input_stream: labeled stream to track. - Should contain label_col and prediction_col - :param output_stream: Output stream to push the detector's alerts - :param output_tsdb: Output TSDB table to allow analysis and display - :param tsdb_batch_size: Batch size of alerts to buffer before pushing to the TSDB - :param callbacks: Additional rest endpoints to send the alert data to - :param models: List of the detectors to deploy - Defaults to ['ddm', 'eddm', 'pagehinkley']. - :param models_dest: Location for saving the detectors - Defaults to 'models' (in relation to artifact_path). - :param pagehinkley_threshold: Drift level threshold for PH detector Defaults to 10. - :param ddm_warning_level: Warning level alert for DDM detector Defaults to 2. - :param ddm_out_control_level: Drift level alert for DDM detector Defaults to 3. - :param label_col: Label column to be used on base_dataset and input_stream - Defaults to 'label'. - :param prediction_col: Prediction column to be used on base_dataset and input_stream - Defaults to 'prediction'. - :param hub_url: hub_url in case the default is not used, concept_drift_streaming will be loaded - by this url - Defaults to mlconf.hub_url. - :param fn_tag: hub tag to use - Defaults to 'master' - """ - - mlconf.dbpath = mlconf.dbpath or "http://mlrun-api:8080" - mlconf.hub_url = hub_url - fn = import_function(url=f"hub://concept_drift_streaming:{fn_tag}") - - context.logger.info("Loading base dataset") - base_df = base_dataset.as_df() - error_stream = np.where( - base_df[prediction_col].values == base_df[label_col].values, 0, 1 - ) - - context.logger.info("Creating models") - models = [ - model.strip() - for model in os.getenv("models", "pagehinkley, ddm, eddm").split(",") - ] - models = { - "eddm": skmultiflow.drift_detection.EDDM(), - "pagehinkley": skmultiflow.drift_detection.PageHinkley( - min_instances=len(error_stream), threshold=pagehinkley_threshold - ), - "ddm": skmultiflow.drift_detection.DDM( - min_num_instances=len(error_stream), - warning_level=ddm_warning_level, - out_control_level=ddm_out_control_level, - ), - } - - context.logger.info("Streaming data to models") - for i in range(len(error_stream)): - for model_name, model in models.items(): - model.add_element(error_stream[i]) - - context.logger.info("Logging ready models") - for name, model in models.items(): - data = dumps(model) - model_file = f"{name}.pkl" - context.log_model( - f"{name}_concept_drift", - body=data, - labels={"framework": "skmultiflow", "workflow": "concept-drift"}, - model_file=model_file, - model_dir=models_dest, - tag="latest", - ) - fn.set_envs( - { - f"{name}_model_path": os.path.join( - context.artifact_path, models_dest, model_file - ) - } - ) - - context.logger.info("Deploying Concept Drift Streaming function") - fn.set_envs( - { - "label_col": label_col, - "prediction_col": prediction_col, - "drift_stream": output_stream, - "tsdb_table": output_tsdb, - "pagehinkley_threshold": pagehinkley_threshold, - "ddm_warning_level": ddm_warning_level, - "ddm_out_control": ddm_out_control_level, - } - ) - fn.add_v3io_stream_trigger(stream_path = input_stream, name = 'stream', group = consumer_group) - fn.apply(mount_v3io()) - fn.deploy(project=context.project) diff --git a/functions/development/concept_drift/1.1.0/src/function.yaml b/functions/development/concept_drift/1.1.0/src/function.yaml deleted file mode 100644 index 071111c7..00000000 --- a/functions/development/concept_drift/1.1.0/src/function.yaml +++ /dev/null @@ -1,112 +0,0 @@ -kind: job -metadata: - name: concept-drift - tag: '' - hash: 935da41196802875e19948974f32b6f00c29feb2 - project: '' - labels: - author: orz - framework: sklearn - categories: - - machine-learning - - model-serving -spec: - command: '' - args: [] - image: mlrun/ml-models - env: [] - default_handler: concept_drift_deployer - entry_points: - concept_drift_deployer: - name: concept_drift_deployer - doc: "Deploy a streaming Concept Drift detector on a labeled stream\n This\ - \ function is the Deployment step for the Streaming Concept Drift Detector.\n\ - \ It will load the selected drift detectors and initialize them with the\n\ - \ base_dataset's statistics. Then it will deploy the concept_drift_streaming\n\ - \ function and pass the models to it for streaming concept-drift detection\ - \ on top\n of a labeled stream." - parameters: - - name: context - type: MLClientCtx - doc: MLRun context - default: '' - - name: base_dataset - type: DataItem - doc: Dataset containing label_col and prediction_col to initialize the detectors - default: '' - - name: input_stream - type: str - doc: labeled stream to track. Should contain label_col and prediction_col - default: '' - - name: consumer_group - type: str - default: '' - - name: output_stream - type: str - doc: Output stream to push the detector's alerts - default: '' - - name: output_tsdb - type: str - doc: Output TSDB table to allow analysis and display - default: '' - - name: tsdb_batch_size - type: int - doc: Batch size of alerts to buffer before pushing to the TSDB - default: '' - - name: callbacks - type: list - doc: Additional rest endpoints to send the alert data to - default: '' - - name: models - type: list - doc: List of the detectors to deploy Defaults to ['ddm', 'eddm', 'pagehinkley']. - default: - - ddm - - eddm - - pagehinkley - - name: models_dest - doc: Location for saving the detectors Defaults to 'models' (in relation to - artifact_path). - default: models - - name: pagehinkley_threshold - type: float - doc: Drift level threshold for PH detector Defaults to 10. - default: 10 - - name: ddm_warning_level - type: float - doc: Warning level alert for DDM detector Defaults to 2. - default: 2 - - name: ddm_out_control_level - type: float - doc: Drift level alert for DDM detector Defaults to 3. - default: 3 - - name: label_col - doc: Label column to be used on base_dataset and input_stream Defaults to - 'label'. - default: label - - name: prediction_col - doc: Prediction column to be used on base_dataset and input_stream Defaults - to 'prediction'. - default: prediction - - name: hub_url - type: str - doc: hub_url in case the default is not used, concept_drift_streaming will - be loaded by this url Defaults to mlconf.hub_url. - default: <_ast.Name object at 0x7f48eda946d0> - - name: fn_tag - type: str - doc: hub tag to use Defaults to 'master' - default: master - outputs: - - default: '' - lineno: 15 - description: Deploy a streaming Concept Drift detector on a labeled stream - build: - functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHNrbXVsdGlmbG93LmRyaWZ0X2RldGVjdGlvbiAgIyBXZSB3aWxsIGdyYWIgb3VyIFBILCBERE0sIEVERE0gYWxnb3JpdGhtcyBmcm9tIGhlcmUKaW1wb3J0IG51bXB5IGFzIG5wCmltcG9ydCBwYW5kYXMgYXMgcGQKaW1wb3J0IG9zCmZyb20gY2xvdWRwaWNrbGUgaW1wb3J0IGR1bXBzLCBsb2FkLCBkdW1wCgpmcm9tIG51Y2xpby50cmlnZ2VycyBpbXBvcnQgVjNJT1N0cmVhbVRyaWdnZXIKZnJvbSBtbHJ1biBpbXBvcnQgRGF0YUl0ZW0sIGltcG9ydF9mdW5jdGlvbiwgbWxjb25mLCBNTENsaWVudEN0eCwgbW91bnRfdjNpbwoKaW1wb3J0IHJhbmRvbQoKCmRlZiBjb25jZXB0X2RyaWZ0X2RlcGxveWVyKAogICAgY29udGV4dDogTUxDbGllbnRDdHgsCiAgICBiYXNlX2RhdGFzZXQ6IERhdGFJdGVtLAogICAgaW5wdXRfc3RyZWFtOiBzdHIsCiAgICBjb25zdW1lcl9ncm91cDogc3RyLAogICAgb3V0cHV0X3N0cmVhbTogc3RyLAogICAgb3V0cHV0X3RzZGI6IHN0ciwKICAgIHRzZGJfYmF0Y2hfc2l6ZTogaW50LAogICAgY2FsbGJhY2tzOiBsaXN0LAogICAgbW9kZWxzOiBsaXN0ID0gWyJkZG0iLCAiZWRkbSIsICJwYWdlaGlua2xleSJdLAogICAgbW9kZWxzX2Rlc3Q9Im1vZGVscyIsCiAgICBwYWdlaGlua2xleV90aHJlc2hvbGQ6IGZsb2F0ID0gMTAsCiAgICBkZG1fd2FybmluZ19sZXZlbDogZmxvYXQgPSAyLAogICAgZGRtX291dF9jb250cm9sX2xldmVsOiBmbG9hdCA9IDMsCiAgICBsYWJlbF9jb2w9ImxhYmVsIiwKICAgIHByZWRpY3Rpb25fY29sPSJwcmVkaWN0aW9uIiwKICAgIGh1Yl91cmw6IHN0ciA9IG1sY29uZi5odWJfdXJsLAogICAgZm5fdGFnOiBzdHIgPSAibWFzdGVyIiwKKToKICAgICIiIkRlcGxveSBhIHN0cmVhbWluZyBDb25jZXB0IERyaWZ0IGRldGVjdG9yIG9uIGEgbGFiZWxlZCBzdHJlYW0KICAgICAgIFRoaXMgZnVuY3Rpb24gaXMgdGhlIERlcGxveW1lbnQgc3RlcCBmb3IgdGhlIFN0cmVhbWluZyBDb25jZXB0IERyaWZ0IERldGVjdG9yLgogICAgICAgSXQgd2lsbCBsb2FkIHRoZSBzZWxlY3RlZCBkcmlmdCBkZXRlY3RvcnMgYW5kIGluaXRpYWxpemUgdGhlbSB3aXRoIHRoZQogICAgICAgYmFzZV9kYXRhc2V0J3Mgc3RhdGlzdGljcy4gIFRoZW4gaXQgd2lsbCBkZXBsb3kgdGhlIGNvbmNlcHRfZHJpZnRfc3RyZWFtaW5nCiAgICAgICBmdW5jdGlvbiBhbmQgcGFzcyB0aGUgbW9kZWxzIHRvIGl0IGZvciBzdHJlYW1pbmcgY29uY2VwdC1kcmlmdCBkZXRlY3Rpb24gb24gdG9wCiAgICAgICBvZiBhIGxhYmVsZWQgc3RyZWFtLgoKICAgIDpwYXJhbSBjb250ZXh0OiAgICAgICAgIE1MUnVuIGNvbnRleHQKICAgIDpwYXJhbSBiYXNlX2RhdGFzZXQ6ICAgIERhdGFzZXQgY29udGFpbmluZyBsYWJlbF9jb2wgYW5kIHByZWRpY3Rpb25fY29sIHRvIGluaXRpYWxpemUgdGhlIGRldGVjdG9ycwogICAgOnBhcmFtIGlucHV0X3N0cmVhbTogICAgbGFiZWxlZCBzdHJlYW0gdG8gdHJhY2suCiAgICAgICAgICAgICAgICAgICAgICAgICAgICBTaG91bGQgY29udGFpbiBsYWJlbF9jb2wgYW5kIHByZWRpY3Rpb25fY29sCiAgICA6cGFyYW0gb3V0cHV0X3N0cmVhbTogICBPdXRwdXQgc3RyZWFtIHRvIHB1c2ggdGhlIGRldGVjdG9yJ3MgYWxlcnRzCiAgICA6cGFyYW0gb3V0cHV0X3RzZGI6ICAgICBPdXRwdXQgVFNEQiB0YWJsZSB0byBhbGxvdyBhbmFseXNpcyBhbmQgZGlzcGxheQogICAgOnBhcmFtIHRzZGJfYmF0Y2hfc2l6ZTogQmF0Y2ggc2l6ZSBvZiBhbGVydHMgdG8gYnVmZmVyIGJlZm9yZSBwdXNoaW5nIHRvIHRoZSBUU0RCCiAgICA6cGFyYW0gY2FsbGJhY2tzOiAgICAgICBBZGRpdGlvbmFsIHJlc3QgZW5kcG9pbnRzIHRvIHNlbmQgdGhlIGFsZXJ0IGRhdGEgdG8KICAgIDpwYXJhbSBtb2RlbHM6ICAgICAgICAgIExpc3Qgb2YgdGhlIGRldGVjdG9ycyB0byBkZXBsb3kKICAgICAgICAgICAgICAgICAgICAgICAgICAgIERlZmF1bHRzIHRvIFsnZGRtJywgJ2VkZG0nLCAncGFnZWhpbmtsZXknXS4KICAgIDpwYXJhbSBtb2RlbHNfZGVzdDogICAgIExvY2F0aW9uIGZvciBzYXZpbmcgdGhlIGRldGVjdG9ycwogICAgICAgICAgICAgICAgICAgICAgICAgICAgRGVmYXVsdHMgdG8gJ21vZGVscycgKGluIHJlbGF0aW9uIHRvIGFydGlmYWN0X3BhdGgpLgogICAgOnBhcmFtIHBhZ2VoaW5rbGV5X3RocmVzaG9sZDogIERyaWZ0IGxldmVsIHRocmVzaG9sZCBmb3IgUEggZGV0ZWN0b3IgRGVmYXVsdHMgdG8gMTAuCiAgICA6cGFyYW0gZGRtX3dhcm5pbmdfbGV2ZWw6ICAgICAgV2FybmluZyBsZXZlbCBhbGVydCBmb3IgRERNIGRldGVjdG9yIERlZmF1bHRzIHRvIDIuCiAgICA6cGFyYW0gZGRtX291dF9jb250cm9sX2xldmVsOiAgRHJpZnQgbGV2ZWwgYWxlcnQgZm9yIERETSBkZXRlY3RvciBEZWZhdWx0cyB0byAzLgogICAgOnBhcmFtIGxhYmVsX2NvbDogICAgICAgTGFiZWwgY29sdW1uIHRvIGJlIHVzZWQgb24gYmFzZV9kYXRhc2V0IGFuZCBpbnB1dF9zdHJlYW0KICAgICAgICAgICAgICAgICAgICAgICAgICAgIERlZmF1bHRzIHRvICdsYWJlbCcuCiAgICA6cGFyYW0gcHJlZGljdGlvbl9jb2w6ICBQcmVkaWN0aW9uIGNvbHVtbiB0byBiZSB1c2VkIG9uIGJhc2VfZGF0YXNldCBhbmQgaW5wdXRfc3RyZWFtCiAgICAgICAgICAgICAgICAgICAgICAgICAgICBEZWZhdWx0cyB0byAncHJlZGljdGlvbicuCiAgICA6cGFyYW0gaHViX3VybDogICAgICAgICBodWJfdXJsIGluIGNhc2UgdGhlIGRlZmF1bHQgaXMgbm90IHVzZWQsIGNvbmNlcHRfZHJpZnRfc3RyZWFtaW5nIHdpbGwgYmUgbG9hZGVkCiAgICAgICAgICAgICAgICAgICAgICAgICAgICBieSB0aGlzIHVybAogICAgICAgICAgICAgICAgICAgICAgICAgICAgRGVmYXVsdHMgdG8gbWxjb25mLmh1Yl91cmwuCiAgICA6cGFyYW0gZm5fdGFnOiAgICAgICAgICBodWIgdGFnIHRvIHVzZQogICAgICAgICAgICAgICAgICAgICAgICAgICAgRGVmYXVsdHMgdG8gJ21hc3RlcicKICAgICIiIgoKICAgIG1sY29uZi5kYnBhdGggPSBtbGNvbmYuZGJwYXRoIG9yICJodHRwOi8vbWxydW4tYXBpOjgwODAiCiAgICBtbGNvbmYuaHViX3VybCA9IGh1Yl91cmwKICAgIGZuID0gaW1wb3J0X2Z1bmN0aW9uKHVybD1mImh1YjovL2NvbmNlcHRfZHJpZnRfc3RyZWFtaW5nOntmbl90YWd9IikKCiAgICBjb250ZXh0LmxvZ2dlci5pbmZvKCJMb2FkaW5nIGJhc2UgZGF0YXNldCIpCiAgICBiYXNlX2RmID0gYmFzZV9kYXRhc2V0LmFzX2RmKCkKICAgIGVycm9yX3N0cmVhbSA9IG5wLndoZXJlKAogICAgICAgIGJhc2VfZGZbcHJlZGljdGlvbl9jb2xdLnZhbHVlcyA9PSBiYXNlX2RmW2xhYmVsX2NvbF0udmFsdWVzLCAwLCAxCiAgICApCgogICAgY29udGV4dC5sb2dnZXIuaW5mbygiQ3JlYXRpbmcgbW9kZWxzIikKICAgIG1vZGVscyA9IFsKICAgICAgICBtb2RlbC5zdHJpcCgpCiAgICAgICAgZm9yIG1vZGVsIGluIG9zLmdldGVudigibW9kZWxzIiwgInBhZ2VoaW5rbGV5LCBkZG0sIGVkZG0iKS5zcGxpdCgiLCIpCiAgICBdCiAgICBtb2RlbHMgPSB7CiAgICAgICAgImVkZG0iOiBza211bHRpZmxvdy5kcmlmdF9kZXRlY3Rpb24uRURETSgpLAogICAgICAgICJwYWdlaGlua2xleSI6IHNrbXVsdGlmbG93LmRyaWZ0X2RldGVjdGlvbi5QYWdlSGlua2xleSgKICAgICAgICAgICAgbWluX2luc3RhbmNlcz1sZW4oZXJyb3Jfc3RyZWFtKSwgdGhyZXNob2xkPXBhZ2VoaW5rbGV5X3RocmVzaG9sZAogICAgICAgICksCiAgICAgICAgImRkbSI6IHNrbXVsdGlmbG93LmRyaWZ0X2RldGVjdGlvbi5ERE0oCiAgICAgICAgICAgIG1pbl9udW1faW5zdGFuY2VzPWxlbihlcnJvcl9zdHJlYW0pLAogICAgICAgICAgICB3YXJuaW5nX2xldmVsPWRkbV93YXJuaW5nX2xldmVsLAogICAgICAgICAgICBvdXRfY29udHJvbF9sZXZlbD1kZG1fb3V0X2NvbnRyb2xfbGV2ZWwsCiAgICAgICAgKSwKICAgIH0KCiAgICBjb250ZXh0LmxvZ2dlci5pbmZvKCJTdHJlYW1pbmcgZGF0YSB0byBtb2RlbHMiKQogICAgZm9yIGkgaW4gcmFuZ2UobGVuKGVycm9yX3N0cmVhbSkpOgogICAgICAgIGZvciBtb2RlbF9uYW1lLCBtb2RlbCBpbiBtb2RlbHMuaXRlbXMoKToKICAgICAgICAgICAgbW9kZWwuYWRkX2VsZW1lbnQoZXJyb3Jfc3RyZWFtW2ldKQoKICAgIGNvbnRleHQubG9nZ2VyLmluZm8oIkxvZ2dpbmcgcmVhZHkgbW9kZWxzIikKICAgIGZvciBuYW1lLCBtb2RlbCBpbiBtb2RlbHMuaXRlbXMoKToKICAgICAgICBkYXRhID0gZHVtcHMobW9kZWwpCiAgICAgICAgbW9kZWxfZmlsZSA9IGYie25hbWV9LnBrbCIKICAgICAgICBjb250ZXh0LmxvZ19tb2RlbCgKICAgICAgICAgICAgZiJ7bmFtZX1fY29uY2VwdF9kcmlmdCIsCiAgICAgICAgICAgIGJvZHk9ZGF0YSwKICAgICAgICAgICAgbGFiZWxzPXsiZnJhbWV3b3JrIjogInNrbXVsdGlmbG93IiwgIndvcmtmbG93IjogImNvbmNlcHQtZHJpZnQifSwKICAgICAgICAgICAgbW9kZWxfZmlsZT1tb2RlbF9maWxlLAogICAgICAgICAgICBtb2RlbF9kaXI9bW9kZWxzX2Rlc3QsCiAgICAgICAgICAgIHRhZz0ibGF0ZXN0IiwKICAgICAgICApCiAgICAgICAgZm4uc2V0X2VudnMoCiAgICAgICAgICAgIHsKICAgICAgICAgICAgICAgIGYie25hbWV9X21vZGVsX3BhdGgiOiBvcy5wYXRoLmpvaW4oCiAgICAgICAgICAgICAgICAgICAgY29udGV4dC5hcnRpZmFjdF9wYXRoLCBtb2RlbHNfZGVzdCwgbW9kZWxfZmlsZQogICAgICAgICAgICAgICAgKQogICAgICAgICAgICB9CiAgICAgICAgKQoKICAgIGNvbnRleHQubG9nZ2VyLmluZm8oIkRlcGxveWluZyBDb25jZXB0IERyaWZ0IFN0cmVhbWluZyBmdW5jdGlvbiIpCiAgICBmbi5zZXRfZW52cygKICAgICAgICB7CiAgICAgICAgICAgICJsYWJlbF9jb2wiOiBsYWJlbF9jb2wsCiAgICAgICAgICAgICJwcmVkaWN0aW9uX2NvbCI6IHByZWRpY3Rpb25fY29sLAogICAgICAgICAgICAiZHJpZnRfc3RyZWFtIjogb3V0cHV0X3N0cmVhbSwKICAgICAgICAgICAgInRzZGJfdGFibGUiOiBvdXRwdXRfdHNkYiwKICAgICAgICAgICAgInBhZ2VoaW5rbGV5X3RocmVzaG9sZCI6IHBhZ2VoaW5rbGV5X3RocmVzaG9sZCwKICAgICAgICAgICAgImRkbV93YXJuaW5nX2xldmVsIjogZGRtX3dhcm5pbmdfbGV2ZWwsCiAgICAgICAgICAgICJkZG1fb3V0X2NvbnRyb2wiOiBkZG1fb3V0X2NvbnRyb2xfbGV2ZWwsCiAgICAgICAgfQogICAgKQogICAgZm4uYWRkX3YzaW9fc3RyZWFtX3RyaWdnZXIoc3RyZWFtX3BhdGggPSBpbnB1dF9zdHJlYW0sIG5hbWUgPSAnc3RyZWFtJywgZ3JvdXAgPSBjb25zdW1lcl9ncm91cCkKICAgIGZuLmFwcGx5KG1vdW50X3YzaW8oKSkKICAgIGZuLmRlcGxveShwcm9qZWN0PWNvbnRleHQucHJvamVjdCkK - commands: - - python -m pip install scikit-multiflow - code_origin: https://github.com/daniels290813/functions.git#82bbfde4afa2eae77059e05c70bbebacf530fd0d:/User/test/functions/concept_drift/concept_drift.py - origin_filename: /User/test/functions/concept_drift/concept_drift.py - disable_auto_mount: false - affinity: null -verbose: false diff --git a/functions/development/concept_drift/1.1.0/src/item.yaml b/functions/development/concept_drift/1.1.0/src/item.yaml deleted file mode 100644 index 2ee37e38..00000000 --- a/functions/development/concept_drift/1.1.0/src/item.yaml +++ /dev/null @@ -1,27 +0,0 @@ -apiVersion: v1 -categories: -- machine-learning -- model-serving -description: Deploy a streaming Concept Drift detector on a labeled stream -doc: '' -example: concept_drift.ipynb -generationDate: 2022-08-28:17-25 -hidden: false -icon: '' -labels: - author: orz - framework: sklearn -maintainers: [] -marketplaceType: '' -mlrunVersion: 1.1.0 -name: concept-drift -platformVersion: 3.5.0 -spec: - filename: concept_drift.py - handler: concept_drift_deployer - image: mlrun/ml-models - kind: job - requirements: - - scikit-multiflow -url: '' -version: 1.1.0 diff --git a/functions/development/concept_drift/1.1.0/src/requirements.txt b/functions/development/concept_drift/1.1.0/src/requirements.txt deleted file mode 100644 index fa0fddd8..00000000 --- a/functions/development/concept_drift/1.1.0/src/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -skmultiflow \ No newline at end of file diff --git a/functions/development/concept_drift/1.1.0/static/concept_drift.html b/functions/development/concept_drift/1.1.0/static/concept_drift.html deleted file mode 100644 index 19468b25..00000000 --- a/functions/development/concept_drift/1.1.0/static/concept_drift.html +++ /dev/null @@ -1,287 +0,0 @@ - - - - - - - -concept_drift.concept_drift - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - -
-
- -
-
-
-
-
- -
-

- -
-
-
-
-
-
-
-

Source code for concept_drift.concept_drift

-# Copyright 2019 Iguazio
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# Generated by nuclio.export.NuclioExporter
-
-import skmultiflow.drift_detection  # We will grab our PH, DDM, EDDM algorithms from here
-import numpy as np
-import pandas as pd
-import os
-from cloudpickle import dumps, load, dump
-
-from nuclio.triggers import V3IOStreamTrigger
-from mlrun import DataItem, import_function, mlconf, MLClientCtx, mount_v3io
-
-import random
-
-
-
[docs]def concept_drift_deployer( - context: MLClientCtx, - base_dataset: DataItem, - input_stream: str, - consumer_group: str, - output_stream: str, - output_tsdb: str, - tsdb_batch_size: int, - callbacks: list, - models: list = ["ddm", "eddm", "pagehinkley"], - models_dest="models", - pagehinkley_threshold: float = 10, - ddm_warning_level: float = 2, - ddm_out_control_level: float = 3, - label_col="label", - prediction_col="prediction", - hub_url: str = mlconf.hub_url, - fn_tag: str = "master", -): - """Deploy a streaming Concept Drift detector on a labeled stream - This function is the Deployment step for the Streaming Concept Drift Detector. - It will load the selected drift detectors and initialize them with the - base_dataset's statistics. Then it will deploy the concept_drift_streaming - function and pass the models to it for streaming concept-drift detection on top - of a labeled stream. - - :param context: MLRun context - :param base_dataset: Dataset containing label_col and prediction_col to initialize the detectors - :param input_stream: labeled stream to track. - Should contain label_col and prediction_col - :param output_stream: Output stream to push the detector's alerts - :param output_tsdb: Output TSDB table to allow analysis and display - :param tsdb_batch_size: Batch size of alerts to buffer before pushing to the TSDB - :param callbacks: Additional rest endpoints to send the alert data to - :param models: List of the detectors to deploy - Defaults to ['ddm', 'eddm', 'pagehinkley']. - :param models_dest: Location for saving the detectors - Defaults to 'models' (in relation to artifact_path). - :param pagehinkley_threshold: Drift level threshold for PH detector Defaults to 10. - :param ddm_warning_level: Warning level alert for DDM detector Defaults to 2. - :param ddm_out_control_level: Drift level alert for DDM detector Defaults to 3. - :param label_col: Label column to be used on base_dataset and input_stream - Defaults to 'label'. - :param prediction_col: Prediction column to be used on base_dataset and input_stream - Defaults to 'prediction'. - :param hub_url: hub_url in case the default is not used, concept_drift_streaming will be loaded - by this url - Defaults to mlconf.hub_url. - :param fn_tag: hub tag to use - Defaults to 'master' - """ - - mlconf.dbpath = mlconf.dbpath or "http://mlrun-api:8080" - mlconf.hub_url = hub_url - fn = import_function(url=f"hub://concept_drift_streaming:{fn_tag}") - - context.logger.info("Loading base dataset") - base_df = base_dataset.as_df() - error_stream = np.where( - base_df[prediction_col].values == base_df[label_col].values, 0, 1 - ) - - context.logger.info("Creating models") - models = [ - model.strip() - for model in os.getenv("models", "pagehinkley, ddm, eddm").split(",") - ] - models = { - "eddm": skmultiflow.drift_detection.EDDM(), - "pagehinkley": skmultiflow.drift_detection.PageHinkley( - min_instances=len(error_stream), threshold=pagehinkley_threshold - ), - "ddm": skmultiflow.drift_detection.DDM( - min_num_instances=len(error_stream), - warning_level=ddm_warning_level, - out_control_level=ddm_out_control_level, - ), - } - - context.logger.info("Streaming data to models") - for i in range(len(error_stream)): - for model_name, model in models.items(): - model.add_element(error_stream[i]) - - context.logger.info("Logging ready models") - for name, model in models.items(): - data = dumps(model) - model_file = f"{name}.pkl" - context.log_model( - f"{name}_concept_drift", - body=data, - labels={"framework": "skmultiflow", "workflow": "concept-drift"}, - model_file=model_file, - model_dir=models_dest, - tag="latest", - ) - fn.set_envs( - { - f"{name}_model_path": os.path.join( - context.artifact_path, models_dest, model_file - ) - } - ) - - context.logger.info("Deploying Concept Drift Streaming function") - fn.set_envs( - { - "label_col": label_col, - "prediction_col": prediction_col, - "drift_stream": output_stream, - "tsdb_table": output_tsdb, - "pagehinkley_threshold": pagehinkley_threshold, - "ddm_warning_level": ddm_warning_level, - "ddm_out_control": ddm_out_control_level, - } - ) - fn.add_v3io_stream_trigger(stream_path = input_stream, name = 'stream', group = consumer_group) - fn.apply(mount_v3io()) - fn.deploy(project=context.project)
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/concept_drift/1.1.0/static/documentation.html b/functions/development/concept_drift/1.1.0/static/documentation.html deleted file mode 100644 index 888532af..00000000 --- a/functions/development/concept_drift/1.1.0/static/documentation.html +++ /dev/null @@ -1,222 +0,0 @@ - - - - - - - -concept_drift package - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - - - -
-
- - -
-
-
- -
-

concept_drift package

- -
- -
-
-
-
-
-

concept_drift package#

-
-

Submodules#

-
-
-

concept_drift.concept_drift module#

-
-
-

Module contents#

-
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/concept_drift/1.1.0/static/example.html b/functions/development/concept_drift/1.1.0/static/example.html deleted file mode 100644 index 33a8e3db..00000000 --- a/functions/development/concept_drift/1.1.0/static/example.html +++ /dev/null @@ -1,894 +0,0 @@ - - - - - - - -Concept Drift - Deployer - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - - - -
-
- - -
-
-
- - -
-
-
-

Concept Drift - Deployer#

-

Deploy a streaming Concept Drift detector on a labeled stream.
-It will initialize the selected drift detectors with the base_dataset’s statistics and deploy the concept_drift_streaming function from the hub.
-adding V3IOStreamTrigger in order to listen to the input_stream.

-
-

Steps#

-
    -
  1. Data exploration

  2. -
  3. Creating the input stream

  4. -
  5. Importing the function

  6. -
  7. Running the function remotely

  8. -
  9. Testing the function

  10. -
-
-
-

Data exploration#

-

In order to know about the performance of a drift detector by measuring the different detection metrics, we need to know beforehand where a real drift occurs.
-This is only possible with synthetic datasets.
The scikit-multiflow framework allows generating several kinds of synthetic data to simulate the occurrence of drifts.
-Harvard dataverse provides futher explanations on the used dataset along with different kinds of drifted datasets.
-mixed_0101_abrupto has 4 concepts and 3 drifts at time steps 10000, 20000, and 30000.
-Our dataset will be train-test-splitted, the train part (first 5000 examples) is used to train the model (that is generated easly using sklearn_classifer).
-The test part (which is already predicted by the model) will be pushed to the input stream in order to detect drifts.

-
-
-
import pandas as pd
-data_path = 'https://s3.wasabisys.com/iguazio/data/function-marketplace-data/concept_drift/mixed_0101_abrupto.csv'
-predicted_train_path = 'https://s3.wasabisys.com/iguazio/data/function-marketplace-data/concept_drift/predicted_abrupto_train.csv'
-predicted_test_data_path = 'https://s3.wasabisys.com/iguazio/data/function-marketplace-data/concept_drift/predicted_abrupto_test.csv'
-# You can find the model used here
-models_path = 'https://s3.wasabisys.com/iguazio/models/function-marketplace-models/concept_drift/concept_drift_random_forest.pkl'
-original_data = pd.read_csv(data_path)
-original_data.head()
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
X1X2X3X4class
00.01.00.4601010.5927441.0
11.01.00.5887880.5749840.0
20.00.00.4016410.6793251.0
31.01.00.3060760.1821080.0
40.00.00.9628470.5792451.0
-
-
-
-
-
predicted_test = pd.read_csv(predicted_test_data_path)
-predicted_test.tail()
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
X1X2X3X4classpredicted_col
349950.00.00.0101060.6472690.01.0
349961.01.00.2936510.7372911.00.0
349970.00.00.8485460.5523370.01.0
349981.01.00.6147540.8598961.00.0
349991.00.00.2653060.8437160.01.0
-
-
-
-
-

Creating the input stream#

-
-
-
import os 
-
-container = os.path.join('/',os.environ['V3IO_HOME'].split('/')[0])
-user = os.environ["V3IO_USERNAME"]
-rel_path = os.getcwd()[6:] + '/artifacts'
-
-base_input_stream = os.path.join(user,rel_path) + "/inputs_stream"
-base_output_stream = os.path.join(user,rel_path) + "/output_stream"
-input_stream = os.path.join(container,base_input_stream)
-output_stream = os.path.join(container,user,rel_path) + "/output_stream"
-tsdb_path = os.path.join(container,user,rel_path) + "/output_tsdb"
-
-stream_consumer_group = 'cg45'
-
-
-
-
-
-
-
import v3io.dataplane
-
-client = v3io.dataplane.Client()
-response = client.stream.create(container = container,
-                                stream_path=base_input_stream,
-                                shard_count=1,
-                                raise_for_status = v3io.dataplane.RaiseForStatus.never)
-response.raise_for_status([409, 204])
-
-
-
-
-
-
-

Importing the function#

-
-
-
# Importing the function
-import mlrun
-mlrun.set_environment(project='function-marketplace')
-
-fn = mlrun.import_function("hub://concept_drift:development")
-fn.apply(mlrun.auto_mount())
-
-
-
-
-
> 2021-10-25 10:27:04,105 [info] created and saved project function-marketplace
-
-
-
<mlrun.runtimes.kubejob.KubejobRuntime at 0x7f145dd80fd0>
-
-
-
-
-
-
-

Running the function remotely#

-
-
-
drift_run = fn.run(name='concept_drift',
-                   params={'input_stream'    : input_stream,
-                           'consumer_group'  : stream_consumer_group,
-                           'output_stream'   : output_stream,
-                           'output_tsdb'     : tsdb_path,
-                           'tsdb_batch_size' : 1,
-                           'models'          : ['ddm', 'eddm', 'pagehinkley'], # defaults
-                           'label_col'       : 'class',
-                           'prediction_col'  : 'predicted_col',
-                           'fn_tag'          : 'development'},
-                   inputs={'base_dataset'    : predicted_train_path},
-                   artifact_path = os.path.join(os.getcwd(), 'artifacts'))
-
-
-
-
-
> 2021-10-25 10:27:04,567 [info] starting run concept_drift uid=fa07c222e77d4eac86d2ce9317aaded1 DB=http://mlrun-api:8080
-> 2021-10-25 10:27:04,709 [info] Job is running in the background, pod: concept-drift-ggxgb
-> 2021-10-25 10:27:11,199 [info] Loading base dataset
-> 2021-10-25 10:27:13,227 [info] Creating models
-> 2021-10-25 10:27:13,227 [info] Streaming data to models
-> 2021-10-25 10:27:13,347 [info] Logging ready models
-> 2021-10-25 10:27:13,487 [info] Deploying Concept Drift Streaming function
-> 2021-10-25 10:27:13,490 [info] Starting remote function deploy
-2021-10-25 10:27:13  (info) Deploying function
-2021-10-25 10:27:13  (info) Building
-2021-10-25 10:27:13  (info) Staging files and preparing base images
-2021-10-25 10:27:13  (info) Building processor image
-2021-10-25 10:27:15  (info) Build complete
-2021-10-25 10:27:21  (info) Function deploy complete
-> 2021-10-25 10:27:21,797 [info] successfully deployed function: {'internal_invocation_urls': ['nuclio-function-marketplace-concept-drift-streaming.default-tenant.svc.cluster.local:8080'], 'external_invocation_urls': ['default-tenant.app.dev39.lab.iguazeng.com:31143']}
-> 2021-10-25 10:27:21,868 [info] run executed, status=completed
-final state: completed
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
function-marketplace0Oct 25 10:27:10completedconcept_drift
v3io_user=dani
kind=job
owner=dani
host=concept-drift-ggxgb
base_dataset
input_stream=/users/dani/test/functions/concept_drift/artifacts/inputs_stream
consumer_group=cg45
output_stream=/users/dani/test/functions/concept_drift/artifacts/output_stream
output_tsdb=/users/dani/test/functions/concept_drift/artifacts/output_tsdb
tsdb_batch_size=1
models=['ddm', 'eddm', 'pagehinkley']
label_col=class
prediction_col=predicted_col
fn_tag=development
eddm_concept_drift
pagehinkley_concept_drift
ddm_concept_drift
-
- -
-

-
-
-
> to track results use the .show() or .logs() methods or click here to open in UI
> 2021-10-25 10:27:23,031 [info] run executed, status=completed
-
-
-
-
-
-
-

Testing the function#

-
-

Mark that we are testing the deployed function - concept_drift_streaming

-
-
-
-
import json
-import datetime
-
-# Reshaping the data to V3IOStream format.
-def restructure_stream_event(context, event):
-    instances = [dict()]
-    for key in predicted_test.keys():
-        if key not in ['when', 'class', 'model', 'worker', 'hostname', 'predicted_col']:
-            instances[0].update({key: event.pop(key)})
-    event['request'] = {'instances': instances}
-    event['resp'] = [int(event.pop('predicted_col'))]
-    event['when'] = datetime.datetime.strftime(datetime.datetime.now(), format="%Y-%m-%d %H:%M:%S.%f")
-    event['model'] = 'sklearn.ensemble.RandomForestClassifier'
-    return event
-    
-    
-records = json.loads(predicted_test.to_json(orient='records'))
-records = [{'data': json.dumps(restructure_stream_event(context, record))} for record in records]
-
-# showing first record
-records[0]
-
-
-
-
-
{'data': '{"class": 1.0, "request": {"instances": [{"X1": 0.0, "X2": 0.0, "X3": 0.0634475073, "X4": 0.4136568818}]}, "resp": [1], "when": "2021-10-25 10:27:23.152584", "model": "sklearn.ensemble.RandomForestClassifier"}'}
-
-
-
-
-
-
-
# Creating v3io client
-v3io_client = v3io.dataplane.Client()
-
-# Pushing some undrifted data to the input stream
-response = v3io_client.stream.put_records(container=container,
-                                          stream_path=base_input_stream, 
-                                          records=records[4900:5100])
-
-
-
-
-
-
-
# Getting earliest location in the shard
-location = json.loads(v3io_client.stream.seek(container=container,
-                                              stream_path=base_input_stream,
-                                              shard_id=0,
-                                              seek_type='EARLIEST').body)['Location']
-# Getting records from input stream
-response = v3io_client.stream.get_records(container=container,
-                                          stream_path=base_input_stream,
-                                          shard_id=0, location=location)
-# Showing the last sequence that is written to the input stream
-json.loads(response.body)['Records'][-1]
-
-
-
-
-
{'SequenceNumber': 200,
- 'Data': 'eyJjbGFzcyI6IDAuMCwgInJlcXVlc3QiOiB7Imluc3RhbmNlcyI6IFt7IlgxIjogMC4wLCAiWDIiOiAwLjAsICJYMyI6IDAuMzMzMTYzNjk4OSwgIlg0IjogMC40MjE2NzY1Njg3fV19LCAicmVzcCI6IFsxXSwgIndoZW4iOiAiMjAyMS0xMC0yNSAxMDoyNzoyMy4yOTM3OTgiLCAibW9kZWwiOiAic2tsZWFybi5lbnNlbWJsZS5SYW5kb21Gb3Jlc3RDbGFzc2lmaWVyIn0=',
- 'ArrivalTimeSec': 1635157644,
- 'ArrivalTimeNSec': 395309631}
-
-
-
-
-
-

Make sure some time has passed - the function needs to be triggered by the input stream, then it’ll write to the output stream#

-
-
-
# Getting earliest location in the shard
-location = json.loads(v3io_client.stream.seek(container=container,
-                                              stream_path=base_output_stream,
-                                              shard_id=0,
-                                              seek_type='EARLIEST').body)['Location']
-# Getting records from output stream
-response = v3io_client.stream.get_records(container=container,
-                                          stream_path=base_output_stream,
-                                          shard_id=0, location=location)
-
-
-
-
-
-
-
# Showing changed detected
-import base64
-for instance in json.loads(response.body)['Records']:
-    seq = instance["SequenceNumber"]
-    data = json.loads(base64.b64decode(instance['Data']))
-    if(data['ddm_drift']==1 or data['eddm_drift']==1):
-        print(f'sequence number : {seq}, data : {data}')
-
-
-
-
-
sequence number : 106, data : {'class': 0.0, 'request': {'instances': [{'X1': 0.0, 'X2': 0.0, 'X3': 0.9628473804, 'X4': 0.5792453402}]}, 'resp': [1], 'when': '2021-10-25 10:27:23.291145', 'model': 'sklearn.ensemble.RandomForestClassifier', 'ddm_warning_zone': 0, 'ddm_drift': 1, 'eddm_warning_zone': 0, 'eddm_drift': 0}
-sequence number : 122, data : {'class': 0.0, 'request': {'instances': [{'X1': 0.0, 'X2': 0.0, 'X3': 0.4969765505, 'X4': 0.9784738351}]}, 'resp': [1], 'when': '2021-10-25 10:27:23.291558', 'model': 'sklearn.ensemble.RandomForestClassifier', 'ddm_warning_zone': 0, 'ddm_drift': 0, 'eddm_warning_zone': 0, 'eddm_drift': 1}
-
-
-
-
-

We can see that the system detected a change in the 106 instance, which is 10006 instance in the real dataset -
-5000 first instances are for train, we started pushing data from the 4900 instance of the test dataset (9900 from the real dataset), and we pushed only 200 instances.

-

Back to the top

-
-
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/concept_drift/1.1.0/static/function.html b/functions/development/concept_drift/1.1.0/static/function.html deleted file mode 100644 index f9afe688..00000000 --- a/functions/development/concept_drift/1.1.0/static/function.html +++ /dev/null @@ -1,134 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: job
-metadata:
-  name: concept-drift
-  tag: ''
-  hash: 935da41196802875e19948974f32b6f00c29feb2
-  project: ''
-  labels:
-    author: orz
-    framework: sklearn
-  categories:
-  - machine-learning
-  - model-serving
-spec:
-  command: ''
-  args: []
-  image: mlrun/ml-models
-  env: []
-  default_handler: concept_drift_deployer
-  entry_points:
-    concept_drift_deployer:
-      name: concept_drift_deployer
-      doc: "Deploy a streaming Concept Drift detector on a labeled stream\n   This\
-        \ function is the Deployment step for the Streaming Concept Drift Detector.\n\
-        \   It will load the selected drift detectors and initialize them with the\n\
-        \   base_dataset's statistics.  Then it will deploy the concept_drift_streaming\n\
-        \   function and pass the models to it for streaming concept-drift detection\
-        \ on top\n   of a labeled stream."
-      parameters:
-      - name: context
-        type: MLClientCtx
-        doc: MLRun context
-        default: ''
-      - name: base_dataset
-        type: DataItem
-        doc: Dataset containing label_col and prediction_col to initialize the detectors
-        default: ''
-      - name: input_stream
-        type: str
-        doc: labeled stream to track. Should contain label_col and prediction_col
-        default: ''
-      - name: consumer_group
-        type: str
-        default: ''
-      - name: output_stream
-        type: str
-        doc: Output stream to push the detector's alerts
-        default: ''
-      - name: output_tsdb
-        type: str
-        doc: Output TSDB table to allow analysis and display
-        default: ''
-      - name: tsdb_batch_size
-        type: int
-        doc: Batch size of alerts to buffer before pushing to the TSDB
-        default: ''
-      - name: callbacks
-        type: list
-        doc: Additional rest endpoints to send the alert data to
-        default: ''
-      - name: models
-        type: list
-        doc: List of the detectors to deploy Defaults to ['ddm', 'eddm', 'pagehinkley'].
-        default:
-        - ddm
-        - eddm
-        - pagehinkley
-      - name: models_dest
-        doc: Location for saving the detectors Defaults to 'models' (in relation to
-          artifact_path).
-        default: models
-      - name: pagehinkley_threshold
-        type: float
-        doc: Drift level threshold for PH detector Defaults to 10.
-        default: 10
-      - name: ddm_warning_level
-        type: float
-        doc: Warning level alert for DDM detector Defaults to 2.
-        default: 2
-      - name: ddm_out_control_level
-        type: float
-        doc: Drift level alert for DDM detector Defaults to 3.
-        default: 3
-      - name: label_col
-        doc: Label column to be used on base_dataset and input_stream Defaults to
-          'label'.
-        default: label
-      - name: prediction_col
-        doc: Prediction column to be used on base_dataset and input_stream Defaults
-          to 'prediction'.
-        default: prediction
-      - name: hub_url
-        type: str
-        doc: hub_url in case the default is not used, concept_drift_streaming will
-          be loaded by this url Defaults to mlconf.hub_url.
-        default: <_ast.Name object at 0x7f48eda946d0>
-      - name: fn_tag
-        type: str
-        doc: hub tag to use Defaults to 'master'
-        default: master
-      outputs:
-      - default: ''
-      lineno: 15
-  description: Deploy a streaming Concept Drift detector on a labeled stream
-  build:
-    functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHNrbXVsdGlmbG93LmRyaWZ0X2RldGVjdGlvbiAgIyBXZSB3aWxsIGdyYWIgb3VyIFBILCBERE0sIEVERE0gYWxnb3JpdGhtcyBmcm9tIGhlcmUKaW1wb3J0IG51bXB5IGFzIG5wCmltcG9ydCBwYW5kYXMgYXMgcGQKaW1wb3J0IG9zCmZyb20gY2xvdWRwaWNrbGUgaW1wb3J0IGR1bXBzLCBsb2FkLCBkdW1wCgpmcm9tIG51Y2xpby50cmlnZ2VycyBpbXBvcnQgVjNJT1N0cmVhbVRyaWdnZXIKZnJvbSBtbHJ1biBpbXBvcnQgRGF0YUl0ZW0sIGltcG9ydF9mdW5jdGlvbiwgbWxjb25mLCBNTENsaWVudEN0eCwgbW91bnRfdjNpbwoKaW1wb3J0IHJhbmRvbQoKCmRlZiBjb25jZXB0X2RyaWZ0X2RlcGxveWVyKAogICAgY29udGV4dDogTUxDbGllbnRDdHgsCiAgICBiYXNlX2RhdGFzZXQ6IERhdGFJdGVtLAogICAgaW5wdXRfc3RyZWFtOiBzdHIsCiAgICBjb25zdW1lcl9ncm91cDogc3RyLAogICAgb3V0cHV0X3N0cmVhbTogc3RyLAogICAgb3V0cHV0X3RzZGI6IHN0ciwKICAgIHRzZGJfYmF0Y2hfc2l6ZTogaW50LAogICAgY2FsbGJhY2tzOiBsaXN0LAogICAgbW9kZWxzOiBsaXN0ID0gWyJkZG0iLCAiZWRkbSIsICJwYWdlaGlua2xleSJdLAogICAgbW9kZWxzX2Rlc3Q9Im1vZGVscyIsCiAgICBwYWdlaGlua2xleV90aHJlc2hvbGQ6IGZsb2F0ID0gMTAsCiAgICBkZG1fd2FybmluZ19sZXZlbDogZmxvYXQgPSAyLAogICAgZGRtX291dF9jb250cm9sX2xldmVsOiBmbG9hdCA9IDMsCiAgICBsYWJlbF9jb2w9ImxhYmVsIiwKICAgIHByZWRpY3Rpb25fY29sPSJwcmVkaWN0aW9uIiwKICAgIGh1Yl91cmw6IHN0ciA9IG1sY29uZi5odWJfdXJsLAogICAgZm5fdGFnOiBzdHIgPSAibWFzdGVyIiwKKToKICAgICIiIkRlcGxveSBhIHN0cmVhbWluZyBDb25jZXB0IERyaWZ0IGRldGVjdG9yIG9uIGEgbGFiZWxlZCBzdHJlYW0KICAgICAgIFRoaXMgZnVuY3Rpb24gaXMgdGhlIERlcGxveW1lbnQgc3RlcCBmb3IgdGhlIFN0cmVhbWluZyBDb25jZXB0IERyaWZ0IERldGVjdG9yLgogICAgICAgSXQgd2lsbCBsb2FkIHRoZSBzZWxlY3RlZCBkcmlmdCBkZXRlY3RvcnMgYW5kIGluaXRpYWxpemUgdGhlbSB3aXRoIHRoZQogICAgICAgYmFzZV9kYXRhc2V0J3Mgc3RhdGlzdGljcy4gIFRoZW4gaXQgd2lsbCBkZXBsb3kgdGhlIGNvbmNlcHRfZHJpZnRfc3RyZWFtaW5nCiAgICAgICBmdW5jdGlvbiBhbmQgcGFzcyB0aGUgbW9kZWxzIHRvIGl0IGZvciBzdHJlYW1pbmcgY29uY2VwdC1kcmlmdCBkZXRlY3Rpb24gb24gdG9wCiAgICAgICBvZiBhIGxhYmVsZWQgc3RyZWFtLgoKICAgIDpwYXJhbSBjb250ZXh0OiAgICAgICAgIE1MUnVuIGNvbnRleHQKICAgIDpwYXJhbSBiYXNlX2RhdGFzZXQ6ICAgIERhdGFzZXQgY29udGFpbmluZyBsYWJlbF9jb2wgYW5kIHByZWRpY3Rpb25fY29sIHRvIGluaXRpYWxpemUgdGhlIGRldGVjdG9ycwogICAgOnBhcmFtIGlucHV0X3N0cmVhbTogICAgbGFiZWxlZCBzdHJlYW0gdG8gdHJhY2suCiAgICAgICAgICAgICAgICAgICAgICAgICAgICBTaG91bGQgY29udGFpbiBsYWJlbF9jb2wgYW5kIHByZWRpY3Rpb25fY29sCiAgICA6cGFyYW0gb3V0cHV0X3N0cmVhbTogICBPdXRwdXQgc3RyZWFtIHRvIHB1c2ggdGhlIGRldGVjdG9yJ3MgYWxlcnRzCiAgICA6cGFyYW0gb3V0cHV0X3RzZGI6ICAgICBPdXRwdXQgVFNEQiB0YWJsZSB0byBhbGxvdyBhbmFseXNpcyBhbmQgZGlzcGxheQogICAgOnBhcmFtIHRzZGJfYmF0Y2hfc2l6ZTogQmF0Y2ggc2l6ZSBvZiBhbGVydHMgdG8gYnVmZmVyIGJlZm9yZSBwdXNoaW5nIHRvIHRoZSBUU0RCCiAgICA6cGFyYW0gY2FsbGJhY2tzOiAgICAgICBBZGRpdGlvbmFsIHJlc3QgZW5kcG9pbnRzIHRvIHNlbmQgdGhlIGFsZXJ0IGRhdGEgdG8KICAgIDpwYXJhbSBtb2RlbHM6ICAgICAgICAgIExpc3Qgb2YgdGhlIGRldGVjdG9ycyB0byBkZXBsb3kKICAgICAgICAgICAgICAgICAgICAgICAgICAgIERlZmF1bHRzIHRvIFsnZGRtJywgJ2VkZG0nLCAncGFnZWhpbmtsZXknXS4KICAgIDpwYXJhbSBtb2RlbHNfZGVzdDogICAgIExvY2F0aW9uIGZvciBzYXZpbmcgdGhlIGRldGVjdG9ycwogICAgICAgICAgICAgICAgICAgICAgICAgICAgRGVmYXVsdHMgdG8gJ21vZGVscycgKGluIHJlbGF0aW9uIHRvIGFydGlmYWN0X3BhdGgpLgogICAgOnBhcmFtIHBhZ2VoaW5rbGV5X3RocmVzaG9sZDogIERyaWZ0IGxldmVsIHRocmVzaG9sZCBmb3IgUEggZGV0ZWN0b3IgRGVmYXVsdHMgdG8gMTAuCiAgICA6cGFyYW0gZGRtX3dhcm5pbmdfbGV2ZWw6ICAgICAgV2FybmluZyBsZXZlbCBhbGVydCBmb3IgRERNIGRldGVjdG9yIERlZmF1bHRzIHRvIDIuCiAgICA6cGFyYW0gZGRtX291dF9jb250cm9sX2xldmVsOiAgRHJpZnQgbGV2ZWwgYWxlcnQgZm9yIERETSBkZXRlY3RvciBEZWZhdWx0cyB0byAzLgogICAgOnBhcmFtIGxhYmVsX2NvbDogICAgICAgTGFiZWwgY29sdW1uIHRvIGJlIHVzZWQgb24gYmFzZV9kYXRhc2V0IGFuZCBpbnB1dF9zdHJlYW0KICAgICAgICAgICAgICAgICAgICAgICAgICAgIERlZmF1bHRzIHRvICdsYWJlbCcuCiAgICA6cGFyYW0gcHJlZGljdGlvbl9jb2w6ICBQcmVkaWN0aW9uIGNvbHVtbiB0byBiZSB1c2VkIG9uIGJhc2VfZGF0YXNldCBhbmQgaW5wdXRfc3RyZWFtCiAgICAgICAgICAgICAgICAgICAgICAgICAgICBEZWZhdWx0cyB0byAncHJlZGljdGlvbicuCiAgICA6cGFyYW0gaHViX3VybDogICAgICAgICBodWJfdXJsIGluIGNhc2UgdGhlIGRlZmF1bHQgaXMgbm90IHVzZWQsIGNvbmNlcHRfZHJpZnRfc3RyZWFtaW5nIHdpbGwgYmUgbG9hZGVkCiAgICAgICAgICAgICAgICAgICAgICAgICAgICBieSB0aGlzIHVybAogICAgICAgICAgICAgICAgICAgICAgICAgICAgRGVmYXVsdHMgdG8gbWxjb25mLmh1Yl91cmwuCiAgICA6cGFyYW0gZm5fdGFnOiAgICAgICAgICBodWIgdGFnIHRvIHVzZQogICAgICAgICAgICAgICAgICAgICAgICAgICAgRGVmYXVsdHMgdG8gJ21hc3RlcicKICAgICIiIgoKICAgIG1sY29uZi5kYnBhdGggPSBtbGNvbmYuZGJwYXRoIG9yICJodHRwOi8vbWxydW4tYXBpOjgwODAiCiAgICBtbGNvbmYuaHViX3VybCA9IGh1Yl91cmwKICAgIGZuID0gaW1wb3J0X2Z1bmN0aW9uKHVybD1mImh1YjovL2NvbmNlcHRfZHJpZnRfc3RyZWFtaW5nOntmbl90YWd9IikKCiAgICBjb250ZXh0LmxvZ2dlci5pbmZvKCJMb2FkaW5nIGJhc2UgZGF0YXNldCIpCiAgICBiYXNlX2RmID0gYmFzZV9kYXRhc2V0LmFzX2RmKCkKICAgIGVycm9yX3N0cmVhbSA9IG5wLndoZXJlKAogICAgICAgIGJhc2VfZGZbcHJlZGljdGlvbl9jb2xdLnZhbHVlcyA9PSBiYXNlX2RmW2xhYmVsX2NvbF0udmFsdWVzLCAwLCAxCiAgICApCgogICAgY29udGV4dC5sb2dnZXIuaW5mbygiQ3JlYXRpbmcgbW9kZWxzIikKICAgIG1vZGVscyA9IFsKICAgICAgICBtb2RlbC5zdHJpcCgpCiAgICAgICAgZm9yIG1vZGVsIGluIG9zLmdldGVudigibW9kZWxzIiwgInBhZ2VoaW5rbGV5LCBkZG0sIGVkZG0iKS5zcGxpdCgiLCIpCiAgICBdCiAgICBtb2RlbHMgPSB7CiAgICAgICAgImVkZG0iOiBza211bHRpZmxvdy5kcmlmdF9kZXRlY3Rpb24uRURETSgpLAogICAgICAgICJwYWdlaGlua2xleSI6IHNrbXVsdGlmbG93LmRyaWZ0X2RldGVjdGlvbi5QYWdlSGlua2xleSgKICAgICAgICAgICAgbWluX2luc3RhbmNlcz1sZW4oZXJyb3Jfc3RyZWFtKSwgdGhyZXNob2xkPXBhZ2VoaW5rbGV5X3RocmVzaG9sZAogICAgICAgICksCiAgICAgICAgImRkbSI6IHNrbXVsdGlmbG93LmRyaWZ0X2RldGVjdGlvbi5ERE0oCiAgICAgICAgICAgIG1pbl9udW1faW5zdGFuY2VzPWxlbihlcnJvcl9zdHJlYW0pLAogICAgICAgICAgICB3YXJuaW5nX2xldmVsPWRkbV93YXJuaW5nX2xldmVsLAogICAgICAgICAgICBvdXRfY29udHJvbF9sZXZlbD1kZG1fb3V0X2NvbnRyb2xfbGV2ZWwsCiAgICAgICAgKSwKICAgIH0KCiAgICBjb250ZXh0LmxvZ2dlci5pbmZvKCJTdHJlYW1pbmcgZGF0YSB0byBtb2RlbHMiKQogICAgZm9yIGkgaW4gcmFuZ2UobGVuKGVycm9yX3N0cmVhbSkpOgogICAgICAgIGZvciBtb2RlbF9uYW1lLCBtb2RlbCBpbiBtb2RlbHMuaXRlbXMoKToKICAgICAgICAgICAgbW9kZWwuYWRkX2VsZW1lbnQoZXJyb3Jfc3RyZWFtW2ldKQoKICAgIGNvbnRleHQubG9nZ2VyLmluZm8oIkxvZ2dpbmcgcmVhZHkgbW9kZWxzIikKICAgIGZvciBuYW1lLCBtb2RlbCBpbiBtb2RlbHMuaXRlbXMoKToKICAgICAgICBkYXRhID0gZHVtcHMobW9kZWwpCiAgICAgICAgbW9kZWxfZmlsZSA9IGYie25hbWV9LnBrbCIKICAgICAgICBjb250ZXh0LmxvZ19tb2RlbCgKICAgICAgICAgICAgZiJ7bmFtZX1fY29uY2VwdF9kcmlmdCIsCiAgICAgICAgICAgIGJvZHk9ZGF0YSwKICAgICAgICAgICAgbGFiZWxzPXsiZnJhbWV3b3JrIjogInNrbXVsdGlmbG93IiwgIndvcmtmbG93IjogImNvbmNlcHQtZHJpZnQifSwKICAgICAgICAgICAgbW9kZWxfZmlsZT1tb2RlbF9maWxlLAogICAgICAgICAgICBtb2RlbF9kaXI9bW9kZWxzX2Rlc3QsCiAgICAgICAgICAgIHRhZz0ibGF0ZXN0IiwKICAgICAgICApCiAgICAgICAgZm4uc2V0X2VudnMoCiAgICAgICAgICAgIHsKICAgICAgICAgICAgICAgIGYie25hbWV9X21vZGVsX3BhdGgiOiBvcy5wYXRoLmpvaW4oCiAgICAgICAgICAgICAgICAgICAgY29udGV4dC5hcnRpZmFjdF9wYXRoLCBtb2RlbHNfZGVzdCwgbW9kZWxfZmlsZQogICAgICAgICAgICAgICAgKQogICAgICAgICAgICB9CiAgICAgICAgKQoKICAgIGNvbnRleHQubG9nZ2VyLmluZm8oIkRlcGxveWluZyBDb25jZXB0IERyaWZ0IFN0cmVhbWluZyBmdW5jdGlvbiIpCiAgICBmbi5zZXRfZW52cygKICAgICAgICB7CiAgICAgICAgICAgICJsYWJlbF9jb2wiOiBsYWJlbF9jb2wsCiAgICAgICAgICAgICJwcmVkaWN0aW9uX2NvbCI6IHByZWRpY3Rpb25fY29sLAogICAgICAgICAgICAiZHJpZnRfc3RyZWFtIjogb3V0cHV0X3N0cmVhbSwKICAgICAgICAgICAgInRzZGJfdGFibGUiOiBvdXRwdXRfdHNkYiwKICAgICAgICAgICAgInBhZ2VoaW5rbGV5X3RocmVzaG9sZCI6IHBhZ2VoaW5rbGV5X3RocmVzaG9sZCwKICAgICAgICAgICAgImRkbV93YXJuaW5nX2xldmVsIjogZGRtX3dhcm5pbmdfbGV2ZWwsCiAgICAgICAgICAgICJkZG1fb3V0X2NvbnRyb2wiOiBkZG1fb3V0X2NvbnRyb2xfbGV2ZWwsCiAgICAgICAgfQogICAgKQogICAgZm4uYWRkX3YzaW9fc3RyZWFtX3RyaWdnZXIoc3RyZWFtX3BhdGggPSBpbnB1dF9zdHJlYW0sIG5hbWUgPSAnc3RyZWFtJywgZ3JvdXAgPSBjb25zdW1lcl9ncm91cCkKICAgIGZuLmFwcGx5KG1vdW50X3YzaW8oKSkKICAgIGZuLmRlcGxveShwcm9qZWN0PWNvbnRleHQucHJvamVjdCkK
-    commands:
-    - python -m pip install scikit-multiflow
-    code_origin: https://github.com/daniels290813/functions.git#82bbfde4afa2eae77059e05c70bbebacf530fd0d:/User/test/functions/concept_drift/concept_drift.py
-    origin_filename: /User/test/functions/concept_drift/concept_drift.py
-  disable_auto_mount: false
-  affinity: null
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/concept_drift/1.1.0/static/item.html b/functions/development/concept_drift/1.1.0/static/item.html deleted file mode 100644 index d61d5687..00000000 --- a/functions/development/concept_drift/1.1.0/static/item.html +++ /dev/null @@ -1,49 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- machine-learning
-- model-serving
-description: Deploy a streaming Concept Drift detector on a labeled stream
-doc: ''
-example: concept_drift.ipynb
-generationDate: 2022-08-28:17-25
-hidden: false
-icon: ''
-labels:
-  author: orz
-  framework: sklearn
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 1.1.0
-name: concept-drift
-platformVersion: 3.5.0
-spec:
-  filename: concept_drift.py
-  handler: concept_drift_deployer
-  image: mlrun/ml-models
-  kind: job
-  requirements:
-  - scikit-multiflow
-url: ''
-version: 1.1.0
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/concept_drift/1.1.0/static/source.html b/functions/development/concept_drift/1.1.0/static/source.html deleted file mode 100644 index 6aeba7a7..00000000 --- a/functions/development/concept_drift/1.1.0/static/source.html +++ /dev/null @@ -1,169 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-# Copyright 2019 Iguazio
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# Generated by nuclio.export.NuclioExporter
-
-import skmultiflow.drift_detection  # We will grab our PH, DDM, EDDM algorithms from here
-import numpy as np
-import pandas as pd
-import os
-from cloudpickle import dumps, load, dump
-
-from nuclio.triggers import V3IOStreamTrigger
-from mlrun import DataItem, import_function, mlconf, MLClientCtx, mount_v3io
-
-import random
-
-
-def concept_drift_deployer(
-    context: MLClientCtx,
-    base_dataset: DataItem,
-    input_stream: str,
-    consumer_group: str,
-    output_stream: str,
-    output_tsdb: str,
-    tsdb_batch_size: int,
-    callbacks: list,
-    models: list = ["ddm", "eddm", "pagehinkley"],
-    models_dest="models",
-    pagehinkley_threshold: float = 10,
-    ddm_warning_level: float = 2,
-    ddm_out_control_level: float = 3,
-    label_col="label",
-    prediction_col="prediction",
-    hub_url: str = mlconf.hub_url,
-    fn_tag: str = "master",
-):
-    """Deploy a streaming Concept Drift detector on a labeled stream
-       This function is the Deployment step for the Streaming Concept Drift Detector.
-       It will load the selected drift detectors and initialize them with the
-       base_dataset's statistics.  Then it will deploy the concept_drift_streaming
-       function and pass the models to it for streaming concept-drift detection on top
-       of a labeled stream.
-
-    :param context:         MLRun context
-    :param base_dataset:    Dataset containing label_col and prediction_col to initialize the detectors
-    :param input_stream:    labeled stream to track.
-                            Should contain label_col and prediction_col
-    :param output_stream:   Output stream to push the detector's alerts
-    :param output_tsdb:     Output TSDB table to allow analysis and display
-    :param tsdb_batch_size: Batch size of alerts to buffer before pushing to the TSDB
-    :param callbacks:       Additional rest endpoints to send the alert data to
-    :param models:          List of the detectors to deploy
-                            Defaults to ['ddm', 'eddm', 'pagehinkley'].
-    :param models_dest:     Location for saving the detectors
-                            Defaults to 'models' (in relation to artifact_path).
-    :param pagehinkley_threshold:  Drift level threshold for PH detector Defaults to 10.
-    :param ddm_warning_level:      Warning level alert for DDM detector Defaults to 2.
-    :param ddm_out_control_level:  Drift level alert for DDM detector Defaults to 3.
-    :param label_col:       Label column to be used on base_dataset and input_stream
-                            Defaults to 'label'.
-    :param prediction_col:  Prediction column to be used on base_dataset and input_stream
-                            Defaults to 'prediction'.
-    :param hub_url:         hub_url in case the default is not used, concept_drift_streaming will be loaded
-                            by this url
-                            Defaults to mlconf.hub_url.
-    :param fn_tag:          hub tag to use
-                            Defaults to 'master'
-    """
-
-    mlconf.dbpath = mlconf.dbpath or "http://mlrun-api:8080"
-    mlconf.hub_url = hub_url
-    fn = import_function(url=f"hub://concept_drift_streaming:{fn_tag}")
-
-    context.logger.info("Loading base dataset")
-    base_df = base_dataset.as_df()
-    error_stream = np.where(
-        base_df[prediction_col].values == base_df[label_col].values, 0, 1
-    )
-
-    context.logger.info("Creating models")
-    models = [
-        model.strip()
-        for model in os.getenv("models", "pagehinkley, ddm, eddm").split(",")
-    ]
-    models = {
-        "eddm": skmultiflow.drift_detection.EDDM(),
-        "pagehinkley": skmultiflow.drift_detection.PageHinkley(
-            min_instances=len(error_stream), threshold=pagehinkley_threshold
-        ),
-        "ddm": skmultiflow.drift_detection.DDM(
-            min_num_instances=len(error_stream),
-            warning_level=ddm_warning_level,
-            out_control_level=ddm_out_control_level,
-        ),
-    }
-
-    context.logger.info("Streaming data to models")
-    for i in range(len(error_stream)):
-        for model_name, model in models.items():
-            model.add_element(error_stream[i])
-
-    context.logger.info("Logging ready models")
-    for name, model in models.items():
-        data = dumps(model)
-        model_file = f"{name}.pkl"
-        context.log_model(
-            f"{name}_concept_drift",
-            body=data,
-            labels={"framework": "skmultiflow", "workflow": "concept-drift"},
-            model_file=model_file,
-            model_dir=models_dest,
-            tag="latest",
-        )
-        fn.set_envs(
-            {
-                f"{name}_model_path": os.path.join(
-                    context.artifact_path, models_dest, model_file
-                )
-            }
-        )
-
-    context.logger.info("Deploying Concept Drift Streaming function")
-    fn.set_envs(
-        {
-            "label_col": label_col,
-            "prediction_col": prediction_col,
-            "drift_stream": output_stream,
-            "tsdb_table": output_tsdb,
-            "pagehinkley_threshold": pagehinkley_threshold,
-            "ddm_warning_level": ddm_warning_level,
-            "ddm_out_control": ddm_out_control_level,
-        }
-    )
-    fn.add_v3io_stream_trigger(stream_path = input_stream, name = 'stream', group = consumer_group)
-    fn.apply(mount_v3io())
-    fn.deploy(project=context.project)
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/concept_drift/latest/src/README.md b/functions/development/concept_drift/latest/src/README.md deleted file mode 100644 index 92e6d893..00000000 --- a/functions/development/concept_drift/latest/src/README.md +++ /dev/null @@ -1,132 +0,0 @@ -# Concept Drift - -**Concept drift** is a change in the statistical properties of the **target variable** over time. - -When deploying our models to production, we must ensure our models perform as we expect them to - reaching the same level of performence we have seen on our test sets or at least performing in the same quality as when they were deployed. - -However, often this is not the case. there are many factors that can affect our model's performance like seasonality or any unkown root causes that will change the laws underlying our data and invalidate some assumptions made by the model. - -We offer this function to help combat Concept Drift with implementation of streaming DDM, EDDM and PH concept drift detectors. - -## How to integrate - -This function is made of two parts: - -1. Kubernetes job to instantiate the selected models with a provided base dataset (the test dataset could be used) -2. [Nuclio serverless function](../concept_drift_streaming/concept_drift_streaming.ipynb) listed on a _labeled stream_, which will be deployed from this function after the models initialization and run the models per event and provide necessary alerts. - -There are two steps to integrate sucessfully with your workflow: - -1. Provide a stream where each event containes the joined **label** and **prediction** for that specific event. -2. Add this function to the workflow with the following params: - -```markdown -:param context: MLRun context -:param base_dataset: Dataset containing label_col and prediction_col to initialize the detectors -:param input_stream: labeled stream to track. - Should contain label_col and prediction_col -:param output_stream: Output stream to push the detector's alerts -:param output_tsdb: Output TSDB table to allow analysis and display -:param tsdb_batch_size: Batch size of alerts to buffer before pushing to the TSDB -:param callbacks: Additional rest endpoints to send the alert data to -:param models: List of the detectors to deploy - Defaults to ['ddm', 'eddm', 'pagehinkley']. -:param models_dest: Location for saving the detectors - Defaults to 'models' (in relation to artifact_path). -:param pagehinkley_threshold: Drift level threshold for PH detector Defaults to 10. -:param ddm_warning_level: Warning level alert for DDM detector Defaults to 2. -:param ddm_out_control_level: Drift level alert for DDM detector Defaults to 3. -:param label_col: Label column to be used on base_dataset and input_stream - Defaults to 'label'. -:param prediction_col: Prediction column to be used on base_dataset and input_stream - Defaults to 'prediction'. -:param hub_url: hub_url in case the default is not used, concept_drift_streaming will be loaded - by this url - Defaults to mlconf.hub_url. -:param fn_tag: hub tag to use - Defaults to 'master' -``` - -## Algorithms - -We offer to deploy up to 3 concept drift streaming detectors - -### DDM - Drift Detection Method - -Models the **Number of errors** as a **binomial** variable. This enables us to confine the expected number of errors in a prediction stream window to within some standard deviation. - -- Good for **abrupt** drift changes - -
- -![$mu=np_t$](https://latex.codecogs.com/svg.latex?mu=np_t) - -![$\sigma=\sqrt{\frac{p_t(1-p_t)}{n}}$]() - -
- -**Alert** when: - -
- -![$p_t+\sigma_t\ge{p_{min}+3\sigma_{min}}$](https://latex.codecogs.com/svg.latex?p_t+\sigma_t\ge{p_{min}+3\sigma_{min}}) - -
- -### EDDM - Early Drift Detection Method - -Uses the distance between two consecutive errors. - -- works better for **gradual** drift changes. -- More sensitive then DDM for noise -- Requires Minimal number of errors to initialize the statistics. - -**Warning**: - -
- -![$\frac{p_t+2\sigma_t}{p_{max}+2\sigma_{max}}<0.95$](https://latex.codecogs.com/svg.latex?\frac{p_t+2\sigma_t}{p_{max}+2\sigma_{max}}<0.95) - -
- -**Alert**: - -
- -![$\frac{p_t+2\sigma_t}{p_{max}+2\sigma_{max}}<0.90$](https://latex.codecogs.com/svg.latex?\frac{p_t+2\sigma_t}{p_{max}+2\sigma_{max}}<0.90) - -
- -### PageHinkley Test: - -The PageHinkley test is a sequential analysis technique typically used for monitoring change detection. (The test was designed to detect change in avg. of a Gaussian signal). In this test we use: -x*1*, ..., x*n* - labeled dataset -δ - magnitude threshold -λ - detection threshold - -
- -![$\hat{x_T}=\frac{1}{T}\sum_{t=1}^{t}{x_t}$](https://latex.codecogs.com/svg.latex?\hat{x_T}=\frac{1}{T}\sum_{t=1}^{t}{x_t}) - -![$\sum_{t=1}^T{x_t-\hat{x_T}-\delta}$](https://latex.codecogs.com/svg.latex?U_T=\sum_{t=1}^T{x_t-\hat{x_T}-\delta}) - -![$m_T=min(U_t,t=1..T)$]() - -
- -**Alert**: - -
- -![$U_T-m_T>\lambda$](https://latex.codecogs.com/svg.latex?U_T-m_T>\lambda) - -
- -## Additional resources -[A Study on Change Detection Methods](https://pdfs.semanticscholar.org/bb6e/8a44c0efcd725aae1c0b1817561f6e278c2c.pdf), Raquel Sebasti˜ao1,2 and Jo˜ao Gama1,3, 1 LIAAD-INESC Porto L.A., University of Porto -Rua de Ceuta, 118 - 6, 4050-190 Porto, Portugal -2 Faculty of Science, University of Porto -3 Faculty of Economics, University of Porto -{raquel,jgama}@liaad.up.pt - -[MLOps Live #4 - How to Detect & Remediate Drift in Production with MLOps Automation](https://www.youtube.com/watch?v=66_Q7mJZOSc&t=1296s) diff --git a/functions/development/concept_drift/latest/src/concept_drift.ipynb b/functions/development/concept_drift/latest/src/concept_drift.ipynb deleted file mode 100644 index e9c063b6..00000000 --- a/functions/development/concept_drift/latest/src/concept_drift.ipynb +++ /dev/null @@ -1,793 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Concept Drift - Deployer\n", - "Deploy a streaming Concept Drift detector on a labeled stream. \n", - "It will initialize the selected drift detectors with the base_dataset's statistics and deploy the [concept_drift_streaming](https://github.com/mlrun/functions/blob/master/concept_drift_streaming/concept_drift_streaming.ipynb) function from the hub.
\n", - "adding [V3IOStreamTrigger](https://nuclio.io/docs/latest/reference/triggers/v3iostream/) in order to listen to the input_stream." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Steps**\n", - "\n", - "1. [Data exploration](#Data-exploration)\n", - "2. [Creating the input stream](#Creating-the-input-stream)\n", - "3. [Importing the function](#Importing-the-function)\n", - "4. [Running the function remotely](#Running-the-function-remotely)\n", - "5. [Testing the function](#Testing-the-function)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Data exploration**" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In order to know about the performance of a drift detector by measuring the different detection metrics, we need to know beforehand where a real drift occurs.
\n", - "This is only possible with synthetic datasets.
The scikit-multiflow framework allows generating several kinds of synthetic data to simulate the occurrence of drifts.
\n", - "[Harvard dataverse](https://dataverse.harvard.edu) provides futher explanations on the [used dataset](https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/5OWRGB) along with different kinds of drifted datasets.
\n", - "mixed_0101_abrupto has 4 concepts and 3 drifts at time steps 10000, 20000, and 30000.
\n", - "Our dataset will be train-test-splitted, the train part (first 5000 examples) is used to train the model (that is generated easly using [sklearn_classifer](https://github.com/mlrun/functions/blob/master/sklearn_classifier/sklearn_classifier.ipynb)).
\n", - "The test part (which is already predicted by the model) will be pushed to the input stream in order to detect drifts." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
X1X2X3X4class
00.01.00.4601010.5927441.0
11.01.00.5887880.5749840.0
20.00.00.4016410.6793251.0
31.01.00.3060760.1821080.0
40.00.00.9628470.5792451.0
\n", - "
" - ], - "text/plain": [ - " X1 X2 X3 X4 class\n", - "0 0.0 1.0 0.460101 0.592744 1.0\n", - "1 1.0 1.0 0.588788 0.574984 0.0\n", - "2 0.0 0.0 0.401641 0.679325 1.0\n", - "3 1.0 1.0 0.306076 0.182108 0.0\n", - "4 0.0 0.0 0.962847 0.579245 1.0" - ] - }, - "execution_count": 1, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import pandas as pd\n", - "data_path = 'https://s3.wasabisys.com/iguazio/data/function-marketplace-data/concept_drift/mixed_0101_abrupto.csv'\n", - "predicted_train_path = 'https://s3.wasabisys.com/iguazio/data/function-marketplace-data/concept_drift/predicted_abrupto_train.csv'\n", - "predicted_test_data_path = 'https://s3.wasabisys.com/iguazio/data/function-marketplace-data/concept_drift/predicted_abrupto_test.csv'\n", - "# You can find the model used here\n", - "models_path = 'https://s3.wasabisys.com/iguazio/models/function-marketplace-models/concept_drift/concept_drift_random_forest.pkl'\n", - "original_data = pd.read_csv(data_path)\n", - "original_data.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
X1X2X3X4classpredicted_col
349950.00.00.0101060.6472690.01.0
349961.01.00.2936510.7372911.00.0
349970.00.00.8485460.5523370.01.0
349981.01.00.6147540.8598961.00.0
349991.00.00.2653060.8437160.01.0
\n", - "
" - ], - "text/plain": [ - " X1 X2 X3 X4 class predicted_col\n", - "34995 0.0 0.0 0.010106 0.647269 0.0 1.0\n", - "34996 1.0 1.0 0.293651 0.737291 1.0 0.0\n", - "34997 0.0 0.0 0.848546 0.552337 0.0 1.0\n", - "34998 1.0 1.0 0.614754 0.859896 1.0 0.0\n", - "34999 1.0 0.0 0.265306 0.843716 0.0 1.0" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "predicted_test = pd.read_csv(predicted_test_data_path)\n", - "predicted_test.tail()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Creating the input stream**" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "import os \n", - "\n", - "container = os.path.join('/',os.environ['V3IO_HOME'].split('/')[0])\n", - "user = os.environ[\"V3IO_USERNAME\"]\n", - "rel_path = os.getcwd()[6:] + '/artifacts'\n", - "\n", - "base_input_stream = os.path.join(user,rel_path) + \"/inputs_stream\"\n", - "base_output_stream = os.path.join(user,rel_path) + \"/output_stream\"\n", - "input_stream = os.path.join(container,base_input_stream)\n", - "output_stream = os.path.join(container,user,rel_path) + \"/output_stream\"\n", - "tsdb_path = os.path.join(container,user,rel_path) + \"/output_tsdb\"\n", - "\n", - "stream_consumer_group = 'cg45'" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "import v3io.dataplane\n", - "\n", - "client = v3io.dataplane.Client()\n", - "response = client.stream.create(container = container,\n", - " stream_path=base_input_stream,\n", - " shard_count=1,\n", - " raise_for_status = v3io.dataplane.RaiseForStatus.never)\n", - "response.raise_for_status([409, 204])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Importing the function**" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2021-10-25 10:27:04,105 [info] created and saved project function-marketplace\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Importing the function\n", - "import mlrun\n", - "mlrun.set_environment(project='function-marketplace')\n", - "\n", - "fn = mlrun.import_function(\"hub://concept_drift:development\")\n", - "fn.apply(mlrun.auto_mount())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Running the function remotely**" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2021-10-25 10:27:04,567 [info] starting run concept_drift uid=fa07c222e77d4eac86d2ce9317aaded1 DB=http://mlrun-api:8080\n", - "> 2021-10-25 10:27:04,709 [info] Job is running in the background, pod: concept-drift-ggxgb\n", - "> 2021-10-25 10:27:11,199 [info] Loading base dataset\n", - "> 2021-10-25 10:27:13,227 [info] Creating models\n", - "> 2021-10-25 10:27:13,227 [info] Streaming data to models\n", - "> 2021-10-25 10:27:13,347 [info] Logging ready models\n", - "> 2021-10-25 10:27:13,487 [info] Deploying Concept Drift Streaming function\n", - "> 2021-10-25 10:27:13,490 [info] Starting remote function deploy\n", - "2021-10-25 10:27:13 (info) Deploying function\n", - "2021-10-25 10:27:13 (info) Building\n", - "2021-10-25 10:27:13 (info) Staging files and preparing base images\n", - "2021-10-25 10:27:13 (info) Building processor image\n", - "2021-10-25 10:27:15 (info) Build complete\n", - "2021-10-25 10:27:21 (info) Function deploy complete\n", - "> 2021-10-25 10:27:21,797 [info] successfully deployed function: {'internal_invocation_urls': ['nuclio-function-marketplace-concept-drift-streaming.default-tenant.svc.cluster.local:8080'], 'external_invocation_urls': ['default-tenant.app.dev39.lab.iguazeng.com:31143']}\n", - "> 2021-10-25 10:27:21,868 [info] run executed, status=completed\n", - "final state: completed\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
function-marketplace0Oct 25 10:27:10completedconcept_drift
v3io_user=dani
kind=job
owner=dani
host=concept-drift-ggxgb
base_dataset
input_stream=/users/dani/test/functions/concept_drift/artifacts/inputs_stream
consumer_group=cg45
output_stream=/users/dani/test/functions/concept_drift/artifacts/output_stream
output_tsdb=/users/dani/test/functions/concept_drift/artifacts/output_tsdb
tsdb_batch_size=1
models=['ddm', 'eddm', 'pagehinkley']
label_col=class
prediction_col=predicted_col
fn_tag=development
eddm_concept_drift
pagehinkley_concept_drift
ddm_concept_drift
\n", - "
\n", - "
\n", - "
\n", - " Title\n", - " ×\n", - "
\n", - " \n", - "
\n", - "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - }, - { - "data": { - "text/html": [ - " > to track results use the .show() or .logs() methods or click here to open in UI" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2021-10-25 10:27:23,031 [info] run executed, status=completed\n" - ] - } - ], - "source": [ - "drift_run = fn.run(name='concept_drift',\n", - " params={'input_stream' : input_stream,\n", - " 'consumer_group' : stream_consumer_group,\n", - " 'output_stream' : output_stream,\n", - " 'output_tsdb' : tsdb_path,\n", - " 'tsdb_batch_size' : 1,\n", - " 'models' : ['ddm', 'eddm', 'pagehinkley'], # defaults\n", - " 'label_col' : 'class',\n", - " 'prediction_col' : 'predicted_col',\n", - " 'fn_tag' : 'development'},\n", - " inputs={'base_dataset' : predicted_train_path},\n", - " artifact_path = os.path.join(os.getcwd(), 'artifacts'))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Testing the function**\n", - "> Mark that we are testing the deployed function - concept_drift_streaming" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'data': '{\"class\": 1.0, \"request\": {\"instances\": [{\"X1\": 0.0, \"X2\": 0.0, \"X3\": 0.0634475073, \"X4\": 0.4136568818}]}, \"resp\": [1], \"when\": \"2021-10-25 10:27:23.152584\", \"model\": \"sklearn.ensemble.RandomForestClassifier\"}'}" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import json\n", - "import datetime\n", - "\n", - "# Reshaping the data to V3IOStream format.\n", - "def restructure_stream_event(context, event):\n", - " instances = [dict()]\n", - " for key in predicted_test.keys():\n", - " if key not in ['when', 'class', 'model', 'worker', 'hostname', 'predicted_col']:\n", - " instances[0].update({key: event.pop(key)})\n", - " event['request'] = {'instances': instances}\n", - " event['resp'] = [int(event.pop('predicted_col'))]\n", - " event['when'] = datetime.datetime.strftime(datetime.datetime.now(), format=\"%Y-%m-%d %H:%M:%S.%f\")\n", - " event['model'] = 'sklearn.ensemble.RandomForestClassifier'\n", - " return event\n", - " \n", - " \n", - "records = json.loads(predicted_test.to_json(orient='records'))\n", - "records = [{'data': json.dumps(restructure_stream_event(context, record))} for record in records]\n", - "\n", - "# showing first record\n", - "records[0]" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "# Creating v3io client\n", - "v3io_client = v3io.dataplane.Client()\n", - "\n", - "# Pushing some undrifted data to the input stream\n", - "response = v3io_client.stream.put_records(container=container,\n", - " stream_path=base_input_stream, \n", - " records=records[4900:5100])" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'SequenceNumber': 200,\n", - " 'Data': 'eyJjbGFzcyI6IDAuMCwgInJlcXVlc3QiOiB7Imluc3RhbmNlcyI6IFt7IlgxIjogMC4wLCAiWDIiOiAwLjAsICJYMyI6IDAuMzMzMTYzNjk4OSwgIlg0IjogMC40MjE2NzY1Njg3fV19LCAicmVzcCI6IFsxXSwgIndoZW4iOiAiMjAyMS0xMC0yNSAxMDoyNzoyMy4yOTM3OTgiLCAibW9kZWwiOiAic2tsZWFybi5lbnNlbWJsZS5SYW5kb21Gb3Jlc3RDbGFzc2lmaWVyIn0=',\n", - " 'ArrivalTimeSec': 1635157644,\n", - " 'ArrivalTimeNSec': 395309631}" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Getting earliest location in the shard\n", - "location = json.loads(v3io_client.stream.seek(container=container,\n", - " stream_path=base_input_stream,\n", - " shard_id=0,\n", - " seek_type='EARLIEST').body)['Location']\n", - "# Getting records from input stream\n", - "response = v3io_client.stream.get_records(container=container,\n", - " stream_path=base_input_stream,\n", - " shard_id=0, location=location)\n", - "# Showing the last sequence that is written to the input stream\n", - "json.loads(response.body)['Records'][-1]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Make sure some time has passed - the function needs to be triggered by the input stream, then it'll write to the output stream" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "# Getting earliest location in the shard\n", - "location = json.loads(v3io_client.stream.seek(container=container,\n", - " stream_path=base_output_stream,\n", - " shard_id=0,\n", - " seek_type='EARLIEST').body)['Location']\n", - "# Getting records from output stream\n", - "response = v3io_client.stream.get_records(container=container,\n", - " stream_path=base_output_stream,\n", - " shard_id=0, location=location)" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "sequence number : 106, data : {'class': 0.0, 'request': {'instances': [{'X1': 0.0, 'X2': 0.0, 'X3': 0.9628473804, 'X4': 0.5792453402}]}, 'resp': [1], 'when': '2021-10-25 10:27:23.291145', 'model': 'sklearn.ensemble.RandomForestClassifier', 'ddm_warning_zone': 0, 'ddm_drift': 1, 'eddm_warning_zone': 0, 'eddm_drift': 0}\n", - "sequence number : 122, data : {'class': 0.0, 'request': {'instances': [{'X1': 0.0, 'X2': 0.0, 'X3': 0.4969765505, 'X4': 0.9784738351}]}, 'resp': [1], 'when': '2021-10-25 10:27:23.291558', 'model': 'sklearn.ensemble.RandomForestClassifier', 'ddm_warning_zone': 0, 'ddm_drift': 0, 'eddm_warning_zone': 0, 'eddm_drift': 1}\n" - ] - } - ], - "source": [ - "# Showing changed detected\n", - "import base64\n", - "for instance in json.loads(response.body)['Records']:\n", - " seq = instance[\"SequenceNumber\"]\n", - " data = json.loads(base64.b64decode(instance['Data']))\n", - " if(data['ddm_drift']==1 or data['eddm_drift']==1):\n", - " print(f'sequence number : {seq}, data : {data}')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can see that the system detected a change in the 106 instance, which is 10006 instance in the real dataset -
\n", - "5000 first instances are for train, we started pushing data from the 4900 instance of the test dataset (9900 from the real dataset), and we pushed only 200 instances.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "[Back to the top](#Concept-Drift---Deployer)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python [conda env:root] *", - "language": "python", - "name": "conda-root-py" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/functions/development/concept_drift/latest/src/concept_drift.py b/functions/development/concept_drift/latest/src/concept_drift.py deleted file mode 100644 index 03355d3b..00000000 --- a/functions/development/concept_drift/latest/src/concept_drift.py +++ /dev/null @@ -1,147 +0,0 @@ -# Copyright 2019 Iguazio -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# Generated by nuclio.export.NuclioExporter - -import skmultiflow.drift_detection # We will grab our PH, DDM, EDDM algorithms from here -import numpy as np -import pandas as pd -import os -from cloudpickle import dumps, load, dump - -from nuclio.triggers import V3IOStreamTrigger -from mlrun import DataItem, import_function, mlconf, MLClientCtx, mount_v3io - -import random - - -def concept_drift_deployer( - context: MLClientCtx, - base_dataset: DataItem, - input_stream: str, - consumer_group: str, - output_stream: str, - output_tsdb: str, - tsdb_batch_size: int, - callbacks: list, - models: list = ["ddm", "eddm", "pagehinkley"], - models_dest="models", - pagehinkley_threshold: float = 10, - ddm_warning_level: float = 2, - ddm_out_control_level: float = 3, - label_col="label", - prediction_col="prediction", - hub_url: str = mlconf.hub_url, - fn_tag: str = "master", -): - """Deploy a streaming Concept Drift detector on a labeled stream - This function is the Deployment step for the Streaming Concept Drift Detector. - It will load the selected drift detectors and initialize them with the - base_dataset's statistics. Then it will deploy the concept_drift_streaming - function and pass the models to it for streaming concept-drift detection on top - of a labeled stream. - - :param context: MLRun context - :param base_dataset: Dataset containing label_col and prediction_col to initialize the detectors - :param input_stream: labeled stream to track. - Should contain label_col and prediction_col - :param output_stream: Output stream to push the detector's alerts - :param output_tsdb: Output TSDB table to allow analysis and display - :param tsdb_batch_size: Batch size of alerts to buffer before pushing to the TSDB - :param callbacks: Additional rest endpoints to send the alert data to - :param models: List of the detectors to deploy - Defaults to ['ddm', 'eddm', 'pagehinkley']. - :param models_dest: Location for saving the detectors - Defaults to 'models' (in relation to artifact_path). - :param pagehinkley_threshold: Drift level threshold for PH detector Defaults to 10. - :param ddm_warning_level: Warning level alert for DDM detector Defaults to 2. - :param ddm_out_control_level: Drift level alert for DDM detector Defaults to 3. - :param label_col: Label column to be used on base_dataset and input_stream - Defaults to 'label'. - :param prediction_col: Prediction column to be used on base_dataset and input_stream - Defaults to 'prediction'. - :param hub_url: hub_url in case the default is not used, concept_drift_streaming will be loaded - by this url - Defaults to mlconf.hub_url. - :param fn_tag: hub tag to use - Defaults to 'master' - """ - - mlconf.dbpath = mlconf.dbpath or "http://mlrun-api:8080" - mlconf.hub_url = hub_url - fn = import_function(url=f"hub://concept_drift_streaming:{fn_tag}") - - context.logger.info("Loading base dataset") - base_df = base_dataset.as_df() - error_stream = np.where( - base_df[prediction_col].values == base_df[label_col].values, 0, 1 - ) - - context.logger.info("Creating models") - models = [ - model.strip() - for model in os.getenv("models", "pagehinkley, ddm, eddm").split(",") - ] - models = { - "eddm": skmultiflow.drift_detection.EDDM(), - "pagehinkley": skmultiflow.drift_detection.PageHinkley( - min_instances=len(error_stream), threshold=pagehinkley_threshold - ), - "ddm": skmultiflow.drift_detection.DDM( - min_num_instances=len(error_stream), - warning_level=ddm_warning_level, - out_control_level=ddm_out_control_level, - ), - } - - context.logger.info("Streaming data to models") - for i in range(len(error_stream)): - for model_name, model in models.items(): - model.add_element(error_stream[i]) - - context.logger.info("Logging ready models") - for name, model in models.items(): - data = dumps(model) - model_file = f"{name}.pkl" - context.log_model( - f"{name}_concept_drift", - body=data, - labels={"framework": "skmultiflow", "workflow": "concept-drift"}, - model_file=model_file, - model_dir=models_dest, - tag="latest", - ) - fn.set_envs( - { - f"{name}_model_path": os.path.join( - context.artifact_path, models_dest, model_file - ) - } - ) - - context.logger.info("Deploying Concept Drift Streaming function") - fn.set_envs( - { - "label_col": label_col, - "prediction_col": prediction_col, - "drift_stream": output_stream, - "tsdb_table": output_tsdb, - "pagehinkley_threshold": pagehinkley_threshold, - "ddm_warning_level": ddm_warning_level, - "ddm_out_control": ddm_out_control_level, - } - ) - fn.add_v3io_stream_trigger(stream_path = input_stream, name = 'stream', group = consumer_group) - fn.apply(mount_v3io()) - fn.deploy(project=context.project) diff --git a/functions/development/concept_drift/latest/src/function.yaml b/functions/development/concept_drift/latest/src/function.yaml deleted file mode 100644 index 071111c7..00000000 --- a/functions/development/concept_drift/latest/src/function.yaml +++ /dev/null @@ -1,112 +0,0 @@ -kind: job -metadata: - name: concept-drift - tag: '' - hash: 935da41196802875e19948974f32b6f00c29feb2 - project: '' - labels: - author: orz - framework: sklearn - categories: - - machine-learning - - model-serving -spec: - command: '' - args: [] - image: mlrun/ml-models - env: [] - default_handler: concept_drift_deployer - entry_points: - concept_drift_deployer: - name: concept_drift_deployer - doc: "Deploy a streaming Concept Drift detector on a labeled stream\n This\ - \ function is the Deployment step for the Streaming Concept Drift Detector.\n\ - \ It will load the selected drift detectors and initialize them with the\n\ - \ base_dataset's statistics. Then it will deploy the concept_drift_streaming\n\ - \ function and pass the models to it for streaming concept-drift detection\ - \ on top\n of a labeled stream." - parameters: - - name: context - type: MLClientCtx - doc: MLRun context - default: '' - - name: base_dataset - type: DataItem - doc: Dataset containing label_col and prediction_col to initialize the detectors - default: '' - - name: input_stream - type: str - doc: labeled stream to track. Should contain label_col and prediction_col - default: '' - - name: consumer_group - type: str - default: '' - - name: output_stream - type: str - doc: Output stream to push the detector's alerts - default: '' - - name: output_tsdb - type: str - doc: Output TSDB table to allow analysis and display - default: '' - - name: tsdb_batch_size - type: int - doc: Batch size of alerts to buffer before pushing to the TSDB - default: '' - - name: callbacks - type: list - doc: Additional rest endpoints to send the alert data to - default: '' - - name: models - type: list - doc: List of the detectors to deploy Defaults to ['ddm', 'eddm', 'pagehinkley']. - default: - - ddm - - eddm - - pagehinkley - - name: models_dest - doc: Location for saving the detectors Defaults to 'models' (in relation to - artifact_path). - default: models - - name: pagehinkley_threshold - type: float - doc: Drift level threshold for PH detector Defaults to 10. - default: 10 - - name: ddm_warning_level - type: float - doc: Warning level alert for DDM detector Defaults to 2. - default: 2 - - name: ddm_out_control_level - type: float - doc: Drift level alert for DDM detector Defaults to 3. - default: 3 - - name: label_col - doc: Label column to be used on base_dataset and input_stream Defaults to - 'label'. - default: label - - name: prediction_col - doc: Prediction column to be used on base_dataset and input_stream Defaults - to 'prediction'. - default: prediction - - name: hub_url - type: str - doc: hub_url in case the default is not used, concept_drift_streaming will - be loaded by this url Defaults to mlconf.hub_url. - default: <_ast.Name object at 0x7f48eda946d0> - - name: fn_tag - type: str - doc: hub tag to use Defaults to 'master' - default: master - outputs: - - default: '' - lineno: 15 - description: Deploy a streaming Concept Drift detector on a labeled stream - build: - functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHNrbXVsdGlmbG93LmRyaWZ0X2RldGVjdGlvbiAgIyBXZSB3aWxsIGdyYWIgb3VyIFBILCBERE0sIEVERE0gYWxnb3JpdGhtcyBmcm9tIGhlcmUKaW1wb3J0IG51bXB5IGFzIG5wCmltcG9ydCBwYW5kYXMgYXMgcGQKaW1wb3J0IG9zCmZyb20gY2xvdWRwaWNrbGUgaW1wb3J0IGR1bXBzLCBsb2FkLCBkdW1wCgpmcm9tIG51Y2xpby50cmlnZ2VycyBpbXBvcnQgVjNJT1N0cmVhbVRyaWdnZXIKZnJvbSBtbHJ1biBpbXBvcnQgRGF0YUl0ZW0sIGltcG9ydF9mdW5jdGlvbiwgbWxjb25mLCBNTENsaWVudEN0eCwgbW91bnRfdjNpbwoKaW1wb3J0IHJhbmRvbQoKCmRlZiBjb25jZXB0X2RyaWZ0X2RlcGxveWVyKAogICAgY29udGV4dDogTUxDbGllbnRDdHgsCiAgICBiYXNlX2RhdGFzZXQ6IERhdGFJdGVtLAogICAgaW5wdXRfc3RyZWFtOiBzdHIsCiAgICBjb25zdW1lcl9ncm91cDogc3RyLAogICAgb3V0cHV0X3N0cmVhbTogc3RyLAogICAgb3V0cHV0X3RzZGI6IHN0ciwKICAgIHRzZGJfYmF0Y2hfc2l6ZTogaW50LAogICAgY2FsbGJhY2tzOiBsaXN0LAogICAgbW9kZWxzOiBsaXN0ID0gWyJkZG0iLCAiZWRkbSIsICJwYWdlaGlua2xleSJdLAogICAgbW9kZWxzX2Rlc3Q9Im1vZGVscyIsCiAgICBwYWdlaGlua2xleV90aHJlc2hvbGQ6IGZsb2F0ID0gMTAsCiAgICBkZG1fd2FybmluZ19sZXZlbDogZmxvYXQgPSAyLAogICAgZGRtX291dF9jb250cm9sX2xldmVsOiBmbG9hdCA9IDMsCiAgICBsYWJlbF9jb2w9ImxhYmVsIiwKICAgIHByZWRpY3Rpb25fY29sPSJwcmVkaWN0aW9uIiwKICAgIGh1Yl91cmw6IHN0ciA9IG1sY29uZi5odWJfdXJsLAogICAgZm5fdGFnOiBzdHIgPSAibWFzdGVyIiwKKToKICAgICIiIkRlcGxveSBhIHN0cmVhbWluZyBDb25jZXB0IERyaWZ0IGRldGVjdG9yIG9uIGEgbGFiZWxlZCBzdHJlYW0KICAgICAgIFRoaXMgZnVuY3Rpb24gaXMgdGhlIERlcGxveW1lbnQgc3RlcCBmb3IgdGhlIFN0cmVhbWluZyBDb25jZXB0IERyaWZ0IERldGVjdG9yLgogICAgICAgSXQgd2lsbCBsb2FkIHRoZSBzZWxlY3RlZCBkcmlmdCBkZXRlY3RvcnMgYW5kIGluaXRpYWxpemUgdGhlbSB3aXRoIHRoZQogICAgICAgYmFzZV9kYXRhc2V0J3Mgc3RhdGlzdGljcy4gIFRoZW4gaXQgd2lsbCBkZXBsb3kgdGhlIGNvbmNlcHRfZHJpZnRfc3RyZWFtaW5nCiAgICAgICBmdW5jdGlvbiBhbmQgcGFzcyB0aGUgbW9kZWxzIHRvIGl0IGZvciBzdHJlYW1pbmcgY29uY2VwdC1kcmlmdCBkZXRlY3Rpb24gb24gdG9wCiAgICAgICBvZiBhIGxhYmVsZWQgc3RyZWFtLgoKICAgIDpwYXJhbSBjb250ZXh0OiAgICAgICAgIE1MUnVuIGNvbnRleHQKICAgIDpwYXJhbSBiYXNlX2RhdGFzZXQ6ICAgIERhdGFzZXQgY29udGFpbmluZyBsYWJlbF9jb2wgYW5kIHByZWRpY3Rpb25fY29sIHRvIGluaXRpYWxpemUgdGhlIGRldGVjdG9ycwogICAgOnBhcmFtIGlucHV0X3N0cmVhbTogICAgbGFiZWxlZCBzdHJlYW0gdG8gdHJhY2suCiAgICAgICAgICAgICAgICAgICAgICAgICAgICBTaG91bGQgY29udGFpbiBsYWJlbF9jb2wgYW5kIHByZWRpY3Rpb25fY29sCiAgICA6cGFyYW0gb3V0cHV0X3N0cmVhbTogICBPdXRwdXQgc3RyZWFtIHRvIHB1c2ggdGhlIGRldGVjdG9yJ3MgYWxlcnRzCiAgICA6cGFyYW0gb3V0cHV0X3RzZGI6ICAgICBPdXRwdXQgVFNEQiB0YWJsZSB0byBhbGxvdyBhbmFseXNpcyBhbmQgZGlzcGxheQogICAgOnBhcmFtIHRzZGJfYmF0Y2hfc2l6ZTogQmF0Y2ggc2l6ZSBvZiBhbGVydHMgdG8gYnVmZmVyIGJlZm9yZSBwdXNoaW5nIHRvIHRoZSBUU0RCCiAgICA6cGFyYW0gY2FsbGJhY2tzOiAgICAgICBBZGRpdGlvbmFsIHJlc3QgZW5kcG9pbnRzIHRvIHNlbmQgdGhlIGFsZXJ0IGRhdGEgdG8KICAgIDpwYXJhbSBtb2RlbHM6ICAgICAgICAgIExpc3Qgb2YgdGhlIGRldGVjdG9ycyB0byBkZXBsb3kKICAgICAgICAgICAgICAgICAgICAgICAgICAgIERlZmF1bHRzIHRvIFsnZGRtJywgJ2VkZG0nLCAncGFnZWhpbmtsZXknXS4KICAgIDpwYXJhbSBtb2RlbHNfZGVzdDogICAgIExvY2F0aW9uIGZvciBzYXZpbmcgdGhlIGRldGVjdG9ycwogICAgICAgICAgICAgICAgICAgICAgICAgICAgRGVmYXVsdHMgdG8gJ21vZGVscycgKGluIHJlbGF0aW9uIHRvIGFydGlmYWN0X3BhdGgpLgogICAgOnBhcmFtIHBhZ2VoaW5rbGV5X3RocmVzaG9sZDogIERyaWZ0IGxldmVsIHRocmVzaG9sZCBmb3IgUEggZGV0ZWN0b3IgRGVmYXVsdHMgdG8gMTAuCiAgICA6cGFyYW0gZGRtX3dhcm5pbmdfbGV2ZWw6ICAgICAgV2FybmluZyBsZXZlbCBhbGVydCBmb3IgRERNIGRldGVjdG9yIERlZmF1bHRzIHRvIDIuCiAgICA6cGFyYW0gZGRtX291dF9jb250cm9sX2xldmVsOiAgRHJpZnQgbGV2ZWwgYWxlcnQgZm9yIERETSBkZXRlY3RvciBEZWZhdWx0cyB0byAzLgogICAgOnBhcmFtIGxhYmVsX2NvbDogICAgICAgTGFiZWwgY29sdW1uIHRvIGJlIHVzZWQgb24gYmFzZV9kYXRhc2V0IGFuZCBpbnB1dF9zdHJlYW0KICAgICAgICAgICAgICAgICAgICAgICAgICAgIERlZmF1bHRzIHRvICdsYWJlbCcuCiAgICA6cGFyYW0gcHJlZGljdGlvbl9jb2w6ICBQcmVkaWN0aW9uIGNvbHVtbiB0byBiZSB1c2VkIG9uIGJhc2VfZGF0YXNldCBhbmQgaW5wdXRfc3RyZWFtCiAgICAgICAgICAgICAgICAgICAgICAgICAgICBEZWZhdWx0cyB0byAncHJlZGljdGlvbicuCiAgICA6cGFyYW0gaHViX3VybDogICAgICAgICBodWJfdXJsIGluIGNhc2UgdGhlIGRlZmF1bHQgaXMgbm90IHVzZWQsIGNvbmNlcHRfZHJpZnRfc3RyZWFtaW5nIHdpbGwgYmUgbG9hZGVkCiAgICAgICAgICAgICAgICAgICAgICAgICAgICBieSB0aGlzIHVybAogICAgICAgICAgICAgICAgICAgICAgICAgICAgRGVmYXVsdHMgdG8gbWxjb25mLmh1Yl91cmwuCiAgICA6cGFyYW0gZm5fdGFnOiAgICAgICAgICBodWIgdGFnIHRvIHVzZQogICAgICAgICAgICAgICAgICAgICAgICAgICAgRGVmYXVsdHMgdG8gJ21hc3RlcicKICAgICIiIgoKICAgIG1sY29uZi5kYnBhdGggPSBtbGNvbmYuZGJwYXRoIG9yICJodHRwOi8vbWxydW4tYXBpOjgwODAiCiAgICBtbGNvbmYuaHViX3VybCA9IGh1Yl91cmwKICAgIGZuID0gaW1wb3J0X2Z1bmN0aW9uKHVybD1mImh1YjovL2NvbmNlcHRfZHJpZnRfc3RyZWFtaW5nOntmbl90YWd9IikKCiAgICBjb250ZXh0LmxvZ2dlci5pbmZvKCJMb2FkaW5nIGJhc2UgZGF0YXNldCIpCiAgICBiYXNlX2RmID0gYmFzZV9kYXRhc2V0LmFzX2RmKCkKICAgIGVycm9yX3N0cmVhbSA9IG5wLndoZXJlKAogICAgICAgIGJhc2VfZGZbcHJlZGljdGlvbl9jb2xdLnZhbHVlcyA9PSBiYXNlX2RmW2xhYmVsX2NvbF0udmFsdWVzLCAwLCAxCiAgICApCgogICAgY29udGV4dC5sb2dnZXIuaW5mbygiQ3JlYXRpbmcgbW9kZWxzIikKICAgIG1vZGVscyA9IFsKICAgICAgICBtb2RlbC5zdHJpcCgpCiAgICAgICAgZm9yIG1vZGVsIGluIG9zLmdldGVudigibW9kZWxzIiwgInBhZ2VoaW5rbGV5LCBkZG0sIGVkZG0iKS5zcGxpdCgiLCIpCiAgICBdCiAgICBtb2RlbHMgPSB7CiAgICAgICAgImVkZG0iOiBza211bHRpZmxvdy5kcmlmdF9kZXRlY3Rpb24uRURETSgpLAogICAgICAgICJwYWdlaGlua2xleSI6IHNrbXVsdGlmbG93LmRyaWZ0X2RldGVjdGlvbi5QYWdlSGlua2xleSgKICAgICAgICAgICAgbWluX2luc3RhbmNlcz1sZW4oZXJyb3Jfc3RyZWFtKSwgdGhyZXNob2xkPXBhZ2VoaW5rbGV5X3RocmVzaG9sZAogICAgICAgICksCiAgICAgICAgImRkbSI6IHNrbXVsdGlmbG93LmRyaWZ0X2RldGVjdGlvbi5ERE0oCiAgICAgICAgICAgIG1pbl9udW1faW5zdGFuY2VzPWxlbihlcnJvcl9zdHJlYW0pLAogICAgICAgICAgICB3YXJuaW5nX2xldmVsPWRkbV93YXJuaW5nX2xldmVsLAogICAgICAgICAgICBvdXRfY29udHJvbF9sZXZlbD1kZG1fb3V0X2NvbnRyb2xfbGV2ZWwsCiAgICAgICAgKSwKICAgIH0KCiAgICBjb250ZXh0LmxvZ2dlci5pbmZvKCJTdHJlYW1pbmcgZGF0YSB0byBtb2RlbHMiKQogICAgZm9yIGkgaW4gcmFuZ2UobGVuKGVycm9yX3N0cmVhbSkpOgogICAgICAgIGZvciBtb2RlbF9uYW1lLCBtb2RlbCBpbiBtb2RlbHMuaXRlbXMoKToKICAgICAgICAgICAgbW9kZWwuYWRkX2VsZW1lbnQoZXJyb3Jfc3RyZWFtW2ldKQoKICAgIGNvbnRleHQubG9nZ2VyLmluZm8oIkxvZ2dpbmcgcmVhZHkgbW9kZWxzIikKICAgIGZvciBuYW1lLCBtb2RlbCBpbiBtb2RlbHMuaXRlbXMoKToKICAgICAgICBkYXRhID0gZHVtcHMobW9kZWwpCiAgICAgICAgbW9kZWxfZmlsZSA9IGYie25hbWV9LnBrbCIKICAgICAgICBjb250ZXh0LmxvZ19tb2RlbCgKICAgICAgICAgICAgZiJ7bmFtZX1fY29uY2VwdF9kcmlmdCIsCiAgICAgICAgICAgIGJvZHk9ZGF0YSwKICAgICAgICAgICAgbGFiZWxzPXsiZnJhbWV3b3JrIjogInNrbXVsdGlmbG93IiwgIndvcmtmbG93IjogImNvbmNlcHQtZHJpZnQifSwKICAgICAgICAgICAgbW9kZWxfZmlsZT1tb2RlbF9maWxlLAogICAgICAgICAgICBtb2RlbF9kaXI9bW9kZWxzX2Rlc3QsCiAgICAgICAgICAgIHRhZz0ibGF0ZXN0IiwKICAgICAgICApCiAgICAgICAgZm4uc2V0X2VudnMoCiAgICAgICAgICAgIHsKICAgICAgICAgICAgICAgIGYie25hbWV9X21vZGVsX3BhdGgiOiBvcy5wYXRoLmpvaW4oCiAgICAgICAgICAgICAgICAgICAgY29udGV4dC5hcnRpZmFjdF9wYXRoLCBtb2RlbHNfZGVzdCwgbW9kZWxfZmlsZQogICAgICAgICAgICAgICAgKQogICAgICAgICAgICB9CiAgICAgICAgKQoKICAgIGNvbnRleHQubG9nZ2VyLmluZm8oIkRlcGxveWluZyBDb25jZXB0IERyaWZ0IFN0cmVhbWluZyBmdW5jdGlvbiIpCiAgICBmbi5zZXRfZW52cygKICAgICAgICB7CiAgICAgICAgICAgICJsYWJlbF9jb2wiOiBsYWJlbF9jb2wsCiAgICAgICAgICAgICJwcmVkaWN0aW9uX2NvbCI6IHByZWRpY3Rpb25fY29sLAogICAgICAgICAgICAiZHJpZnRfc3RyZWFtIjogb3V0cHV0X3N0cmVhbSwKICAgICAgICAgICAgInRzZGJfdGFibGUiOiBvdXRwdXRfdHNkYiwKICAgICAgICAgICAgInBhZ2VoaW5rbGV5X3RocmVzaG9sZCI6IHBhZ2VoaW5rbGV5X3RocmVzaG9sZCwKICAgICAgICAgICAgImRkbV93YXJuaW5nX2xldmVsIjogZGRtX3dhcm5pbmdfbGV2ZWwsCiAgICAgICAgICAgICJkZG1fb3V0X2NvbnRyb2wiOiBkZG1fb3V0X2NvbnRyb2xfbGV2ZWwsCiAgICAgICAgfQogICAgKQogICAgZm4uYWRkX3YzaW9fc3RyZWFtX3RyaWdnZXIoc3RyZWFtX3BhdGggPSBpbnB1dF9zdHJlYW0sIG5hbWUgPSAnc3RyZWFtJywgZ3JvdXAgPSBjb25zdW1lcl9ncm91cCkKICAgIGZuLmFwcGx5KG1vdW50X3YzaW8oKSkKICAgIGZuLmRlcGxveShwcm9qZWN0PWNvbnRleHQucHJvamVjdCkK - commands: - - python -m pip install scikit-multiflow - code_origin: https://github.com/daniels290813/functions.git#82bbfde4afa2eae77059e05c70bbebacf530fd0d:/User/test/functions/concept_drift/concept_drift.py - origin_filename: /User/test/functions/concept_drift/concept_drift.py - disable_auto_mount: false - affinity: null -verbose: false diff --git a/functions/development/concept_drift/latest/src/item.yaml b/functions/development/concept_drift/latest/src/item.yaml deleted file mode 100644 index 2ee37e38..00000000 --- a/functions/development/concept_drift/latest/src/item.yaml +++ /dev/null @@ -1,27 +0,0 @@ -apiVersion: v1 -categories: -- machine-learning -- model-serving -description: Deploy a streaming Concept Drift detector on a labeled stream -doc: '' -example: concept_drift.ipynb -generationDate: 2022-08-28:17-25 -hidden: false -icon: '' -labels: - author: orz - framework: sklearn -maintainers: [] -marketplaceType: '' -mlrunVersion: 1.1.0 -name: concept-drift -platformVersion: 3.5.0 -spec: - filename: concept_drift.py - handler: concept_drift_deployer - image: mlrun/ml-models - kind: job - requirements: - - scikit-multiflow -url: '' -version: 1.1.0 diff --git a/functions/development/concept_drift/latest/src/requirements.txt b/functions/development/concept_drift/latest/src/requirements.txt deleted file mode 100644 index fa0fddd8..00000000 --- a/functions/development/concept_drift/latest/src/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -skmultiflow \ No newline at end of file diff --git a/functions/development/concept_drift/latest/static/concept_drift.html b/functions/development/concept_drift/latest/static/concept_drift.html deleted file mode 100644 index 19468b25..00000000 --- a/functions/development/concept_drift/latest/static/concept_drift.html +++ /dev/null @@ -1,287 +0,0 @@ - - - - - - - -concept_drift.concept_drift - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - -
-
- -
-
-
-
-
- -
-

- -
-
-
-
-
-
-
-

Source code for concept_drift.concept_drift

-# Copyright 2019 Iguazio
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# Generated by nuclio.export.NuclioExporter
-
-import skmultiflow.drift_detection  # We will grab our PH, DDM, EDDM algorithms from here
-import numpy as np
-import pandas as pd
-import os
-from cloudpickle import dumps, load, dump
-
-from nuclio.triggers import V3IOStreamTrigger
-from mlrun import DataItem, import_function, mlconf, MLClientCtx, mount_v3io
-
-import random
-
-
-
[docs]def concept_drift_deployer( - context: MLClientCtx, - base_dataset: DataItem, - input_stream: str, - consumer_group: str, - output_stream: str, - output_tsdb: str, - tsdb_batch_size: int, - callbacks: list, - models: list = ["ddm", "eddm", "pagehinkley"], - models_dest="models", - pagehinkley_threshold: float = 10, - ddm_warning_level: float = 2, - ddm_out_control_level: float = 3, - label_col="label", - prediction_col="prediction", - hub_url: str = mlconf.hub_url, - fn_tag: str = "master", -): - """Deploy a streaming Concept Drift detector on a labeled stream - This function is the Deployment step for the Streaming Concept Drift Detector. - It will load the selected drift detectors and initialize them with the - base_dataset's statistics. Then it will deploy the concept_drift_streaming - function and pass the models to it for streaming concept-drift detection on top - of a labeled stream. - - :param context: MLRun context - :param base_dataset: Dataset containing label_col and prediction_col to initialize the detectors - :param input_stream: labeled stream to track. - Should contain label_col and prediction_col - :param output_stream: Output stream to push the detector's alerts - :param output_tsdb: Output TSDB table to allow analysis and display - :param tsdb_batch_size: Batch size of alerts to buffer before pushing to the TSDB - :param callbacks: Additional rest endpoints to send the alert data to - :param models: List of the detectors to deploy - Defaults to ['ddm', 'eddm', 'pagehinkley']. - :param models_dest: Location for saving the detectors - Defaults to 'models' (in relation to artifact_path). - :param pagehinkley_threshold: Drift level threshold for PH detector Defaults to 10. - :param ddm_warning_level: Warning level alert for DDM detector Defaults to 2. - :param ddm_out_control_level: Drift level alert for DDM detector Defaults to 3. - :param label_col: Label column to be used on base_dataset and input_stream - Defaults to 'label'. - :param prediction_col: Prediction column to be used on base_dataset and input_stream - Defaults to 'prediction'. - :param hub_url: hub_url in case the default is not used, concept_drift_streaming will be loaded - by this url - Defaults to mlconf.hub_url. - :param fn_tag: hub tag to use - Defaults to 'master' - """ - - mlconf.dbpath = mlconf.dbpath or "http://mlrun-api:8080" - mlconf.hub_url = hub_url - fn = import_function(url=f"hub://concept_drift_streaming:{fn_tag}") - - context.logger.info("Loading base dataset") - base_df = base_dataset.as_df() - error_stream = np.where( - base_df[prediction_col].values == base_df[label_col].values, 0, 1 - ) - - context.logger.info("Creating models") - models = [ - model.strip() - for model in os.getenv("models", "pagehinkley, ddm, eddm").split(",") - ] - models = { - "eddm": skmultiflow.drift_detection.EDDM(), - "pagehinkley": skmultiflow.drift_detection.PageHinkley( - min_instances=len(error_stream), threshold=pagehinkley_threshold - ), - "ddm": skmultiflow.drift_detection.DDM( - min_num_instances=len(error_stream), - warning_level=ddm_warning_level, - out_control_level=ddm_out_control_level, - ), - } - - context.logger.info("Streaming data to models") - for i in range(len(error_stream)): - for model_name, model in models.items(): - model.add_element(error_stream[i]) - - context.logger.info("Logging ready models") - for name, model in models.items(): - data = dumps(model) - model_file = f"{name}.pkl" - context.log_model( - f"{name}_concept_drift", - body=data, - labels={"framework": "skmultiflow", "workflow": "concept-drift"}, - model_file=model_file, - model_dir=models_dest, - tag="latest", - ) - fn.set_envs( - { - f"{name}_model_path": os.path.join( - context.artifact_path, models_dest, model_file - ) - } - ) - - context.logger.info("Deploying Concept Drift Streaming function") - fn.set_envs( - { - "label_col": label_col, - "prediction_col": prediction_col, - "drift_stream": output_stream, - "tsdb_table": output_tsdb, - "pagehinkley_threshold": pagehinkley_threshold, - "ddm_warning_level": ddm_warning_level, - "ddm_out_control": ddm_out_control_level, - } - ) - fn.add_v3io_stream_trigger(stream_path = input_stream, name = 'stream', group = consumer_group) - fn.apply(mount_v3io()) - fn.deploy(project=context.project)
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/concept_drift/latest/static/documentation.html b/functions/development/concept_drift/latest/static/documentation.html deleted file mode 100644 index 888532af..00000000 --- a/functions/development/concept_drift/latest/static/documentation.html +++ /dev/null @@ -1,222 +0,0 @@ - - - - - - - -concept_drift package - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - - - -
-
- - -
-
-
- -
-

concept_drift package

- -
- -
-
-
-
-
-

concept_drift package#

-
-

Submodules#

-
-
-

concept_drift.concept_drift module#

-
-
-

Module contents#

-
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/concept_drift/latest/static/example.html b/functions/development/concept_drift/latest/static/example.html deleted file mode 100644 index 33a8e3db..00000000 --- a/functions/development/concept_drift/latest/static/example.html +++ /dev/null @@ -1,894 +0,0 @@ - - - - - - - -Concept Drift - Deployer - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - - - -
-
- - -
-
-
- - -
-
-
-

Concept Drift - Deployer#

-

Deploy a streaming Concept Drift detector on a labeled stream.
-It will initialize the selected drift detectors with the base_dataset’s statistics and deploy the concept_drift_streaming function from the hub.
-adding V3IOStreamTrigger in order to listen to the input_stream.

-
-

Steps#

-
    -
  1. Data exploration

  2. -
  3. Creating the input stream

  4. -
  5. Importing the function

  6. -
  7. Running the function remotely

  8. -
  9. Testing the function

  10. -
-
-
-

Data exploration#

-

In order to know about the performance of a drift detector by measuring the different detection metrics, we need to know beforehand where a real drift occurs.
-This is only possible with synthetic datasets.
The scikit-multiflow framework allows generating several kinds of synthetic data to simulate the occurrence of drifts.
-Harvard dataverse provides futher explanations on the used dataset along with different kinds of drifted datasets.
-mixed_0101_abrupto has 4 concepts and 3 drifts at time steps 10000, 20000, and 30000.
-Our dataset will be train-test-splitted, the train part (first 5000 examples) is used to train the model (that is generated easly using sklearn_classifer).
-The test part (which is already predicted by the model) will be pushed to the input stream in order to detect drifts.

-
-
-
import pandas as pd
-data_path = 'https://s3.wasabisys.com/iguazio/data/function-marketplace-data/concept_drift/mixed_0101_abrupto.csv'
-predicted_train_path = 'https://s3.wasabisys.com/iguazio/data/function-marketplace-data/concept_drift/predicted_abrupto_train.csv'
-predicted_test_data_path = 'https://s3.wasabisys.com/iguazio/data/function-marketplace-data/concept_drift/predicted_abrupto_test.csv'
-# You can find the model used here
-models_path = 'https://s3.wasabisys.com/iguazio/models/function-marketplace-models/concept_drift/concept_drift_random_forest.pkl'
-original_data = pd.read_csv(data_path)
-original_data.head()
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
X1X2X3X4class
00.01.00.4601010.5927441.0
11.01.00.5887880.5749840.0
20.00.00.4016410.6793251.0
31.01.00.3060760.1821080.0
40.00.00.9628470.5792451.0
-
-
-
-
-
predicted_test = pd.read_csv(predicted_test_data_path)
-predicted_test.tail()
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
X1X2X3X4classpredicted_col
349950.00.00.0101060.6472690.01.0
349961.01.00.2936510.7372911.00.0
349970.00.00.8485460.5523370.01.0
349981.01.00.6147540.8598961.00.0
349991.00.00.2653060.8437160.01.0
-
-
-
-
-

Creating the input stream#

-
-
-
import os 
-
-container = os.path.join('/',os.environ['V3IO_HOME'].split('/')[0])
-user = os.environ["V3IO_USERNAME"]
-rel_path = os.getcwd()[6:] + '/artifacts'
-
-base_input_stream = os.path.join(user,rel_path) + "/inputs_stream"
-base_output_stream = os.path.join(user,rel_path) + "/output_stream"
-input_stream = os.path.join(container,base_input_stream)
-output_stream = os.path.join(container,user,rel_path) + "/output_stream"
-tsdb_path = os.path.join(container,user,rel_path) + "/output_tsdb"
-
-stream_consumer_group = 'cg45'
-
-
-
-
-
-
-
import v3io.dataplane
-
-client = v3io.dataplane.Client()
-response = client.stream.create(container = container,
-                                stream_path=base_input_stream,
-                                shard_count=1,
-                                raise_for_status = v3io.dataplane.RaiseForStatus.never)
-response.raise_for_status([409, 204])
-
-
-
-
-
-
-

Importing the function#

-
-
-
# Importing the function
-import mlrun
-mlrun.set_environment(project='function-marketplace')
-
-fn = mlrun.import_function("hub://concept_drift:development")
-fn.apply(mlrun.auto_mount())
-
-
-
-
-
> 2021-10-25 10:27:04,105 [info] created and saved project function-marketplace
-
-
-
<mlrun.runtimes.kubejob.KubejobRuntime at 0x7f145dd80fd0>
-
-
-
-
-
-
-

Running the function remotely#

-
-
-
drift_run = fn.run(name='concept_drift',
-                   params={'input_stream'    : input_stream,
-                           'consumer_group'  : stream_consumer_group,
-                           'output_stream'   : output_stream,
-                           'output_tsdb'     : tsdb_path,
-                           'tsdb_batch_size' : 1,
-                           'models'          : ['ddm', 'eddm', 'pagehinkley'], # defaults
-                           'label_col'       : 'class',
-                           'prediction_col'  : 'predicted_col',
-                           'fn_tag'          : 'development'},
-                   inputs={'base_dataset'    : predicted_train_path},
-                   artifact_path = os.path.join(os.getcwd(), 'artifacts'))
-
-
-
-
-
> 2021-10-25 10:27:04,567 [info] starting run concept_drift uid=fa07c222e77d4eac86d2ce9317aaded1 DB=http://mlrun-api:8080
-> 2021-10-25 10:27:04,709 [info] Job is running in the background, pod: concept-drift-ggxgb
-> 2021-10-25 10:27:11,199 [info] Loading base dataset
-> 2021-10-25 10:27:13,227 [info] Creating models
-> 2021-10-25 10:27:13,227 [info] Streaming data to models
-> 2021-10-25 10:27:13,347 [info] Logging ready models
-> 2021-10-25 10:27:13,487 [info] Deploying Concept Drift Streaming function
-> 2021-10-25 10:27:13,490 [info] Starting remote function deploy
-2021-10-25 10:27:13  (info) Deploying function
-2021-10-25 10:27:13  (info) Building
-2021-10-25 10:27:13  (info) Staging files and preparing base images
-2021-10-25 10:27:13  (info) Building processor image
-2021-10-25 10:27:15  (info) Build complete
-2021-10-25 10:27:21  (info) Function deploy complete
-> 2021-10-25 10:27:21,797 [info] successfully deployed function: {'internal_invocation_urls': ['nuclio-function-marketplace-concept-drift-streaming.default-tenant.svc.cluster.local:8080'], 'external_invocation_urls': ['default-tenant.app.dev39.lab.iguazeng.com:31143']}
-> 2021-10-25 10:27:21,868 [info] run executed, status=completed
-final state: completed
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
function-marketplace0Oct 25 10:27:10completedconcept_drift
v3io_user=dani
kind=job
owner=dani
host=concept-drift-ggxgb
base_dataset
input_stream=/users/dani/test/functions/concept_drift/artifacts/inputs_stream
consumer_group=cg45
output_stream=/users/dani/test/functions/concept_drift/artifacts/output_stream
output_tsdb=/users/dani/test/functions/concept_drift/artifacts/output_tsdb
tsdb_batch_size=1
models=['ddm', 'eddm', 'pagehinkley']
label_col=class
prediction_col=predicted_col
fn_tag=development
eddm_concept_drift
pagehinkley_concept_drift
ddm_concept_drift
-
- -
-

-
-
-
> to track results use the .show() or .logs() methods or click here to open in UI
> 2021-10-25 10:27:23,031 [info] run executed, status=completed
-
-
-
-
-
-
-

Testing the function#

-
-

Mark that we are testing the deployed function - concept_drift_streaming

-
-
-
-
import json
-import datetime
-
-# Reshaping the data to V3IOStream format.
-def restructure_stream_event(context, event):
-    instances = [dict()]
-    for key in predicted_test.keys():
-        if key not in ['when', 'class', 'model', 'worker', 'hostname', 'predicted_col']:
-            instances[0].update({key: event.pop(key)})
-    event['request'] = {'instances': instances}
-    event['resp'] = [int(event.pop('predicted_col'))]
-    event['when'] = datetime.datetime.strftime(datetime.datetime.now(), format="%Y-%m-%d %H:%M:%S.%f")
-    event['model'] = 'sklearn.ensemble.RandomForestClassifier'
-    return event
-    
-    
-records = json.loads(predicted_test.to_json(orient='records'))
-records = [{'data': json.dumps(restructure_stream_event(context, record))} for record in records]
-
-# showing first record
-records[0]
-
-
-
-
-
{'data': '{"class": 1.0, "request": {"instances": [{"X1": 0.0, "X2": 0.0, "X3": 0.0634475073, "X4": 0.4136568818}]}, "resp": [1], "when": "2021-10-25 10:27:23.152584", "model": "sklearn.ensemble.RandomForestClassifier"}'}
-
-
-
-
-
-
-
# Creating v3io client
-v3io_client = v3io.dataplane.Client()
-
-# Pushing some undrifted data to the input stream
-response = v3io_client.stream.put_records(container=container,
-                                          stream_path=base_input_stream, 
-                                          records=records[4900:5100])
-
-
-
-
-
-
-
# Getting earliest location in the shard
-location = json.loads(v3io_client.stream.seek(container=container,
-                                              stream_path=base_input_stream,
-                                              shard_id=0,
-                                              seek_type='EARLIEST').body)['Location']
-# Getting records from input stream
-response = v3io_client.stream.get_records(container=container,
-                                          stream_path=base_input_stream,
-                                          shard_id=0, location=location)
-# Showing the last sequence that is written to the input stream
-json.loads(response.body)['Records'][-1]
-
-
-
-
-
{'SequenceNumber': 200,
- 'Data': 'eyJjbGFzcyI6IDAuMCwgInJlcXVlc3QiOiB7Imluc3RhbmNlcyI6IFt7IlgxIjogMC4wLCAiWDIiOiAwLjAsICJYMyI6IDAuMzMzMTYzNjk4OSwgIlg0IjogMC40MjE2NzY1Njg3fV19LCAicmVzcCI6IFsxXSwgIndoZW4iOiAiMjAyMS0xMC0yNSAxMDoyNzoyMy4yOTM3OTgiLCAibW9kZWwiOiAic2tsZWFybi5lbnNlbWJsZS5SYW5kb21Gb3Jlc3RDbGFzc2lmaWVyIn0=',
- 'ArrivalTimeSec': 1635157644,
- 'ArrivalTimeNSec': 395309631}
-
-
-
-
-
-

Make sure some time has passed - the function needs to be triggered by the input stream, then it’ll write to the output stream#

-
-
-
# Getting earliest location in the shard
-location = json.loads(v3io_client.stream.seek(container=container,
-                                              stream_path=base_output_stream,
-                                              shard_id=0,
-                                              seek_type='EARLIEST').body)['Location']
-# Getting records from output stream
-response = v3io_client.stream.get_records(container=container,
-                                          stream_path=base_output_stream,
-                                          shard_id=0, location=location)
-
-
-
-
-
-
-
# Showing changed detected
-import base64
-for instance in json.loads(response.body)['Records']:
-    seq = instance["SequenceNumber"]
-    data = json.loads(base64.b64decode(instance['Data']))
-    if(data['ddm_drift']==1 or data['eddm_drift']==1):
-        print(f'sequence number : {seq}, data : {data}')
-
-
-
-
-
sequence number : 106, data : {'class': 0.0, 'request': {'instances': [{'X1': 0.0, 'X2': 0.0, 'X3': 0.9628473804, 'X4': 0.5792453402}]}, 'resp': [1], 'when': '2021-10-25 10:27:23.291145', 'model': 'sklearn.ensemble.RandomForestClassifier', 'ddm_warning_zone': 0, 'ddm_drift': 1, 'eddm_warning_zone': 0, 'eddm_drift': 0}
-sequence number : 122, data : {'class': 0.0, 'request': {'instances': [{'X1': 0.0, 'X2': 0.0, 'X3': 0.4969765505, 'X4': 0.9784738351}]}, 'resp': [1], 'when': '2021-10-25 10:27:23.291558', 'model': 'sklearn.ensemble.RandomForestClassifier', 'ddm_warning_zone': 0, 'ddm_drift': 0, 'eddm_warning_zone': 0, 'eddm_drift': 1}
-
-
-
-
-

We can see that the system detected a change in the 106 instance, which is 10006 instance in the real dataset -
-5000 first instances are for train, we started pushing data from the 4900 instance of the test dataset (9900 from the real dataset), and we pushed only 200 instances.

-

Back to the top

-
-
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/concept_drift/latest/static/function.html b/functions/development/concept_drift/latest/static/function.html deleted file mode 100644 index f9afe688..00000000 --- a/functions/development/concept_drift/latest/static/function.html +++ /dev/null @@ -1,134 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: job
-metadata:
-  name: concept-drift
-  tag: ''
-  hash: 935da41196802875e19948974f32b6f00c29feb2
-  project: ''
-  labels:
-    author: orz
-    framework: sklearn
-  categories:
-  - machine-learning
-  - model-serving
-spec:
-  command: ''
-  args: []
-  image: mlrun/ml-models
-  env: []
-  default_handler: concept_drift_deployer
-  entry_points:
-    concept_drift_deployer:
-      name: concept_drift_deployer
-      doc: "Deploy a streaming Concept Drift detector on a labeled stream\n   This\
-        \ function is the Deployment step for the Streaming Concept Drift Detector.\n\
-        \   It will load the selected drift detectors and initialize them with the\n\
-        \   base_dataset's statistics.  Then it will deploy the concept_drift_streaming\n\
-        \   function and pass the models to it for streaming concept-drift detection\
-        \ on top\n   of a labeled stream."
-      parameters:
-      - name: context
-        type: MLClientCtx
-        doc: MLRun context
-        default: ''
-      - name: base_dataset
-        type: DataItem
-        doc: Dataset containing label_col and prediction_col to initialize the detectors
-        default: ''
-      - name: input_stream
-        type: str
-        doc: labeled stream to track. Should contain label_col and prediction_col
-        default: ''
-      - name: consumer_group
-        type: str
-        default: ''
-      - name: output_stream
-        type: str
-        doc: Output stream to push the detector's alerts
-        default: ''
-      - name: output_tsdb
-        type: str
-        doc: Output TSDB table to allow analysis and display
-        default: ''
-      - name: tsdb_batch_size
-        type: int
-        doc: Batch size of alerts to buffer before pushing to the TSDB
-        default: ''
-      - name: callbacks
-        type: list
-        doc: Additional rest endpoints to send the alert data to
-        default: ''
-      - name: models
-        type: list
-        doc: List of the detectors to deploy Defaults to ['ddm', 'eddm', 'pagehinkley'].
-        default:
-        - ddm
-        - eddm
-        - pagehinkley
-      - name: models_dest
-        doc: Location for saving the detectors Defaults to 'models' (in relation to
-          artifact_path).
-        default: models
-      - name: pagehinkley_threshold
-        type: float
-        doc: Drift level threshold for PH detector Defaults to 10.
-        default: 10
-      - name: ddm_warning_level
-        type: float
-        doc: Warning level alert for DDM detector Defaults to 2.
-        default: 2
-      - name: ddm_out_control_level
-        type: float
-        doc: Drift level alert for DDM detector Defaults to 3.
-        default: 3
-      - name: label_col
-        doc: Label column to be used on base_dataset and input_stream Defaults to
-          'label'.
-        default: label
-      - name: prediction_col
-        doc: Prediction column to be used on base_dataset and input_stream Defaults
-          to 'prediction'.
-        default: prediction
-      - name: hub_url
-        type: str
-        doc: hub_url in case the default is not used, concept_drift_streaming will
-          be loaded by this url Defaults to mlconf.hub_url.
-        default: <_ast.Name object at 0x7f48eda946d0>
-      - name: fn_tag
-        type: str
-        doc: hub tag to use Defaults to 'master'
-        default: master
-      outputs:
-      - default: ''
-      lineno: 15
-  description: Deploy a streaming Concept Drift detector on a labeled stream
-  build:
-    functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHNrbXVsdGlmbG93LmRyaWZ0X2RldGVjdGlvbiAgIyBXZSB3aWxsIGdyYWIgb3VyIFBILCBERE0sIEVERE0gYWxnb3JpdGhtcyBmcm9tIGhlcmUKaW1wb3J0IG51bXB5IGFzIG5wCmltcG9ydCBwYW5kYXMgYXMgcGQKaW1wb3J0IG9zCmZyb20gY2xvdWRwaWNrbGUgaW1wb3J0IGR1bXBzLCBsb2FkLCBkdW1wCgpmcm9tIG51Y2xpby50cmlnZ2VycyBpbXBvcnQgVjNJT1N0cmVhbVRyaWdnZXIKZnJvbSBtbHJ1biBpbXBvcnQgRGF0YUl0ZW0sIGltcG9ydF9mdW5jdGlvbiwgbWxjb25mLCBNTENsaWVudEN0eCwgbW91bnRfdjNpbwoKaW1wb3J0IHJhbmRvbQoKCmRlZiBjb25jZXB0X2RyaWZ0X2RlcGxveWVyKAogICAgY29udGV4dDogTUxDbGllbnRDdHgsCiAgICBiYXNlX2RhdGFzZXQ6IERhdGFJdGVtLAogICAgaW5wdXRfc3RyZWFtOiBzdHIsCiAgICBjb25zdW1lcl9ncm91cDogc3RyLAogICAgb3V0cHV0X3N0cmVhbTogc3RyLAogICAgb3V0cHV0X3RzZGI6IHN0ciwKICAgIHRzZGJfYmF0Y2hfc2l6ZTogaW50LAogICAgY2FsbGJhY2tzOiBsaXN0LAogICAgbW9kZWxzOiBsaXN0ID0gWyJkZG0iLCAiZWRkbSIsICJwYWdlaGlua2xleSJdLAogICAgbW9kZWxzX2Rlc3Q9Im1vZGVscyIsCiAgICBwYWdlaGlua2xleV90aHJlc2hvbGQ6IGZsb2F0ID0gMTAsCiAgICBkZG1fd2FybmluZ19sZXZlbDogZmxvYXQgPSAyLAogICAgZGRtX291dF9jb250cm9sX2xldmVsOiBmbG9hdCA9IDMsCiAgICBsYWJlbF9jb2w9ImxhYmVsIiwKICAgIHByZWRpY3Rpb25fY29sPSJwcmVkaWN0aW9uIiwKICAgIGh1Yl91cmw6IHN0ciA9IG1sY29uZi5odWJfdXJsLAogICAgZm5fdGFnOiBzdHIgPSAibWFzdGVyIiwKKToKICAgICIiIkRlcGxveSBhIHN0cmVhbWluZyBDb25jZXB0IERyaWZ0IGRldGVjdG9yIG9uIGEgbGFiZWxlZCBzdHJlYW0KICAgICAgIFRoaXMgZnVuY3Rpb24gaXMgdGhlIERlcGxveW1lbnQgc3RlcCBmb3IgdGhlIFN0cmVhbWluZyBDb25jZXB0IERyaWZ0IERldGVjdG9yLgogICAgICAgSXQgd2lsbCBsb2FkIHRoZSBzZWxlY3RlZCBkcmlmdCBkZXRlY3RvcnMgYW5kIGluaXRpYWxpemUgdGhlbSB3aXRoIHRoZQogICAgICAgYmFzZV9kYXRhc2V0J3Mgc3RhdGlzdGljcy4gIFRoZW4gaXQgd2lsbCBkZXBsb3kgdGhlIGNvbmNlcHRfZHJpZnRfc3RyZWFtaW5nCiAgICAgICBmdW5jdGlvbiBhbmQgcGFzcyB0aGUgbW9kZWxzIHRvIGl0IGZvciBzdHJlYW1pbmcgY29uY2VwdC1kcmlmdCBkZXRlY3Rpb24gb24gdG9wCiAgICAgICBvZiBhIGxhYmVsZWQgc3RyZWFtLgoKICAgIDpwYXJhbSBjb250ZXh0OiAgICAgICAgIE1MUnVuIGNvbnRleHQKICAgIDpwYXJhbSBiYXNlX2RhdGFzZXQ6ICAgIERhdGFzZXQgY29udGFpbmluZyBsYWJlbF9jb2wgYW5kIHByZWRpY3Rpb25fY29sIHRvIGluaXRpYWxpemUgdGhlIGRldGVjdG9ycwogICAgOnBhcmFtIGlucHV0X3N0cmVhbTogICAgbGFiZWxlZCBzdHJlYW0gdG8gdHJhY2suCiAgICAgICAgICAgICAgICAgICAgICAgICAgICBTaG91bGQgY29udGFpbiBsYWJlbF9jb2wgYW5kIHByZWRpY3Rpb25fY29sCiAgICA6cGFyYW0gb3V0cHV0X3N0cmVhbTogICBPdXRwdXQgc3RyZWFtIHRvIHB1c2ggdGhlIGRldGVjdG9yJ3MgYWxlcnRzCiAgICA6cGFyYW0gb3V0cHV0X3RzZGI6ICAgICBPdXRwdXQgVFNEQiB0YWJsZSB0byBhbGxvdyBhbmFseXNpcyBhbmQgZGlzcGxheQogICAgOnBhcmFtIHRzZGJfYmF0Y2hfc2l6ZTogQmF0Y2ggc2l6ZSBvZiBhbGVydHMgdG8gYnVmZmVyIGJlZm9yZSBwdXNoaW5nIHRvIHRoZSBUU0RCCiAgICA6cGFyYW0gY2FsbGJhY2tzOiAgICAgICBBZGRpdGlvbmFsIHJlc3QgZW5kcG9pbnRzIHRvIHNlbmQgdGhlIGFsZXJ0IGRhdGEgdG8KICAgIDpwYXJhbSBtb2RlbHM6ICAgICAgICAgIExpc3Qgb2YgdGhlIGRldGVjdG9ycyB0byBkZXBsb3kKICAgICAgICAgICAgICAgICAgICAgICAgICAgIERlZmF1bHRzIHRvIFsnZGRtJywgJ2VkZG0nLCAncGFnZWhpbmtsZXknXS4KICAgIDpwYXJhbSBtb2RlbHNfZGVzdDogICAgIExvY2F0aW9uIGZvciBzYXZpbmcgdGhlIGRldGVjdG9ycwogICAgICAgICAgICAgICAgICAgICAgICAgICAgRGVmYXVsdHMgdG8gJ21vZGVscycgKGluIHJlbGF0aW9uIHRvIGFydGlmYWN0X3BhdGgpLgogICAgOnBhcmFtIHBhZ2VoaW5rbGV5X3RocmVzaG9sZDogIERyaWZ0IGxldmVsIHRocmVzaG9sZCBmb3IgUEggZGV0ZWN0b3IgRGVmYXVsdHMgdG8gMTAuCiAgICA6cGFyYW0gZGRtX3dhcm5pbmdfbGV2ZWw6ICAgICAgV2FybmluZyBsZXZlbCBhbGVydCBmb3IgRERNIGRldGVjdG9yIERlZmF1bHRzIHRvIDIuCiAgICA6cGFyYW0gZGRtX291dF9jb250cm9sX2xldmVsOiAgRHJpZnQgbGV2ZWwgYWxlcnQgZm9yIERETSBkZXRlY3RvciBEZWZhdWx0cyB0byAzLgogICAgOnBhcmFtIGxhYmVsX2NvbDogICAgICAgTGFiZWwgY29sdW1uIHRvIGJlIHVzZWQgb24gYmFzZV9kYXRhc2V0IGFuZCBpbnB1dF9zdHJlYW0KICAgICAgICAgICAgICAgICAgICAgICAgICAgIERlZmF1bHRzIHRvICdsYWJlbCcuCiAgICA6cGFyYW0gcHJlZGljdGlvbl9jb2w6ICBQcmVkaWN0aW9uIGNvbHVtbiB0byBiZSB1c2VkIG9uIGJhc2VfZGF0YXNldCBhbmQgaW5wdXRfc3RyZWFtCiAgICAgICAgICAgICAgICAgICAgICAgICAgICBEZWZhdWx0cyB0byAncHJlZGljdGlvbicuCiAgICA6cGFyYW0gaHViX3VybDogICAgICAgICBodWJfdXJsIGluIGNhc2UgdGhlIGRlZmF1bHQgaXMgbm90IHVzZWQsIGNvbmNlcHRfZHJpZnRfc3RyZWFtaW5nIHdpbGwgYmUgbG9hZGVkCiAgICAgICAgICAgICAgICAgICAgICAgICAgICBieSB0aGlzIHVybAogICAgICAgICAgICAgICAgICAgICAgICAgICAgRGVmYXVsdHMgdG8gbWxjb25mLmh1Yl91cmwuCiAgICA6cGFyYW0gZm5fdGFnOiAgICAgICAgICBodWIgdGFnIHRvIHVzZQogICAgICAgICAgICAgICAgICAgICAgICAgICAgRGVmYXVsdHMgdG8gJ21hc3RlcicKICAgICIiIgoKICAgIG1sY29uZi5kYnBhdGggPSBtbGNvbmYuZGJwYXRoIG9yICJodHRwOi8vbWxydW4tYXBpOjgwODAiCiAgICBtbGNvbmYuaHViX3VybCA9IGh1Yl91cmwKICAgIGZuID0gaW1wb3J0X2Z1bmN0aW9uKHVybD1mImh1YjovL2NvbmNlcHRfZHJpZnRfc3RyZWFtaW5nOntmbl90YWd9IikKCiAgICBjb250ZXh0LmxvZ2dlci5pbmZvKCJMb2FkaW5nIGJhc2UgZGF0YXNldCIpCiAgICBiYXNlX2RmID0gYmFzZV9kYXRhc2V0LmFzX2RmKCkKICAgIGVycm9yX3N0cmVhbSA9IG5wLndoZXJlKAogICAgICAgIGJhc2VfZGZbcHJlZGljdGlvbl9jb2xdLnZhbHVlcyA9PSBiYXNlX2RmW2xhYmVsX2NvbF0udmFsdWVzLCAwLCAxCiAgICApCgogICAgY29udGV4dC5sb2dnZXIuaW5mbygiQ3JlYXRpbmcgbW9kZWxzIikKICAgIG1vZGVscyA9IFsKICAgICAgICBtb2RlbC5zdHJpcCgpCiAgICAgICAgZm9yIG1vZGVsIGluIG9zLmdldGVudigibW9kZWxzIiwgInBhZ2VoaW5rbGV5LCBkZG0sIGVkZG0iKS5zcGxpdCgiLCIpCiAgICBdCiAgICBtb2RlbHMgPSB7CiAgICAgICAgImVkZG0iOiBza211bHRpZmxvdy5kcmlmdF9kZXRlY3Rpb24uRURETSgpLAogICAgICAgICJwYWdlaGlua2xleSI6IHNrbXVsdGlmbG93LmRyaWZ0X2RldGVjdGlvbi5QYWdlSGlua2xleSgKICAgICAgICAgICAgbWluX2luc3RhbmNlcz1sZW4oZXJyb3Jfc3RyZWFtKSwgdGhyZXNob2xkPXBhZ2VoaW5rbGV5X3RocmVzaG9sZAogICAgICAgICksCiAgICAgICAgImRkbSI6IHNrbXVsdGlmbG93LmRyaWZ0X2RldGVjdGlvbi5ERE0oCiAgICAgICAgICAgIG1pbl9udW1faW5zdGFuY2VzPWxlbihlcnJvcl9zdHJlYW0pLAogICAgICAgICAgICB3YXJuaW5nX2xldmVsPWRkbV93YXJuaW5nX2xldmVsLAogICAgICAgICAgICBvdXRfY29udHJvbF9sZXZlbD1kZG1fb3V0X2NvbnRyb2xfbGV2ZWwsCiAgICAgICAgKSwKICAgIH0KCiAgICBjb250ZXh0LmxvZ2dlci5pbmZvKCJTdHJlYW1pbmcgZGF0YSB0byBtb2RlbHMiKQogICAgZm9yIGkgaW4gcmFuZ2UobGVuKGVycm9yX3N0cmVhbSkpOgogICAgICAgIGZvciBtb2RlbF9uYW1lLCBtb2RlbCBpbiBtb2RlbHMuaXRlbXMoKToKICAgICAgICAgICAgbW9kZWwuYWRkX2VsZW1lbnQoZXJyb3Jfc3RyZWFtW2ldKQoKICAgIGNvbnRleHQubG9nZ2VyLmluZm8oIkxvZ2dpbmcgcmVhZHkgbW9kZWxzIikKICAgIGZvciBuYW1lLCBtb2RlbCBpbiBtb2RlbHMuaXRlbXMoKToKICAgICAgICBkYXRhID0gZHVtcHMobW9kZWwpCiAgICAgICAgbW9kZWxfZmlsZSA9IGYie25hbWV9LnBrbCIKICAgICAgICBjb250ZXh0LmxvZ19tb2RlbCgKICAgICAgICAgICAgZiJ7bmFtZX1fY29uY2VwdF9kcmlmdCIsCiAgICAgICAgICAgIGJvZHk9ZGF0YSwKICAgICAgICAgICAgbGFiZWxzPXsiZnJhbWV3b3JrIjogInNrbXVsdGlmbG93IiwgIndvcmtmbG93IjogImNvbmNlcHQtZHJpZnQifSwKICAgICAgICAgICAgbW9kZWxfZmlsZT1tb2RlbF9maWxlLAogICAgICAgICAgICBtb2RlbF9kaXI9bW9kZWxzX2Rlc3QsCiAgICAgICAgICAgIHRhZz0ibGF0ZXN0IiwKICAgICAgICApCiAgICAgICAgZm4uc2V0X2VudnMoCiAgICAgICAgICAgIHsKICAgICAgICAgICAgICAgIGYie25hbWV9X21vZGVsX3BhdGgiOiBvcy5wYXRoLmpvaW4oCiAgICAgICAgICAgICAgICAgICAgY29udGV4dC5hcnRpZmFjdF9wYXRoLCBtb2RlbHNfZGVzdCwgbW9kZWxfZmlsZQogICAgICAgICAgICAgICAgKQogICAgICAgICAgICB9CiAgICAgICAgKQoKICAgIGNvbnRleHQubG9nZ2VyLmluZm8oIkRlcGxveWluZyBDb25jZXB0IERyaWZ0IFN0cmVhbWluZyBmdW5jdGlvbiIpCiAgICBmbi5zZXRfZW52cygKICAgICAgICB7CiAgICAgICAgICAgICJsYWJlbF9jb2wiOiBsYWJlbF9jb2wsCiAgICAgICAgICAgICJwcmVkaWN0aW9uX2NvbCI6IHByZWRpY3Rpb25fY29sLAogICAgICAgICAgICAiZHJpZnRfc3RyZWFtIjogb3V0cHV0X3N0cmVhbSwKICAgICAgICAgICAgInRzZGJfdGFibGUiOiBvdXRwdXRfdHNkYiwKICAgICAgICAgICAgInBhZ2VoaW5rbGV5X3RocmVzaG9sZCI6IHBhZ2VoaW5rbGV5X3RocmVzaG9sZCwKICAgICAgICAgICAgImRkbV93YXJuaW5nX2xldmVsIjogZGRtX3dhcm5pbmdfbGV2ZWwsCiAgICAgICAgICAgICJkZG1fb3V0X2NvbnRyb2wiOiBkZG1fb3V0X2NvbnRyb2xfbGV2ZWwsCiAgICAgICAgfQogICAgKQogICAgZm4uYWRkX3YzaW9fc3RyZWFtX3RyaWdnZXIoc3RyZWFtX3BhdGggPSBpbnB1dF9zdHJlYW0sIG5hbWUgPSAnc3RyZWFtJywgZ3JvdXAgPSBjb25zdW1lcl9ncm91cCkKICAgIGZuLmFwcGx5KG1vdW50X3YzaW8oKSkKICAgIGZuLmRlcGxveShwcm9qZWN0PWNvbnRleHQucHJvamVjdCkK
-    commands:
-    - python -m pip install scikit-multiflow
-    code_origin: https://github.com/daniels290813/functions.git#82bbfde4afa2eae77059e05c70bbebacf530fd0d:/User/test/functions/concept_drift/concept_drift.py
-    origin_filename: /User/test/functions/concept_drift/concept_drift.py
-  disable_auto_mount: false
-  affinity: null
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/concept_drift/latest/static/item.html b/functions/development/concept_drift/latest/static/item.html deleted file mode 100644 index d61d5687..00000000 --- a/functions/development/concept_drift/latest/static/item.html +++ /dev/null @@ -1,49 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- machine-learning
-- model-serving
-description: Deploy a streaming Concept Drift detector on a labeled stream
-doc: ''
-example: concept_drift.ipynb
-generationDate: 2022-08-28:17-25
-hidden: false
-icon: ''
-labels:
-  author: orz
-  framework: sklearn
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 1.1.0
-name: concept-drift
-platformVersion: 3.5.0
-spec:
-  filename: concept_drift.py
-  handler: concept_drift_deployer
-  image: mlrun/ml-models
-  kind: job
-  requirements:
-  - scikit-multiflow
-url: ''
-version: 1.1.0
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/concept_drift/latest/static/source.html b/functions/development/concept_drift/latest/static/source.html deleted file mode 100644 index 6aeba7a7..00000000 --- a/functions/development/concept_drift/latest/static/source.html +++ /dev/null @@ -1,169 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-# Copyright 2019 Iguazio
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# Generated by nuclio.export.NuclioExporter
-
-import skmultiflow.drift_detection  # We will grab our PH, DDM, EDDM algorithms from here
-import numpy as np
-import pandas as pd
-import os
-from cloudpickle import dumps, load, dump
-
-from nuclio.triggers import V3IOStreamTrigger
-from mlrun import DataItem, import_function, mlconf, MLClientCtx, mount_v3io
-
-import random
-
-
-def concept_drift_deployer(
-    context: MLClientCtx,
-    base_dataset: DataItem,
-    input_stream: str,
-    consumer_group: str,
-    output_stream: str,
-    output_tsdb: str,
-    tsdb_batch_size: int,
-    callbacks: list,
-    models: list = ["ddm", "eddm", "pagehinkley"],
-    models_dest="models",
-    pagehinkley_threshold: float = 10,
-    ddm_warning_level: float = 2,
-    ddm_out_control_level: float = 3,
-    label_col="label",
-    prediction_col="prediction",
-    hub_url: str = mlconf.hub_url,
-    fn_tag: str = "master",
-):
-    """Deploy a streaming Concept Drift detector on a labeled stream
-       This function is the Deployment step for the Streaming Concept Drift Detector.
-       It will load the selected drift detectors and initialize them with the
-       base_dataset's statistics.  Then it will deploy the concept_drift_streaming
-       function and pass the models to it for streaming concept-drift detection on top
-       of a labeled stream.
-
-    :param context:         MLRun context
-    :param base_dataset:    Dataset containing label_col and prediction_col to initialize the detectors
-    :param input_stream:    labeled stream to track.
-                            Should contain label_col and prediction_col
-    :param output_stream:   Output stream to push the detector's alerts
-    :param output_tsdb:     Output TSDB table to allow analysis and display
-    :param tsdb_batch_size: Batch size of alerts to buffer before pushing to the TSDB
-    :param callbacks:       Additional rest endpoints to send the alert data to
-    :param models:          List of the detectors to deploy
-                            Defaults to ['ddm', 'eddm', 'pagehinkley'].
-    :param models_dest:     Location for saving the detectors
-                            Defaults to 'models' (in relation to artifact_path).
-    :param pagehinkley_threshold:  Drift level threshold for PH detector Defaults to 10.
-    :param ddm_warning_level:      Warning level alert for DDM detector Defaults to 2.
-    :param ddm_out_control_level:  Drift level alert for DDM detector Defaults to 3.
-    :param label_col:       Label column to be used on base_dataset and input_stream
-                            Defaults to 'label'.
-    :param prediction_col:  Prediction column to be used on base_dataset and input_stream
-                            Defaults to 'prediction'.
-    :param hub_url:         hub_url in case the default is not used, concept_drift_streaming will be loaded
-                            by this url
-                            Defaults to mlconf.hub_url.
-    :param fn_tag:          hub tag to use
-                            Defaults to 'master'
-    """
-
-    mlconf.dbpath = mlconf.dbpath or "http://mlrun-api:8080"
-    mlconf.hub_url = hub_url
-    fn = import_function(url=f"hub://concept_drift_streaming:{fn_tag}")
-
-    context.logger.info("Loading base dataset")
-    base_df = base_dataset.as_df()
-    error_stream = np.where(
-        base_df[prediction_col].values == base_df[label_col].values, 0, 1
-    )
-
-    context.logger.info("Creating models")
-    models = [
-        model.strip()
-        for model in os.getenv("models", "pagehinkley, ddm, eddm").split(",")
-    ]
-    models = {
-        "eddm": skmultiflow.drift_detection.EDDM(),
-        "pagehinkley": skmultiflow.drift_detection.PageHinkley(
-            min_instances=len(error_stream), threshold=pagehinkley_threshold
-        ),
-        "ddm": skmultiflow.drift_detection.DDM(
-            min_num_instances=len(error_stream),
-            warning_level=ddm_warning_level,
-            out_control_level=ddm_out_control_level,
-        ),
-    }
-
-    context.logger.info("Streaming data to models")
-    for i in range(len(error_stream)):
-        for model_name, model in models.items():
-            model.add_element(error_stream[i])
-
-    context.logger.info("Logging ready models")
-    for name, model in models.items():
-        data = dumps(model)
-        model_file = f"{name}.pkl"
-        context.log_model(
-            f"{name}_concept_drift",
-            body=data,
-            labels={"framework": "skmultiflow", "workflow": "concept-drift"},
-            model_file=model_file,
-            model_dir=models_dest,
-            tag="latest",
-        )
-        fn.set_envs(
-            {
-                f"{name}_model_path": os.path.join(
-                    context.artifact_path, models_dest, model_file
-                )
-            }
-        )
-
-    context.logger.info("Deploying Concept Drift Streaming function")
-    fn.set_envs(
-        {
-            "label_col": label_col,
-            "prediction_col": prediction_col,
-            "drift_stream": output_stream,
-            "tsdb_table": output_tsdb,
-            "pagehinkley_threshold": pagehinkley_threshold,
-            "ddm_warning_level": ddm_warning_level,
-            "ddm_out_control": ddm_out_control_level,
-        }
-    )
-    fn.add_v3io_stream_trigger(stream_path = input_stream, name = 'stream', group = consumer_group)
-    fn.apply(mount_v3io())
-    fn.deploy(project=context.project)
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/concept_drift_streaming/0.0.1/src/concept_drift_streaming.ipynb b/functions/development/concept_drift_streaming/0.0.1/src/concept_drift_streaming.ipynb deleted file mode 100644 index b916cb7a..00000000 --- a/functions/development/concept_drift_streaming/0.0.1/src/concept_drift_streaming.ipynb +++ /dev/null @@ -1,480 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Concept Drift Streaming" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import nuclio" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "from pprint import pprint" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "%%nuclio cmd -c\n", - "python -m pip install scikit-multiflow==0.4.1\n", - "python -m pip install v3io_frames" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "%nuclio: setting kind to 'nuclio'\n", - "%nuclio: setting spec.build.baseImage to 'mlrun/ml-models'\n" - ] - } - ], - "source": [ - "# Define function spec\n", - "%nuclio config kind = \"nuclio\"\n", - "%nuclio config spec.build.baseImage = \"mlrun/ml-models\"\n", - "\n", - "# Add V3IO Mount\n", - "# %nuclio env %v3io" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: ignore\n", - "env = {'label_col': 'resp',\n", - " 'prediction_col': 'prediction',\n", - " 'drift_stream': '/bigdata/network-operations/drift_stream',\n", - " 'tsdb_table': 'network-operations/drift_tsdb',\n", - " 'pagehinkley_threshold': 10,\n", - " 'models': ['pagehinkley', 'ddm', 'eddm'],\n", - " 'window_size': 10}\n", - "config = {'kind': 'nuclio',\n", - " 'spec.build.baseImage': 'mlrun/ml-models'}\n", - "cmd = ['python -m pip install scikit-multiflow',\n", - " 'python -m pip install v3io_frames']\n", - "v3io = True\n", - "config = nuclio.ConfigSpec(env=env,\n", - " config=config,\n", - " cmd=cmd,\n", - " v3io=v3io)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: start-code" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "import skmultiflow.drift_detection\n", - "import numpy as np\n", - "import pandas as pd\n", - "import os\n", - "import json\n", - "import v3io.dataplane\n", - "import v3io_frames as v3f\n", - "import requests\n", - "from cloudpickle import load\n", - "\n", - "# For testing\n", - "import random" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "def split_path(mntpath=''):\n", - " if mntpath[0] == '/':\n", - " mntpath = mntpath[1:]\n", - " paths = mntpath.split('/')\n", - " container = paths[0]\n", - " subpath = ''\n", - " if len(paths) > 1:\n", - " subpath = mntpath[len(container):]\n", - " return container, subpath\n", - "\n", - "\n", - "def create_stream(context, path, shards=1):\n", - " # create a stream w/8 shards\n", - " container, stream_path = split_path(path)\n", - " context.logger.info(f'Creating stream in Container: {container} & Path {stream_path}')\n", - " response = context.v3io_client.create_stream(container=container,\n", - " path=stream_path, \n", - " shard_count=shards,\n", - " raise_for_status=v3io.dataplane.RaiseForStatus.never)\n", - " response.raise_for_status([409, 204])\n", - " \n", - " \n", - "def push_to_stream(context, stream_path, data):\n", - " records = [{'data': json.dumps(rec)} for rec in data]\n", - " container, stream_path = split_path(stream_path)\n", - " response = context.v3io_client.put_records(container=container,\n", - " path=stream_path, \n", - " records=records)\n", - "\n", - "\n", - "def construct_record(record):\n", - " label_col = os.getenv('label_col', 'label')\n", - " prediction_col = os.getenv('prediction_col', 'prediction')\n", - " res = dict([(k, record[k]) for k in ['when', 'class', 'model', 'resp', 'request']])\n", - " res['feature_vector'] = res.pop('request')['instances'][0]\n", - " res['timestamp'] = res.pop('when')\n", - " res['prediction'] = res['resp'][0]\n", - " return res" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "def init_context(context):\n", - " # create a v3io context object\n", - " v3io_client = v3io.dataplane.Client()\n", - " setattr(context, \"v3io_client\", v3io_client)\n", - " \n", - " # Setup windowing for TSDB writer\n", - " v3f_client = v3f.Client('framesd:8081', container='bigdata')\n", - " setattr(context, \"v3f\", v3f_client)\n", - " window = []\n", - " setattr(context, 'window', window)\n", - " setattr(context, 'window_size', int(os.getenv('window_size', 10)))\n", - " setattr(context, 'tsdb_table', os.getenv('tsdb_table', 'concept_drift_tsdb_1'))\n", - " try:\n", - " context.v3f.create('tsdb', context.tsdb_table, rate='1/s', if_exists=1)\n", - " except Exception as e:\n", - " context.logger.info(f'Creating context with rate= faile for {e}')\n", - " context.v3f.create('tsdb', context.tsdb_table, attrs={'rate': '1/s'}, if_exists=1)\n", - " \n", - " # Setup callbacks\n", - " callbacks = [callback.strip() for callback in os.getenv('callbacks', '').split(',')]\n", - " setattr(context, 'callbacks', callbacks)\n", - " \n", - " # Setup drift stream\n", - " setattr(context, 'drift_stream', os.getenv('drift_stream', '/bigdata/drift_stream'))\n", - " try:\n", - " create_stream(context, context.drift_stream, int(os.getenv('drift_stream_shards', 1)))\n", - " except:\n", - " context.logger.info(f'{context.drift_stream} already exists')\n", - " \n", - " # Load models\n", - " models = {}\n", - " model_types = ['pagehinkely', 'ddm', 'eddm']\n", - " path_suffix = '_model_path'\n", - " for model in model_types:\n", - " model_env = f'{model}{path_suffix}'\n", - " if model_env in os.environ:\n", - " with open(os.environ[model_env], 'rb') as f:\n", - " models[model] = load(f)\n", - " setattr(context, 'models', models)\n", - " \n", - " # Columns to check\n", - " setattr(context, 'label_col', os.getenv('label_col', 'label'))\n", - " setattr(context, 'prediction_col', os.getenv('prediction_col', 'prediction'))" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "def handler(context, event):\n", - " # Construct event\n", - " context.logger.info(f'event: {event.body}')\n", - " full_event = json.loads(event.body)\n", - " record = construct_record(full_event)\n", - " \n", - " # Is our prediction wrong?\n", - " is_error = record[context.label_col] != record[context.prediction_col]\n", - " context.logger.info(f'Adding {is_error}')\n", - " \n", - " # Process the {is_error} element with our algorithms\n", - " for name, model in context.models.items():\n", - " # Add element\n", - " results = {'timestamp': record['timestamp']}\n", - " results['algorithm'] = name\n", - " model.add_element(is_error)\n", - " \n", - " # Detect warning zone (if applicable to the algorithm)\n", - " if hasattr(model, 'detected_warning_zone') and model.detected_warning_zone():\n", - " context.logger.info(f'{name}\\tWarning zone detected')\n", - " results['warning_zone'] = 1\n", - " full_event[f'{name}_warning_zone'] = 1\n", - " else:\n", - " results['warning_zone'] = 0\n", - " full_event[f'{name}_warning_zone'] = 0\n", - " \n", - " # Detect drift\n", - " if model.detected_change():\n", - " context.logger.info('Change Detected')\n", - " results['change_detected'] = 1\n", - " full_event[f'{name}_drift'] = 1\n", - " else:\n", - " results['change_detected'] = 0\n", - " full_event[f'{name}_drift'] = 0\n", - " context.window.append(results)\n", - " \n", - " # Return results\n", - " # Write to stream\n", - " push_to_stream(context, context.drift_stream, [full_event])\n", - " \n", - " # Add to callbacks\n", - " if context.callbacks != ['']:\n", - " for callback in context.callbacks:\n", - " requests.post(url=callback,\n", - " json=full_event)\n", - " \n", - " if (len(context.window) / len(context.models)) >= context.window_size:\n", - " df = pd.DataFrame(context.window)\n", - " df['timestamp'] = pd.to_datetime(df['timestamp'])\n", - " df = df.set_index(['timestamp', 'algorithm'])\n", - " context.v3f.write('tsdb', context.tsdb_table, df)\n", - " context.window = []" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: end-code" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Test " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "init_context(context)\n", - "event = nuclio.Event(body=json.dumps({'prediction': 0,\n", - " 'when': 'now',\n", - " 'class': 'ClassModel', \n", - " 'model': 'tester_v1', \n", - " 'resp': [0], \n", - " 'request': {'instances': [[1, 1.2, 3]]}}))\n", - "out = handler(context, event)\n", - "out" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Cluster" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%nuclio deploy -n network-operations-concept-drift -p network-operations" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Save function yaml" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "from os import path\n", - "from mlrun import run_local, NewTask, mlconf, import_function, mount_v3io, code_to_function, get_run_db\n", - "mlconf.dbpath = mlconf.dbpath or 'http://mlrun-api:8080'" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-07-14 13:49:22,720 function spec saved to path: /User/functions/concept_drift_streaming/function.yaml\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# create job function object from notebook code\n", - "fn = code_to_function(\"concept_drift_streaming\", kind='nuclio')\n", - "\n", - "# add metadata (for templates and reuse)\n", - "fn.spec.default_handler = \"handler\"\n", - "fn.spec.description = \"Deploy a streaming Concept Drift detector on a labeled stream. the nuclio part of the concept_drift function\"\n", - "fn.metadata.categories = [\"ml\", \"serve\"]\n", - "fn.metadata.labels = {\"author\": \"orz\", \"framework\": \"sklearn\"}\n", - "fn.export(\"/User/functions/concept_drift_streaming/function.yaml\")" - ] - }, - { - "cell_type": "code", - "execution_count": 120, - "metadata": {}, - "outputs": [], - "source": [ - "stream_trigger = nuclio.triggers.V3IOStreamTrigger(url='/bigdata/network-operations/inference_stream@cd2')" - ] - }, - { - "cell_type": "code", - "execution_count": 121, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 121, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fn.add_trigger('labeled_stream', stream_trigger)" - ] - }, - { - "cell_type": "code", - "execution_count": 122, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 122, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fn.apply(mount_v3io()).with_v3io()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "fn.export(\"function.yaml\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Stream testing" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "metadata": {}, - "outputs": [], - "source": [ - "fn = import_function('./function.yaml')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "fn.deploy(project='network-operations')" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.8" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/functions/development/concept_drift_streaming/0.0.1/src/concept_drift_streaming.py b/functions/development/concept_drift_streaming/0.0.1/src/concept_drift_streaming.py deleted file mode 100644 index 830c2d50..00000000 --- a/functions/development/concept_drift_streaming/0.0.1/src/concept_drift_streaming.py +++ /dev/null @@ -1,143 +0,0 @@ -# Generated by nuclio.export.NuclioExporter - -import skmultiflow.drift_detection -import numpy as np -import pandas as pd -import os -import json -import v3io.dataplane -import v3io_frames as v3f -import requests -from cloudpickle import load - -import random - - -def split_path(mntpath=""): - if mntpath[0] == "/": - mntpath = mntpath[1:] - paths = mntpath.split("/") - container = paths[0] - subpath = "" - if len(paths) > 1: - subpath = mntpath[len(container) :] - return container, subpath - - -def create_stream(context, path, shards=1): - container, stream_path = split_path(path) - context.logger.info( - f"Creating stream in Container: {container} & Path {stream_path}" - ) - response = context.v3io_client.create_stream( - container=container, - path=stream_path, - shard_count=shards, - raise_for_status=v3io.dataplane.RaiseForStatus.never, - ) - response.raise_for_status([409, 204]) - - -def push_to_stream(context, stream_path, data): - records = [{"data": json.dumps(rec)} for rec in data] - container, stream_path = split_path(stream_path) - response = context.v3io_client.put_records( - container=container, path=stream_path, records=records - ) - - -def construct_record(record): - label_col = os.getenv("label_col", "label") - prediction_col = os.getenv("prediction_col", "prediction") - res = dict([(k, record[k]) for k in ["when", "class", "model", "resp", "request"]]) - res["feature_vector"] = res.pop("request")["instances"][0] - res["timestamp"] = res.pop("when") - res["prediction"] = res["resp"][0] - return res - - -def init_context(context): - v3io_client = v3io.dataplane.Client() - setattr(context, "v3io_client", v3io_client) - - v3f_client = v3f.Client("framesd:8081", container="bigdata") - setattr(context, "v3f", v3f_client) - window = [] - setattr(context, "window", window) - setattr(context, "window_size", int(os.getenv("window_size", 10))) - setattr(context, "tsdb_table", os.getenv("tsdb_table", "concept_drift_tsdb_1")) - try: - context.v3f.create("tsdb", context.tsdb_table, rate="1/s", if_exists=1) - except Exception as e: - context.logger.info(f"Creating context with rate= faile for {e}") - context.v3f.create( - "tsdb", context.tsdb_table, attrs={"rate": "1/s"}, if_exists=1 - ) - - callbacks = [callback.strip() for callback in os.getenv("callbacks", "").split(",")] - setattr(context, "callbacks", callbacks) - - setattr(context, "drift_stream", os.getenv("drift_stream", "/bigdata/drift_stream")) - try: - create_stream( - context, context.drift_stream, int(os.getenv("drift_stream_shards", 1)) - ) - except: - context.logger.info(f"{context.drift_stream} already exists") - - models = {} - model_types = ["pagehinkely", "ddm", "eddm"] - path_suffix = "_model_path" - for model in model_types: - model_env = f"{model}{path_suffix}" - if model_env in os.environ: - with open(os.environ[model_env], "rb") as f: - models[model] = load(f) - setattr(context, "models", models) - - setattr(context, "label_col", os.getenv("label_col", "label")) - setattr(context, "prediction_col", os.getenv("prediction_col", "prediction")) - - -def handler(context, event): - context.logger.info(f"event: {event.body}") - full_event = json.loads(event.body) - record = construct_record(full_event) - - is_error = record[context.label_col] != record[context.prediction_col] - context.logger.info(f"Adding {is_error}") - - for name, model in context.models.items(): - results = {"timestamp": record["timestamp"]} - results["algorithm"] = name - model.add_element(is_error) - - if hasattr(model, "detected_warning_zone") and model.detected_warning_zone(): - context.logger.info(f"{name}\tWarning zone detected") - results["warning_zone"] = 1 - full_event[f"{name}_warning_zone"] = 1 - else: - results["warning_zone"] = 0 - full_event[f"{name}_warning_zone"] = 0 - - if model.detected_change(): - context.logger.info("Change Detected") - results["change_detected"] = 1 - full_event[f"{name}_drift"] = 1 - else: - results["change_detected"] = 0 - full_event[f"{name}_drift"] = 0 - context.window.append(results) - - push_to_stream(context, context.drift_stream, [full_event]) - - if context.callbacks != [""]: - for callback in context.callbacks: - requests.post(url=callback, json=full_event) - - if (len(context.window) / len(context.models)) >= context.window_size: - df = pd.DataFrame(context.window) - df["timestamp"] = pd.to_datetime(df["timestamp"]) - df = df.set_index(["timestamp", "algorithm"]) - context.v3f.write("tsdb", context.tsdb_table, df) - context.window = [] diff --git a/functions/development/concept_drift_streaming/0.0.1/src/function.yaml b/functions/development/concept_drift_streaming/0.0.1/src/function.yaml deleted file mode 100644 index 06704274..00000000 --- a/functions/development/concept_drift_streaming/0.0.1/src/function.yaml +++ /dev/null @@ -1,46 +0,0 @@ -kind: remote -metadata: - name: concept-drift-streaming - tag: '' - hash: d97012d8a20a9042c2f13dd790ecb28e34d9f29f - project: default - labels: - author: orz - framework: sklearn - categories: - - machine-learning - - monitoring -spec: - command: '' - args: [] - image: mlrun/ml-models - description: Deploy a streaming Concept Drift detector on a labeled stream. the - nuclio part of the concept_drift function - min_replicas: 1 - max_replicas: 4 - env: [] - base_spec: - apiVersion: nuclio.io/v1 - kind: Function - metadata: - name: concept-drift-streaming - labels: {} - annotations: - nuclio.io/generated_by: function generated from /home/kali/functions/concept_drift_streaming/concept_drift_streaming.py - spec: - runtime: python:3.6 - handler: concept_drift_streaming:handler - env: [] - volumes: [] - build: - commands: [] - noBaseImagesPull: true - functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHNrbXVsdGlmbG93LmRyaWZ0X2RldGVjdGlvbgppbXBvcnQgbnVtcHkgYXMgbnAKaW1wb3J0IHBhbmRhcyBhcyBwZAppbXBvcnQgb3MKaW1wb3J0IGpzb24KaW1wb3J0IHYzaW8uZGF0YXBsYW5lCmltcG9ydCB2M2lvX2ZyYW1lcyBhcyB2M2YKaW1wb3J0IHJlcXVlc3RzCmZyb20gY2xvdWRwaWNrbGUgaW1wb3J0IGxvYWQKCmltcG9ydCByYW5kb20KCgpkZWYgc3BsaXRfcGF0aChtbnRwYXRoPSIiKToKICAgIGlmIG1udHBhdGhbMF0gPT0gIi8iOgogICAgICAgIG1udHBhdGggPSBtbnRwYXRoWzE6XQogICAgcGF0aHMgPSBtbnRwYXRoLnNwbGl0KCIvIikKICAgIGNvbnRhaW5lciA9IHBhdGhzWzBdCiAgICBzdWJwYXRoID0gIiIKICAgIGlmIGxlbihwYXRocykgPiAxOgogICAgICAgIHN1YnBhdGggPSBtbnRwYXRoW2xlbihjb250YWluZXIpIDpdCiAgICByZXR1cm4gY29udGFpbmVyLCBzdWJwYXRoCgoKZGVmIGNyZWF0ZV9zdHJlYW0oY29udGV4dCwgcGF0aCwgc2hhcmRzPTEpOgogICAgY29udGFpbmVyLCBzdHJlYW1fcGF0aCA9IHNwbGl0X3BhdGgocGF0aCkKICAgIGNvbnRleHQubG9nZ2VyLmluZm8oCiAgICAgICAgZiJDcmVhdGluZyBzdHJlYW0gaW4gQ29udGFpbmVyOiB7Y29udGFpbmVyfSAmIFBhdGgge3N0cmVhbV9wYXRofSIKICAgICkKICAgIHJlc3BvbnNlID0gY29udGV4dC52M2lvX2NsaWVudC5jcmVhdGVfc3RyZWFtKAogICAgICAgIGNvbnRhaW5lcj1jb250YWluZXIsCiAgICAgICAgcGF0aD1zdHJlYW1fcGF0aCwKICAgICAgICBzaGFyZF9jb3VudD1zaGFyZHMsCiAgICAgICAgcmFpc2VfZm9yX3N0YXR1cz12M2lvLmRhdGFwbGFuZS5SYWlzZUZvclN0YXR1cy5uZXZlciwKICAgICkKICAgIHJlc3BvbnNlLnJhaXNlX2Zvcl9zdGF0dXMoWzQwOSwgMjA0XSkKCgpkZWYgcHVzaF90b19zdHJlYW0oY29udGV4dCwgc3RyZWFtX3BhdGgsIGRhdGEpOgogICAgcmVjb3JkcyA9IFt7ImRhdGEiOiBqc29uLmR1bXBzKHJlYyl9IGZvciByZWMgaW4gZGF0YV0KICAgIGNvbnRhaW5lciwgc3RyZWFtX3BhdGggPSBzcGxpdF9wYXRoKHN0cmVhbV9wYXRoKQogICAgcmVzcG9uc2UgPSBjb250ZXh0LnYzaW9fY2xpZW50LnB1dF9yZWNvcmRzKAogICAgICAgIGNvbnRhaW5lcj1jb250YWluZXIsIHBhdGg9c3RyZWFtX3BhdGgsIHJlY29yZHM9cmVjb3JkcwogICAgKQoKCmRlZiBjb25zdHJ1Y3RfcmVjb3JkKHJlY29yZCk6CiAgICBsYWJlbF9jb2wgPSBvcy5nZXRlbnYoImxhYmVsX2NvbCIsICJsYWJlbCIpCiAgICBwcmVkaWN0aW9uX2NvbCA9IG9zLmdldGVudigicHJlZGljdGlvbl9jb2wiLCAicHJlZGljdGlvbiIpCiAgICByZXMgPSBkaWN0KFsoaywgcmVjb3JkW2tdKSBmb3IgayBpbiBbIndoZW4iLCAiY2xhc3MiLCAibW9kZWwiLCAicmVzcCIsICJyZXF1ZXN0Il1dKQogICAgcmVzWyJmZWF0dXJlX3ZlY3RvciJdID0gcmVzLnBvcCgicmVxdWVzdCIpWyJpbnN0YW5jZXMiXVswXQogICAgcmVzWyJ0aW1lc3RhbXAiXSA9IHJlcy5wb3AoIndoZW4iKQogICAgcmVzWyJwcmVkaWN0aW9uIl0gPSByZXNbInJlc3AiXVswXQogICAgcmV0dXJuIHJlcwoKCmRlZiBpbml0X2NvbnRleHQoY29udGV4dCk6CiAgICB2M2lvX2NsaWVudCA9IHYzaW8uZGF0YXBsYW5lLkNsaWVudCgpCiAgICBzZXRhdHRyKGNvbnRleHQsICJ2M2lvX2NsaWVudCIsIHYzaW9fY2xpZW50KQoKICAgIHYzZl9jbGllbnQgPSB2M2YuQ2xpZW50KCJmcmFtZXNkOjgwODEiLCBjb250YWluZXI9ImJpZ2RhdGEiKQogICAgc2V0YXR0cihjb250ZXh0LCAidjNmIiwgdjNmX2NsaWVudCkKICAgIHdpbmRvdyA9IFtdCiAgICBzZXRhdHRyKGNvbnRleHQsICJ3aW5kb3ciLCB3aW5kb3cpCiAgICBzZXRhdHRyKGNvbnRleHQsICJ3aW5kb3dfc2l6ZSIsIGludChvcy5nZXRlbnYoIndpbmRvd19zaXplIiwgMTApKSkKICAgIHNldGF0dHIoY29udGV4dCwgInRzZGJfdGFibGUiLCBvcy5nZXRlbnYoInRzZGJfdGFibGUiLCAiY29uY2VwdF9kcmlmdF90c2RiXzEiKSkKICAgIHRyeToKICAgICAgICBjb250ZXh0LnYzZi5jcmVhdGUoInRzZGIiLCBjb250ZXh0LnRzZGJfdGFibGUsIHJhdGU9IjEvcyIsIGlmX2V4aXN0cz0xKQogICAgZXhjZXB0IEV4Y2VwdGlvbiBhcyBlOgogICAgICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZiJDcmVhdGluZyBjb250ZXh0IHdpdGggcmF0ZT0gZmFpbGUgZm9yIHtlfSIpCiAgICAgICAgY29udGV4dC52M2YuY3JlYXRlKAogICAgICAgICAgICAidHNkYiIsIGNvbnRleHQudHNkYl90YWJsZSwgYXR0cnM9eyJyYXRlIjogIjEvcyJ9LCBpZl9leGlzdHM9MQogICAgICAgICkKCiAgICBjYWxsYmFja3MgPSBbY2FsbGJhY2suc3RyaXAoKSBmb3IgY2FsbGJhY2sgaW4gb3MuZ2V0ZW52KCJjYWxsYmFja3MiLCAiIikuc3BsaXQoIiwiKV0KICAgIHNldGF0dHIoY29udGV4dCwgImNhbGxiYWNrcyIsIGNhbGxiYWNrcykKCiAgICBzZXRhdHRyKGNvbnRleHQsICJkcmlmdF9zdHJlYW0iLCBvcy5nZXRlbnYoImRyaWZ0X3N0cmVhbSIsICIvYmlnZGF0YS9kcmlmdF9zdHJlYW0iKSkKICAgIHRyeToKICAgICAgICBjcmVhdGVfc3RyZWFtKAogICAgICAgICAgICBjb250ZXh0LCBjb250ZXh0LmRyaWZ0X3N0cmVhbSwgaW50KG9zLmdldGVudigiZHJpZnRfc3RyZWFtX3NoYXJkcyIsIDEpKQogICAgICAgICkKICAgIGV4Y2VwdDoKICAgICAgICBjb250ZXh0LmxvZ2dlci5pbmZvKGYie2NvbnRleHQuZHJpZnRfc3RyZWFtfSBhbHJlYWR5IGV4aXN0cyIpCgogICAgbW9kZWxzID0ge30KICAgIG1vZGVsX3R5cGVzID0gWyJwYWdlaGlua2VseSIsICJkZG0iLCAiZWRkbSJdCiAgICBwYXRoX3N1ZmZpeCA9ICJfbW9kZWxfcGF0aCIKICAgIGZvciBtb2RlbCBpbiBtb2RlbF90eXBlczoKICAgICAgICBtb2RlbF9lbnYgPSBmInttb2RlbH17cGF0aF9zdWZmaXh9IgogICAgICAgIGlmIG1vZGVsX2VudiBpbiBvcy5lbnZpcm9uOgogICAgICAgICAgICB3aXRoIG9wZW4ob3MuZW52aXJvblttb2RlbF9lbnZdLCAicmIiKSBhcyBmOgogICAgICAgICAgICAgICAgbW9kZWxzW21vZGVsXSA9IGxvYWQoZikKICAgIHNldGF0dHIoY29udGV4dCwgIm1vZGVscyIsIG1vZGVscykKCiAgICBzZXRhdHRyKGNvbnRleHQsICJsYWJlbF9jb2wiLCBvcy5nZXRlbnYoImxhYmVsX2NvbCIsICJsYWJlbCIpKQogICAgc2V0YXR0cihjb250ZXh0LCAicHJlZGljdGlvbl9jb2wiLCBvcy5nZXRlbnYoInByZWRpY3Rpb25fY29sIiwgInByZWRpY3Rpb24iKSkKCgpkZWYgaGFuZGxlcihjb250ZXh0LCBldmVudCk6CiAgICBjb250ZXh0LmxvZ2dlci5pbmZvKGYiZXZlbnQ6IHtldmVudC5ib2R5fSIpCiAgICBmdWxsX2V2ZW50ID0ganNvbi5sb2FkcyhldmVudC5ib2R5KQogICAgcmVjb3JkID0gY29uc3RydWN0X3JlY29yZChmdWxsX2V2ZW50KQoKICAgIGlzX2Vycm9yID0gcmVjb3JkW2NvbnRleHQubGFiZWxfY29sXSAhPSByZWNvcmRbY29udGV4dC5wcmVkaWN0aW9uX2NvbF0KICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZiJBZGRpbmcge2lzX2Vycm9yfSIpCgogICAgZm9yIG5hbWUsIG1vZGVsIGluIGNvbnRleHQubW9kZWxzLml0ZW1zKCk6CiAgICAgICAgcmVzdWx0cyA9IHsidGltZXN0YW1wIjogcmVjb3JkWyJ0aW1lc3RhbXAiXX0KICAgICAgICByZXN1bHRzWyJhbGdvcml0aG0iXSA9IG5hbWUKICAgICAgICBtb2RlbC5hZGRfZWxlbWVudChpc19lcnJvcikKCiAgICAgICAgaWYgaGFzYXR0cihtb2RlbCwgImRldGVjdGVkX3dhcm5pbmdfem9uZSIpIGFuZCBtb2RlbC5kZXRlY3RlZF93YXJuaW5nX3pvbmUoKToKICAgICAgICAgICAgY29udGV4dC5sb2dnZXIuaW5mbyhmIntuYW1lfVx0V2FybmluZyB6b25lIGRldGVjdGVkIikKICAgICAgICAgICAgcmVzdWx0c1sid2FybmluZ196b25lIl0gPSAxCiAgICAgICAgICAgIGZ1bGxfZXZlbnRbZiJ7bmFtZX1fd2FybmluZ196b25lIl0gPSAxCiAgICAgICAgZWxzZToKICAgICAgICAgICAgcmVzdWx0c1sid2FybmluZ196b25lIl0gPSAwCiAgICAgICAgICAgIGZ1bGxfZXZlbnRbZiJ7bmFtZX1fd2FybmluZ196b25lIl0gPSAwCgogICAgICAgIGlmIG1vZGVsLmRldGVjdGVkX2NoYW5nZSgpOgogICAgICAgICAgICBjb250ZXh0LmxvZ2dlci5pbmZvKCJDaGFuZ2UgRGV0ZWN0ZWQiKQogICAgICAgICAgICByZXN1bHRzWyJjaGFuZ2VfZGV0ZWN0ZWQiXSA9IDEKICAgICAgICAgICAgZnVsbF9ldmVudFtmIntuYW1lfV9kcmlmdCJdID0gMQogICAgICAgIGVsc2U6CiAgICAgICAgICAgIHJlc3VsdHNbImNoYW5nZV9kZXRlY3RlZCJdID0gMAogICAgICAgICAgICBmdWxsX2V2ZW50W2Yie25hbWV9X2RyaWZ0Il0gPSAwCiAgICAgICAgY29udGV4dC53aW5kb3cuYXBwZW5kKHJlc3VsdHMpCgogICAgcHVzaF90b19zdHJlYW0oY29udGV4dCwgY29udGV4dC5kcmlmdF9zdHJlYW0sIFtmdWxsX2V2ZW50XSkKCiAgICBpZiBjb250ZXh0LmNhbGxiYWNrcyAhPSBbIiJdOgogICAgICAgIGZvciBjYWxsYmFjayBpbiBjb250ZXh0LmNhbGxiYWNrczoKICAgICAgICAgICAgcmVxdWVzdHMucG9zdCh1cmw9Y2FsbGJhY2ssIGpzb249ZnVsbF9ldmVudCkKCiAgICBpZiAobGVuKGNvbnRleHQud2luZG93KSAvIGxlbihjb250ZXh0Lm1vZGVscykpID49IGNvbnRleHQud2luZG93X3NpemU6CiAgICAgICAgZGYgPSBwZC5EYXRhRnJhbWUoY29udGV4dC53aW5kb3cpCiAgICAgICAgZGZbInRpbWVzdGFtcCJdID0gcGQudG9fZGF0ZXRpbWUoZGZbInRpbWVzdGFtcCJdKQogICAgICAgIGRmID0gZGYuc2V0X2luZGV4KFsidGltZXN0YW1wIiwgImFsZ29yaXRobSJdKQogICAgICAgIGNvbnRleHQudjNmLndyaXRlKCJ0c2RiIiwgY29udGV4dC50c2RiX3RhYmxlLCBkZikKICAgICAgICBjb250ZXh0LndpbmRvdyA9IFtdCg== - source: '' - build: - commands: - - python -m pip install scikit-multiflow==0.4.1 v3io_frames - code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/concept_drift_streaming/concept_drift_streaming.py - default_handler: handler - affinity: null -verbose: false diff --git a/functions/development/concept_drift_streaming/0.0.1/src/item.yaml b/functions/development/concept_drift_streaming/0.0.1/src/item.yaml deleted file mode 100644 index 1ab5c700..00000000 --- a/functions/development/concept_drift_streaming/0.0.1/src/item.yaml +++ /dev/null @@ -1,27 +0,0 @@ -apiVersion: v1 -categories: -- machine-learning -- monitoring -description: Deploy a streaming Concept Drift detector on a labeled stream. the nuclio part of the concept_drift function -doc: '' -example: concept_drift_streaming.ipynb -generationDate: 2021-05-19:22-41 -icon: '' -labels: - author: orz - framework: sklearn -maintainers: [] -marketplaceType: '' -mlrunVersion: '' -name: concept-drift-streaming -platformVersion: '' -spec: - filename: concept_drift_streaming.py - handler: handler - image: mlrun/ml-models - kind: nuclio - requirements: - - scikit-multiflow==0.4.1 - - v3io_frames -url: '' -version: 0.0.1 diff --git a/functions/development/concept_drift_streaming/0.0.1/static/documentation.html b/functions/development/concept_drift_streaming/0.0.1/static/documentation.html deleted file mode 100644 index fd1cd3dd..00000000 --- a/functions/development/concept_drift_streaming/0.0.1/static/documentation.html +++ /dev/null @@ -1,125 +0,0 @@ - - - - - - - -concept_drift_streaming package - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- - -
-
-
-
-
-
-

concept_drift_streaming package

-
-

Submodules

-
-
-

concept_drift_streaming.concept_drift_streaming module

-
-
-

Module contents

-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/concept_drift_streaming/0.0.1/static/example.html b/functions/development/concept_drift_streaming/0.0.1/static/example.html deleted file mode 100644 index abf68816..00000000 --- a/functions/development/concept_drift_streaming/0.0.1/static/example.html +++ /dev/null @@ -1,486 +0,0 @@ - - - - - - - -Concept Drift Streaming - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- -
-
- Contents -
- -
-
-
-
-
-
-
-

Concept Drift Streaming

-
-
-
import nuclio
-
-
-
-
-
-
-
from pprint import pprint
-
-
-
-
-
-
-
%%nuclio cmd -c
-python -m pip install scikit-multiflow==0.4.1
-python -m pip install v3io_frames
-
-
-
-
-
-
-
# Define function spec
-%nuclio config kind = "nuclio"
-%nuclio config spec.build.baseImage = "mlrun/ml-models"
-
-# Add V3IO Mount
-# %nuclio env %v3io
-
-
-
-
-
%nuclio: setting kind to 'nuclio'
-%nuclio: setting spec.build.baseImage to 'mlrun/ml-models'
-
-
-
-
-
-
-
# nuclio: ignore
-env = {'label_col': 'resp',
-       'prediction_col': 'prediction',
-       'drift_stream': '/bigdata/network-operations/drift_stream',
-       'tsdb_table': 'network-operations/drift_tsdb',
-       'pagehinkley_threshold': 10,
-       'models': ['pagehinkley', 'ddm', 'eddm'],
-       'window_size': 10}
-config = {'kind': 'nuclio',
-          'spec.build.baseImage': 'mlrun/ml-models'}
-cmd = ['python -m pip install scikit-multiflow',
-       'python -m pip install v3io_frames']
-v3io = True
-config = nuclio.ConfigSpec(env=env,
-                           config=config,
-                           cmd=cmd,
-                           v3io=v3io)
-
-
-
-
-
-
-
# nuclio: start-code
-
-
-
-
-
-
-
import skmultiflow.drift_detection
-import numpy as np
-import pandas as pd
-import os
-import json
-import v3io.dataplane
-import v3io_frames as v3f
-import requests
-from cloudpickle import load
-
-# For testing
-import random
-
-
-
-
-
-
-
def split_path(mntpath=''):
-    if mntpath[0] == '/':
-        mntpath = mntpath[1:]
-    paths = mntpath.split('/')
-    container = paths[0]
-    subpath = ''
-    if len(paths) > 1:
-        subpath = mntpath[len(container):]
-    return container, subpath
-
-
-def create_stream(context, path, shards=1):
-    # create a stream w/8 shards
-    container, stream_path = split_path(path)
-    context.logger.info(f'Creating stream in Container: {container} & Path {stream_path}')
-    response = context.v3io_client.create_stream(container=container,
-                                        path=stream_path, 
-                                        shard_count=shards,
-                                        raise_for_status=v3io.dataplane.RaiseForStatus.never)
-    response.raise_for_status([409, 204])
-    
-    
-def push_to_stream(context, stream_path, data):
-    records = [{'data': json.dumps(rec)} for rec in data]
-    container, stream_path = split_path(stream_path)
-    response = context.v3io_client.put_records(container=container,
-                                               path=stream_path, 
-                                               records=records)
-
-
-def construct_record(record):
-    label_col = os.getenv('label_col', 'label')
-    prediction_col = os.getenv('prediction_col', 'prediction')
-    res = dict([(k, record[k]) for k in ['when', 'class', 'model', 'resp', 'request']])
-    res['feature_vector'] = res.pop('request')['instances'][0]
-    res['timestamp'] = res.pop('when')
-    res['prediction'] = res['resp'][0]
-    return res
-
-
-
-
-
-
-
def init_context(context):
-    # create a v3io context object
-    v3io_client = v3io.dataplane.Client()
-    setattr(context, "v3io_client", v3io_client)
-    
-    # Setup windowing for TSDB writer
-    v3f_client = v3f.Client('framesd:8081', container='bigdata')
-    setattr(context, "v3f", v3f_client)
-    window = []
-    setattr(context, 'window', window)
-    setattr(context, 'window_size', int(os.getenv('window_size', 10)))
-    setattr(context, 'tsdb_table', os.getenv('tsdb_table', 'concept_drift_tsdb_1'))
-    try:
-        context.v3f.create('tsdb', context.tsdb_table, rate='1/s', if_exists=1)
-    except Exception as e:
-        context.logger.info(f'Creating context with rate= faile for {e}')
-        context.v3f.create('tsdb', context.tsdb_table, attrs={'rate': '1/s'}, if_exists=1)
-    
-    # Setup callbacks
-    callbacks = [callback.strip() for callback in os.getenv('callbacks', '').split(',')]
-    setattr(context, 'callbacks', callbacks)
-    
-    # Setup drift stream
-    setattr(context, 'drift_stream', os.getenv('drift_stream', '/bigdata/drift_stream'))
-    try:
-        create_stream(context, context.drift_stream, int(os.getenv('drift_stream_shards', 1)))
-    except:
-        context.logger.info(f'{context.drift_stream} already exists')
-    
-    # Load models
-    models = {}
-    model_types = ['pagehinkely', 'ddm', 'eddm']
-    path_suffix = '_model_path'
-    for model in model_types:
-        model_env = f'{model}{path_suffix}'
-        if model_env in os.environ:
-            with open(os.environ[model_env], 'rb') as f:
-                models[model] = load(f)
-    setattr(context, 'models', models)
-    
-    # Columns to check
-    setattr(context, 'label_col', os.getenv('label_col', 'label'))
-    setattr(context, 'prediction_col', os.getenv('prediction_col', 'prediction'))
-
-
-
-
-
-
-
def handler(context, event):
-    # Construct event
-    context.logger.info(f'event: {event.body}')
-    full_event = json.loads(event.body)
-    record = construct_record(full_event)
-    
-    # Is our prediction wrong?
-    is_error = record[context.label_col] != record[context.prediction_col]
-    context.logger.info(f'Adding {is_error}')
-    
-    # Process the {is_error} element with our algorithms
-    for name, model in context.models.items():
-        # Add element
-        results = {'timestamp': record['timestamp']}
-        results['algorithm'] = name
-        model.add_element(is_error)
-        
-        # Detect warning zone (if applicable to the algorithm)
-        if hasattr(model, 'detected_warning_zone') and model.detected_warning_zone():
-            context.logger.info(f'{name}\tWarning zone detected')
-            results['warning_zone'] = 1
-            full_event[f'{name}_warning_zone'] = 1
-        else:
-            results['warning_zone'] = 0
-            full_event[f'{name}_warning_zone'] = 0
-        
-        # Detect drift
-        if model.detected_change():
-            context.logger.info('Change Detected')
-            results['change_detected'] = 1
-            full_event[f'{name}_drift'] = 1
-        else:
-            results['change_detected'] = 0
-            full_event[f'{name}_drift'] = 0
-        context.window.append(results)
-    
-    # Return results
-    # Write to stream
-    push_to_stream(context, context.drift_stream, [full_event])
-    
-    # Add to callbacks
-    if context.callbacks != ['']:
-        for callback in context.callbacks:
-            requests.post(url=callback,
-                          json=full_event)
-    
-    if (len(context.window) / len(context.models)) >= context.window_size:
-        df = pd.DataFrame(context.window)
-        df['timestamp'] = pd.to_datetime(df['timestamp'])
-        df = df.set_index(['timestamp', 'algorithm'])
-        context.v3f.write('tsdb', context.tsdb_table, df)
-        context.window = []
-
-
-
-
-
-
-
# nuclio: end-code
-
-
-
-
-
-

Test

-
-
-
init_context(context)
-event = nuclio.Event(body=json.dumps({'prediction': 0,
-                                      'when': 'now',
-                                      'class': 'ClassModel', 
-                                      'model': 'tester_v1', 
-                                      'resp': [0], 
-                                      'request': {'instances': [[1, 1.2, 3]]}}))
-out = handler(context, event)
-out
-
-
-
-
-
-
-

Cluster

-
-
-
%nuclio deploy -n network-operations-concept-drift -p network-operations
-
-
-
-
-
-
-

Save function yaml

-
-
-
from os import path
-from mlrun import run_local, NewTask, mlconf, import_function, mount_v3io, code_to_function, get_run_db
-mlconf.dbpath = mlconf.dbpath or 'http://mlrun-api:8080'
-
-
-
-
-
-
-
# create job function object from notebook code
-fn = code_to_function("concept_drift_streaming", kind='nuclio')
-
-# add metadata (for templates and reuse)
-fn.spec.default_handler = "handler"
-fn.spec.description = "Deploy a streaming Concept Drift detector on a labeled stream. the nuclio part of the concept_drift function"
-fn.metadata.categories = ["ml", "serve"]
-fn.metadata.labels = {"author": "orz", "framework": "sklearn"}
-fn.export("/User/functions/concept_drift_streaming/function.yaml")
-
-
-
-
-
[mlrun] 2020-07-14 13:49:22,720 function spec saved to path: /User/functions/concept_drift_streaming/function.yaml
-
-
-
<mlrun.runtimes.function.RemoteRuntime at 0x7fb5a7de3e80>
-
-
-
-
-
-
-
stream_trigger = nuclio.triggers.V3IOStreamTrigger(url='/bigdata/network-operations/inference_stream@cd2')
-
-
-
-
-
-
-
fn.add_trigger('labeled_stream', stream_trigger)
-
-
-
-
-
<mlrun.runtimes.function.RemoteRuntime at 0x7fa1dc063780>
-
-
-
-
-
-
-
fn.apply(mount_v3io()).with_v3io()
-
-
-
-
-
<mlrun.runtimes.function.RemoteRuntime at 0x7fa1dc063780>
-
-
-
-
-
-
-
fn.export("function.yaml")
-
-
-
-
-
-
-

Stream testing

-
-
-
fn = import_function('./function.yaml')
-
-
-
-
-
-
-
fn.deploy(project='network-operations')
-
-
-
-
-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/concept_drift_streaming/0.0.1/static/function.html b/functions/development/concept_drift_streaming/0.0.1/static/function.html deleted file mode 100644 index 70fb96a6..00000000 --- a/functions/development/concept_drift_streaming/0.0.1/static/function.html +++ /dev/null @@ -1,68 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: remote
-metadata:
-  name: concept-drift-streaming
-  tag: ''
-  hash: d97012d8a20a9042c2f13dd790ecb28e34d9f29f
-  project: default
-  labels:
-    author: orz
-    framework: sklearn
-  categories:
-  - machine-learning
-  - monitoring
-spec:
-  command: ''
-  args: []
-  image: mlrun/ml-models
-  description: Deploy a streaming Concept Drift detector on a labeled stream. the
-    nuclio part of the concept_drift function
-  min_replicas: 1
-  max_replicas: 4
-  env: []
-  base_spec:
-    apiVersion: nuclio.io/v1
-    kind: Function
-    metadata:
-      name: concept-drift-streaming
-      labels: {}
-      annotations:
-        nuclio.io/generated_by: function generated from /home/kali/functions/concept_drift_streaming/concept_drift_streaming.py
-    spec:
-      runtime: python:3.6
-      handler: concept_drift_streaming:handler
-      env: []
-      volumes: []
-      build:
-        commands: []
-        noBaseImagesPull: true
-        functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHNrbXVsdGlmbG93LmRyaWZ0X2RldGVjdGlvbgppbXBvcnQgbnVtcHkgYXMgbnAKaW1wb3J0IHBhbmRhcyBhcyBwZAppbXBvcnQgb3MKaW1wb3J0IGpzb24KaW1wb3J0IHYzaW8uZGF0YXBsYW5lCmltcG9ydCB2M2lvX2ZyYW1lcyBhcyB2M2YKaW1wb3J0IHJlcXVlc3RzCmZyb20gY2xvdWRwaWNrbGUgaW1wb3J0IGxvYWQKCmltcG9ydCByYW5kb20KCgpkZWYgc3BsaXRfcGF0aChtbnRwYXRoPSIiKToKICAgIGlmIG1udHBhdGhbMF0gPT0gIi8iOgogICAgICAgIG1udHBhdGggPSBtbnRwYXRoWzE6XQogICAgcGF0aHMgPSBtbnRwYXRoLnNwbGl0KCIvIikKICAgIGNvbnRhaW5lciA9IHBhdGhzWzBdCiAgICBzdWJwYXRoID0gIiIKICAgIGlmIGxlbihwYXRocykgPiAxOgogICAgICAgIHN1YnBhdGggPSBtbnRwYXRoW2xlbihjb250YWluZXIpIDpdCiAgICByZXR1cm4gY29udGFpbmVyLCBzdWJwYXRoCgoKZGVmIGNyZWF0ZV9zdHJlYW0oY29udGV4dCwgcGF0aCwgc2hhcmRzPTEpOgogICAgY29udGFpbmVyLCBzdHJlYW1fcGF0aCA9IHNwbGl0X3BhdGgocGF0aCkKICAgIGNvbnRleHQubG9nZ2VyLmluZm8oCiAgICAgICAgZiJDcmVhdGluZyBzdHJlYW0gaW4gQ29udGFpbmVyOiB7Y29udGFpbmVyfSAmIFBhdGgge3N0cmVhbV9wYXRofSIKICAgICkKICAgIHJlc3BvbnNlID0gY29udGV4dC52M2lvX2NsaWVudC5jcmVhdGVfc3RyZWFtKAogICAgICAgIGNvbnRhaW5lcj1jb250YWluZXIsCiAgICAgICAgcGF0aD1zdHJlYW1fcGF0aCwKICAgICAgICBzaGFyZF9jb3VudD1zaGFyZHMsCiAgICAgICAgcmFpc2VfZm9yX3N0YXR1cz12M2lvLmRhdGFwbGFuZS5SYWlzZUZvclN0YXR1cy5uZXZlciwKICAgICkKICAgIHJlc3BvbnNlLnJhaXNlX2Zvcl9zdGF0dXMoWzQwOSwgMjA0XSkKCgpkZWYgcHVzaF90b19zdHJlYW0oY29udGV4dCwgc3RyZWFtX3BhdGgsIGRhdGEpOgogICAgcmVjb3JkcyA9IFt7ImRhdGEiOiBqc29uLmR1bXBzKHJlYyl9IGZvciByZWMgaW4gZGF0YV0KICAgIGNvbnRhaW5lciwgc3RyZWFtX3BhdGggPSBzcGxpdF9wYXRoKHN0cmVhbV9wYXRoKQogICAgcmVzcG9uc2UgPSBjb250ZXh0LnYzaW9fY2xpZW50LnB1dF9yZWNvcmRzKAogICAgICAgIGNvbnRhaW5lcj1jb250YWluZXIsIHBhdGg9c3RyZWFtX3BhdGgsIHJlY29yZHM9cmVjb3JkcwogICAgKQoKCmRlZiBjb25zdHJ1Y3RfcmVjb3JkKHJlY29yZCk6CiAgICBsYWJlbF9jb2wgPSBvcy5nZXRlbnYoImxhYmVsX2NvbCIsICJsYWJlbCIpCiAgICBwcmVkaWN0aW9uX2NvbCA9IG9zLmdldGVudigicHJlZGljdGlvbl9jb2wiLCAicHJlZGljdGlvbiIpCiAgICByZXMgPSBkaWN0KFsoaywgcmVjb3JkW2tdKSBmb3IgayBpbiBbIndoZW4iLCAiY2xhc3MiLCAibW9kZWwiLCAicmVzcCIsICJyZXF1ZXN0Il1dKQogICAgcmVzWyJmZWF0dXJlX3ZlY3RvciJdID0gcmVzLnBvcCgicmVxdWVzdCIpWyJpbnN0YW5jZXMiXVswXQogICAgcmVzWyJ0aW1lc3RhbXAiXSA9IHJlcy5wb3AoIndoZW4iKQogICAgcmVzWyJwcmVkaWN0aW9uIl0gPSByZXNbInJlc3AiXVswXQogICAgcmV0dXJuIHJlcwoKCmRlZiBpbml0X2NvbnRleHQoY29udGV4dCk6CiAgICB2M2lvX2NsaWVudCA9IHYzaW8uZGF0YXBsYW5lLkNsaWVudCgpCiAgICBzZXRhdHRyKGNvbnRleHQsICJ2M2lvX2NsaWVudCIsIHYzaW9fY2xpZW50KQoKICAgIHYzZl9jbGllbnQgPSB2M2YuQ2xpZW50KCJmcmFtZXNkOjgwODEiLCBjb250YWluZXI9ImJpZ2RhdGEiKQogICAgc2V0YXR0cihjb250ZXh0LCAidjNmIiwgdjNmX2NsaWVudCkKICAgIHdpbmRvdyA9IFtdCiAgICBzZXRhdHRyKGNvbnRleHQsICJ3aW5kb3ciLCB3aW5kb3cpCiAgICBzZXRhdHRyKGNvbnRleHQsICJ3aW5kb3dfc2l6ZSIsIGludChvcy5nZXRlbnYoIndpbmRvd19zaXplIiwgMTApKSkKICAgIHNldGF0dHIoY29udGV4dCwgInRzZGJfdGFibGUiLCBvcy5nZXRlbnYoInRzZGJfdGFibGUiLCAiY29uY2VwdF9kcmlmdF90c2RiXzEiKSkKICAgIHRyeToKICAgICAgICBjb250ZXh0LnYzZi5jcmVhdGUoInRzZGIiLCBjb250ZXh0LnRzZGJfdGFibGUsIHJhdGU9IjEvcyIsIGlmX2V4aXN0cz0xKQogICAgZXhjZXB0IEV4Y2VwdGlvbiBhcyBlOgogICAgICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZiJDcmVhdGluZyBjb250ZXh0IHdpdGggcmF0ZT0gZmFpbGUgZm9yIHtlfSIpCiAgICAgICAgY29udGV4dC52M2YuY3JlYXRlKAogICAgICAgICAgICAidHNkYiIsIGNvbnRleHQudHNkYl90YWJsZSwgYXR0cnM9eyJyYXRlIjogIjEvcyJ9LCBpZl9leGlzdHM9MQogICAgICAgICkKCiAgICBjYWxsYmFja3MgPSBbY2FsbGJhY2suc3RyaXAoKSBmb3IgY2FsbGJhY2sgaW4gb3MuZ2V0ZW52KCJjYWxsYmFja3MiLCAiIikuc3BsaXQoIiwiKV0KICAgIHNldGF0dHIoY29udGV4dCwgImNhbGxiYWNrcyIsIGNhbGxiYWNrcykKCiAgICBzZXRhdHRyKGNvbnRleHQsICJkcmlmdF9zdHJlYW0iLCBvcy5nZXRlbnYoImRyaWZ0X3N0cmVhbSIsICIvYmlnZGF0YS9kcmlmdF9zdHJlYW0iKSkKICAgIHRyeToKICAgICAgICBjcmVhdGVfc3RyZWFtKAogICAgICAgICAgICBjb250ZXh0LCBjb250ZXh0LmRyaWZ0X3N0cmVhbSwgaW50KG9zLmdldGVudigiZHJpZnRfc3RyZWFtX3NoYXJkcyIsIDEpKQogICAgICAgICkKICAgIGV4Y2VwdDoKICAgICAgICBjb250ZXh0LmxvZ2dlci5pbmZvKGYie2NvbnRleHQuZHJpZnRfc3RyZWFtfSBhbHJlYWR5IGV4aXN0cyIpCgogICAgbW9kZWxzID0ge30KICAgIG1vZGVsX3R5cGVzID0gWyJwYWdlaGlua2VseSIsICJkZG0iLCAiZWRkbSJdCiAgICBwYXRoX3N1ZmZpeCA9ICJfbW9kZWxfcGF0aCIKICAgIGZvciBtb2RlbCBpbiBtb2RlbF90eXBlczoKICAgICAgICBtb2RlbF9lbnYgPSBmInttb2RlbH17cGF0aF9zdWZmaXh9IgogICAgICAgIGlmIG1vZGVsX2VudiBpbiBvcy5lbnZpcm9uOgogICAgICAgICAgICB3aXRoIG9wZW4ob3MuZW52aXJvblttb2RlbF9lbnZdLCAicmIiKSBhcyBmOgogICAgICAgICAgICAgICAgbW9kZWxzW21vZGVsXSA9IGxvYWQoZikKICAgIHNldGF0dHIoY29udGV4dCwgIm1vZGVscyIsIG1vZGVscykKCiAgICBzZXRhdHRyKGNvbnRleHQsICJsYWJlbF9jb2wiLCBvcy5nZXRlbnYoImxhYmVsX2NvbCIsICJsYWJlbCIpKQogICAgc2V0YXR0cihjb250ZXh0LCAicHJlZGljdGlvbl9jb2wiLCBvcy5nZXRlbnYoInByZWRpY3Rpb25fY29sIiwgInByZWRpY3Rpb24iKSkKCgpkZWYgaGFuZGxlcihjb250ZXh0LCBldmVudCk6CiAgICBjb250ZXh0LmxvZ2dlci5pbmZvKGYiZXZlbnQ6IHtldmVudC5ib2R5fSIpCiAgICBmdWxsX2V2ZW50ID0ganNvbi5sb2FkcyhldmVudC5ib2R5KQogICAgcmVjb3JkID0gY29uc3RydWN0X3JlY29yZChmdWxsX2V2ZW50KQoKICAgIGlzX2Vycm9yID0gcmVjb3JkW2NvbnRleHQubGFiZWxfY29sXSAhPSByZWNvcmRbY29udGV4dC5wcmVkaWN0aW9uX2NvbF0KICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZiJBZGRpbmcge2lzX2Vycm9yfSIpCgogICAgZm9yIG5hbWUsIG1vZGVsIGluIGNvbnRleHQubW9kZWxzLml0ZW1zKCk6CiAgICAgICAgcmVzdWx0cyA9IHsidGltZXN0YW1wIjogcmVjb3JkWyJ0aW1lc3RhbXAiXX0KICAgICAgICByZXN1bHRzWyJhbGdvcml0aG0iXSA9IG5hbWUKICAgICAgICBtb2RlbC5hZGRfZWxlbWVudChpc19lcnJvcikKCiAgICAgICAgaWYgaGFzYXR0cihtb2RlbCwgImRldGVjdGVkX3dhcm5pbmdfem9uZSIpIGFuZCBtb2RlbC5kZXRlY3RlZF93YXJuaW5nX3pvbmUoKToKICAgICAgICAgICAgY29udGV4dC5sb2dnZXIuaW5mbyhmIntuYW1lfVx0V2FybmluZyB6b25lIGRldGVjdGVkIikKICAgICAgICAgICAgcmVzdWx0c1sid2FybmluZ196b25lIl0gPSAxCiAgICAgICAgICAgIGZ1bGxfZXZlbnRbZiJ7bmFtZX1fd2FybmluZ196b25lIl0gPSAxCiAgICAgICAgZWxzZToKICAgICAgICAgICAgcmVzdWx0c1sid2FybmluZ196b25lIl0gPSAwCiAgICAgICAgICAgIGZ1bGxfZXZlbnRbZiJ7bmFtZX1fd2FybmluZ196b25lIl0gPSAwCgogICAgICAgIGlmIG1vZGVsLmRldGVjdGVkX2NoYW5nZSgpOgogICAgICAgICAgICBjb250ZXh0LmxvZ2dlci5pbmZvKCJDaGFuZ2UgRGV0ZWN0ZWQiKQogICAgICAgICAgICByZXN1bHRzWyJjaGFuZ2VfZGV0ZWN0ZWQiXSA9IDEKICAgICAgICAgICAgZnVsbF9ldmVudFtmIntuYW1lfV9kcmlmdCJdID0gMQogICAgICAgIGVsc2U6CiAgICAgICAgICAgIHJlc3VsdHNbImNoYW5nZV9kZXRlY3RlZCJdID0gMAogICAgICAgICAgICBmdWxsX2V2ZW50W2Yie25hbWV9X2RyaWZ0Il0gPSAwCiAgICAgICAgY29udGV4dC53aW5kb3cuYXBwZW5kKHJlc3VsdHMpCgogICAgcHVzaF90b19zdHJlYW0oY29udGV4dCwgY29udGV4dC5kcmlmdF9zdHJlYW0sIFtmdWxsX2V2ZW50XSkKCiAgICBpZiBjb250ZXh0LmNhbGxiYWNrcyAhPSBbIiJdOgogICAgICAgIGZvciBjYWxsYmFjayBpbiBjb250ZXh0LmNhbGxiYWNrczoKICAgICAgICAgICAgcmVxdWVzdHMucG9zdCh1cmw9Y2FsbGJhY2ssIGpzb249ZnVsbF9ldmVudCkKCiAgICBpZiAobGVuKGNvbnRleHQud2luZG93KSAvIGxlbihjb250ZXh0Lm1vZGVscykpID49IGNvbnRleHQud2luZG93X3NpemU6CiAgICAgICAgZGYgPSBwZC5EYXRhRnJhbWUoY29udGV4dC53aW5kb3cpCiAgICAgICAgZGZbInRpbWVzdGFtcCJdID0gcGQudG9fZGF0ZXRpbWUoZGZbInRpbWVzdGFtcCJdKQogICAgICAgIGRmID0gZGYuc2V0X2luZGV4KFsidGltZXN0YW1wIiwgImFsZ29yaXRobSJdKQogICAgICAgIGNvbnRleHQudjNmLndyaXRlKCJ0c2RiIiwgY29udGV4dC50c2RiX3RhYmxlLCBkZikKICAgICAgICBjb250ZXh0LndpbmRvdyA9IFtdCg==
-  source: ''
-  build:
-    commands:
-    - python -m pip install scikit-multiflow==0.4.1 v3io_frames
-    code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/concept_drift_streaming/concept_drift_streaming.py
-  default_handler: handler
-  affinity: null
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/concept_drift_streaming/0.0.1/static/item.html b/functions/development/concept_drift_streaming/0.0.1/static/item.html deleted file mode 100644 index c6e2f0f8..00000000 --- a/functions/development/concept_drift_streaming/0.0.1/static/item.html +++ /dev/null @@ -1,49 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- machine-learning
-- monitoring
-description: Deploy a streaming Concept Drift detector on a labeled stream. the nuclio part of the concept_drift function
-doc: ''
-example: concept_drift_streaming.ipynb
-generationDate: 2021-05-19:22-41
-icon: ''
-labels:
-  author: orz
-  framework: sklearn
-maintainers: []
-marketplaceType: ''
-mlrunVersion: ''
-name: concept-drift-streaming
-platformVersion: ''
-spec:
-  filename: concept_drift_streaming.py
-  handler: handler
-  image: mlrun/ml-models
-  kind: nuclio
-  requirements:
-  - scikit-multiflow==0.4.1
-  - v3io_frames
-url: ''
-version: 0.0.1
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/concept_drift_streaming/0.0.1/static/source.html b/functions/development/concept_drift_streaming/0.0.1/static/source.html deleted file mode 100644 index 60fa5704..00000000 --- a/functions/development/concept_drift_streaming/0.0.1/static/source.html +++ /dev/null @@ -1,165 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-# Generated by nuclio.export.NuclioExporter
-
-import skmultiflow.drift_detection
-import numpy as np
-import pandas as pd
-import os
-import json
-import v3io.dataplane
-import v3io_frames as v3f
-import requests
-from cloudpickle import load
-
-import random
-
-
-def split_path(mntpath=""):
-    if mntpath[0] == "/":
-        mntpath = mntpath[1:]
-    paths = mntpath.split("/")
-    container = paths[0]
-    subpath = ""
-    if len(paths) > 1:
-        subpath = mntpath[len(container) :]
-    return container, subpath
-
-
-def create_stream(context, path, shards=1):
-    container, stream_path = split_path(path)
-    context.logger.info(
-        f"Creating stream in Container: {container} & Path {stream_path}"
-    )
-    response = context.v3io_client.create_stream(
-        container=container,
-        path=stream_path,
-        shard_count=shards,
-        raise_for_status=v3io.dataplane.RaiseForStatus.never,
-    )
-    response.raise_for_status([409, 204])
-
-
-def push_to_stream(context, stream_path, data):
-    records = [{"data": json.dumps(rec)} for rec in data]
-    container, stream_path = split_path(stream_path)
-    response = context.v3io_client.put_records(
-        container=container, path=stream_path, records=records
-    )
-
-
-def construct_record(record):
-    label_col = os.getenv("label_col", "label")
-    prediction_col = os.getenv("prediction_col", "prediction")
-    res = dict([(k, record[k]) for k in ["when", "class", "model", "resp", "request"]])
-    res["feature_vector"] = res.pop("request")["instances"][0]
-    res["timestamp"] = res.pop("when")
-    res["prediction"] = res["resp"][0]
-    return res
-
-
-def init_context(context):
-    v3io_client = v3io.dataplane.Client()
-    setattr(context, "v3io_client", v3io_client)
-
-    v3f_client = v3f.Client("framesd:8081", container="bigdata")
-    setattr(context, "v3f", v3f_client)
-    window = []
-    setattr(context, "window", window)
-    setattr(context, "window_size", int(os.getenv("window_size", 10)))
-    setattr(context, "tsdb_table", os.getenv("tsdb_table", "concept_drift_tsdb_1"))
-    try:
-        context.v3f.create("tsdb", context.tsdb_table, rate="1/s", if_exists=1)
-    except Exception as e:
-        context.logger.info(f"Creating context with rate= faile for {e}")
-        context.v3f.create(
-            "tsdb", context.tsdb_table, attrs={"rate": "1/s"}, if_exists=1
-        )
-
-    callbacks = [callback.strip() for callback in os.getenv("callbacks", "").split(",")]
-    setattr(context, "callbacks", callbacks)
-
-    setattr(context, "drift_stream", os.getenv("drift_stream", "/bigdata/drift_stream"))
-    try:
-        create_stream(
-            context, context.drift_stream, int(os.getenv("drift_stream_shards", 1))
-        )
-    except:
-        context.logger.info(f"{context.drift_stream} already exists")
-
-    models = {}
-    model_types = ["pagehinkely", "ddm", "eddm"]
-    path_suffix = "_model_path"
-    for model in model_types:
-        model_env = f"{model}{path_suffix}"
-        if model_env in os.environ:
-            with open(os.environ[model_env], "rb") as f:
-                models[model] = load(f)
-    setattr(context, "models", models)
-
-    setattr(context, "label_col", os.getenv("label_col", "label"))
-    setattr(context, "prediction_col", os.getenv("prediction_col", "prediction"))
-
-
-def handler(context, event):
-    context.logger.info(f"event: {event.body}")
-    full_event = json.loads(event.body)
-    record = construct_record(full_event)
-
-    is_error = record[context.label_col] != record[context.prediction_col]
-    context.logger.info(f"Adding {is_error}")
-
-    for name, model in context.models.items():
-        results = {"timestamp": record["timestamp"]}
-        results["algorithm"] = name
-        model.add_element(is_error)
-
-        if hasattr(model, "detected_warning_zone") and model.detected_warning_zone():
-            context.logger.info(f"{name}\tWarning zone detected")
-            results["warning_zone"] = 1
-            full_event[f"{name}_warning_zone"] = 1
-        else:
-            results["warning_zone"] = 0
-            full_event[f"{name}_warning_zone"] = 0
-
-        if model.detected_change():
-            context.logger.info("Change Detected")
-            results["change_detected"] = 1
-            full_event[f"{name}_drift"] = 1
-        else:
-            results["change_detected"] = 0
-            full_event[f"{name}_drift"] = 0
-        context.window.append(results)
-
-    push_to_stream(context, context.drift_stream, [full_event])
-
-    if context.callbacks != [""]:
-        for callback in context.callbacks:
-            requests.post(url=callback, json=full_event)
-
-    if (len(context.window) / len(context.models)) >= context.window_size:
-        df = pd.DataFrame(context.window)
-        df["timestamp"] = pd.to_datetime(df["timestamp"])
-        df = df.set_index(["timestamp", "algorithm"])
-        context.v3f.write("tsdb", context.tsdb_table, df)
-        context.window = []
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/concept_drift_streaming/0.0.2/src/concept_drift_streaming.ipynb b/functions/development/concept_drift_streaming/0.0.2/src/concept_drift_streaming.ipynb deleted file mode 100644 index b916cb7a..00000000 --- a/functions/development/concept_drift_streaming/0.0.2/src/concept_drift_streaming.ipynb +++ /dev/null @@ -1,480 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Concept Drift Streaming" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import nuclio" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "from pprint import pprint" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "%%nuclio cmd -c\n", - "python -m pip install scikit-multiflow==0.4.1\n", - "python -m pip install v3io_frames" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "%nuclio: setting kind to 'nuclio'\n", - "%nuclio: setting spec.build.baseImage to 'mlrun/ml-models'\n" - ] - } - ], - "source": [ - "# Define function spec\n", - "%nuclio config kind = \"nuclio\"\n", - "%nuclio config spec.build.baseImage = \"mlrun/ml-models\"\n", - "\n", - "# Add V3IO Mount\n", - "# %nuclio env %v3io" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: ignore\n", - "env = {'label_col': 'resp',\n", - " 'prediction_col': 'prediction',\n", - " 'drift_stream': '/bigdata/network-operations/drift_stream',\n", - " 'tsdb_table': 'network-operations/drift_tsdb',\n", - " 'pagehinkley_threshold': 10,\n", - " 'models': ['pagehinkley', 'ddm', 'eddm'],\n", - " 'window_size': 10}\n", - "config = {'kind': 'nuclio',\n", - " 'spec.build.baseImage': 'mlrun/ml-models'}\n", - "cmd = ['python -m pip install scikit-multiflow',\n", - " 'python -m pip install v3io_frames']\n", - "v3io = True\n", - "config = nuclio.ConfigSpec(env=env,\n", - " config=config,\n", - " cmd=cmd,\n", - " v3io=v3io)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: start-code" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "import skmultiflow.drift_detection\n", - "import numpy as np\n", - "import pandas as pd\n", - "import os\n", - "import json\n", - "import v3io.dataplane\n", - "import v3io_frames as v3f\n", - "import requests\n", - "from cloudpickle import load\n", - "\n", - "# For testing\n", - "import random" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "def split_path(mntpath=''):\n", - " if mntpath[0] == '/':\n", - " mntpath = mntpath[1:]\n", - " paths = mntpath.split('/')\n", - " container = paths[0]\n", - " subpath = ''\n", - " if len(paths) > 1:\n", - " subpath = mntpath[len(container):]\n", - " return container, subpath\n", - "\n", - "\n", - "def create_stream(context, path, shards=1):\n", - " # create a stream w/8 shards\n", - " container, stream_path = split_path(path)\n", - " context.logger.info(f'Creating stream in Container: {container} & Path {stream_path}')\n", - " response = context.v3io_client.create_stream(container=container,\n", - " path=stream_path, \n", - " shard_count=shards,\n", - " raise_for_status=v3io.dataplane.RaiseForStatus.never)\n", - " response.raise_for_status([409, 204])\n", - " \n", - " \n", - "def push_to_stream(context, stream_path, data):\n", - " records = [{'data': json.dumps(rec)} for rec in data]\n", - " container, stream_path = split_path(stream_path)\n", - " response = context.v3io_client.put_records(container=container,\n", - " path=stream_path, \n", - " records=records)\n", - "\n", - "\n", - "def construct_record(record):\n", - " label_col = os.getenv('label_col', 'label')\n", - " prediction_col = os.getenv('prediction_col', 'prediction')\n", - " res = dict([(k, record[k]) for k in ['when', 'class', 'model', 'resp', 'request']])\n", - " res['feature_vector'] = res.pop('request')['instances'][0]\n", - " res['timestamp'] = res.pop('when')\n", - " res['prediction'] = res['resp'][0]\n", - " return res" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "def init_context(context):\n", - " # create a v3io context object\n", - " v3io_client = v3io.dataplane.Client()\n", - " setattr(context, \"v3io_client\", v3io_client)\n", - " \n", - " # Setup windowing for TSDB writer\n", - " v3f_client = v3f.Client('framesd:8081', container='bigdata')\n", - " setattr(context, \"v3f\", v3f_client)\n", - " window = []\n", - " setattr(context, 'window', window)\n", - " setattr(context, 'window_size', int(os.getenv('window_size', 10)))\n", - " setattr(context, 'tsdb_table', os.getenv('tsdb_table', 'concept_drift_tsdb_1'))\n", - " try:\n", - " context.v3f.create('tsdb', context.tsdb_table, rate='1/s', if_exists=1)\n", - " except Exception as e:\n", - " context.logger.info(f'Creating context with rate= faile for {e}')\n", - " context.v3f.create('tsdb', context.tsdb_table, attrs={'rate': '1/s'}, if_exists=1)\n", - " \n", - " # Setup callbacks\n", - " callbacks = [callback.strip() for callback in os.getenv('callbacks', '').split(',')]\n", - " setattr(context, 'callbacks', callbacks)\n", - " \n", - " # Setup drift stream\n", - " setattr(context, 'drift_stream', os.getenv('drift_stream', '/bigdata/drift_stream'))\n", - " try:\n", - " create_stream(context, context.drift_stream, int(os.getenv('drift_stream_shards', 1)))\n", - " except:\n", - " context.logger.info(f'{context.drift_stream} already exists')\n", - " \n", - " # Load models\n", - " models = {}\n", - " model_types = ['pagehinkely', 'ddm', 'eddm']\n", - " path_suffix = '_model_path'\n", - " for model in model_types:\n", - " model_env = f'{model}{path_suffix}'\n", - " if model_env in os.environ:\n", - " with open(os.environ[model_env], 'rb') as f:\n", - " models[model] = load(f)\n", - " setattr(context, 'models', models)\n", - " \n", - " # Columns to check\n", - " setattr(context, 'label_col', os.getenv('label_col', 'label'))\n", - " setattr(context, 'prediction_col', os.getenv('prediction_col', 'prediction'))" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "def handler(context, event):\n", - " # Construct event\n", - " context.logger.info(f'event: {event.body}')\n", - " full_event = json.loads(event.body)\n", - " record = construct_record(full_event)\n", - " \n", - " # Is our prediction wrong?\n", - " is_error = record[context.label_col] != record[context.prediction_col]\n", - " context.logger.info(f'Adding {is_error}')\n", - " \n", - " # Process the {is_error} element with our algorithms\n", - " for name, model in context.models.items():\n", - " # Add element\n", - " results = {'timestamp': record['timestamp']}\n", - " results['algorithm'] = name\n", - " model.add_element(is_error)\n", - " \n", - " # Detect warning zone (if applicable to the algorithm)\n", - " if hasattr(model, 'detected_warning_zone') and model.detected_warning_zone():\n", - " context.logger.info(f'{name}\\tWarning zone detected')\n", - " results['warning_zone'] = 1\n", - " full_event[f'{name}_warning_zone'] = 1\n", - " else:\n", - " results['warning_zone'] = 0\n", - " full_event[f'{name}_warning_zone'] = 0\n", - " \n", - " # Detect drift\n", - " if model.detected_change():\n", - " context.logger.info('Change Detected')\n", - " results['change_detected'] = 1\n", - " full_event[f'{name}_drift'] = 1\n", - " else:\n", - " results['change_detected'] = 0\n", - " full_event[f'{name}_drift'] = 0\n", - " context.window.append(results)\n", - " \n", - " # Return results\n", - " # Write to stream\n", - " push_to_stream(context, context.drift_stream, [full_event])\n", - " \n", - " # Add to callbacks\n", - " if context.callbacks != ['']:\n", - " for callback in context.callbacks:\n", - " requests.post(url=callback,\n", - " json=full_event)\n", - " \n", - " if (len(context.window) / len(context.models)) >= context.window_size:\n", - " df = pd.DataFrame(context.window)\n", - " df['timestamp'] = pd.to_datetime(df['timestamp'])\n", - " df = df.set_index(['timestamp', 'algorithm'])\n", - " context.v3f.write('tsdb', context.tsdb_table, df)\n", - " context.window = []" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: end-code" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Test " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "init_context(context)\n", - "event = nuclio.Event(body=json.dumps({'prediction': 0,\n", - " 'when': 'now',\n", - " 'class': 'ClassModel', \n", - " 'model': 'tester_v1', \n", - " 'resp': [0], \n", - " 'request': {'instances': [[1, 1.2, 3]]}}))\n", - "out = handler(context, event)\n", - "out" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Cluster" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%nuclio deploy -n network-operations-concept-drift -p network-operations" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Save function yaml" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "from os import path\n", - "from mlrun import run_local, NewTask, mlconf, import_function, mount_v3io, code_to_function, get_run_db\n", - "mlconf.dbpath = mlconf.dbpath or 'http://mlrun-api:8080'" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-07-14 13:49:22,720 function spec saved to path: /User/functions/concept_drift_streaming/function.yaml\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# create job function object from notebook code\n", - "fn = code_to_function(\"concept_drift_streaming\", kind='nuclio')\n", - "\n", - "# add metadata (for templates and reuse)\n", - "fn.spec.default_handler = \"handler\"\n", - "fn.spec.description = \"Deploy a streaming Concept Drift detector on a labeled stream. the nuclio part of the concept_drift function\"\n", - "fn.metadata.categories = [\"ml\", \"serve\"]\n", - "fn.metadata.labels = {\"author\": \"orz\", \"framework\": \"sklearn\"}\n", - "fn.export(\"/User/functions/concept_drift_streaming/function.yaml\")" - ] - }, - { - "cell_type": "code", - "execution_count": 120, - "metadata": {}, - "outputs": [], - "source": [ - "stream_trigger = nuclio.triggers.V3IOStreamTrigger(url='/bigdata/network-operations/inference_stream@cd2')" - ] - }, - { - "cell_type": "code", - "execution_count": 121, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 121, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fn.add_trigger('labeled_stream', stream_trigger)" - ] - }, - { - "cell_type": "code", - "execution_count": 122, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 122, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fn.apply(mount_v3io()).with_v3io()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "fn.export(\"function.yaml\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Stream testing" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "metadata": {}, - "outputs": [], - "source": [ - "fn = import_function('./function.yaml')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "fn.deploy(project='network-operations')" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.8" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/functions/development/concept_drift_streaming/0.0.2/src/concept_drift_streaming.py b/functions/development/concept_drift_streaming/0.0.2/src/concept_drift_streaming.py deleted file mode 100644 index 94247c45..00000000 --- a/functions/development/concept_drift_streaming/0.0.2/src/concept_drift_streaming.py +++ /dev/null @@ -1,143 +0,0 @@ -# Generated by nuclio.export.NuclioExporter - -import skmultiflow.drift_detection -import numpy as np -import pandas as pd -import os -import json -import v3io.dataplane -import v3io_frames as v3f -import requests -from cloudpickle import load - -import random - - -def split_path(mntpath=""): - if mntpath[0] == "/": - mntpath = mntpath[1:] - paths = mntpath.split("/") - container = paths[0] - subpath = "" - if len(paths) > 1: - subpath = mntpath[len(container) :] - return container, subpath - - -def create_stream(context, path, shards=1): - container, stream_path = split_path(path) - context.logger.info( - f"Creating stream in Container: {container} & Path {stream_path}" - ) - response = context.v3io_client.create_stream( - container=container, - path=stream_path, - shard_count=shards, - raise_for_status=v3io.dataplane.RaiseForStatus.never, - ) - response.raise_for_status([409, 204]) - - -def push_to_stream(context, stream_path, data): - records = [{"data": json.dumps(rec)} for rec in data] - container, stream_path = split_path(stream_path) - response = context.v3io_client.put_records( - container=container, path=stream_path, records=records - ) - - -def construct_record(record): - label_col = os.getenv("label_col", "label") - prediction_col = os.getenv("prediction_col", "prediction") - res = dict([(k, record[k]) for k in ["when", "class", "model", "resp", "request"]]) - res["feature_vector"] = res.pop("request")["instances"][0] - res["timestamp"] = res.pop("when") - res[prediction_col] = res["resp"][0] - return res - - -def init_context(context): - v3io_client = v3io.dataplane.Client() - setattr(context, "v3io_client", v3io_client) - - v3f_client = v3f.Client("framesd:8081", container="bigdata") - setattr(context, "v3f", v3f_client) - window = [] - setattr(context, "window", window) - setattr(context, "window_size", int(os.getenv("window_size", 10))) - setattr(context, "tsdb_table", os.getenv("tsdb_table", "concept_drift_tsdb_1")) - try: - context.v3f.create("tsdb", context.tsdb_table, rate="1/s", if_exists=1) - except Exception as e: - context.logger.info(f"Creating context with rate= faile for {e}") - context.v3f.create( - "tsdb", context.tsdb_table, attrs={"rate": "1/s"}, if_exists=1 - ) - - callbacks = [callback.strip() for callback in os.getenv("callbacks", "").split(",")] - setattr(context, "callbacks", callbacks) - - setattr(context, "drift_stream", os.getenv("drift_stream", "/bigdata/drift_stream")) - try: - create_stream( - context, context.drift_stream, int(os.getenv("drift_stream_shards", 1)) - ) - except: - context.logger.info(f"{context.drift_stream} already exists") - - models = {} - model_types = ["pagehinkely", "ddm", "eddm"] - path_suffix = "_model_path" - for model in model_types: - model_env = f"{model}{path_suffix}" - if model_env in os.environ: - with open(os.environ[model_env], "rb") as f: - models[model] = load(f) - setattr(context, "models", models) - - setattr(context, "label_col", os.getenv("label_col", "label")) - setattr(context, "prediction_col", os.getenv("prediction_col", "prediction")) - - -def handler(context, event): - context.logger.info(f"event: {event.body}") - full_event = json.loads(event.body) - record = construct_record(full_event) - - is_error = record[context.label_col] != record[context.prediction_col] - context.logger.info(f"Adding {is_error}") - - for name, model in context.models.items(): - results = {"timestamp": record["timestamp"]} - results["algorithm"] = name - model.add_element(is_error) - - if hasattr(model, "detected_warning_zone") and model.detected_warning_zone(): - context.logger.info(f"{name}\tWarning zone detected") - results["warning_zone"] = 1 - full_event[f"{name}_warning_zone"] = 1 - else: - results["warning_zone"] = 0 - full_event[f"{name}_warning_zone"] = 0 - - if model.detected_change(): - context.logger.info("Change Detected") - results["change_detected"] = 1 - full_event[f"{name}_drift"] = 1 - else: - results["change_detected"] = 0 - full_event[f"{name}_drift"] = 0 - context.window.append(results) - - push_to_stream(context, context.drift_stream, [full_event]) - - if context.callbacks != [""]: - for callback in context.callbacks: - requests.post(url=callback, json=full_event) - - if (len(context.window) / len(context.models)) >= context.window_size: - df = pd.DataFrame(context.window) - df["timestamp"] = pd.to_datetime(df["timestamp"]) - df = df.set_index(["timestamp", "algorithm"]) - context.v3f.write("tsdb", context.tsdb_table, df) - context.window = [] diff --git a/functions/development/concept_drift_streaming/0.0.2/src/function.yaml b/functions/development/concept_drift_streaming/0.0.2/src/function.yaml deleted file mode 100644 index a91ac25f..00000000 --- a/functions/development/concept_drift_streaming/0.0.2/src/function.yaml +++ /dev/null @@ -1,48 +0,0 @@ -kind: remote -metadata: - name: concept-drift-streaming - tag: '' - hash: dc41ff41149be69f19b91a6d78a06571937063ae - project: default - labels: - author: orz - framework: sklearn - categories: - - machine-learning - - monitoring -spec: - command: '' - args: [] - image: mlrun/ml-models - description: Deploy a streaming Concept Drift detector on a labeled stream. the - nuclio part of the concept_drift function - min_replicas: 1 - max_replicas: 4 - env: [] - base_spec: - apiVersion: nuclio.io/v1 - kind: Function - metadata: - name: concept-drift-streaming - labels: {} - annotations: - nuclio.io/generated_by: function generated from /User/test/functions/concept_drift_streaming/concept_drift_streaming.py - spec: - runtime: python:3.6 - handler: concept_drift_streaming:handler - env: [] - volumes: [] - build: - commands: [] - noBaseImagesPull: true - functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHNrbXVsdGlmbG93LmRyaWZ0X2RldGVjdGlvbgppbXBvcnQgbnVtcHkgYXMgbnAKaW1wb3J0IHBhbmRhcyBhcyBwZAppbXBvcnQgb3MKaW1wb3J0IGpzb24KaW1wb3J0IHYzaW8uZGF0YXBsYW5lCmltcG9ydCB2M2lvX2ZyYW1lcyBhcyB2M2YKaW1wb3J0IHJlcXVlc3RzCmZyb20gY2xvdWRwaWNrbGUgaW1wb3J0IGxvYWQKCmltcG9ydCByYW5kb20KCgpkZWYgc3BsaXRfcGF0aChtbnRwYXRoPSIiKToKICAgIGlmIG1udHBhdGhbMF0gPT0gIi8iOgogICAgICAgIG1udHBhdGggPSBtbnRwYXRoWzE6XQogICAgcGF0aHMgPSBtbnRwYXRoLnNwbGl0KCIvIikKICAgIGNvbnRhaW5lciA9IHBhdGhzWzBdCiAgICBzdWJwYXRoID0gIiIKICAgIGlmIGxlbihwYXRocykgPiAxOgogICAgICAgIHN1YnBhdGggPSBtbnRwYXRoW2xlbihjb250YWluZXIpIDpdCiAgICByZXR1cm4gY29udGFpbmVyLCBzdWJwYXRoCgoKZGVmIGNyZWF0ZV9zdHJlYW0oY29udGV4dCwgcGF0aCwgc2hhcmRzPTEpOgogICAgY29udGFpbmVyLCBzdHJlYW1fcGF0aCA9IHNwbGl0X3BhdGgocGF0aCkKICAgIGNvbnRleHQubG9nZ2VyLmluZm8oCiAgICAgICAgZiJDcmVhdGluZyBzdHJlYW0gaW4gQ29udGFpbmVyOiB7Y29udGFpbmVyfSAmIFBhdGgge3N0cmVhbV9wYXRofSIKICAgICkKICAgIHJlc3BvbnNlID0gY29udGV4dC52M2lvX2NsaWVudC5jcmVhdGVfc3RyZWFtKAogICAgICAgIGNvbnRhaW5lcj1jb250YWluZXIsCiAgICAgICAgcGF0aD1zdHJlYW1fcGF0aCwKICAgICAgICBzaGFyZF9jb3VudD1zaGFyZHMsCiAgICAgICAgcmFpc2VfZm9yX3N0YXR1cz12M2lvLmRhdGFwbGFuZS5SYWlzZUZvclN0YXR1cy5uZXZlciwKICAgICkKICAgIHJlc3BvbnNlLnJhaXNlX2Zvcl9zdGF0dXMoWzQwOSwgMjA0XSkKCgpkZWYgcHVzaF90b19zdHJlYW0oY29udGV4dCwgc3RyZWFtX3BhdGgsIGRhdGEpOgogICAgcmVjb3JkcyA9IFt7ImRhdGEiOiBqc29uLmR1bXBzKHJlYyl9IGZvciByZWMgaW4gZGF0YV0KICAgIGNvbnRhaW5lciwgc3RyZWFtX3BhdGggPSBzcGxpdF9wYXRoKHN0cmVhbV9wYXRoKQogICAgcmVzcG9uc2UgPSBjb250ZXh0LnYzaW9fY2xpZW50LnB1dF9yZWNvcmRzKAogICAgICAgIGNvbnRhaW5lcj1jb250YWluZXIsIHBhdGg9c3RyZWFtX3BhdGgsIHJlY29yZHM9cmVjb3JkcwogICAgKQoKCmRlZiBjb25zdHJ1Y3RfcmVjb3JkKHJlY29yZCk6CiAgICBsYWJlbF9jb2wgPSBvcy5nZXRlbnYoImxhYmVsX2NvbCIsICJsYWJlbCIpCiAgICBwcmVkaWN0aW9uX2NvbCA9IG9zLmdldGVudigicHJlZGljdGlvbl9jb2wiLCAicHJlZGljdGlvbiIpCiAgICByZXMgPSBkaWN0KFsoaywgcmVjb3JkW2tdKSBmb3IgayBpbiBbIndoZW4iLCAiY2xhc3MiLCAibW9kZWwiLCAicmVzcCIsICJyZXF1ZXN0Il1dKQogICAgcmVzWyJmZWF0dXJlX3ZlY3RvciJdID0gcmVzLnBvcCgicmVxdWVzdCIpWyJpbnN0YW5jZXMiXVswXQogICAgcmVzWyJ0aW1lc3RhbXAiXSA9IHJlcy5wb3AoIndoZW4iKQogICAgcmVzW3ByZWRpY3Rpb25fY29sXSA9IHJlc1sicmVzcCJdWzBdCiAgICByZXR1cm4gcmVzCgoKZGVmIGluaXRfY29udGV4dChjb250ZXh0KToKICAgIHYzaW9fY2xpZW50ID0gdjNpby5kYXRhcGxhbmUuQ2xpZW50KCkKICAgIHNldGF0dHIoY29udGV4dCwgInYzaW9fY2xpZW50IiwgdjNpb19jbGllbnQpCgogICAgdjNmX2NsaWVudCA9IHYzZi5DbGllbnQoImZyYW1lc2Q6ODA4MSIsIGNvbnRhaW5lcj0iYmlnZGF0YSIpCiAgICBzZXRhdHRyKGNvbnRleHQsICJ2M2YiLCB2M2ZfY2xpZW50KQogICAgd2luZG93ID0gW10KICAgIHNldGF0dHIoY29udGV4dCwgIndpbmRvdyIsIHdpbmRvdykKICAgIHNldGF0dHIoY29udGV4dCwgIndpbmRvd19zaXplIiwgaW50KG9zLmdldGVudigid2luZG93X3NpemUiLCAxMCkpKQogICAgc2V0YXR0cihjb250ZXh0LCAidHNkYl90YWJsZSIsIG9zLmdldGVudigidHNkYl90YWJsZSIsICJjb25jZXB0X2RyaWZ0X3RzZGJfMSIpKQogICAgdHJ5OgogICAgICAgIGNvbnRleHQudjNmLmNyZWF0ZSgidHNkYiIsIGNvbnRleHQudHNkYl90YWJsZSwgcmF0ZT0iMS9zIiwgaWZfZXhpc3RzPTEpCiAgICBleGNlcHQgRXhjZXB0aW9uIGFzIGU6CiAgICAgICAgY29udGV4dC5sb2dnZXIuaW5mbyhmIkNyZWF0aW5nIGNvbnRleHQgd2l0aCByYXRlPSBmYWlsZSBmb3Ige2V9IikKICAgICAgICBjb250ZXh0LnYzZi5jcmVhdGUoCiAgICAgICAgICAgICJ0c2RiIiwgY29udGV4dC50c2RiX3RhYmxlLCBhdHRycz17InJhdGUiOiAiMS9zIn0sIGlmX2V4aXN0cz0xCiAgICAgICAgKQoKICAgIGNhbGxiYWNrcyA9IFtjYWxsYmFjay5zdHJpcCgpIGZvciBjYWxsYmFjayBpbiBvcy5nZXRlbnYoImNhbGxiYWNrcyIsICIiKS5zcGxpdCgiLCIpXQogICAgc2V0YXR0cihjb250ZXh0LCAiY2FsbGJhY2tzIiwgY2FsbGJhY2tzKQoKICAgIHNldGF0dHIoY29udGV4dCwgImRyaWZ0X3N0cmVhbSIsIG9zLmdldGVudigiZHJpZnRfc3RyZWFtIiwgIi9iaWdkYXRhL2RyaWZ0X3N0cmVhbSIpKQogICAgdHJ5OgogICAgICAgIGNyZWF0ZV9zdHJlYW0oCiAgICAgICAgICAgIGNvbnRleHQsIGNvbnRleHQuZHJpZnRfc3RyZWFtLCBpbnQob3MuZ2V0ZW52KCJkcmlmdF9zdHJlYW1fc2hhcmRzIiwgMSkpCiAgICAgICAgKQogICAgZXhjZXB0OgogICAgICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZiJ7Y29udGV4dC5kcmlmdF9zdHJlYW19IGFscmVhZHkgZXhpc3RzIikKCiAgICBtb2RlbHMgPSB7fQogICAgbW9kZWxfdHlwZXMgPSBbInBhZ2VoaW5rZWx5IiwgImRkbSIsICJlZGRtIl0KICAgIHBhdGhfc3VmZml4ID0gIl9tb2RlbF9wYXRoIgogICAgZm9yIG1vZGVsIGluIG1vZGVsX3R5cGVzOgogICAgICAgIG1vZGVsX2VudiA9IGYie21vZGVsfXtwYXRoX3N1ZmZpeH0iCiAgICAgICAgaWYgbW9kZWxfZW52IGluIG9zLmVudmlyb246CiAgICAgICAgICAgIHdpdGggb3Blbihvcy5lbnZpcm9uW21vZGVsX2Vudl0sICJyYiIpIGFzIGY6CiAgICAgICAgICAgICAgICBtb2RlbHNbbW9kZWxdID0gbG9hZChmKQogICAgc2V0YXR0cihjb250ZXh0LCAibW9kZWxzIiwgbW9kZWxzKQoKICAgIHNldGF0dHIoY29udGV4dCwgImxhYmVsX2NvbCIsIG9zLmdldGVudigibGFiZWxfY29sIiwgImxhYmVsIikpCiAgICBzZXRhdHRyKGNvbnRleHQsICJwcmVkaWN0aW9uX2NvbCIsIG9zLmdldGVudigicHJlZGljdGlvbl9jb2wiLCAicHJlZGljdGlvbiIpKQoKCmRlZiBoYW5kbGVyKGNvbnRleHQsIGV2ZW50KToKICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZiJldmVudDoge2V2ZW50LmJvZHl9IikKICAgIGZ1bGxfZXZlbnQgPSBqc29uLmxvYWRzKGV2ZW50LmJvZHkpCiAgICByZWNvcmQgPSBjb25zdHJ1Y3RfcmVjb3JkKGZ1bGxfZXZlbnQpCgogICAgaXNfZXJyb3IgPSByZWNvcmRbY29udGV4dC5sYWJlbF9jb2xdICE9IHJlY29yZFtjb250ZXh0LnByZWRpY3Rpb25fY29sXQogICAgY29udGV4dC5sb2dnZXIuaW5mbyhmIkFkZGluZyB7aXNfZXJyb3J9IikKCiAgICBmb3IgbmFtZSwgbW9kZWwgaW4gY29udGV4dC5tb2RlbHMuaXRlbXMoKToKICAgICAgICByZXN1bHRzID0geyJ0aW1lc3RhbXAiOiByZWNvcmRbInRpbWVzdGFtcCJdfQogICAgICAgIHJlc3VsdHNbImFsZ29yaXRobSJdID0gbmFtZQogICAgICAgIG1vZGVsLmFkZF9lbGVtZW50KGlzX2Vycm9yKQoKICAgICAgICBpZiBoYXNhdHRyKG1vZGVsLCAiZGV0ZWN0ZWRfd2FybmluZ196b25lIikgYW5kIG1vZGVsLmRldGVjdGVkX3dhcm5pbmdfem9uZSgpOgogICAgICAgICAgICBjb250ZXh0LmxvZ2dlci5pbmZvKGYie25hbWV9XHRXYXJuaW5nIHpvbmUgZGV0ZWN0ZWQiKQogICAgICAgICAgICByZXN1bHRzWyJ3YXJuaW5nX3pvbmUiXSA9IDEKICAgICAgICAgICAgZnVsbF9ldmVudFtmIntuYW1lfV93YXJuaW5nX3pvbmUiXSA9IDEKICAgICAgICBlbHNlOgogICAgICAgICAgICByZXN1bHRzWyJ3YXJuaW5nX3pvbmUiXSA9IDAKICAgICAgICAgICAgZnVsbF9ldmVudFtmIntuYW1lfV93YXJuaW5nX3pvbmUiXSA9IDAKCiAgICAgICAgaWYgbW9kZWwuZGV0ZWN0ZWRfY2hhbmdlKCk6CiAgICAgICAgICAgIGNvbnRleHQubG9nZ2VyLmluZm8oIkNoYW5nZSBEZXRlY3RlZCIpCiAgICAgICAgICAgIHJlc3VsdHNbImNoYW5nZV9kZXRlY3RlZCJdID0gMQogICAgICAgICAgICBmdWxsX2V2ZW50W2Yie25hbWV9X2RyaWZ0Il0gPSAxCiAgICAgICAgZWxzZToKICAgICAgICAgICAgcmVzdWx0c1siY2hhbmdlX2RldGVjdGVkIl0gPSAwCiAgICAgICAgICAgIGZ1bGxfZXZlbnRbZiJ7bmFtZX1fZHJpZnQiXSA9IDAKICAgICAgICBjb250ZXh0LndpbmRvdy5hcHBlbmQocmVzdWx0cykKCiAgICBwdXNoX3RvX3N0cmVhbShjb250ZXh0LCBjb250ZXh0LmRyaWZ0X3N0cmVhbSwgW2Z1bGxfZXZlbnRdKQoKICAgIGlmIGNvbnRleHQuY2FsbGJhY2tzICE9IFsiIl06CiAgICAgICAgZm9yIGNhbGxiYWNrIGluIGNvbnRleHQuY2FsbGJhY2tzOgogICAgICAgICAgICByZXF1ZXN0cy5wb3N0KHVybD1jYWxsYmFjaywganNvbj1mdWxsX2V2ZW50KQoKICAgIGlmIChsZW4oY29udGV4dC53aW5kb3cpIC8gbGVuKGNvbnRleHQubW9kZWxzKSkgPj0gY29udGV4dC53aW5kb3dfc2l6ZToKICAgICAgICBkZiA9IHBkLkRhdGFGcmFtZShjb250ZXh0LndpbmRvdykKICAgICAgICBkZlsidGltZXN0YW1wIl0gPSBwZC50b19kYXRldGltZShkZlsidGltZXN0YW1wIl0pCiAgICAgICAgZGYgPSBkZi5zZXRfaW5kZXgoWyJ0aW1lc3RhbXAiLCAiYWxnb3JpdGhtIl0pCiAgICAgICAgY29udGV4dC52M2Yud3JpdGUoInRzZGIiLCBjb250ZXh0LnRzZGJfdGFibGUsIGRmKQogICAgICAgIGNvbnRleHQud2luZG93ID0gW10K - source: '' - build: - commands: - - python -m pip install scikit-multiflow==0.4.1 v3io_frames - code_origin: https://github.com/daniels290813/functions.git#d96059851b5d51fd4583e982483eb973fccc47d2:/User/test/functions/concept_drift_streaming/concept_drift_streaming.py - origin_filename: /User/test/functions/concept_drift_streaming/concept_drift_streaming.py - default_handler: handler - disable_auto_mount: false - affinity: null -verbose: false diff --git a/functions/development/concept_drift_streaming/0.0.2/src/item.yaml b/functions/development/concept_drift_streaming/0.0.2/src/item.yaml deleted file mode 100644 index 20e525ea..00000000 --- a/functions/development/concept_drift_streaming/0.0.2/src/item.yaml +++ /dev/null @@ -1,27 +0,0 @@ -apiVersion: v1 -categories: -- machine-learning -- monitoring -description: Deploy a streaming Concept Drift detector on a labeled stream. the nuclio part of the concept_drift function -doc: '' -example: concept_drift_streaming.ipynb -generationDate: 2021-05-19:22-41 -icon: '' -labels: - author: orz - framework: sklearn -maintainers: [] -marketplaceType: '' -mlrunVersion: '' -name: concept-drift-streaming -platformVersion: '' -spec: - filename: concept_drift_streaming.py - handler: handler - image: mlrun/ml-models - kind: nuclio - requirements: - - scikit-multiflow==0.4.1 - - v3io_frames -url: '' -version: 0.0.2 diff --git a/functions/development/concept_drift_streaming/0.0.2/static/documentation.html b/functions/development/concept_drift_streaming/0.0.2/static/documentation.html deleted file mode 100644 index a8b6c05c..00000000 --- a/functions/development/concept_drift_streaming/0.0.2/static/documentation.html +++ /dev/null @@ -1,125 +0,0 @@ - - - - - - - -concept_drift_streaming package - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- - -
-
-
-
-
-
-

concept_drift_streaming package

-
-

Submodules

-
-
-

concept_drift_streaming.concept_drift_streaming module

-
-
-

Module contents

-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/concept_drift_streaming/0.0.2/static/example.html b/functions/development/concept_drift_streaming/0.0.2/static/example.html deleted file mode 100644 index adc40ef3..00000000 --- a/functions/development/concept_drift_streaming/0.0.2/static/example.html +++ /dev/null @@ -1,486 +0,0 @@ - - - - - - - -Concept Drift Streaming - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- -
-
- Contents -
- -
-
-
-
-
-
-
-

Concept Drift Streaming

-
-
-
import nuclio
-
-
-
-
-
-
-
from pprint import pprint
-
-
-
-
-
-
-
%%nuclio cmd -c
-python -m pip install scikit-multiflow==0.4.1
-python -m pip install v3io_frames
-
-
-
-
-
-
-
# Define function spec
-%nuclio config kind = "nuclio"
-%nuclio config spec.build.baseImage = "mlrun/ml-models"
-
-# Add V3IO Mount
-# %nuclio env %v3io
-
-
-
-
-
%nuclio: setting kind to 'nuclio'
-%nuclio: setting spec.build.baseImage to 'mlrun/ml-models'
-
-
-
-
-
-
-
# nuclio: ignore
-env = {'label_col': 'resp',
-       'prediction_col': 'prediction',
-       'drift_stream': '/bigdata/network-operations/drift_stream',
-       'tsdb_table': 'network-operations/drift_tsdb',
-       'pagehinkley_threshold': 10,
-       'models': ['pagehinkley', 'ddm', 'eddm'],
-       'window_size': 10}
-config = {'kind': 'nuclio',
-          'spec.build.baseImage': 'mlrun/ml-models'}
-cmd = ['python -m pip install scikit-multiflow',
-       'python -m pip install v3io_frames']
-v3io = True
-config = nuclio.ConfigSpec(env=env,
-                           config=config,
-                           cmd=cmd,
-                           v3io=v3io)
-
-
-
-
-
-
-
# nuclio: start-code
-
-
-
-
-
-
-
import skmultiflow.drift_detection
-import numpy as np
-import pandas as pd
-import os
-import json
-import v3io.dataplane
-import v3io_frames as v3f
-import requests
-from cloudpickle import load
-
-# For testing
-import random
-
-
-
-
-
-
-
def split_path(mntpath=''):
-    if mntpath[0] == '/':
-        mntpath = mntpath[1:]
-    paths = mntpath.split('/')
-    container = paths[0]
-    subpath = ''
-    if len(paths) > 1:
-        subpath = mntpath[len(container):]
-    return container, subpath
-
-
-def create_stream(context, path, shards=1):
-    # create a stream w/8 shards
-    container, stream_path = split_path(path)
-    context.logger.info(f'Creating stream in Container: {container} & Path {stream_path}')
-    response = context.v3io_client.create_stream(container=container,
-                                        path=stream_path, 
-                                        shard_count=shards,
-                                        raise_for_status=v3io.dataplane.RaiseForStatus.never)
-    response.raise_for_status([409, 204])
-    
-    
-def push_to_stream(context, stream_path, data):
-    records = [{'data': json.dumps(rec)} for rec in data]
-    container, stream_path = split_path(stream_path)
-    response = context.v3io_client.put_records(container=container,
-                                               path=stream_path, 
-                                               records=records)
-
-
-def construct_record(record):
-    label_col = os.getenv('label_col', 'label')
-    prediction_col = os.getenv('prediction_col', 'prediction')
-    res = dict([(k, record[k]) for k in ['when', 'class', 'model', 'resp', 'request']])
-    res['feature_vector'] = res.pop('request')['instances'][0]
-    res['timestamp'] = res.pop('when')
-    res['prediction'] = res['resp'][0]
-    return res
-
-
-
-
-
-
-
def init_context(context):
-    # create a v3io context object
-    v3io_client = v3io.dataplane.Client()
-    setattr(context, "v3io_client", v3io_client)
-    
-    # Setup windowing for TSDB writer
-    v3f_client = v3f.Client('framesd:8081', container='bigdata')
-    setattr(context, "v3f", v3f_client)
-    window = []
-    setattr(context, 'window', window)
-    setattr(context, 'window_size', int(os.getenv('window_size', 10)))
-    setattr(context, 'tsdb_table', os.getenv('tsdb_table', 'concept_drift_tsdb_1'))
-    try:
-        context.v3f.create('tsdb', context.tsdb_table, rate='1/s', if_exists=1)
-    except Exception as e:
-        context.logger.info(f'Creating context with rate= faile for {e}')
-        context.v3f.create('tsdb', context.tsdb_table, attrs={'rate': '1/s'}, if_exists=1)
-    
-    # Setup callbacks
-    callbacks = [callback.strip() for callback in os.getenv('callbacks', '').split(',')]
-    setattr(context, 'callbacks', callbacks)
-    
-    # Setup drift stream
-    setattr(context, 'drift_stream', os.getenv('drift_stream', '/bigdata/drift_stream'))
-    try:
-        create_stream(context, context.drift_stream, int(os.getenv('drift_stream_shards', 1)))
-    except:
-        context.logger.info(f'{context.drift_stream} already exists')
-    
-    # Load models
-    models = {}
-    model_types = ['pagehinkely', 'ddm', 'eddm']
-    path_suffix = '_model_path'
-    for model in model_types:
-        model_env = f'{model}{path_suffix}'
-        if model_env in os.environ:
-            with open(os.environ[model_env], 'rb') as f:
-                models[model] = load(f)
-    setattr(context, 'models', models)
-    
-    # Columns to check
-    setattr(context, 'label_col', os.getenv('label_col', 'label'))
-    setattr(context, 'prediction_col', os.getenv('prediction_col', 'prediction'))
-
-
-
-
-
-
-
def handler(context, event):
-    # Construct event
-    context.logger.info(f'event: {event.body}')
-    full_event = json.loads(event.body)
-    record = construct_record(full_event)
-    
-    # Is our prediction wrong?
-    is_error = record[context.label_col] != record[context.prediction_col]
-    context.logger.info(f'Adding {is_error}')
-    
-    # Process the {is_error} element with our algorithms
-    for name, model in context.models.items():
-        # Add element
-        results = {'timestamp': record['timestamp']}
-        results['algorithm'] = name
-        model.add_element(is_error)
-        
-        # Detect warning zone (if applicable to the algorithm)
-        if hasattr(model, 'detected_warning_zone') and model.detected_warning_zone():
-            context.logger.info(f'{name}\tWarning zone detected')
-            results['warning_zone'] = 1
-            full_event[f'{name}_warning_zone'] = 1
-        else:
-            results['warning_zone'] = 0
-            full_event[f'{name}_warning_zone'] = 0
-        
-        # Detect drift
-        if model.detected_change():
-            context.logger.info('Change Detected')
-            results['change_detected'] = 1
-            full_event[f'{name}_drift'] = 1
-        else:
-            results['change_detected'] = 0
-            full_event[f'{name}_drift'] = 0
-        context.window.append(results)
-    
-    # Return results
-    # Write to stream
-    push_to_stream(context, context.drift_stream, [full_event])
-    
-    # Add to callbacks
-    if context.callbacks != ['']:
-        for callback in context.callbacks:
-            requests.post(url=callback,
-                          json=full_event)
-    
-    if (len(context.window) / len(context.models)) >= context.window_size:
-        df = pd.DataFrame(context.window)
-        df['timestamp'] = pd.to_datetime(df['timestamp'])
-        df = df.set_index(['timestamp', 'algorithm'])
-        context.v3f.write('tsdb', context.tsdb_table, df)
-        context.window = []
-
-
-
-
-
-
-
# nuclio: end-code
-
-
-
-
-
-

Test

-
-
-
init_context(context)
-event = nuclio.Event(body=json.dumps({'prediction': 0,
-                                      'when': 'now',
-                                      'class': 'ClassModel', 
-                                      'model': 'tester_v1', 
-                                      'resp': [0], 
-                                      'request': {'instances': [[1, 1.2, 3]]}}))
-out = handler(context, event)
-out
-
-
-
-
-
-
-

Cluster

-
-
-
%nuclio deploy -n network-operations-concept-drift -p network-operations
-
-
-
-
-
-
-

Save function yaml

-
-
-
from os import path
-from mlrun import run_local, NewTask, mlconf, import_function, mount_v3io, code_to_function, get_run_db
-mlconf.dbpath = mlconf.dbpath or 'http://mlrun-api:8080'
-
-
-
-
-
-
-
# create job function object from notebook code
-fn = code_to_function("concept_drift_streaming", kind='nuclio')
-
-# add metadata (for templates and reuse)
-fn.spec.default_handler = "handler"
-fn.spec.description = "Deploy a streaming Concept Drift detector on a labeled stream. the nuclio part of the concept_drift function"
-fn.metadata.categories = ["ml", "serve"]
-fn.metadata.labels = {"author": "orz", "framework": "sklearn"}
-fn.export("/User/functions/concept_drift_streaming/function.yaml")
-
-
-
-
-
[mlrun] 2020-07-14 13:49:22,720 function spec saved to path: /User/functions/concept_drift_streaming/function.yaml
-
-
-
<mlrun.runtimes.function.RemoteRuntime at 0x7fb5a7de3e80>
-
-
-
-
-
-
-
stream_trigger = nuclio.triggers.V3IOStreamTrigger(url='/bigdata/network-operations/inference_stream@cd2')
-
-
-
-
-
-
-
fn.add_trigger('labeled_stream', stream_trigger)
-
-
-
-
-
<mlrun.runtimes.function.RemoteRuntime at 0x7fa1dc063780>
-
-
-
-
-
-
-
fn.apply(mount_v3io()).with_v3io()
-
-
-
-
-
<mlrun.runtimes.function.RemoteRuntime at 0x7fa1dc063780>
-
-
-
-
-
-
-
fn.export("function.yaml")
-
-
-
-
-
-
-

Stream testing

-
-
-
fn = import_function('./function.yaml')
-
-
-
-
-
-
-
fn.deploy(project='network-operations')
-
-
-
-
-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/concept_drift_streaming/0.0.2/static/function.html b/functions/development/concept_drift_streaming/0.0.2/static/function.html deleted file mode 100644 index a69f4c73..00000000 --- a/functions/development/concept_drift_streaming/0.0.2/static/function.html +++ /dev/null @@ -1,70 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: remote
-metadata:
-  name: concept-drift-streaming
-  tag: ''
-  hash: dc41ff41149be69f19b91a6d78a06571937063ae
-  project: default
-  labels:
-    author: orz
-    framework: sklearn
-  categories:
-  - machine-learning
-  - monitoring
-spec:
-  command: ''
-  args: []
-  image: mlrun/ml-models
-  description: Deploy a streaming Concept Drift detector on a labeled stream. the
-    nuclio part of the concept_drift function
-  min_replicas: 1
-  max_replicas: 4
-  env: []
-  base_spec:
-    apiVersion: nuclio.io/v1
-    kind: Function
-    metadata:
-      name: concept-drift-streaming
-      labels: {}
-      annotations:
-        nuclio.io/generated_by: function generated from /User/test/functions/concept_drift_streaming/concept_drift_streaming.py
-    spec:
-      runtime: python:3.6
-      handler: concept_drift_streaming:handler
-      env: []
-      volumes: []
-      build:
-        commands: []
-        noBaseImagesPull: true
-        functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHNrbXVsdGlmbG93LmRyaWZ0X2RldGVjdGlvbgppbXBvcnQgbnVtcHkgYXMgbnAKaW1wb3J0IHBhbmRhcyBhcyBwZAppbXBvcnQgb3MKaW1wb3J0IGpzb24KaW1wb3J0IHYzaW8uZGF0YXBsYW5lCmltcG9ydCB2M2lvX2ZyYW1lcyBhcyB2M2YKaW1wb3J0IHJlcXVlc3RzCmZyb20gY2xvdWRwaWNrbGUgaW1wb3J0IGxvYWQKCmltcG9ydCByYW5kb20KCgpkZWYgc3BsaXRfcGF0aChtbnRwYXRoPSIiKToKICAgIGlmIG1udHBhdGhbMF0gPT0gIi8iOgogICAgICAgIG1udHBhdGggPSBtbnRwYXRoWzE6XQogICAgcGF0aHMgPSBtbnRwYXRoLnNwbGl0KCIvIikKICAgIGNvbnRhaW5lciA9IHBhdGhzWzBdCiAgICBzdWJwYXRoID0gIiIKICAgIGlmIGxlbihwYXRocykgPiAxOgogICAgICAgIHN1YnBhdGggPSBtbnRwYXRoW2xlbihjb250YWluZXIpIDpdCiAgICByZXR1cm4gY29udGFpbmVyLCBzdWJwYXRoCgoKZGVmIGNyZWF0ZV9zdHJlYW0oY29udGV4dCwgcGF0aCwgc2hhcmRzPTEpOgogICAgY29udGFpbmVyLCBzdHJlYW1fcGF0aCA9IHNwbGl0X3BhdGgocGF0aCkKICAgIGNvbnRleHQubG9nZ2VyLmluZm8oCiAgICAgICAgZiJDcmVhdGluZyBzdHJlYW0gaW4gQ29udGFpbmVyOiB7Y29udGFpbmVyfSAmIFBhdGgge3N0cmVhbV9wYXRofSIKICAgICkKICAgIHJlc3BvbnNlID0gY29udGV4dC52M2lvX2NsaWVudC5jcmVhdGVfc3RyZWFtKAogICAgICAgIGNvbnRhaW5lcj1jb250YWluZXIsCiAgICAgICAgcGF0aD1zdHJlYW1fcGF0aCwKICAgICAgICBzaGFyZF9jb3VudD1zaGFyZHMsCiAgICAgICAgcmFpc2VfZm9yX3N0YXR1cz12M2lvLmRhdGFwbGFuZS5SYWlzZUZvclN0YXR1cy5uZXZlciwKICAgICkKICAgIHJlc3BvbnNlLnJhaXNlX2Zvcl9zdGF0dXMoWzQwOSwgMjA0XSkKCgpkZWYgcHVzaF90b19zdHJlYW0oY29udGV4dCwgc3RyZWFtX3BhdGgsIGRhdGEpOgogICAgcmVjb3JkcyA9IFt7ImRhdGEiOiBqc29uLmR1bXBzKHJlYyl9IGZvciByZWMgaW4gZGF0YV0KICAgIGNvbnRhaW5lciwgc3RyZWFtX3BhdGggPSBzcGxpdF9wYXRoKHN0cmVhbV9wYXRoKQogICAgcmVzcG9uc2UgPSBjb250ZXh0LnYzaW9fY2xpZW50LnB1dF9yZWNvcmRzKAogICAgICAgIGNvbnRhaW5lcj1jb250YWluZXIsIHBhdGg9c3RyZWFtX3BhdGgsIHJlY29yZHM9cmVjb3JkcwogICAgKQoKCmRlZiBjb25zdHJ1Y3RfcmVjb3JkKHJlY29yZCk6CiAgICBsYWJlbF9jb2wgPSBvcy5nZXRlbnYoImxhYmVsX2NvbCIsICJsYWJlbCIpCiAgICBwcmVkaWN0aW9uX2NvbCA9IG9zLmdldGVudigicHJlZGljdGlvbl9jb2wiLCAicHJlZGljdGlvbiIpCiAgICByZXMgPSBkaWN0KFsoaywgcmVjb3JkW2tdKSBmb3IgayBpbiBbIndoZW4iLCAiY2xhc3MiLCAibW9kZWwiLCAicmVzcCIsICJyZXF1ZXN0Il1dKQogICAgcmVzWyJmZWF0dXJlX3ZlY3RvciJdID0gcmVzLnBvcCgicmVxdWVzdCIpWyJpbnN0YW5jZXMiXVswXQogICAgcmVzWyJ0aW1lc3RhbXAiXSA9IHJlcy5wb3AoIndoZW4iKQogICAgcmVzW3ByZWRpY3Rpb25fY29sXSA9IHJlc1sicmVzcCJdWzBdCiAgICByZXR1cm4gcmVzCgoKZGVmIGluaXRfY29udGV4dChjb250ZXh0KToKICAgIHYzaW9fY2xpZW50ID0gdjNpby5kYXRhcGxhbmUuQ2xpZW50KCkKICAgIHNldGF0dHIoY29udGV4dCwgInYzaW9fY2xpZW50IiwgdjNpb19jbGllbnQpCgogICAgdjNmX2NsaWVudCA9IHYzZi5DbGllbnQoImZyYW1lc2Q6ODA4MSIsIGNvbnRhaW5lcj0iYmlnZGF0YSIpCiAgICBzZXRhdHRyKGNvbnRleHQsICJ2M2YiLCB2M2ZfY2xpZW50KQogICAgd2luZG93ID0gW10KICAgIHNldGF0dHIoY29udGV4dCwgIndpbmRvdyIsIHdpbmRvdykKICAgIHNldGF0dHIoY29udGV4dCwgIndpbmRvd19zaXplIiwgaW50KG9zLmdldGVudigid2luZG93X3NpemUiLCAxMCkpKQogICAgc2V0YXR0cihjb250ZXh0LCAidHNkYl90YWJsZSIsIG9zLmdldGVudigidHNkYl90YWJsZSIsICJjb25jZXB0X2RyaWZ0X3RzZGJfMSIpKQogICAgdHJ5OgogICAgICAgIGNvbnRleHQudjNmLmNyZWF0ZSgidHNkYiIsIGNvbnRleHQudHNkYl90YWJsZSwgcmF0ZT0iMS9zIiwgaWZfZXhpc3RzPTEpCiAgICBleGNlcHQgRXhjZXB0aW9uIGFzIGU6CiAgICAgICAgY29udGV4dC5sb2dnZXIuaW5mbyhmIkNyZWF0aW5nIGNvbnRleHQgd2l0aCByYXRlPSBmYWlsZSBmb3Ige2V9IikKICAgICAgICBjb250ZXh0LnYzZi5jcmVhdGUoCiAgICAgICAgICAgICJ0c2RiIiwgY29udGV4dC50c2RiX3RhYmxlLCBhdHRycz17InJhdGUiOiAiMS9zIn0sIGlmX2V4aXN0cz0xCiAgICAgICAgKQoKICAgIGNhbGxiYWNrcyA9IFtjYWxsYmFjay5zdHJpcCgpIGZvciBjYWxsYmFjayBpbiBvcy5nZXRlbnYoImNhbGxiYWNrcyIsICIiKS5zcGxpdCgiLCIpXQogICAgc2V0YXR0cihjb250ZXh0LCAiY2FsbGJhY2tzIiwgY2FsbGJhY2tzKQoKICAgIHNldGF0dHIoY29udGV4dCwgImRyaWZ0X3N0cmVhbSIsIG9zLmdldGVudigiZHJpZnRfc3RyZWFtIiwgIi9iaWdkYXRhL2RyaWZ0X3N0cmVhbSIpKQogICAgdHJ5OgogICAgICAgIGNyZWF0ZV9zdHJlYW0oCiAgICAgICAgICAgIGNvbnRleHQsIGNvbnRleHQuZHJpZnRfc3RyZWFtLCBpbnQob3MuZ2V0ZW52KCJkcmlmdF9zdHJlYW1fc2hhcmRzIiwgMSkpCiAgICAgICAgKQogICAgZXhjZXB0OgogICAgICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZiJ7Y29udGV4dC5kcmlmdF9zdHJlYW19IGFscmVhZHkgZXhpc3RzIikKCiAgICBtb2RlbHMgPSB7fQogICAgbW9kZWxfdHlwZXMgPSBbInBhZ2VoaW5rZWx5IiwgImRkbSIsICJlZGRtIl0KICAgIHBhdGhfc3VmZml4ID0gIl9tb2RlbF9wYXRoIgogICAgZm9yIG1vZGVsIGluIG1vZGVsX3R5cGVzOgogICAgICAgIG1vZGVsX2VudiA9IGYie21vZGVsfXtwYXRoX3N1ZmZpeH0iCiAgICAgICAgaWYgbW9kZWxfZW52IGluIG9zLmVudmlyb246CiAgICAgICAgICAgIHdpdGggb3Blbihvcy5lbnZpcm9uW21vZGVsX2Vudl0sICJyYiIpIGFzIGY6CiAgICAgICAgICAgICAgICBtb2RlbHNbbW9kZWxdID0gbG9hZChmKQogICAgc2V0YXR0cihjb250ZXh0LCAibW9kZWxzIiwgbW9kZWxzKQoKICAgIHNldGF0dHIoY29udGV4dCwgImxhYmVsX2NvbCIsIG9zLmdldGVudigibGFiZWxfY29sIiwgImxhYmVsIikpCiAgICBzZXRhdHRyKGNvbnRleHQsICJwcmVkaWN0aW9uX2NvbCIsIG9zLmdldGVudigicHJlZGljdGlvbl9jb2wiLCAicHJlZGljdGlvbiIpKQoKCmRlZiBoYW5kbGVyKGNvbnRleHQsIGV2ZW50KToKICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZiJldmVudDoge2V2ZW50LmJvZHl9IikKICAgIGZ1bGxfZXZlbnQgPSBqc29uLmxvYWRzKGV2ZW50LmJvZHkpCiAgICByZWNvcmQgPSBjb25zdHJ1Y3RfcmVjb3JkKGZ1bGxfZXZlbnQpCgogICAgaXNfZXJyb3IgPSByZWNvcmRbY29udGV4dC5sYWJlbF9jb2xdICE9IHJlY29yZFtjb250ZXh0LnByZWRpY3Rpb25fY29sXQogICAgY29udGV4dC5sb2dnZXIuaW5mbyhmIkFkZGluZyB7aXNfZXJyb3J9IikKCiAgICBmb3IgbmFtZSwgbW9kZWwgaW4gY29udGV4dC5tb2RlbHMuaXRlbXMoKToKICAgICAgICByZXN1bHRzID0geyJ0aW1lc3RhbXAiOiByZWNvcmRbInRpbWVzdGFtcCJdfQogICAgICAgIHJlc3VsdHNbImFsZ29yaXRobSJdID0gbmFtZQogICAgICAgIG1vZGVsLmFkZF9lbGVtZW50KGlzX2Vycm9yKQoKICAgICAgICBpZiBoYXNhdHRyKG1vZGVsLCAiZGV0ZWN0ZWRfd2FybmluZ196b25lIikgYW5kIG1vZGVsLmRldGVjdGVkX3dhcm5pbmdfem9uZSgpOgogICAgICAgICAgICBjb250ZXh0LmxvZ2dlci5pbmZvKGYie25hbWV9XHRXYXJuaW5nIHpvbmUgZGV0ZWN0ZWQiKQogICAgICAgICAgICByZXN1bHRzWyJ3YXJuaW5nX3pvbmUiXSA9IDEKICAgICAgICAgICAgZnVsbF9ldmVudFtmIntuYW1lfV93YXJuaW5nX3pvbmUiXSA9IDEKICAgICAgICBlbHNlOgogICAgICAgICAgICByZXN1bHRzWyJ3YXJuaW5nX3pvbmUiXSA9IDAKICAgICAgICAgICAgZnVsbF9ldmVudFtmIntuYW1lfV93YXJuaW5nX3pvbmUiXSA9IDAKCiAgICAgICAgaWYgbW9kZWwuZGV0ZWN0ZWRfY2hhbmdlKCk6CiAgICAgICAgICAgIGNvbnRleHQubG9nZ2VyLmluZm8oIkNoYW5nZSBEZXRlY3RlZCIpCiAgICAgICAgICAgIHJlc3VsdHNbImNoYW5nZV9kZXRlY3RlZCJdID0gMQogICAgICAgICAgICBmdWxsX2V2ZW50W2Yie25hbWV9X2RyaWZ0Il0gPSAxCiAgICAgICAgZWxzZToKICAgICAgICAgICAgcmVzdWx0c1siY2hhbmdlX2RldGVjdGVkIl0gPSAwCiAgICAgICAgICAgIGZ1bGxfZXZlbnRbZiJ7bmFtZX1fZHJpZnQiXSA9IDAKICAgICAgICBjb250ZXh0LndpbmRvdy5hcHBlbmQocmVzdWx0cykKCiAgICBwdXNoX3RvX3N0cmVhbShjb250ZXh0LCBjb250ZXh0LmRyaWZ0X3N0cmVhbSwgW2Z1bGxfZXZlbnRdKQoKICAgIGlmIGNvbnRleHQuY2FsbGJhY2tzICE9IFsiIl06CiAgICAgICAgZm9yIGNhbGxiYWNrIGluIGNvbnRleHQuY2FsbGJhY2tzOgogICAgICAgICAgICByZXF1ZXN0cy5wb3N0KHVybD1jYWxsYmFjaywganNvbj1mdWxsX2V2ZW50KQoKICAgIGlmIChsZW4oY29udGV4dC53aW5kb3cpIC8gbGVuKGNvbnRleHQubW9kZWxzKSkgPj0gY29udGV4dC53aW5kb3dfc2l6ZToKICAgICAgICBkZiA9IHBkLkRhdGFGcmFtZShjb250ZXh0LndpbmRvdykKICAgICAgICBkZlsidGltZXN0YW1wIl0gPSBwZC50b19kYXRldGltZShkZlsidGltZXN0YW1wIl0pCiAgICAgICAgZGYgPSBkZi5zZXRfaW5kZXgoWyJ0aW1lc3RhbXAiLCAiYWxnb3JpdGhtIl0pCiAgICAgICAgY29udGV4dC52M2Yud3JpdGUoInRzZGIiLCBjb250ZXh0LnRzZGJfdGFibGUsIGRmKQogICAgICAgIGNvbnRleHQud2luZG93ID0gW10K
-  source: ''
-  build:
-    commands:
-    - python -m pip install scikit-multiflow==0.4.1 v3io_frames
-    code_origin: https://github.com/daniels290813/functions.git#d96059851b5d51fd4583e982483eb973fccc47d2:/User/test/functions/concept_drift_streaming/concept_drift_streaming.py
-    origin_filename: /User/test/functions/concept_drift_streaming/concept_drift_streaming.py
-  default_handler: handler
-  disable_auto_mount: false
-  affinity: null
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/concept_drift_streaming/0.0.2/static/item.html b/functions/development/concept_drift_streaming/0.0.2/static/item.html deleted file mode 100644 index 5a723282..00000000 --- a/functions/development/concept_drift_streaming/0.0.2/static/item.html +++ /dev/null @@ -1,49 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- machine-learning
-- monitoring
-description: Deploy a streaming Concept Drift detector on a labeled stream. the nuclio part of the concept_drift function
-doc: ''
-example: concept_drift_streaming.ipynb
-generationDate: 2021-05-19:22-41
-icon: ''
-labels:
-  author: orz
-  framework: sklearn
-maintainers: []
-marketplaceType: ''
-mlrunVersion: ''
-name: concept-drift-streaming
-platformVersion: ''
-spec:
-  filename: concept_drift_streaming.py
-  handler: handler
-  image: mlrun/ml-models
-  kind: nuclio
-  requirements:
-  - scikit-multiflow==0.4.1
-  - v3io_frames
-url: ''
-version: 0.0.2
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/concept_drift_streaming/0.0.2/static/source.html b/functions/development/concept_drift_streaming/0.0.2/static/source.html deleted file mode 100644 index 3c479b8f..00000000 --- a/functions/development/concept_drift_streaming/0.0.2/static/source.html +++ /dev/null @@ -1,165 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-# Generated by nuclio.export.NuclioExporter
-
-import skmultiflow.drift_detection
-import numpy as np
-import pandas as pd
-import os
-import json
-import v3io.dataplane
-import v3io_frames as v3f
-import requests
-from cloudpickle import load
-
-import random
-
-
-def split_path(mntpath=""):
-    if mntpath[0] == "/":
-        mntpath = mntpath[1:]
-    paths = mntpath.split("/")
-    container = paths[0]
-    subpath = ""
-    if len(paths) > 1:
-        subpath = mntpath[len(container) :]
-    return container, subpath
-
-
-def create_stream(context, path, shards=1):
-    container, stream_path = split_path(path)
-    context.logger.info(
-        f"Creating stream in Container: {container} & Path {stream_path}"
-    )
-    response = context.v3io_client.create_stream(
-        container=container,
-        path=stream_path,
-        shard_count=shards,
-        raise_for_status=v3io.dataplane.RaiseForStatus.never,
-    )
-    response.raise_for_status([409, 204])
-
-
-def push_to_stream(context, stream_path, data):
-    records = [{"data": json.dumps(rec)} for rec in data]
-    container, stream_path = split_path(stream_path)
-    response = context.v3io_client.put_records(
-        container=container, path=stream_path, records=records
-    )
-
-
-def construct_record(record):
-    label_col = os.getenv("label_col", "label")
-    prediction_col = os.getenv("prediction_col", "prediction")
-    res = dict([(k, record[k]) for k in ["when", "class", "model", "resp", "request"]])
-    res["feature_vector"] = res.pop("request")["instances"][0]
-    res["timestamp"] = res.pop("when")
-    res[prediction_col] = res["resp"][0]
-    return res
-
-
-def init_context(context):
-    v3io_client = v3io.dataplane.Client()
-    setattr(context, "v3io_client", v3io_client)
-
-    v3f_client = v3f.Client("framesd:8081", container="bigdata")
-    setattr(context, "v3f", v3f_client)
-    window = []
-    setattr(context, "window", window)
-    setattr(context, "window_size", int(os.getenv("window_size", 10)))
-    setattr(context, "tsdb_table", os.getenv("tsdb_table", "concept_drift_tsdb_1"))
-    try:
-        context.v3f.create("tsdb", context.tsdb_table, rate="1/s", if_exists=1)
-    except Exception as e:
-        context.logger.info(f"Creating context with rate= faile for {e}")
-        context.v3f.create(
-            "tsdb", context.tsdb_table, attrs={"rate": "1/s"}, if_exists=1
-        )
-
-    callbacks = [callback.strip() for callback in os.getenv("callbacks", "").split(",")]
-    setattr(context, "callbacks", callbacks)
-
-    setattr(context, "drift_stream", os.getenv("drift_stream", "/bigdata/drift_stream"))
-    try:
-        create_stream(
-            context, context.drift_stream, int(os.getenv("drift_stream_shards", 1))
-        )
-    except:
-        context.logger.info(f"{context.drift_stream} already exists")
-
-    models = {}
-    model_types = ["pagehinkely", "ddm", "eddm"]
-    path_suffix = "_model_path"
-    for model in model_types:
-        model_env = f"{model}{path_suffix}"
-        if model_env in os.environ:
-            with open(os.environ[model_env], "rb") as f:
-                models[model] = load(f)
-    setattr(context, "models", models)
-
-    setattr(context, "label_col", os.getenv("label_col", "label"))
-    setattr(context, "prediction_col", os.getenv("prediction_col", "prediction"))
-
-
-def handler(context, event):
-    context.logger.info(f"event: {event.body}")
-    full_event = json.loads(event.body)
-    record = construct_record(full_event)
-
-    is_error = record[context.label_col] != record[context.prediction_col]
-    context.logger.info(f"Adding {is_error}")
-
-    for name, model in context.models.items():
-        results = {"timestamp": record["timestamp"]}
-        results["algorithm"] = name
-        model.add_element(is_error)
-
-        if hasattr(model, "detected_warning_zone") and model.detected_warning_zone():
-            context.logger.info(f"{name}\tWarning zone detected")
-            results["warning_zone"] = 1
-            full_event[f"{name}_warning_zone"] = 1
-        else:
-            results["warning_zone"] = 0
-            full_event[f"{name}_warning_zone"] = 0
-
-        if model.detected_change():
-            context.logger.info("Change Detected")
-            results["change_detected"] = 1
-            full_event[f"{name}_drift"] = 1
-        else:
-            results["change_detected"] = 0
-            full_event[f"{name}_drift"] = 0
-        context.window.append(results)
-
-    push_to_stream(context, context.drift_stream, [full_event])
-
-    if context.callbacks != [""]:
-        for callback in context.callbacks:
-            requests.post(url=callback, json=full_event)
-
-    if (len(context.window) / len(context.models)) >= context.window_size:
-        df = pd.DataFrame(context.window)
-        df["timestamp"] = pd.to_datetime(df["timestamp"])
-        df = df.set_index(["timestamp", "algorithm"])
-        context.v3f.write("tsdb", context.tsdb_table, df)
-        context.window = []
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/concept_drift_streaming/0.8.0/src/concept_drift_streaming.ipynb b/functions/development/concept_drift_streaming/0.8.0/src/concept_drift_streaming.ipynb deleted file mode 100644 index b916cb7a..00000000 --- a/functions/development/concept_drift_streaming/0.8.0/src/concept_drift_streaming.ipynb +++ /dev/null @@ -1,480 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Concept Drift Streaming" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import nuclio" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "from pprint import pprint" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "%%nuclio cmd -c\n", - "python -m pip install scikit-multiflow==0.4.1\n", - "python -m pip install v3io_frames" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "%nuclio: setting kind to 'nuclio'\n", - "%nuclio: setting spec.build.baseImage to 'mlrun/ml-models'\n" - ] - } - ], - "source": [ - "# Define function spec\n", - "%nuclio config kind = \"nuclio\"\n", - "%nuclio config spec.build.baseImage = \"mlrun/ml-models\"\n", - "\n", - "# Add V3IO Mount\n", - "# %nuclio env %v3io" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: ignore\n", - "env = {'label_col': 'resp',\n", - " 'prediction_col': 'prediction',\n", - " 'drift_stream': '/bigdata/network-operations/drift_stream',\n", - " 'tsdb_table': 'network-operations/drift_tsdb',\n", - " 'pagehinkley_threshold': 10,\n", - " 'models': ['pagehinkley', 'ddm', 'eddm'],\n", - " 'window_size': 10}\n", - "config = {'kind': 'nuclio',\n", - " 'spec.build.baseImage': 'mlrun/ml-models'}\n", - "cmd = ['python -m pip install scikit-multiflow',\n", - " 'python -m pip install v3io_frames']\n", - "v3io = True\n", - "config = nuclio.ConfigSpec(env=env,\n", - " config=config,\n", - " cmd=cmd,\n", - " v3io=v3io)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: start-code" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "import skmultiflow.drift_detection\n", - "import numpy as np\n", - "import pandas as pd\n", - "import os\n", - "import json\n", - "import v3io.dataplane\n", - "import v3io_frames as v3f\n", - "import requests\n", - "from cloudpickle import load\n", - "\n", - "# For testing\n", - "import random" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "def split_path(mntpath=''):\n", - " if mntpath[0] == '/':\n", - " mntpath = mntpath[1:]\n", - " paths = mntpath.split('/')\n", - " container = paths[0]\n", - " subpath = ''\n", - " if len(paths) > 1:\n", - " subpath = mntpath[len(container):]\n", - " return container, subpath\n", - "\n", - "\n", - "def create_stream(context, path, shards=1):\n", - " # create a stream w/8 shards\n", - " container, stream_path = split_path(path)\n", - " context.logger.info(f'Creating stream in Container: {container} & Path {stream_path}')\n", - " response = context.v3io_client.create_stream(container=container,\n", - " path=stream_path, \n", - " shard_count=shards,\n", - " raise_for_status=v3io.dataplane.RaiseForStatus.never)\n", - " response.raise_for_status([409, 204])\n", - " \n", - " \n", - "def push_to_stream(context, stream_path, data):\n", - " records = [{'data': json.dumps(rec)} for rec in data]\n", - " container, stream_path = split_path(stream_path)\n", - " response = context.v3io_client.put_records(container=container,\n", - " path=stream_path, \n", - " records=records)\n", - "\n", - "\n", - "def construct_record(record):\n", - " label_col = os.getenv('label_col', 'label')\n", - " prediction_col = os.getenv('prediction_col', 'prediction')\n", - " res = dict([(k, record[k]) for k in ['when', 'class', 'model', 'resp', 'request']])\n", - " res['feature_vector'] = res.pop('request')['instances'][0]\n", - " res['timestamp'] = res.pop('when')\n", - " res['prediction'] = res['resp'][0]\n", - " return res" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "def init_context(context):\n", - " # create a v3io context object\n", - " v3io_client = v3io.dataplane.Client()\n", - " setattr(context, \"v3io_client\", v3io_client)\n", - " \n", - " # Setup windowing for TSDB writer\n", - " v3f_client = v3f.Client('framesd:8081', container='bigdata')\n", - " setattr(context, \"v3f\", v3f_client)\n", - " window = []\n", - " setattr(context, 'window', window)\n", - " setattr(context, 'window_size', int(os.getenv('window_size', 10)))\n", - " setattr(context, 'tsdb_table', os.getenv('tsdb_table', 'concept_drift_tsdb_1'))\n", - " try:\n", - " context.v3f.create('tsdb', context.tsdb_table, rate='1/s', if_exists=1)\n", - " except Exception as e:\n", - " context.logger.info(f'Creating context with rate= faile for {e}')\n", - " context.v3f.create('tsdb', context.tsdb_table, attrs={'rate': '1/s'}, if_exists=1)\n", - " \n", - " # Setup callbacks\n", - " callbacks = [callback.strip() for callback in os.getenv('callbacks', '').split(',')]\n", - " setattr(context, 'callbacks', callbacks)\n", - " \n", - " # Setup drift stream\n", - " setattr(context, 'drift_stream', os.getenv('drift_stream', '/bigdata/drift_stream'))\n", - " try:\n", - " create_stream(context, context.drift_stream, int(os.getenv('drift_stream_shards', 1)))\n", - " except:\n", - " context.logger.info(f'{context.drift_stream} already exists')\n", - " \n", - " # Load models\n", - " models = {}\n", - " model_types = ['pagehinkely', 'ddm', 'eddm']\n", - " path_suffix = '_model_path'\n", - " for model in model_types:\n", - " model_env = f'{model}{path_suffix}'\n", - " if model_env in os.environ:\n", - " with open(os.environ[model_env], 'rb') as f:\n", - " models[model] = load(f)\n", - " setattr(context, 'models', models)\n", - " \n", - " # Columns to check\n", - " setattr(context, 'label_col', os.getenv('label_col', 'label'))\n", - " setattr(context, 'prediction_col', os.getenv('prediction_col', 'prediction'))" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "def handler(context, event):\n", - " # Construct event\n", - " context.logger.info(f'event: {event.body}')\n", - " full_event = json.loads(event.body)\n", - " record = construct_record(full_event)\n", - " \n", - " # Is our prediction wrong?\n", - " is_error = record[context.label_col] != record[context.prediction_col]\n", - " context.logger.info(f'Adding {is_error}')\n", - " \n", - " # Process the {is_error} element with our algorithms\n", - " for name, model in context.models.items():\n", - " # Add element\n", - " results = {'timestamp': record['timestamp']}\n", - " results['algorithm'] = name\n", - " model.add_element(is_error)\n", - " \n", - " # Detect warning zone (if applicable to the algorithm)\n", - " if hasattr(model, 'detected_warning_zone') and model.detected_warning_zone():\n", - " context.logger.info(f'{name}\\tWarning zone detected')\n", - " results['warning_zone'] = 1\n", - " full_event[f'{name}_warning_zone'] = 1\n", - " else:\n", - " results['warning_zone'] = 0\n", - " full_event[f'{name}_warning_zone'] = 0\n", - " \n", - " # Detect drift\n", - " if model.detected_change():\n", - " context.logger.info('Change Detected')\n", - " results['change_detected'] = 1\n", - " full_event[f'{name}_drift'] = 1\n", - " else:\n", - " results['change_detected'] = 0\n", - " full_event[f'{name}_drift'] = 0\n", - " context.window.append(results)\n", - " \n", - " # Return results\n", - " # Write to stream\n", - " push_to_stream(context, context.drift_stream, [full_event])\n", - " \n", - " # Add to callbacks\n", - " if context.callbacks != ['']:\n", - " for callback in context.callbacks:\n", - " requests.post(url=callback,\n", - " json=full_event)\n", - " \n", - " if (len(context.window) / len(context.models)) >= context.window_size:\n", - " df = pd.DataFrame(context.window)\n", - " df['timestamp'] = pd.to_datetime(df['timestamp'])\n", - " df = df.set_index(['timestamp', 'algorithm'])\n", - " context.v3f.write('tsdb', context.tsdb_table, df)\n", - " context.window = []" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: end-code" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Test " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "init_context(context)\n", - "event = nuclio.Event(body=json.dumps({'prediction': 0,\n", - " 'when': 'now',\n", - " 'class': 'ClassModel', \n", - " 'model': 'tester_v1', \n", - " 'resp': [0], \n", - " 'request': {'instances': [[1, 1.2, 3]]}}))\n", - "out = handler(context, event)\n", - "out" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Cluster" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%nuclio deploy -n network-operations-concept-drift -p network-operations" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Save function yaml" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "from os import path\n", - "from mlrun import run_local, NewTask, mlconf, import_function, mount_v3io, code_to_function, get_run_db\n", - "mlconf.dbpath = mlconf.dbpath or 'http://mlrun-api:8080'" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-07-14 13:49:22,720 function spec saved to path: /User/functions/concept_drift_streaming/function.yaml\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# create job function object from notebook code\n", - "fn = code_to_function(\"concept_drift_streaming\", kind='nuclio')\n", - "\n", - "# add metadata (for templates and reuse)\n", - "fn.spec.default_handler = \"handler\"\n", - "fn.spec.description = \"Deploy a streaming Concept Drift detector on a labeled stream. the nuclio part of the concept_drift function\"\n", - "fn.metadata.categories = [\"ml\", \"serve\"]\n", - "fn.metadata.labels = {\"author\": \"orz\", \"framework\": \"sklearn\"}\n", - "fn.export(\"/User/functions/concept_drift_streaming/function.yaml\")" - ] - }, - { - "cell_type": "code", - "execution_count": 120, - "metadata": {}, - "outputs": [], - "source": [ - "stream_trigger = nuclio.triggers.V3IOStreamTrigger(url='/bigdata/network-operations/inference_stream@cd2')" - ] - }, - { - "cell_type": "code", - "execution_count": 121, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 121, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fn.add_trigger('labeled_stream', stream_trigger)" - ] - }, - { - "cell_type": "code", - "execution_count": 122, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 122, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fn.apply(mount_v3io()).with_v3io()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "fn.export(\"function.yaml\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Stream testing" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "metadata": {}, - "outputs": [], - "source": [ - "fn = import_function('./function.yaml')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "fn.deploy(project='network-operations')" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.8" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/functions/development/concept_drift_streaming/0.8.0/src/concept_drift_streaming.py b/functions/development/concept_drift_streaming/0.8.0/src/concept_drift_streaming.py deleted file mode 100644 index 94247c45..00000000 --- a/functions/development/concept_drift_streaming/0.8.0/src/concept_drift_streaming.py +++ /dev/null @@ -1,143 +0,0 @@ -# Generated by nuclio.export.NuclioExporter - -import skmultiflow.drift_detection -import numpy as np -import pandas as pd -import os -import json -import v3io.dataplane -import v3io_frames as v3f -import requests -from cloudpickle import load - -import random - - -def split_path(mntpath=""): - if mntpath[0] == "/": - mntpath = mntpath[1:] - paths = mntpath.split("/") - container = paths[0] - subpath = "" - if len(paths) > 1: - subpath = mntpath[len(container) :] - return container, subpath - - -def create_stream(context, path, shards=1): - container, stream_path = split_path(path) - context.logger.info( - f"Creating stream in Container: {container} & Path {stream_path}" - ) - response = context.v3io_client.create_stream( - container=container, - path=stream_path, - shard_count=shards, - raise_for_status=v3io.dataplane.RaiseForStatus.never, - ) - response.raise_for_status([409, 204]) - - -def push_to_stream(context, stream_path, data): - records = [{"data": json.dumps(rec)} for rec in data] - container, stream_path = split_path(stream_path) - response = context.v3io_client.put_records( - container=container, path=stream_path, records=records - ) - - -def construct_record(record): - label_col = os.getenv("label_col", "label") - prediction_col = os.getenv("prediction_col", "prediction") - res = dict([(k, record[k]) for k in ["when", "class", "model", "resp", "request"]]) - res["feature_vector"] = res.pop("request")["instances"][0] - res["timestamp"] = res.pop("when") - res[prediction_col] = res["resp"][0] - return res - - -def init_context(context): - v3io_client = v3io.dataplane.Client() - setattr(context, "v3io_client", v3io_client) - - v3f_client = v3f.Client("framesd:8081", container="bigdata") - setattr(context, "v3f", v3f_client) - window = [] - setattr(context, "window", window) - setattr(context, "window_size", int(os.getenv("window_size", 10))) - setattr(context, "tsdb_table", os.getenv("tsdb_table", "concept_drift_tsdb_1")) - try: - context.v3f.create("tsdb", context.tsdb_table, rate="1/s", if_exists=1) - except Exception as e: - context.logger.info(f"Creating context with rate= faile for {e}") - context.v3f.create( - "tsdb", context.tsdb_table, attrs={"rate": "1/s"}, if_exists=1 - ) - - callbacks = [callback.strip() for callback in os.getenv("callbacks", "").split(",")] - setattr(context, "callbacks", callbacks) - - setattr(context, "drift_stream", os.getenv("drift_stream", "/bigdata/drift_stream")) - try: - create_stream( - context, context.drift_stream, int(os.getenv("drift_stream_shards", 1)) - ) - except: - context.logger.info(f"{context.drift_stream} already exists") - - models = {} - model_types = ["pagehinkely", "ddm", "eddm"] - path_suffix = "_model_path" - for model in model_types: - model_env = f"{model}{path_suffix}" - if model_env in os.environ: - with open(os.environ[model_env], "rb") as f: - models[model] = load(f) - setattr(context, "models", models) - - setattr(context, "label_col", os.getenv("label_col", "label")) - setattr(context, "prediction_col", os.getenv("prediction_col", "prediction")) - - -def handler(context, event): - context.logger.info(f"event: {event.body}") - full_event = json.loads(event.body) - record = construct_record(full_event) - - is_error = record[context.label_col] != record[context.prediction_col] - context.logger.info(f"Adding {is_error}") - - for name, model in context.models.items(): - results = {"timestamp": record["timestamp"]} - results["algorithm"] = name - model.add_element(is_error) - - if hasattr(model, "detected_warning_zone") and model.detected_warning_zone(): - context.logger.info(f"{name}\tWarning zone detected") - results["warning_zone"] = 1 - full_event[f"{name}_warning_zone"] = 1 - else: - results["warning_zone"] = 0 - full_event[f"{name}_warning_zone"] = 0 - - if model.detected_change(): - context.logger.info("Change Detected") - results["change_detected"] = 1 - full_event[f"{name}_drift"] = 1 - else: - results["change_detected"] = 0 - full_event[f"{name}_drift"] = 0 - context.window.append(results) - - push_to_stream(context, context.drift_stream, [full_event]) - - if context.callbacks != [""]: - for callback in context.callbacks: - requests.post(url=callback, json=full_event) - - if (len(context.window) / len(context.models)) >= context.window_size: - df = pd.DataFrame(context.window) - df["timestamp"] = pd.to_datetime(df["timestamp"]) - df = df.set_index(["timestamp", "algorithm"]) - context.v3f.write("tsdb", context.tsdb_table, df) - context.window = [] diff --git a/functions/development/concept_drift_streaming/0.8.0/src/function.yaml b/functions/development/concept_drift_streaming/0.8.0/src/function.yaml deleted file mode 100644 index a91ac25f..00000000 --- a/functions/development/concept_drift_streaming/0.8.0/src/function.yaml +++ /dev/null @@ -1,48 +0,0 @@ -kind: remote -metadata: - name: concept-drift-streaming - tag: '' - hash: dc41ff41149be69f19b91a6d78a06571937063ae - project: default - labels: - author: orz - framework: sklearn - categories: - - machine-learning - - monitoring -spec: - command: '' - args: [] - image: mlrun/ml-models - description: Deploy a streaming Concept Drift detector on a labeled stream. the - nuclio part of the concept_drift function - min_replicas: 1 - max_replicas: 4 - env: [] - base_spec: - apiVersion: nuclio.io/v1 - kind: Function - metadata: - name: concept-drift-streaming - labels: {} - annotations: - nuclio.io/generated_by: function generated from /User/test/functions/concept_drift_streaming/concept_drift_streaming.py - spec: - runtime: python:3.6 - handler: concept_drift_streaming:handler - env: [] - volumes: [] - build: - commands: [] - noBaseImagesPull: true - functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHNrbXVsdGlmbG93LmRyaWZ0X2RldGVjdGlvbgppbXBvcnQgbnVtcHkgYXMgbnAKaW1wb3J0IHBhbmRhcyBhcyBwZAppbXBvcnQgb3MKaW1wb3J0IGpzb24KaW1wb3J0IHYzaW8uZGF0YXBsYW5lCmltcG9ydCB2M2lvX2ZyYW1lcyBhcyB2M2YKaW1wb3J0IHJlcXVlc3RzCmZyb20gY2xvdWRwaWNrbGUgaW1wb3J0IGxvYWQKCmltcG9ydCByYW5kb20KCgpkZWYgc3BsaXRfcGF0aChtbnRwYXRoPSIiKToKICAgIGlmIG1udHBhdGhbMF0gPT0gIi8iOgogICAgICAgIG1udHBhdGggPSBtbnRwYXRoWzE6XQogICAgcGF0aHMgPSBtbnRwYXRoLnNwbGl0KCIvIikKICAgIGNvbnRhaW5lciA9IHBhdGhzWzBdCiAgICBzdWJwYXRoID0gIiIKICAgIGlmIGxlbihwYXRocykgPiAxOgogICAgICAgIHN1YnBhdGggPSBtbnRwYXRoW2xlbihjb250YWluZXIpIDpdCiAgICByZXR1cm4gY29udGFpbmVyLCBzdWJwYXRoCgoKZGVmIGNyZWF0ZV9zdHJlYW0oY29udGV4dCwgcGF0aCwgc2hhcmRzPTEpOgogICAgY29udGFpbmVyLCBzdHJlYW1fcGF0aCA9IHNwbGl0X3BhdGgocGF0aCkKICAgIGNvbnRleHQubG9nZ2VyLmluZm8oCiAgICAgICAgZiJDcmVhdGluZyBzdHJlYW0gaW4gQ29udGFpbmVyOiB7Y29udGFpbmVyfSAmIFBhdGgge3N0cmVhbV9wYXRofSIKICAgICkKICAgIHJlc3BvbnNlID0gY29udGV4dC52M2lvX2NsaWVudC5jcmVhdGVfc3RyZWFtKAogICAgICAgIGNvbnRhaW5lcj1jb250YWluZXIsCiAgICAgICAgcGF0aD1zdHJlYW1fcGF0aCwKICAgICAgICBzaGFyZF9jb3VudD1zaGFyZHMsCiAgICAgICAgcmFpc2VfZm9yX3N0YXR1cz12M2lvLmRhdGFwbGFuZS5SYWlzZUZvclN0YXR1cy5uZXZlciwKICAgICkKICAgIHJlc3BvbnNlLnJhaXNlX2Zvcl9zdGF0dXMoWzQwOSwgMjA0XSkKCgpkZWYgcHVzaF90b19zdHJlYW0oY29udGV4dCwgc3RyZWFtX3BhdGgsIGRhdGEpOgogICAgcmVjb3JkcyA9IFt7ImRhdGEiOiBqc29uLmR1bXBzKHJlYyl9IGZvciByZWMgaW4gZGF0YV0KICAgIGNvbnRhaW5lciwgc3RyZWFtX3BhdGggPSBzcGxpdF9wYXRoKHN0cmVhbV9wYXRoKQogICAgcmVzcG9uc2UgPSBjb250ZXh0LnYzaW9fY2xpZW50LnB1dF9yZWNvcmRzKAogICAgICAgIGNvbnRhaW5lcj1jb250YWluZXIsIHBhdGg9c3RyZWFtX3BhdGgsIHJlY29yZHM9cmVjb3JkcwogICAgKQoKCmRlZiBjb25zdHJ1Y3RfcmVjb3JkKHJlY29yZCk6CiAgICBsYWJlbF9jb2wgPSBvcy5nZXRlbnYoImxhYmVsX2NvbCIsICJsYWJlbCIpCiAgICBwcmVkaWN0aW9uX2NvbCA9IG9zLmdldGVudigicHJlZGljdGlvbl9jb2wiLCAicHJlZGljdGlvbiIpCiAgICByZXMgPSBkaWN0KFsoaywgcmVjb3JkW2tdKSBmb3IgayBpbiBbIndoZW4iLCAiY2xhc3MiLCAibW9kZWwiLCAicmVzcCIsICJyZXF1ZXN0Il1dKQogICAgcmVzWyJmZWF0dXJlX3ZlY3RvciJdID0gcmVzLnBvcCgicmVxdWVzdCIpWyJpbnN0YW5jZXMiXVswXQogICAgcmVzWyJ0aW1lc3RhbXAiXSA9IHJlcy5wb3AoIndoZW4iKQogICAgcmVzW3ByZWRpY3Rpb25fY29sXSA9IHJlc1sicmVzcCJdWzBdCiAgICByZXR1cm4gcmVzCgoKZGVmIGluaXRfY29udGV4dChjb250ZXh0KToKICAgIHYzaW9fY2xpZW50ID0gdjNpby5kYXRhcGxhbmUuQ2xpZW50KCkKICAgIHNldGF0dHIoY29udGV4dCwgInYzaW9fY2xpZW50IiwgdjNpb19jbGllbnQpCgogICAgdjNmX2NsaWVudCA9IHYzZi5DbGllbnQoImZyYW1lc2Q6ODA4MSIsIGNvbnRhaW5lcj0iYmlnZGF0YSIpCiAgICBzZXRhdHRyKGNvbnRleHQsICJ2M2YiLCB2M2ZfY2xpZW50KQogICAgd2luZG93ID0gW10KICAgIHNldGF0dHIoY29udGV4dCwgIndpbmRvdyIsIHdpbmRvdykKICAgIHNldGF0dHIoY29udGV4dCwgIndpbmRvd19zaXplIiwgaW50KG9zLmdldGVudigid2luZG93X3NpemUiLCAxMCkpKQogICAgc2V0YXR0cihjb250ZXh0LCAidHNkYl90YWJsZSIsIG9zLmdldGVudigidHNkYl90YWJsZSIsICJjb25jZXB0X2RyaWZ0X3RzZGJfMSIpKQogICAgdHJ5OgogICAgICAgIGNvbnRleHQudjNmLmNyZWF0ZSgidHNkYiIsIGNvbnRleHQudHNkYl90YWJsZSwgcmF0ZT0iMS9zIiwgaWZfZXhpc3RzPTEpCiAgICBleGNlcHQgRXhjZXB0aW9uIGFzIGU6CiAgICAgICAgY29udGV4dC5sb2dnZXIuaW5mbyhmIkNyZWF0aW5nIGNvbnRleHQgd2l0aCByYXRlPSBmYWlsZSBmb3Ige2V9IikKICAgICAgICBjb250ZXh0LnYzZi5jcmVhdGUoCiAgICAgICAgICAgICJ0c2RiIiwgY29udGV4dC50c2RiX3RhYmxlLCBhdHRycz17InJhdGUiOiAiMS9zIn0sIGlmX2V4aXN0cz0xCiAgICAgICAgKQoKICAgIGNhbGxiYWNrcyA9IFtjYWxsYmFjay5zdHJpcCgpIGZvciBjYWxsYmFjayBpbiBvcy5nZXRlbnYoImNhbGxiYWNrcyIsICIiKS5zcGxpdCgiLCIpXQogICAgc2V0YXR0cihjb250ZXh0LCAiY2FsbGJhY2tzIiwgY2FsbGJhY2tzKQoKICAgIHNldGF0dHIoY29udGV4dCwgImRyaWZ0X3N0cmVhbSIsIG9zLmdldGVudigiZHJpZnRfc3RyZWFtIiwgIi9iaWdkYXRhL2RyaWZ0X3N0cmVhbSIpKQogICAgdHJ5OgogICAgICAgIGNyZWF0ZV9zdHJlYW0oCiAgICAgICAgICAgIGNvbnRleHQsIGNvbnRleHQuZHJpZnRfc3RyZWFtLCBpbnQob3MuZ2V0ZW52KCJkcmlmdF9zdHJlYW1fc2hhcmRzIiwgMSkpCiAgICAgICAgKQogICAgZXhjZXB0OgogICAgICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZiJ7Y29udGV4dC5kcmlmdF9zdHJlYW19IGFscmVhZHkgZXhpc3RzIikKCiAgICBtb2RlbHMgPSB7fQogICAgbW9kZWxfdHlwZXMgPSBbInBhZ2VoaW5rZWx5IiwgImRkbSIsICJlZGRtIl0KICAgIHBhdGhfc3VmZml4ID0gIl9tb2RlbF9wYXRoIgogICAgZm9yIG1vZGVsIGluIG1vZGVsX3R5cGVzOgogICAgICAgIG1vZGVsX2VudiA9IGYie21vZGVsfXtwYXRoX3N1ZmZpeH0iCiAgICAgICAgaWYgbW9kZWxfZW52IGluIG9zLmVudmlyb246CiAgICAgICAgICAgIHdpdGggb3Blbihvcy5lbnZpcm9uW21vZGVsX2Vudl0sICJyYiIpIGFzIGY6CiAgICAgICAgICAgICAgICBtb2RlbHNbbW9kZWxdID0gbG9hZChmKQogICAgc2V0YXR0cihjb250ZXh0LCAibW9kZWxzIiwgbW9kZWxzKQoKICAgIHNldGF0dHIoY29udGV4dCwgImxhYmVsX2NvbCIsIG9zLmdldGVudigibGFiZWxfY29sIiwgImxhYmVsIikpCiAgICBzZXRhdHRyKGNvbnRleHQsICJwcmVkaWN0aW9uX2NvbCIsIG9zLmdldGVudigicHJlZGljdGlvbl9jb2wiLCAicHJlZGljdGlvbiIpKQoKCmRlZiBoYW5kbGVyKGNvbnRleHQsIGV2ZW50KToKICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZiJldmVudDoge2V2ZW50LmJvZHl9IikKICAgIGZ1bGxfZXZlbnQgPSBqc29uLmxvYWRzKGV2ZW50LmJvZHkpCiAgICByZWNvcmQgPSBjb25zdHJ1Y3RfcmVjb3JkKGZ1bGxfZXZlbnQpCgogICAgaXNfZXJyb3IgPSByZWNvcmRbY29udGV4dC5sYWJlbF9jb2xdICE9IHJlY29yZFtjb250ZXh0LnByZWRpY3Rpb25fY29sXQogICAgY29udGV4dC5sb2dnZXIuaW5mbyhmIkFkZGluZyB7aXNfZXJyb3J9IikKCiAgICBmb3IgbmFtZSwgbW9kZWwgaW4gY29udGV4dC5tb2RlbHMuaXRlbXMoKToKICAgICAgICByZXN1bHRzID0geyJ0aW1lc3RhbXAiOiByZWNvcmRbInRpbWVzdGFtcCJdfQogICAgICAgIHJlc3VsdHNbImFsZ29yaXRobSJdID0gbmFtZQogICAgICAgIG1vZGVsLmFkZF9lbGVtZW50KGlzX2Vycm9yKQoKICAgICAgICBpZiBoYXNhdHRyKG1vZGVsLCAiZGV0ZWN0ZWRfd2FybmluZ196b25lIikgYW5kIG1vZGVsLmRldGVjdGVkX3dhcm5pbmdfem9uZSgpOgogICAgICAgICAgICBjb250ZXh0LmxvZ2dlci5pbmZvKGYie25hbWV9XHRXYXJuaW5nIHpvbmUgZGV0ZWN0ZWQiKQogICAgICAgICAgICByZXN1bHRzWyJ3YXJuaW5nX3pvbmUiXSA9IDEKICAgICAgICAgICAgZnVsbF9ldmVudFtmIntuYW1lfV93YXJuaW5nX3pvbmUiXSA9IDEKICAgICAgICBlbHNlOgogICAgICAgICAgICByZXN1bHRzWyJ3YXJuaW5nX3pvbmUiXSA9IDAKICAgICAgICAgICAgZnVsbF9ldmVudFtmIntuYW1lfV93YXJuaW5nX3pvbmUiXSA9IDAKCiAgICAgICAgaWYgbW9kZWwuZGV0ZWN0ZWRfY2hhbmdlKCk6CiAgICAgICAgICAgIGNvbnRleHQubG9nZ2VyLmluZm8oIkNoYW5nZSBEZXRlY3RlZCIpCiAgICAgICAgICAgIHJlc3VsdHNbImNoYW5nZV9kZXRlY3RlZCJdID0gMQogICAgICAgICAgICBmdWxsX2V2ZW50W2Yie25hbWV9X2RyaWZ0Il0gPSAxCiAgICAgICAgZWxzZToKICAgICAgICAgICAgcmVzdWx0c1siY2hhbmdlX2RldGVjdGVkIl0gPSAwCiAgICAgICAgICAgIGZ1bGxfZXZlbnRbZiJ7bmFtZX1fZHJpZnQiXSA9IDAKICAgICAgICBjb250ZXh0LndpbmRvdy5hcHBlbmQocmVzdWx0cykKCiAgICBwdXNoX3RvX3N0cmVhbShjb250ZXh0LCBjb250ZXh0LmRyaWZ0X3N0cmVhbSwgW2Z1bGxfZXZlbnRdKQoKICAgIGlmIGNvbnRleHQuY2FsbGJhY2tzICE9IFsiIl06CiAgICAgICAgZm9yIGNhbGxiYWNrIGluIGNvbnRleHQuY2FsbGJhY2tzOgogICAgICAgICAgICByZXF1ZXN0cy5wb3N0KHVybD1jYWxsYmFjaywganNvbj1mdWxsX2V2ZW50KQoKICAgIGlmIChsZW4oY29udGV4dC53aW5kb3cpIC8gbGVuKGNvbnRleHQubW9kZWxzKSkgPj0gY29udGV4dC53aW5kb3dfc2l6ZToKICAgICAgICBkZiA9IHBkLkRhdGFGcmFtZShjb250ZXh0LndpbmRvdykKICAgICAgICBkZlsidGltZXN0YW1wIl0gPSBwZC50b19kYXRldGltZShkZlsidGltZXN0YW1wIl0pCiAgICAgICAgZGYgPSBkZi5zZXRfaW5kZXgoWyJ0aW1lc3RhbXAiLCAiYWxnb3JpdGhtIl0pCiAgICAgICAgY29udGV4dC52M2Yud3JpdGUoInRzZGIiLCBjb250ZXh0LnRzZGJfdGFibGUsIGRmKQogICAgICAgIGNvbnRleHQud2luZG93ID0gW10K - source: '' - build: - commands: - - python -m pip install scikit-multiflow==0.4.1 v3io_frames - code_origin: https://github.com/daniels290813/functions.git#d96059851b5d51fd4583e982483eb973fccc47d2:/User/test/functions/concept_drift_streaming/concept_drift_streaming.py - origin_filename: /User/test/functions/concept_drift_streaming/concept_drift_streaming.py - default_handler: handler - disable_auto_mount: false - affinity: null -verbose: false diff --git a/functions/development/concept_drift_streaming/0.8.0/src/item.yaml b/functions/development/concept_drift_streaming/0.8.0/src/item.yaml deleted file mode 100644 index 7db4e071..00000000 --- a/functions/development/concept_drift_streaming/0.8.0/src/item.yaml +++ /dev/null @@ -1,28 +0,0 @@ -apiVersion: v1 -categories: -- machine-learning -- monitoring -description: Deploy a streaming Concept Drift detector on a labeled stream. the nuclio - part of the concept_drift function -doc: '' -example: concept_drift_streaming.ipynb -generationDate: 2021-05-19:22-41 -icon: '' -labels: - author: orz - framework: sklearn -maintainers: [] -marketplaceType: '' -mlrunVersion: 0.8.0 -name: concept-drift-streaming -platformVersion: 3.2.0 -spec: - filename: concept_drift_streaming.py - handler: handler - image: mlrun/ml-models - kind: nuclio - requirements: - - scikit-multiflow==0.4.1 - - v3io_frames -url: '' -version: 0.8.0 diff --git a/functions/development/concept_drift_streaming/0.8.0/static/documentation.html b/functions/development/concept_drift_streaming/0.8.0/static/documentation.html deleted file mode 100644 index fd1cd3dd..00000000 --- a/functions/development/concept_drift_streaming/0.8.0/static/documentation.html +++ /dev/null @@ -1,125 +0,0 @@ - - - - - - - -concept_drift_streaming package - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- - -
-
-
-
-
-
-

concept_drift_streaming package

-
-

Submodules

-
-
-

concept_drift_streaming.concept_drift_streaming module

-
-
-

Module contents

-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/concept_drift_streaming/0.8.0/static/example.html b/functions/development/concept_drift_streaming/0.8.0/static/example.html deleted file mode 100644 index 17b9b139..00000000 --- a/functions/development/concept_drift_streaming/0.8.0/static/example.html +++ /dev/null @@ -1,486 +0,0 @@ - - - - - - - -Concept Drift Streaming - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- -
-
- Contents -
- -
-
-
-
-
-
-
-

Concept Drift Streaming

-
-
-
import nuclio
-
-
-
-
-
-
-
from pprint import pprint
-
-
-
-
-
-
-
%%nuclio cmd -c
-python -m pip install scikit-multiflow==0.4.1
-python -m pip install v3io_frames
-
-
-
-
-
-
-
# Define function spec
-%nuclio config kind = "nuclio"
-%nuclio config spec.build.baseImage = "mlrun/ml-models"
-
-# Add V3IO Mount
-# %nuclio env %v3io
-
-
-
-
-
%nuclio: setting kind to 'nuclio'
-%nuclio: setting spec.build.baseImage to 'mlrun/ml-models'
-
-
-
-
-
-
-
# nuclio: ignore
-env = {'label_col': 'resp',
-       'prediction_col': 'prediction',
-       'drift_stream': '/bigdata/network-operations/drift_stream',
-       'tsdb_table': 'network-operations/drift_tsdb',
-       'pagehinkley_threshold': 10,
-       'models': ['pagehinkley', 'ddm', 'eddm'],
-       'window_size': 10}
-config = {'kind': 'nuclio',
-          'spec.build.baseImage': 'mlrun/ml-models'}
-cmd = ['python -m pip install scikit-multiflow',
-       'python -m pip install v3io_frames']
-v3io = True
-config = nuclio.ConfigSpec(env=env,
-                           config=config,
-                           cmd=cmd,
-                           v3io=v3io)
-
-
-
-
-
-
-
# nuclio: start-code
-
-
-
-
-
-
-
import skmultiflow.drift_detection
-import numpy as np
-import pandas as pd
-import os
-import json
-import v3io.dataplane
-import v3io_frames as v3f
-import requests
-from cloudpickle import load
-
-# For testing
-import random
-
-
-
-
-
-
-
def split_path(mntpath=''):
-    if mntpath[0] == '/':
-        mntpath = mntpath[1:]
-    paths = mntpath.split('/')
-    container = paths[0]
-    subpath = ''
-    if len(paths) > 1:
-        subpath = mntpath[len(container):]
-    return container, subpath
-
-
-def create_stream(context, path, shards=1):
-    # create a stream w/8 shards
-    container, stream_path = split_path(path)
-    context.logger.info(f'Creating stream in Container: {container} & Path {stream_path}')
-    response = context.v3io_client.create_stream(container=container,
-                                        path=stream_path, 
-                                        shard_count=shards,
-                                        raise_for_status=v3io.dataplane.RaiseForStatus.never)
-    response.raise_for_status([409, 204])
-    
-    
-def push_to_stream(context, stream_path, data):
-    records = [{'data': json.dumps(rec)} for rec in data]
-    container, stream_path = split_path(stream_path)
-    response = context.v3io_client.put_records(container=container,
-                                               path=stream_path, 
-                                               records=records)
-
-
-def construct_record(record):
-    label_col = os.getenv('label_col', 'label')
-    prediction_col = os.getenv('prediction_col', 'prediction')
-    res = dict([(k, record[k]) for k in ['when', 'class', 'model', 'resp', 'request']])
-    res['feature_vector'] = res.pop('request')['instances'][0]
-    res['timestamp'] = res.pop('when')
-    res['prediction'] = res['resp'][0]
-    return res
-
-
-
-
-
-
-
def init_context(context):
-    # create a v3io context object
-    v3io_client = v3io.dataplane.Client()
-    setattr(context, "v3io_client", v3io_client)
-    
-    # Setup windowing for TSDB writer
-    v3f_client = v3f.Client('framesd:8081', container='bigdata')
-    setattr(context, "v3f", v3f_client)
-    window = []
-    setattr(context, 'window', window)
-    setattr(context, 'window_size', int(os.getenv('window_size', 10)))
-    setattr(context, 'tsdb_table', os.getenv('tsdb_table', 'concept_drift_tsdb_1'))
-    try:
-        context.v3f.create('tsdb', context.tsdb_table, rate='1/s', if_exists=1)
-    except Exception as e:
-        context.logger.info(f'Creating context with rate= faile for {e}')
-        context.v3f.create('tsdb', context.tsdb_table, attrs={'rate': '1/s'}, if_exists=1)
-    
-    # Setup callbacks
-    callbacks = [callback.strip() for callback in os.getenv('callbacks', '').split(',')]
-    setattr(context, 'callbacks', callbacks)
-    
-    # Setup drift stream
-    setattr(context, 'drift_stream', os.getenv('drift_stream', '/bigdata/drift_stream'))
-    try:
-        create_stream(context, context.drift_stream, int(os.getenv('drift_stream_shards', 1)))
-    except:
-        context.logger.info(f'{context.drift_stream} already exists')
-    
-    # Load models
-    models = {}
-    model_types = ['pagehinkely', 'ddm', 'eddm']
-    path_suffix = '_model_path'
-    for model in model_types:
-        model_env = f'{model}{path_suffix}'
-        if model_env in os.environ:
-            with open(os.environ[model_env], 'rb') as f:
-                models[model] = load(f)
-    setattr(context, 'models', models)
-    
-    # Columns to check
-    setattr(context, 'label_col', os.getenv('label_col', 'label'))
-    setattr(context, 'prediction_col', os.getenv('prediction_col', 'prediction'))
-
-
-
-
-
-
-
def handler(context, event):
-    # Construct event
-    context.logger.info(f'event: {event.body}')
-    full_event = json.loads(event.body)
-    record = construct_record(full_event)
-    
-    # Is our prediction wrong?
-    is_error = record[context.label_col] != record[context.prediction_col]
-    context.logger.info(f'Adding {is_error}')
-    
-    # Process the {is_error} element with our algorithms
-    for name, model in context.models.items():
-        # Add element
-        results = {'timestamp': record['timestamp']}
-        results['algorithm'] = name
-        model.add_element(is_error)
-        
-        # Detect warning zone (if applicable to the algorithm)
-        if hasattr(model, 'detected_warning_zone') and model.detected_warning_zone():
-            context.logger.info(f'{name}\tWarning zone detected')
-            results['warning_zone'] = 1
-            full_event[f'{name}_warning_zone'] = 1
-        else:
-            results['warning_zone'] = 0
-            full_event[f'{name}_warning_zone'] = 0
-        
-        # Detect drift
-        if model.detected_change():
-            context.logger.info('Change Detected')
-            results['change_detected'] = 1
-            full_event[f'{name}_drift'] = 1
-        else:
-            results['change_detected'] = 0
-            full_event[f'{name}_drift'] = 0
-        context.window.append(results)
-    
-    # Return results
-    # Write to stream
-    push_to_stream(context, context.drift_stream, [full_event])
-    
-    # Add to callbacks
-    if context.callbacks != ['']:
-        for callback in context.callbacks:
-            requests.post(url=callback,
-                          json=full_event)
-    
-    if (len(context.window) / len(context.models)) >= context.window_size:
-        df = pd.DataFrame(context.window)
-        df['timestamp'] = pd.to_datetime(df['timestamp'])
-        df = df.set_index(['timestamp', 'algorithm'])
-        context.v3f.write('tsdb', context.tsdb_table, df)
-        context.window = []
-
-
-
-
-
-
-
# nuclio: end-code
-
-
-
-
-
-

Test

-
-
-
init_context(context)
-event = nuclio.Event(body=json.dumps({'prediction': 0,
-                                      'when': 'now',
-                                      'class': 'ClassModel', 
-                                      'model': 'tester_v1', 
-                                      'resp': [0], 
-                                      'request': {'instances': [[1, 1.2, 3]]}}))
-out = handler(context, event)
-out
-
-
-
-
-
-
-

Cluster

-
-
-
%nuclio deploy -n network-operations-concept-drift -p network-operations
-
-
-
-
-
-
-

Save function yaml

-
-
-
from os import path
-from mlrun import run_local, NewTask, mlconf, import_function, mount_v3io, code_to_function, get_run_db
-mlconf.dbpath = mlconf.dbpath or 'http://mlrun-api:8080'
-
-
-
-
-
-
-
# create job function object from notebook code
-fn = code_to_function("concept_drift_streaming", kind='nuclio')
-
-# add metadata (for templates and reuse)
-fn.spec.default_handler = "handler"
-fn.spec.description = "Deploy a streaming Concept Drift detector on a labeled stream. the nuclio part of the concept_drift function"
-fn.metadata.categories = ["ml", "serve"]
-fn.metadata.labels = {"author": "orz", "framework": "sklearn"}
-fn.export("/User/functions/concept_drift_streaming/function.yaml")
-
-
-
-
-
[mlrun] 2020-07-14 13:49:22,720 function spec saved to path: /User/functions/concept_drift_streaming/function.yaml
-
-
-
<mlrun.runtimes.function.RemoteRuntime at 0x7fb5a7de3e80>
-
-
-
-
-
-
-
stream_trigger = nuclio.triggers.V3IOStreamTrigger(url='/bigdata/network-operations/inference_stream@cd2')
-
-
-
-
-
-
-
fn.add_trigger('labeled_stream', stream_trigger)
-
-
-
-
-
<mlrun.runtimes.function.RemoteRuntime at 0x7fa1dc063780>
-
-
-
-
-
-
-
fn.apply(mount_v3io()).with_v3io()
-
-
-
-
-
<mlrun.runtimes.function.RemoteRuntime at 0x7fa1dc063780>
-
-
-
-
-
-
-
fn.export("function.yaml")
-
-
-
-
-
-
-

Stream testing

-
-
-
fn = import_function('./function.yaml')
-
-
-
-
-
-
-
fn.deploy(project='network-operations')
-
-
-
-
-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/concept_drift_streaming/0.8.0/static/function.html b/functions/development/concept_drift_streaming/0.8.0/static/function.html deleted file mode 100644 index a69f4c73..00000000 --- a/functions/development/concept_drift_streaming/0.8.0/static/function.html +++ /dev/null @@ -1,70 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: remote
-metadata:
-  name: concept-drift-streaming
-  tag: ''
-  hash: dc41ff41149be69f19b91a6d78a06571937063ae
-  project: default
-  labels:
-    author: orz
-    framework: sklearn
-  categories:
-  - machine-learning
-  - monitoring
-spec:
-  command: ''
-  args: []
-  image: mlrun/ml-models
-  description: Deploy a streaming Concept Drift detector on a labeled stream. the
-    nuclio part of the concept_drift function
-  min_replicas: 1
-  max_replicas: 4
-  env: []
-  base_spec:
-    apiVersion: nuclio.io/v1
-    kind: Function
-    metadata:
-      name: concept-drift-streaming
-      labels: {}
-      annotations:
-        nuclio.io/generated_by: function generated from /User/test/functions/concept_drift_streaming/concept_drift_streaming.py
-    spec:
-      runtime: python:3.6
-      handler: concept_drift_streaming:handler
-      env: []
-      volumes: []
-      build:
-        commands: []
-        noBaseImagesPull: true
-        functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHNrbXVsdGlmbG93LmRyaWZ0X2RldGVjdGlvbgppbXBvcnQgbnVtcHkgYXMgbnAKaW1wb3J0IHBhbmRhcyBhcyBwZAppbXBvcnQgb3MKaW1wb3J0IGpzb24KaW1wb3J0IHYzaW8uZGF0YXBsYW5lCmltcG9ydCB2M2lvX2ZyYW1lcyBhcyB2M2YKaW1wb3J0IHJlcXVlc3RzCmZyb20gY2xvdWRwaWNrbGUgaW1wb3J0IGxvYWQKCmltcG9ydCByYW5kb20KCgpkZWYgc3BsaXRfcGF0aChtbnRwYXRoPSIiKToKICAgIGlmIG1udHBhdGhbMF0gPT0gIi8iOgogICAgICAgIG1udHBhdGggPSBtbnRwYXRoWzE6XQogICAgcGF0aHMgPSBtbnRwYXRoLnNwbGl0KCIvIikKICAgIGNvbnRhaW5lciA9IHBhdGhzWzBdCiAgICBzdWJwYXRoID0gIiIKICAgIGlmIGxlbihwYXRocykgPiAxOgogICAgICAgIHN1YnBhdGggPSBtbnRwYXRoW2xlbihjb250YWluZXIpIDpdCiAgICByZXR1cm4gY29udGFpbmVyLCBzdWJwYXRoCgoKZGVmIGNyZWF0ZV9zdHJlYW0oY29udGV4dCwgcGF0aCwgc2hhcmRzPTEpOgogICAgY29udGFpbmVyLCBzdHJlYW1fcGF0aCA9IHNwbGl0X3BhdGgocGF0aCkKICAgIGNvbnRleHQubG9nZ2VyLmluZm8oCiAgICAgICAgZiJDcmVhdGluZyBzdHJlYW0gaW4gQ29udGFpbmVyOiB7Y29udGFpbmVyfSAmIFBhdGgge3N0cmVhbV9wYXRofSIKICAgICkKICAgIHJlc3BvbnNlID0gY29udGV4dC52M2lvX2NsaWVudC5jcmVhdGVfc3RyZWFtKAogICAgICAgIGNvbnRhaW5lcj1jb250YWluZXIsCiAgICAgICAgcGF0aD1zdHJlYW1fcGF0aCwKICAgICAgICBzaGFyZF9jb3VudD1zaGFyZHMsCiAgICAgICAgcmFpc2VfZm9yX3N0YXR1cz12M2lvLmRhdGFwbGFuZS5SYWlzZUZvclN0YXR1cy5uZXZlciwKICAgICkKICAgIHJlc3BvbnNlLnJhaXNlX2Zvcl9zdGF0dXMoWzQwOSwgMjA0XSkKCgpkZWYgcHVzaF90b19zdHJlYW0oY29udGV4dCwgc3RyZWFtX3BhdGgsIGRhdGEpOgogICAgcmVjb3JkcyA9IFt7ImRhdGEiOiBqc29uLmR1bXBzKHJlYyl9IGZvciByZWMgaW4gZGF0YV0KICAgIGNvbnRhaW5lciwgc3RyZWFtX3BhdGggPSBzcGxpdF9wYXRoKHN0cmVhbV9wYXRoKQogICAgcmVzcG9uc2UgPSBjb250ZXh0LnYzaW9fY2xpZW50LnB1dF9yZWNvcmRzKAogICAgICAgIGNvbnRhaW5lcj1jb250YWluZXIsIHBhdGg9c3RyZWFtX3BhdGgsIHJlY29yZHM9cmVjb3JkcwogICAgKQoKCmRlZiBjb25zdHJ1Y3RfcmVjb3JkKHJlY29yZCk6CiAgICBsYWJlbF9jb2wgPSBvcy5nZXRlbnYoImxhYmVsX2NvbCIsICJsYWJlbCIpCiAgICBwcmVkaWN0aW9uX2NvbCA9IG9zLmdldGVudigicHJlZGljdGlvbl9jb2wiLCAicHJlZGljdGlvbiIpCiAgICByZXMgPSBkaWN0KFsoaywgcmVjb3JkW2tdKSBmb3IgayBpbiBbIndoZW4iLCAiY2xhc3MiLCAibW9kZWwiLCAicmVzcCIsICJyZXF1ZXN0Il1dKQogICAgcmVzWyJmZWF0dXJlX3ZlY3RvciJdID0gcmVzLnBvcCgicmVxdWVzdCIpWyJpbnN0YW5jZXMiXVswXQogICAgcmVzWyJ0aW1lc3RhbXAiXSA9IHJlcy5wb3AoIndoZW4iKQogICAgcmVzW3ByZWRpY3Rpb25fY29sXSA9IHJlc1sicmVzcCJdWzBdCiAgICByZXR1cm4gcmVzCgoKZGVmIGluaXRfY29udGV4dChjb250ZXh0KToKICAgIHYzaW9fY2xpZW50ID0gdjNpby5kYXRhcGxhbmUuQ2xpZW50KCkKICAgIHNldGF0dHIoY29udGV4dCwgInYzaW9fY2xpZW50IiwgdjNpb19jbGllbnQpCgogICAgdjNmX2NsaWVudCA9IHYzZi5DbGllbnQoImZyYW1lc2Q6ODA4MSIsIGNvbnRhaW5lcj0iYmlnZGF0YSIpCiAgICBzZXRhdHRyKGNvbnRleHQsICJ2M2YiLCB2M2ZfY2xpZW50KQogICAgd2luZG93ID0gW10KICAgIHNldGF0dHIoY29udGV4dCwgIndpbmRvdyIsIHdpbmRvdykKICAgIHNldGF0dHIoY29udGV4dCwgIndpbmRvd19zaXplIiwgaW50KG9zLmdldGVudigid2luZG93X3NpemUiLCAxMCkpKQogICAgc2V0YXR0cihjb250ZXh0LCAidHNkYl90YWJsZSIsIG9zLmdldGVudigidHNkYl90YWJsZSIsICJjb25jZXB0X2RyaWZ0X3RzZGJfMSIpKQogICAgdHJ5OgogICAgICAgIGNvbnRleHQudjNmLmNyZWF0ZSgidHNkYiIsIGNvbnRleHQudHNkYl90YWJsZSwgcmF0ZT0iMS9zIiwgaWZfZXhpc3RzPTEpCiAgICBleGNlcHQgRXhjZXB0aW9uIGFzIGU6CiAgICAgICAgY29udGV4dC5sb2dnZXIuaW5mbyhmIkNyZWF0aW5nIGNvbnRleHQgd2l0aCByYXRlPSBmYWlsZSBmb3Ige2V9IikKICAgICAgICBjb250ZXh0LnYzZi5jcmVhdGUoCiAgICAgICAgICAgICJ0c2RiIiwgY29udGV4dC50c2RiX3RhYmxlLCBhdHRycz17InJhdGUiOiAiMS9zIn0sIGlmX2V4aXN0cz0xCiAgICAgICAgKQoKICAgIGNhbGxiYWNrcyA9IFtjYWxsYmFjay5zdHJpcCgpIGZvciBjYWxsYmFjayBpbiBvcy5nZXRlbnYoImNhbGxiYWNrcyIsICIiKS5zcGxpdCgiLCIpXQogICAgc2V0YXR0cihjb250ZXh0LCAiY2FsbGJhY2tzIiwgY2FsbGJhY2tzKQoKICAgIHNldGF0dHIoY29udGV4dCwgImRyaWZ0X3N0cmVhbSIsIG9zLmdldGVudigiZHJpZnRfc3RyZWFtIiwgIi9iaWdkYXRhL2RyaWZ0X3N0cmVhbSIpKQogICAgdHJ5OgogICAgICAgIGNyZWF0ZV9zdHJlYW0oCiAgICAgICAgICAgIGNvbnRleHQsIGNvbnRleHQuZHJpZnRfc3RyZWFtLCBpbnQob3MuZ2V0ZW52KCJkcmlmdF9zdHJlYW1fc2hhcmRzIiwgMSkpCiAgICAgICAgKQogICAgZXhjZXB0OgogICAgICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZiJ7Y29udGV4dC5kcmlmdF9zdHJlYW19IGFscmVhZHkgZXhpc3RzIikKCiAgICBtb2RlbHMgPSB7fQogICAgbW9kZWxfdHlwZXMgPSBbInBhZ2VoaW5rZWx5IiwgImRkbSIsICJlZGRtIl0KICAgIHBhdGhfc3VmZml4ID0gIl9tb2RlbF9wYXRoIgogICAgZm9yIG1vZGVsIGluIG1vZGVsX3R5cGVzOgogICAgICAgIG1vZGVsX2VudiA9IGYie21vZGVsfXtwYXRoX3N1ZmZpeH0iCiAgICAgICAgaWYgbW9kZWxfZW52IGluIG9zLmVudmlyb246CiAgICAgICAgICAgIHdpdGggb3Blbihvcy5lbnZpcm9uW21vZGVsX2Vudl0sICJyYiIpIGFzIGY6CiAgICAgICAgICAgICAgICBtb2RlbHNbbW9kZWxdID0gbG9hZChmKQogICAgc2V0YXR0cihjb250ZXh0LCAibW9kZWxzIiwgbW9kZWxzKQoKICAgIHNldGF0dHIoY29udGV4dCwgImxhYmVsX2NvbCIsIG9zLmdldGVudigibGFiZWxfY29sIiwgImxhYmVsIikpCiAgICBzZXRhdHRyKGNvbnRleHQsICJwcmVkaWN0aW9uX2NvbCIsIG9zLmdldGVudigicHJlZGljdGlvbl9jb2wiLCAicHJlZGljdGlvbiIpKQoKCmRlZiBoYW5kbGVyKGNvbnRleHQsIGV2ZW50KToKICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZiJldmVudDoge2V2ZW50LmJvZHl9IikKICAgIGZ1bGxfZXZlbnQgPSBqc29uLmxvYWRzKGV2ZW50LmJvZHkpCiAgICByZWNvcmQgPSBjb25zdHJ1Y3RfcmVjb3JkKGZ1bGxfZXZlbnQpCgogICAgaXNfZXJyb3IgPSByZWNvcmRbY29udGV4dC5sYWJlbF9jb2xdICE9IHJlY29yZFtjb250ZXh0LnByZWRpY3Rpb25fY29sXQogICAgY29udGV4dC5sb2dnZXIuaW5mbyhmIkFkZGluZyB7aXNfZXJyb3J9IikKCiAgICBmb3IgbmFtZSwgbW9kZWwgaW4gY29udGV4dC5tb2RlbHMuaXRlbXMoKToKICAgICAgICByZXN1bHRzID0geyJ0aW1lc3RhbXAiOiByZWNvcmRbInRpbWVzdGFtcCJdfQogICAgICAgIHJlc3VsdHNbImFsZ29yaXRobSJdID0gbmFtZQogICAgICAgIG1vZGVsLmFkZF9lbGVtZW50KGlzX2Vycm9yKQoKICAgICAgICBpZiBoYXNhdHRyKG1vZGVsLCAiZGV0ZWN0ZWRfd2FybmluZ196b25lIikgYW5kIG1vZGVsLmRldGVjdGVkX3dhcm5pbmdfem9uZSgpOgogICAgICAgICAgICBjb250ZXh0LmxvZ2dlci5pbmZvKGYie25hbWV9XHRXYXJuaW5nIHpvbmUgZGV0ZWN0ZWQiKQogICAgICAgICAgICByZXN1bHRzWyJ3YXJuaW5nX3pvbmUiXSA9IDEKICAgICAgICAgICAgZnVsbF9ldmVudFtmIntuYW1lfV93YXJuaW5nX3pvbmUiXSA9IDEKICAgICAgICBlbHNlOgogICAgICAgICAgICByZXN1bHRzWyJ3YXJuaW5nX3pvbmUiXSA9IDAKICAgICAgICAgICAgZnVsbF9ldmVudFtmIntuYW1lfV93YXJuaW5nX3pvbmUiXSA9IDAKCiAgICAgICAgaWYgbW9kZWwuZGV0ZWN0ZWRfY2hhbmdlKCk6CiAgICAgICAgICAgIGNvbnRleHQubG9nZ2VyLmluZm8oIkNoYW5nZSBEZXRlY3RlZCIpCiAgICAgICAgICAgIHJlc3VsdHNbImNoYW5nZV9kZXRlY3RlZCJdID0gMQogICAgICAgICAgICBmdWxsX2V2ZW50W2Yie25hbWV9X2RyaWZ0Il0gPSAxCiAgICAgICAgZWxzZToKICAgICAgICAgICAgcmVzdWx0c1siY2hhbmdlX2RldGVjdGVkIl0gPSAwCiAgICAgICAgICAgIGZ1bGxfZXZlbnRbZiJ7bmFtZX1fZHJpZnQiXSA9IDAKICAgICAgICBjb250ZXh0LndpbmRvdy5hcHBlbmQocmVzdWx0cykKCiAgICBwdXNoX3RvX3N0cmVhbShjb250ZXh0LCBjb250ZXh0LmRyaWZ0X3N0cmVhbSwgW2Z1bGxfZXZlbnRdKQoKICAgIGlmIGNvbnRleHQuY2FsbGJhY2tzICE9IFsiIl06CiAgICAgICAgZm9yIGNhbGxiYWNrIGluIGNvbnRleHQuY2FsbGJhY2tzOgogICAgICAgICAgICByZXF1ZXN0cy5wb3N0KHVybD1jYWxsYmFjaywganNvbj1mdWxsX2V2ZW50KQoKICAgIGlmIChsZW4oY29udGV4dC53aW5kb3cpIC8gbGVuKGNvbnRleHQubW9kZWxzKSkgPj0gY29udGV4dC53aW5kb3dfc2l6ZToKICAgICAgICBkZiA9IHBkLkRhdGFGcmFtZShjb250ZXh0LndpbmRvdykKICAgICAgICBkZlsidGltZXN0YW1wIl0gPSBwZC50b19kYXRldGltZShkZlsidGltZXN0YW1wIl0pCiAgICAgICAgZGYgPSBkZi5zZXRfaW5kZXgoWyJ0aW1lc3RhbXAiLCAiYWxnb3JpdGhtIl0pCiAgICAgICAgY29udGV4dC52M2Yud3JpdGUoInRzZGIiLCBjb250ZXh0LnRzZGJfdGFibGUsIGRmKQogICAgICAgIGNvbnRleHQud2luZG93ID0gW10K
-  source: ''
-  build:
-    commands:
-    - python -m pip install scikit-multiflow==0.4.1 v3io_frames
-    code_origin: https://github.com/daniels290813/functions.git#d96059851b5d51fd4583e982483eb973fccc47d2:/User/test/functions/concept_drift_streaming/concept_drift_streaming.py
-    origin_filename: /User/test/functions/concept_drift_streaming/concept_drift_streaming.py
-  default_handler: handler
-  disable_auto_mount: false
-  affinity: null
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/concept_drift_streaming/0.8.0/static/item.html b/functions/development/concept_drift_streaming/0.8.0/static/item.html deleted file mode 100644 index e3e65ff7..00000000 --- a/functions/development/concept_drift_streaming/0.8.0/static/item.html +++ /dev/null @@ -1,50 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- machine-learning
-- monitoring
-description: Deploy a streaming Concept Drift detector on a labeled stream. the nuclio
-  part of the concept_drift function
-doc: ''
-example: concept_drift_streaming.ipynb
-generationDate: 2021-05-19:22-41
-icon: ''
-labels:
-  author: orz
-  framework: sklearn
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 0.8.0
-name: concept-drift-streaming
-platformVersion: 3.2.0
-spec:
-  filename: concept_drift_streaming.py
-  handler: handler
-  image: mlrun/ml-models
-  kind: nuclio
-  requirements:
-  - scikit-multiflow==0.4.1
-  - v3io_frames
-url: ''
-version: 0.8.0
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/concept_drift_streaming/0.8.0/static/source.html b/functions/development/concept_drift_streaming/0.8.0/static/source.html deleted file mode 100644 index 3c479b8f..00000000 --- a/functions/development/concept_drift_streaming/0.8.0/static/source.html +++ /dev/null @@ -1,165 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-# Generated by nuclio.export.NuclioExporter
-
-import skmultiflow.drift_detection
-import numpy as np
-import pandas as pd
-import os
-import json
-import v3io.dataplane
-import v3io_frames as v3f
-import requests
-from cloudpickle import load
-
-import random
-
-
-def split_path(mntpath=""):
-    if mntpath[0] == "/":
-        mntpath = mntpath[1:]
-    paths = mntpath.split("/")
-    container = paths[0]
-    subpath = ""
-    if len(paths) > 1:
-        subpath = mntpath[len(container) :]
-    return container, subpath
-
-
-def create_stream(context, path, shards=1):
-    container, stream_path = split_path(path)
-    context.logger.info(
-        f"Creating stream in Container: {container} & Path {stream_path}"
-    )
-    response = context.v3io_client.create_stream(
-        container=container,
-        path=stream_path,
-        shard_count=shards,
-        raise_for_status=v3io.dataplane.RaiseForStatus.never,
-    )
-    response.raise_for_status([409, 204])
-
-
-def push_to_stream(context, stream_path, data):
-    records = [{"data": json.dumps(rec)} for rec in data]
-    container, stream_path = split_path(stream_path)
-    response = context.v3io_client.put_records(
-        container=container, path=stream_path, records=records
-    )
-
-
-def construct_record(record):
-    label_col = os.getenv("label_col", "label")
-    prediction_col = os.getenv("prediction_col", "prediction")
-    res = dict([(k, record[k]) for k in ["when", "class", "model", "resp", "request"]])
-    res["feature_vector"] = res.pop("request")["instances"][0]
-    res["timestamp"] = res.pop("when")
-    res[prediction_col] = res["resp"][0]
-    return res
-
-
-def init_context(context):
-    v3io_client = v3io.dataplane.Client()
-    setattr(context, "v3io_client", v3io_client)
-
-    v3f_client = v3f.Client("framesd:8081", container="bigdata")
-    setattr(context, "v3f", v3f_client)
-    window = []
-    setattr(context, "window", window)
-    setattr(context, "window_size", int(os.getenv("window_size", 10)))
-    setattr(context, "tsdb_table", os.getenv("tsdb_table", "concept_drift_tsdb_1"))
-    try:
-        context.v3f.create("tsdb", context.tsdb_table, rate="1/s", if_exists=1)
-    except Exception as e:
-        context.logger.info(f"Creating context with rate= faile for {e}")
-        context.v3f.create(
-            "tsdb", context.tsdb_table, attrs={"rate": "1/s"}, if_exists=1
-        )
-
-    callbacks = [callback.strip() for callback in os.getenv("callbacks", "").split(",")]
-    setattr(context, "callbacks", callbacks)
-
-    setattr(context, "drift_stream", os.getenv("drift_stream", "/bigdata/drift_stream"))
-    try:
-        create_stream(
-            context, context.drift_stream, int(os.getenv("drift_stream_shards", 1))
-        )
-    except:
-        context.logger.info(f"{context.drift_stream} already exists")
-
-    models = {}
-    model_types = ["pagehinkely", "ddm", "eddm"]
-    path_suffix = "_model_path"
-    for model in model_types:
-        model_env = f"{model}{path_suffix}"
-        if model_env in os.environ:
-            with open(os.environ[model_env], "rb") as f:
-                models[model] = load(f)
-    setattr(context, "models", models)
-
-    setattr(context, "label_col", os.getenv("label_col", "label"))
-    setattr(context, "prediction_col", os.getenv("prediction_col", "prediction"))
-
-
-def handler(context, event):
-    context.logger.info(f"event: {event.body}")
-    full_event = json.loads(event.body)
-    record = construct_record(full_event)
-
-    is_error = record[context.label_col] != record[context.prediction_col]
-    context.logger.info(f"Adding {is_error}")
-
-    for name, model in context.models.items():
-        results = {"timestamp": record["timestamp"]}
-        results["algorithm"] = name
-        model.add_element(is_error)
-
-        if hasattr(model, "detected_warning_zone") and model.detected_warning_zone():
-            context.logger.info(f"{name}\tWarning zone detected")
-            results["warning_zone"] = 1
-            full_event[f"{name}_warning_zone"] = 1
-        else:
-            results["warning_zone"] = 0
-            full_event[f"{name}_warning_zone"] = 0
-
-        if model.detected_change():
-            context.logger.info("Change Detected")
-            results["change_detected"] = 1
-            full_event[f"{name}_drift"] = 1
-        else:
-            results["change_detected"] = 0
-            full_event[f"{name}_drift"] = 0
-        context.window.append(results)
-
-    push_to_stream(context, context.drift_stream, [full_event])
-
-    if context.callbacks != [""]:
-        for callback in context.callbacks:
-            requests.post(url=callback, json=full_event)
-
-    if (len(context.window) / len(context.models)) >= context.window_size:
-        df = pd.DataFrame(context.window)
-        df["timestamp"] = pd.to_datetime(df["timestamp"])
-        df = df.set_index(["timestamp", "algorithm"])
-        context.v3f.write("tsdb", context.tsdb_table, df)
-        context.window = []
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/concept_drift_streaming/0.9.0/src/concept_drift_streaming.ipynb b/functions/development/concept_drift_streaming/0.9.0/src/concept_drift_streaming.ipynb deleted file mode 100644 index b916cb7a..00000000 --- a/functions/development/concept_drift_streaming/0.9.0/src/concept_drift_streaming.ipynb +++ /dev/null @@ -1,480 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Concept Drift Streaming" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import nuclio" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "from pprint import pprint" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "%%nuclio cmd -c\n", - "python -m pip install scikit-multiflow==0.4.1\n", - "python -m pip install v3io_frames" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "%nuclio: setting kind to 'nuclio'\n", - "%nuclio: setting spec.build.baseImage to 'mlrun/ml-models'\n" - ] - } - ], - "source": [ - "# Define function spec\n", - "%nuclio config kind = \"nuclio\"\n", - "%nuclio config spec.build.baseImage = \"mlrun/ml-models\"\n", - "\n", - "# Add V3IO Mount\n", - "# %nuclio env %v3io" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: ignore\n", - "env = {'label_col': 'resp',\n", - " 'prediction_col': 'prediction',\n", - " 'drift_stream': '/bigdata/network-operations/drift_stream',\n", - " 'tsdb_table': 'network-operations/drift_tsdb',\n", - " 'pagehinkley_threshold': 10,\n", - " 'models': ['pagehinkley', 'ddm', 'eddm'],\n", - " 'window_size': 10}\n", - "config = {'kind': 'nuclio',\n", - " 'spec.build.baseImage': 'mlrun/ml-models'}\n", - "cmd = ['python -m pip install scikit-multiflow',\n", - " 'python -m pip install v3io_frames']\n", - "v3io = True\n", - "config = nuclio.ConfigSpec(env=env,\n", - " config=config,\n", - " cmd=cmd,\n", - " v3io=v3io)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: start-code" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "import skmultiflow.drift_detection\n", - "import numpy as np\n", - "import pandas as pd\n", - "import os\n", - "import json\n", - "import v3io.dataplane\n", - "import v3io_frames as v3f\n", - "import requests\n", - "from cloudpickle import load\n", - "\n", - "# For testing\n", - "import random" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "def split_path(mntpath=''):\n", - " if mntpath[0] == '/':\n", - " mntpath = mntpath[1:]\n", - " paths = mntpath.split('/')\n", - " container = paths[0]\n", - " subpath = ''\n", - " if len(paths) > 1:\n", - " subpath = mntpath[len(container):]\n", - " return container, subpath\n", - "\n", - "\n", - "def create_stream(context, path, shards=1):\n", - " # create a stream w/8 shards\n", - " container, stream_path = split_path(path)\n", - " context.logger.info(f'Creating stream in Container: {container} & Path {stream_path}')\n", - " response = context.v3io_client.create_stream(container=container,\n", - " path=stream_path, \n", - " shard_count=shards,\n", - " raise_for_status=v3io.dataplane.RaiseForStatus.never)\n", - " response.raise_for_status([409, 204])\n", - " \n", - " \n", - "def push_to_stream(context, stream_path, data):\n", - " records = [{'data': json.dumps(rec)} for rec in data]\n", - " container, stream_path = split_path(stream_path)\n", - " response = context.v3io_client.put_records(container=container,\n", - " path=stream_path, \n", - " records=records)\n", - "\n", - "\n", - "def construct_record(record):\n", - " label_col = os.getenv('label_col', 'label')\n", - " prediction_col = os.getenv('prediction_col', 'prediction')\n", - " res = dict([(k, record[k]) for k in ['when', 'class', 'model', 'resp', 'request']])\n", - " res['feature_vector'] = res.pop('request')['instances'][0]\n", - " res['timestamp'] = res.pop('when')\n", - " res['prediction'] = res['resp'][0]\n", - " return res" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "def init_context(context):\n", - " # create a v3io context object\n", - " v3io_client = v3io.dataplane.Client()\n", - " setattr(context, \"v3io_client\", v3io_client)\n", - " \n", - " # Setup windowing for TSDB writer\n", - " v3f_client = v3f.Client('framesd:8081', container='bigdata')\n", - " setattr(context, \"v3f\", v3f_client)\n", - " window = []\n", - " setattr(context, 'window', window)\n", - " setattr(context, 'window_size', int(os.getenv('window_size', 10)))\n", - " setattr(context, 'tsdb_table', os.getenv('tsdb_table', 'concept_drift_tsdb_1'))\n", - " try:\n", - " context.v3f.create('tsdb', context.tsdb_table, rate='1/s', if_exists=1)\n", - " except Exception as e:\n", - " context.logger.info(f'Creating context with rate= faile for {e}')\n", - " context.v3f.create('tsdb', context.tsdb_table, attrs={'rate': '1/s'}, if_exists=1)\n", - " \n", - " # Setup callbacks\n", - " callbacks = [callback.strip() for callback in os.getenv('callbacks', '').split(',')]\n", - " setattr(context, 'callbacks', callbacks)\n", - " \n", - " # Setup drift stream\n", - " setattr(context, 'drift_stream', os.getenv('drift_stream', '/bigdata/drift_stream'))\n", - " try:\n", - " create_stream(context, context.drift_stream, int(os.getenv('drift_stream_shards', 1)))\n", - " except:\n", - " context.logger.info(f'{context.drift_stream} already exists')\n", - " \n", - " # Load models\n", - " models = {}\n", - " model_types = ['pagehinkely', 'ddm', 'eddm']\n", - " path_suffix = '_model_path'\n", - " for model in model_types:\n", - " model_env = f'{model}{path_suffix}'\n", - " if model_env in os.environ:\n", - " with open(os.environ[model_env], 'rb') as f:\n", - " models[model] = load(f)\n", - " setattr(context, 'models', models)\n", - " \n", - " # Columns to check\n", - " setattr(context, 'label_col', os.getenv('label_col', 'label'))\n", - " setattr(context, 'prediction_col', os.getenv('prediction_col', 'prediction'))" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "def handler(context, event):\n", - " # Construct event\n", - " context.logger.info(f'event: {event.body}')\n", - " full_event = json.loads(event.body)\n", - " record = construct_record(full_event)\n", - " \n", - " # Is our prediction wrong?\n", - " is_error = record[context.label_col] != record[context.prediction_col]\n", - " context.logger.info(f'Adding {is_error}')\n", - " \n", - " # Process the {is_error} element with our algorithms\n", - " for name, model in context.models.items():\n", - " # Add element\n", - " results = {'timestamp': record['timestamp']}\n", - " results['algorithm'] = name\n", - " model.add_element(is_error)\n", - " \n", - " # Detect warning zone (if applicable to the algorithm)\n", - " if hasattr(model, 'detected_warning_zone') and model.detected_warning_zone():\n", - " context.logger.info(f'{name}\\tWarning zone detected')\n", - " results['warning_zone'] = 1\n", - " full_event[f'{name}_warning_zone'] = 1\n", - " else:\n", - " results['warning_zone'] = 0\n", - " full_event[f'{name}_warning_zone'] = 0\n", - " \n", - " # Detect drift\n", - " if model.detected_change():\n", - " context.logger.info('Change Detected')\n", - " results['change_detected'] = 1\n", - " full_event[f'{name}_drift'] = 1\n", - " else:\n", - " results['change_detected'] = 0\n", - " full_event[f'{name}_drift'] = 0\n", - " context.window.append(results)\n", - " \n", - " # Return results\n", - " # Write to stream\n", - " push_to_stream(context, context.drift_stream, [full_event])\n", - " \n", - " # Add to callbacks\n", - " if context.callbacks != ['']:\n", - " for callback in context.callbacks:\n", - " requests.post(url=callback,\n", - " json=full_event)\n", - " \n", - " if (len(context.window) / len(context.models)) >= context.window_size:\n", - " df = pd.DataFrame(context.window)\n", - " df['timestamp'] = pd.to_datetime(df['timestamp'])\n", - " df = df.set_index(['timestamp', 'algorithm'])\n", - " context.v3f.write('tsdb', context.tsdb_table, df)\n", - " context.window = []" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: end-code" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Test " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "init_context(context)\n", - "event = nuclio.Event(body=json.dumps({'prediction': 0,\n", - " 'when': 'now',\n", - " 'class': 'ClassModel', \n", - " 'model': 'tester_v1', \n", - " 'resp': [0], \n", - " 'request': {'instances': [[1, 1.2, 3]]}}))\n", - "out = handler(context, event)\n", - "out" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Cluster" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%nuclio deploy -n network-operations-concept-drift -p network-operations" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Save function yaml" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "from os import path\n", - "from mlrun import run_local, NewTask, mlconf, import_function, mount_v3io, code_to_function, get_run_db\n", - "mlconf.dbpath = mlconf.dbpath or 'http://mlrun-api:8080'" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-07-14 13:49:22,720 function spec saved to path: /User/functions/concept_drift_streaming/function.yaml\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# create job function object from notebook code\n", - "fn = code_to_function(\"concept_drift_streaming\", kind='nuclio')\n", - "\n", - "# add metadata (for templates and reuse)\n", - "fn.spec.default_handler = \"handler\"\n", - "fn.spec.description = \"Deploy a streaming Concept Drift detector on a labeled stream. the nuclio part of the concept_drift function\"\n", - "fn.metadata.categories = [\"ml\", \"serve\"]\n", - "fn.metadata.labels = {\"author\": \"orz\", \"framework\": \"sklearn\"}\n", - "fn.export(\"/User/functions/concept_drift_streaming/function.yaml\")" - ] - }, - { - "cell_type": "code", - "execution_count": 120, - "metadata": {}, - "outputs": [], - "source": [ - "stream_trigger = nuclio.triggers.V3IOStreamTrigger(url='/bigdata/network-operations/inference_stream@cd2')" - ] - }, - { - "cell_type": "code", - "execution_count": 121, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 121, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fn.add_trigger('labeled_stream', stream_trigger)" - ] - }, - { - "cell_type": "code", - "execution_count": 122, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 122, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fn.apply(mount_v3io()).with_v3io()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "fn.export(\"function.yaml\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Stream testing" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "metadata": {}, - "outputs": [], - "source": [ - "fn = import_function('./function.yaml')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "fn.deploy(project='network-operations')" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.8" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/functions/development/concept_drift_streaming/0.9.0/src/concept_drift_streaming.py b/functions/development/concept_drift_streaming/0.9.0/src/concept_drift_streaming.py deleted file mode 100644 index 94247c45..00000000 --- a/functions/development/concept_drift_streaming/0.9.0/src/concept_drift_streaming.py +++ /dev/null @@ -1,143 +0,0 @@ -# Generated by nuclio.export.NuclioExporter - -import skmultiflow.drift_detection -import numpy as np -import pandas as pd -import os -import json -import v3io.dataplane -import v3io_frames as v3f -import requests -from cloudpickle import load - -import random - - -def split_path(mntpath=""): - if mntpath[0] == "/": - mntpath = mntpath[1:] - paths = mntpath.split("/") - container = paths[0] - subpath = "" - if len(paths) > 1: - subpath = mntpath[len(container) :] - return container, subpath - - -def create_stream(context, path, shards=1): - container, stream_path = split_path(path) - context.logger.info( - f"Creating stream in Container: {container} & Path {stream_path}" - ) - response = context.v3io_client.create_stream( - container=container, - path=stream_path, - shard_count=shards, - raise_for_status=v3io.dataplane.RaiseForStatus.never, - ) - response.raise_for_status([409, 204]) - - -def push_to_stream(context, stream_path, data): - records = [{"data": json.dumps(rec)} for rec in data] - container, stream_path = split_path(stream_path) - response = context.v3io_client.put_records( - container=container, path=stream_path, records=records - ) - - -def construct_record(record): - label_col = os.getenv("label_col", "label") - prediction_col = os.getenv("prediction_col", "prediction") - res = dict([(k, record[k]) for k in ["when", "class", "model", "resp", "request"]]) - res["feature_vector"] = res.pop("request")["instances"][0] - res["timestamp"] = res.pop("when") - res[prediction_col] = res["resp"][0] - return res - - -def init_context(context): - v3io_client = v3io.dataplane.Client() - setattr(context, "v3io_client", v3io_client) - - v3f_client = v3f.Client("framesd:8081", container="bigdata") - setattr(context, "v3f", v3f_client) - window = [] - setattr(context, "window", window) - setattr(context, "window_size", int(os.getenv("window_size", 10))) - setattr(context, "tsdb_table", os.getenv("tsdb_table", "concept_drift_tsdb_1")) - try: - context.v3f.create("tsdb", context.tsdb_table, rate="1/s", if_exists=1) - except Exception as e: - context.logger.info(f"Creating context with rate= faile for {e}") - context.v3f.create( - "tsdb", context.tsdb_table, attrs={"rate": "1/s"}, if_exists=1 - ) - - callbacks = [callback.strip() for callback in os.getenv("callbacks", "").split(",")] - setattr(context, "callbacks", callbacks) - - setattr(context, "drift_stream", os.getenv("drift_stream", "/bigdata/drift_stream")) - try: - create_stream( - context, context.drift_stream, int(os.getenv("drift_stream_shards", 1)) - ) - except: - context.logger.info(f"{context.drift_stream} already exists") - - models = {} - model_types = ["pagehinkely", "ddm", "eddm"] - path_suffix = "_model_path" - for model in model_types: - model_env = f"{model}{path_suffix}" - if model_env in os.environ: - with open(os.environ[model_env], "rb") as f: - models[model] = load(f) - setattr(context, "models", models) - - setattr(context, "label_col", os.getenv("label_col", "label")) - setattr(context, "prediction_col", os.getenv("prediction_col", "prediction")) - - -def handler(context, event): - context.logger.info(f"event: {event.body}") - full_event = json.loads(event.body) - record = construct_record(full_event) - - is_error = record[context.label_col] != record[context.prediction_col] - context.logger.info(f"Adding {is_error}") - - for name, model in context.models.items(): - results = {"timestamp": record["timestamp"]} - results["algorithm"] = name - model.add_element(is_error) - - if hasattr(model, "detected_warning_zone") and model.detected_warning_zone(): - context.logger.info(f"{name}\tWarning zone detected") - results["warning_zone"] = 1 - full_event[f"{name}_warning_zone"] = 1 - else: - results["warning_zone"] = 0 - full_event[f"{name}_warning_zone"] = 0 - - if model.detected_change(): - context.logger.info("Change Detected") - results["change_detected"] = 1 - full_event[f"{name}_drift"] = 1 - else: - results["change_detected"] = 0 - full_event[f"{name}_drift"] = 0 - context.window.append(results) - - push_to_stream(context, context.drift_stream, [full_event]) - - if context.callbacks != [""]: - for callback in context.callbacks: - requests.post(url=callback, json=full_event) - - if (len(context.window) / len(context.models)) >= context.window_size: - df = pd.DataFrame(context.window) - df["timestamp"] = pd.to_datetime(df["timestamp"]) - df = df.set_index(["timestamp", "algorithm"]) - context.v3f.write("tsdb", context.tsdb_table, df) - context.window = [] diff --git a/functions/development/concept_drift_streaming/0.9.0/src/function.yaml b/functions/development/concept_drift_streaming/0.9.0/src/function.yaml deleted file mode 100644 index 3001b1bf..00000000 --- a/functions/development/concept_drift_streaming/0.9.0/src/function.yaml +++ /dev/null @@ -1,48 +0,0 @@ -kind: remote -metadata: - name: concept-drift-streaming - tag: '' - hash: dc41ff41149be69f19b91a6d78a06571937063ae - project: '' - labels: - author: orz - framework: sklearn - categories: - - machine-learning - - monitoring -spec: - command: '' - args: [] - image: mlrun/ml-models - description: Deploy a streaming Concept Drift detector on a labeled stream. the - nuclio part of the concept_drift function - min_replicas: 1 - max_replicas: 4 - env: [] - base_spec: - apiVersion: nuclio.io/v1 - kind: Function - metadata: - name: concept-drift-streaming - labels: {} - annotations: - nuclio.io/generated_by: function generated from /User/test/functions/concept_drift_streaming/concept_drift_streaming.py - spec: - runtime: python:3.6 - handler: concept_drift_streaming:handler - env: [] - volumes: [] - build: - commands: [] - noBaseImagesPull: true - functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHNrbXVsdGlmbG93LmRyaWZ0X2RldGVjdGlvbgppbXBvcnQgbnVtcHkgYXMgbnAKaW1wb3J0IHBhbmRhcyBhcyBwZAppbXBvcnQgb3MKaW1wb3J0IGpzb24KaW1wb3J0IHYzaW8uZGF0YXBsYW5lCmltcG9ydCB2M2lvX2ZyYW1lcyBhcyB2M2YKaW1wb3J0IHJlcXVlc3RzCmZyb20gY2xvdWRwaWNrbGUgaW1wb3J0IGxvYWQKCmltcG9ydCByYW5kb20KCgpkZWYgc3BsaXRfcGF0aChtbnRwYXRoPSIiKToKICAgIGlmIG1udHBhdGhbMF0gPT0gIi8iOgogICAgICAgIG1udHBhdGggPSBtbnRwYXRoWzE6XQogICAgcGF0aHMgPSBtbnRwYXRoLnNwbGl0KCIvIikKICAgIGNvbnRhaW5lciA9IHBhdGhzWzBdCiAgICBzdWJwYXRoID0gIiIKICAgIGlmIGxlbihwYXRocykgPiAxOgogICAgICAgIHN1YnBhdGggPSBtbnRwYXRoW2xlbihjb250YWluZXIpIDpdCiAgICByZXR1cm4gY29udGFpbmVyLCBzdWJwYXRoCgoKZGVmIGNyZWF0ZV9zdHJlYW0oY29udGV4dCwgcGF0aCwgc2hhcmRzPTEpOgogICAgY29udGFpbmVyLCBzdHJlYW1fcGF0aCA9IHNwbGl0X3BhdGgocGF0aCkKICAgIGNvbnRleHQubG9nZ2VyLmluZm8oCiAgICAgICAgZiJDcmVhdGluZyBzdHJlYW0gaW4gQ29udGFpbmVyOiB7Y29udGFpbmVyfSAmIFBhdGgge3N0cmVhbV9wYXRofSIKICAgICkKICAgIHJlc3BvbnNlID0gY29udGV4dC52M2lvX2NsaWVudC5jcmVhdGVfc3RyZWFtKAogICAgICAgIGNvbnRhaW5lcj1jb250YWluZXIsCiAgICAgICAgcGF0aD1zdHJlYW1fcGF0aCwKICAgICAgICBzaGFyZF9jb3VudD1zaGFyZHMsCiAgICAgICAgcmFpc2VfZm9yX3N0YXR1cz12M2lvLmRhdGFwbGFuZS5SYWlzZUZvclN0YXR1cy5uZXZlciwKICAgICkKICAgIHJlc3BvbnNlLnJhaXNlX2Zvcl9zdGF0dXMoWzQwOSwgMjA0XSkKCgpkZWYgcHVzaF90b19zdHJlYW0oY29udGV4dCwgc3RyZWFtX3BhdGgsIGRhdGEpOgogICAgcmVjb3JkcyA9IFt7ImRhdGEiOiBqc29uLmR1bXBzKHJlYyl9IGZvciByZWMgaW4gZGF0YV0KICAgIGNvbnRhaW5lciwgc3RyZWFtX3BhdGggPSBzcGxpdF9wYXRoKHN0cmVhbV9wYXRoKQogICAgcmVzcG9uc2UgPSBjb250ZXh0LnYzaW9fY2xpZW50LnB1dF9yZWNvcmRzKAogICAgICAgIGNvbnRhaW5lcj1jb250YWluZXIsIHBhdGg9c3RyZWFtX3BhdGgsIHJlY29yZHM9cmVjb3JkcwogICAgKQoKCmRlZiBjb25zdHJ1Y3RfcmVjb3JkKHJlY29yZCk6CiAgICBsYWJlbF9jb2wgPSBvcy5nZXRlbnYoImxhYmVsX2NvbCIsICJsYWJlbCIpCiAgICBwcmVkaWN0aW9uX2NvbCA9IG9zLmdldGVudigicHJlZGljdGlvbl9jb2wiLCAicHJlZGljdGlvbiIpCiAgICByZXMgPSBkaWN0KFsoaywgcmVjb3JkW2tdKSBmb3IgayBpbiBbIndoZW4iLCAiY2xhc3MiLCAibW9kZWwiLCAicmVzcCIsICJyZXF1ZXN0Il1dKQogICAgcmVzWyJmZWF0dXJlX3ZlY3RvciJdID0gcmVzLnBvcCgicmVxdWVzdCIpWyJpbnN0YW5jZXMiXVswXQogICAgcmVzWyJ0aW1lc3RhbXAiXSA9IHJlcy5wb3AoIndoZW4iKQogICAgcmVzW3ByZWRpY3Rpb25fY29sXSA9IHJlc1sicmVzcCJdWzBdCiAgICByZXR1cm4gcmVzCgoKZGVmIGluaXRfY29udGV4dChjb250ZXh0KToKICAgIHYzaW9fY2xpZW50ID0gdjNpby5kYXRhcGxhbmUuQ2xpZW50KCkKICAgIHNldGF0dHIoY29udGV4dCwgInYzaW9fY2xpZW50IiwgdjNpb19jbGllbnQpCgogICAgdjNmX2NsaWVudCA9IHYzZi5DbGllbnQoImZyYW1lc2Q6ODA4MSIsIGNvbnRhaW5lcj0iYmlnZGF0YSIpCiAgICBzZXRhdHRyKGNvbnRleHQsICJ2M2YiLCB2M2ZfY2xpZW50KQogICAgd2luZG93ID0gW10KICAgIHNldGF0dHIoY29udGV4dCwgIndpbmRvdyIsIHdpbmRvdykKICAgIHNldGF0dHIoY29udGV4dCwgIndpbmRvd19zaXplIiwgaW50KG9zLmdldGVudigid2luZG93X3NpemUiLCAxMCkpKQogICAgc2V0YXR0cihjb250ZXh0LCAidHNkYl90YWJsZSIsIG9zLmdldGVudigidHNkYl90YWJsZSIsICJjb25jZXB0X2RyaWZ0X3RzZGJfMSIpKQogICAgdHJ5OgogICAgICAgIGNvbnRleHQudjNmLmNyZWF0ZSgidHNkYiIsIGNvbnRleHQudHNkYl90YWJsZSwgcmF0ZT0iMS9zIiwgaWZfZXhpc3RzPTEpCiAgICBleGNlcHQgRXhjZXB0aW9uIGFzIGU6CiAgICAgICAgY29udGV4dC5sb2dnZXIuaW5mbyhmIkNyZWF0aW5nIGNvbnRleHQgd2l0aCByYXRlPSBmYWlsZSBmb3Ige2V9IikKICAgICAgICBjb250ZXh0LnYzZi5jcmVhdGUoCiAgICAgICAgICAgICJ0c2RiIiwgY29udGV4dC50c2RiX3RhYmxlLCBhdHRycz17InJhdGUiOiAiMS9zIn0sIGlmX2V4aXN0cz0xCiAgICAgICAgKQoKICAgIGNhbGxiYWNrcyA9IFtjYWxsYmFjay5zdHJpcCgpIGZvciBjYWxsYmFjayBpbiBvcy5nZXRlbnYoImNhbGxiYWNrcyIsICIiKS5zcGxpdCgiLCIpXQogICAgc2V0YXR0cihjb250ZXh0LCAiY2FsbGJhY2tzIiwgY2FsbGJhY2tzKQoKICAgIHNldGF0dHIoY29udGV4dCwgImRyaWZ0X3N0cmVhbSIsIG9zLmdldGVudigiZHJpZnRfc3RyZWFtIiwgIi9iaWdkYXRhL2RyaWZ0X3N0cmVhbSIpKQogICAgdHJ5OgogICAgICAgIGNyZWF0ZV9zdHJlYW0oCiAgICAgICAgICAgIGNvbnRleHQsIGNvbnRleHQuZHJpZnRfc3RyZWFtLCBpbnQob3MuZ2V0ZW52KCJkcmlmdF9zdHJlYW1fc2hhcmRzIiwgMSkpCiAgICAgICAgKQogICAgZXhjZXB0OgogICAgICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZiJ7Y29udGV4dC5kcmlmdF9zdHJlYW19IGFscmVhZHkgZXhpc3RzIikKCiAgICBtb2RlbHMgPSB7fQogICAgbW9kZWxfdHlwZXMgPSBbInBhZ2VoaW5rZWx5IiwgImRkbSIsICJlZGRtIl0KICAgIHBhdGhfc3VmZml4ID0gIl9tb2RlbF9wYXRoIgogICAgZm9yIG1vZGVsIGluIG1vZGVsX3R5cGVzOgogICAgICAgIG1vZGVsX2VudiA9IGYie21vZGVsfXtwYXRoX3N1ZmZpeH0iCiAgICAgICAgaWYgbW9kZWxfZW52IGluIG9zLmVudmlyb246CiAgICAgICAgICAgIHdpdGggb3Blbihvcy5lbnZpcm9uW21vZGVsX2Vudl0sICJyYiIpIGFzIGY6CiAgICAgICAgICAgICAgICBtb2RlbHNbbW9kZWxdID0gbG9hZChmKQogICAgc2V0YXR0cihjb250ZXh0LCAibW9kZWxzIiwgbW9kZWxzKQoKICAgIHNldGF0dHIoY29udGV4dCwgImxhYmVsX2NvbCIsIG9zLmdldGVudigibGFiZWxfY29sIiwgImxhYmVsIikpCiAgICBzZXRhdHRyKGNvbnRleHQsICJwcmVkaWN0aW9uX2NvbCIsIG9zLmdldGVudigicHJlZGljdGlvbl9jb2wiLCAicHJlZGljdGlvbiIpKQoKCmRlZiBoYW5kbGVyKGNvbnRleHQsIGV2ZW50KToKICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZiJldmVudDoge2V2ZW50LmJvZHl9IikKICAgIGZ1bGxfZXZlbnQgPSBqc29uLmxvYWRzKGV2ZW50LmJvZHkpCiAgICByZWNvcmQgPSBjb25zdHJ1Y3RfcmVjb3JkKGZ1bGxfZXZlbnQpCgogICAgaXNfZXJyb3IgPSByZWNvcmRbY29udGV4dC5sYWJlbF9jb2xdICE9IHJlY29yZFtjb250ZXh0LnByZWRpY3Rpb25fY29sXQogICAgY29udGV4dC5sb2dnZXIuaW5mbyhmIkFkZGluZyB7aXNfZXJyb3J9IikKCiAgICBmb3IgbmFtZSwgbW9kZWwgaW4gY29udGV4dC5tb2RlbHMuaXRlbXMoKToKICAgICAgICByZXN1bHRzID0geyJ0aW1lc3RhbXAiOiByZWNvcmRbInRpbWVzdGFtcCJdfQogICAgICAgIHJlc3VsdHNbImFsZ29yaXRobSJdID0gbmFtZQogICAgICAgIG1vZGVsLmFkZF9lbGVtZW50KGlzX2Vycm9yKQoKICAgICAgICBpZiBoYXNhdHRyKG1vZGVsLCAiZGV0ZWN0ZWRfd2FybmluZ196b25lIikgYW5kIG1vZGVsLmRldGVjdGVkX3dhcm5pbmdfem9uZSgpOgogICAgICAgICAgICBjb250ZXh0LmxvZ2dlci5pbmZvKGYie25hbWV9XHRXYXJuaW5nIHpvbmUgZGV0ZWN0ZWQiKQogICAgICAgICAgICByZXN1bHRzWyJ3YXJuaW5nX3pvbmUiXSA9IDEKICAgICAgICAgICAgZnVsbF9ldmVudFtmIntuYW1lfV93YXJuaW5nX3pvbmUiXSA9IDEKICAgICAgICBlbHNlOgogICAgICAgICAgICByZXN1bHRzWyJ3YXJuaW5nX3pvbmUiXSA9IDAKICAgICAgICAgICAgZnVsbF9ldmVudFtmIntuYW1lfV93YXJuaW5nX3pvbmUiXSA9IDAKCiAgICAgICAgaWYgbW9kZWwuZGV0ZWN0ZWRfY2hhbmdlKCk6CiAgICAgICAgICAgIGNvbnRleHQubG9nZ2VyLmluZm8oIkNoYW5nZSBEZXRlY3RlZCIpCiAgICAgICAgICAgIHJlc3VsdHNbImNoYW5nZV9kZXRlY3RlZCJdID0gMQogICAgICAgICAgICBmdWxsX2V2ZW50W2Yie25hbWV9X2RyaWZ0Il0gPSAxCiAgICAgICAgZWxzZToKICAgICAgICAgICAgcmVzdWx0c1siY2hhbmdlX2RldGVjdGVkIl0gPSAwCiAgICAgICAgICAgIGZ1bGxfZXZlbnRbZiJ7bmFtZX1fZHJpZnQiXSA9IDAKICAgICAgICBjb250ZXh0LndpbmRvdy5hcHBlbmQocmVzdWx0cykKCiAgICBwdXNoX3RvX3N0cmVhbShjb250ZXh0LCBjb250ZXh0LmRyaWZ0X3N0cmVhbSwgW2Z1bGxfZXZlbnRdKQoKICAgIGlmIGNvbnRleHQuY2FsbGJhY2tzICE9IFsiIl06CiAgICAgICAgZm9yIGNhbGxiYWNrIGluIGNvbnRleHQuY2FsbGJhY2tzOgogICAgICAgICAgICByZXF1ZXN0cy5wb3N0KHVybD1jYWxsYmFjaywganNvbj1mdWxsX2V2ZW50KQoKICAgIGlmIChsZW4oY29udGV4dC53aW5kb3cpIC8gbGVuKGNvbnRleHQubW9kZWxzKSkgPj0gY29udGV4dC53aW5kb3dfc2l6ZToKICAgICAgICBkZiA9IHBkLkRhdGFGcmFtZShjb250ZXh0LndpbmRvdykKICAgICAgICBkZlsidGltZXN0YW1wIl0gPSBwZC50b19kYXRldGltZShkZlsidGltZXN0YW1wIl0pCiAgICAgICAgZGYgPSBkZi5zZXRfaW5kZXgoWyJ0aW1lc3RhbXAiLCAiYWxnb3JpdGhtIl0pCiAgICAgICAgY29udGV4dC52M2Yud3JpdGUoInRzZGIiLCBjb250ZXh0LnRzZGJfdGFibGUsIGRmKQogICAgICAgIGNvbnRleHQud2luZG93ID0gW10K - source: '' - build: - commands: - - python -m pip install scikit-multiflow==0.4.1 v3io_frames - code_origin: https://github.com/daniels290813/functions.git#d96059851b5d51fd4583e982483eb973fccc47d2:/User/test/functions/concept_drift_streaming/concept_drift_streaming.py - origin_filename: /User/test/functions/concept_drift_streaming/concept_drift_streaming.py - default_handler: handler - disable_auto_mount: false - affinity: null -verbose: false diff --git a/functions/development/concept_drift_streaming/0.9.0/src/item.yaml b/functions/development/concept_drift_streaming/0.9.0/src/item.yaml deleted file mode 100644 index 27c94397..00000000 --- a/functions/development/concept_drift_streaming/0.9.0/src/item.yaml +++ /dev/null @@ -1,28 +0,0 @@ -apiVersion: v1 -categories: -- machine-learning -- monitoring -description: Deploy a streaming Concept Drift detector on a labeled stream. the nuclio - part of the concept_drift function -doc: '' -example: concept_drift_streaming.ipynb -generationDate: 2021-11-18:12-28 -icon: '' -labels: - author: orz - framework: sklearn -maintainers: [] -marketplaceType: '' -mlrunVersion: 0.8.0 -name: concept-drift-streaming -platformVersion: 3.2.0 -spec: - filename: concept_drift_streaming.py - handler: handler - image: mlrun/ml-models - kind: nuclio - requirements: - - scikit-multiflow==0.4.1 - - v3io_frames -url: '' -version: 0.9.0 diff --git a/functions/development/concept_drift_streaming/0.9.0/static/documentation.html b/functions/development/concept_drift_streaming/0.9.0/static/documentation.html deleted file mode 100644 index fd1cd3dd..00000000 --- a/functions/development/concept_drift_streaming/0.9.0/static/documentation.html +++ /dev/null @@ -1,125 +0,0 @@ - - - - - - - -concept_drift_streaming package - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- - -
-
-
-
-
-
-

concept_drift_streaming package

-
-

Submodules

-
-
-

concept_drift_streaming.concept_drift_streaming module

-
-
-

Module contents

-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/concept_drift_streaming/0.9.0/static/example.html b/functions/development/concept_drift_streaming/0.9.0/static/example.html deleted file mode 100644 index 17b9b139..00000000 --- a/functions/development/concept_drift_streaming/0.9.0/static/example.html +++ /dev/null @@ -1,486 +0,0 @@ - - - - - - - -Concept Drift Streaming - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- -
-
- Contents -
- -
-
-
-
-
-
-
-

Concept Drift Streaming

-
-
-
import nuclio
-
-
-
-
-
-
-
from pprint import pprint
-
-
-
-
-
-
-
%%nuclio cmd -c
-python -m pip install scikit-multiflow==0.4.1
-python -m pip install v3io_frames
-
-
-
-
-
-
-
# Define function spec
-%nuclio config kind = "nuclio"
-%nuclio config spec.build.baseImage = "mlrun/ml-models"
-
-# Add V3IO Mount
-# %nuclio env %v3io
-
-
-
-
-
%nuclio: setting kind to 'nuclio'
-%nuclio: setting spec.build.baseImage to 'mlrun/ml-models'
-
-
-
-
-
-
-
# nuclio: ignore
-env = {'label_col': 'resp',
-       'prediction_col': 'prediction',
-       'drift_stream': '/bigdata/network-operations/drift_stream',
-       'tsdb_table': 'network-operations/drift_tsdb',
-       'pagehinkley_threshold': 10,
-       'models': ['pagehinkley', 'ddm', 'eddm'],
-       'window_size': 10}
-config = {'kind': 'nuclio',
-          'spec.build.baseImage': 'mlrun/ml-models'}
-cmd = ['python -m pip install scikit-multiflow',
-       'python -m pip install v3io_frames']
-v3io = True
-config = nuclio.ConfigSpec(env=env,
-                           config=config,
-                           cmd=cmd,
-                           v3io=v3io)
-
-
-
-
-
-
-
# nuclio: start-code
-
-
-
-
-
-
-
import skmultiflow.drift_detection
-import numpy as np
-import pandas as pd
-import os
-import json
-import v3io.dataplane
-import v3io_frames as v3f
-import requests
-from cloudpickle import load
-
-# For testing
-import random
-
-
-
-
-
-
-
def split_path(mntpath=''):
-    if mntpath[0] == '/':
-        mntpath = mntpath[1:]
-    paths = mntpath.split('/')
-    container = paths[0]
-    subpath = ''
-    if len(paths) > 1:
-        subpath = mntpath[len(container):]
-    return container, subpath
-
-
-def create_stream(context, path, shards=1):
-    # create a stream w/8 shards
-    container, stream_path = split_path(path)
-    context.logger.info(f'Creating stream in Container: {container} & Path {stream_path}')
-    response = context.v3io_client.create_stream(container=container,
-                                        path=stream_path, 
-                                        shard_count=shards,
-                                        raise_for_status=v3io.dataplane.RaiseForStatus.never)
-    response.raise_for_status([409, 204])
-    
-    
-def push_to_stream(context, stream_path, data):
-    records = [{'data': json.dumps(rec)} for rec in data]
-    container, stream_path = split_path(stream_path)
-    response = context.v3io_client.put_records(container=container,
-                                               path=stream_path, 
-                                               records=records)
-
-
-def construct_record(record):
-    label_col = os.getenv('label_col', 'label')
-    prediction_col = os.getenv('prediction_col', 'prediction')
-    res = dict([(k, record[k]) for k in ['when', 'class', 'model', 'resp', 'request']])
-    res['feature_vector'] = res.pop('request')['instances'][0]
-    res['timestamp'] = res.pop('when')
-    res['prediction'] = res['resp'][0]
-    return res
-
-
-
-
-
-
-
def init_context(context):
-    # create a v3io context object
-    v3io_client = v3io.dataplane.Client()
-    setattr(context, "v3io_client", v3io_client)
-    
-    # Setup windowing for TSDB writer
-    v3f_client = v3f.Client('framesd:8081', container='bigdata')
-    setattr(context, "v3f", v3f_client)
-    window = []
-    setattr(context, 'window', window)
-    setattr(context, 'window_size', int(os.getenv('window_size', 10)))
-    setattr(context, 'tsdb_table', os.getenv('tsdb_table', 'concept_drift_tsdb_1'))
-    try:
-        context.v3f.create('tsdb', context.tsdb_table, rate='1/s', if_exists=1)
-    except Exception as e:
-        context.logger.info(f'Creating context with rate= faile for {e}')
-        context.v3f.create('tsdb', context.tsdb_table, attrs={'rate': '1/s'}, if_exists=1)
-    
-    # Setup callbacks
-    callbacks = [callback.strip() for callback in os.getenv('callbacks', '').split(',')]
-    setattr(context, 'callbacks', callbacks)
-    
-    # Setup drift stream
-    setattr(context, 'drift_stream', os.getenv('drift_stream', '/bigdata/drift_stream'))
-    try:
-        create_stream(context, context.drift_stream, int(os.getenv('drift_stream_shards', 1)))
-    except:
-        context.logger.info(f'{context.drift_stream} already exists')
-    
-    # Load models
-    models = {}
-    model_types = ['pagehinkely', 'ddm', 'eddm']
-    path_suffix = '_model_path'
-    for model in model_types:
-        model_env = f'{model}{path_suffix}'
-        if model_env in os.environ:
-            with open(os.environ[model_env], 'rb') as f:
-                models[model] = load(f)
-    setattr(context, 'models', models)
-    
-    # Columns to check
-    setattr(context, 'label_col', os.getenv('label_col', 'label'))
-    setattr(context, 'prediction_col', os.getenv('prediction_col', 'prediction'))
-
-
-
-
-
-
-
def handler(context, event):
-    # Construct event
-    context.logger.info(f'event: {event.body}')
-    full_event = json.loads(event.body)
-    record = construct_record(full_event)
-    
-    # Is our prediction wrong?
-    is_error = record[context.label_col] != record[context.prediction_col]
-    context.logger.info(f'Adding {is_error}')
-    
-    # Process the {is_error} element with our algorithms
-    for name, model in context.models.items():
-        # Add element
-        results = {'timestamp': record['timestamp']}
-        results['algorithm'] = name
-        model.add_element(is_error)
-        
-        # Detect warning zone (if applicable to the algorithm)
-        if hasattr(model, 'detected_warning_zone') and model.detected_warning_zone():
-            context.logger.info(f'{name}\tWarning zone detected')
-            results['warning_zone'] = 1
-            full_event[f'{name}_warning_zone'] = 1
-        else:
-            results['warning_zone'] = 0
-            full_event[f'{name}_warning_zone'] = 0
-        
-        # Detect drift
-        if model.detected_change():
-            context.logger.info('Change Detected')
-            results['change_detected'] = 1
-            full_event[f'{name}_drift'] = 1
-        else:
-            results['change_detected'] = 0
-            full_event[f'{name}_drift'] = 0
-        context.window.append(results)
-    
-    # Return results
-    # Write to stream
-    push_to_stream(context, context.drift_stream, [full_event])
-    
-    # Add to callbacks
-    if context.callbacks != ['']:
-        for callback in context.callbacks:
-            requests.post(url=callback,
-                          json=full_event)
-    
-    if (len(context.window) / len(context.models)) >= context.window_size:
-        df = pd.DataFrame(context.window)
-        df['timestamp'] = pd.to_datetime(df['timestamp'])
-        df = df.set_index(['timestamp', 'algorithm'])
-        context.v3f.write('tsdb', context.tsdb_table, df)
-        context.window = []
-
-
-
-
-
-
-
# nuclio: end-code
-
-
-
-
-
-

Test

-
-
-
init_context(context)
-event = nuclio.Event(body=json.dumps({'prediction': 0,
-                                      'when': 'now',
-                                      'class': 'ClassModel', 
-                                      'model': 'tester_v1', 
-                                      'resp': [0], 
-                                      'request': {'instances': [[1, 1.2, 3]]}}))
-out = handler(context, event)
-out
-
-
-
-
-
-
-

Cluster

-
-
-
%nuclio deploy -n network-operations-concept-drift -p network-operations
-
-
-
-
-
-
-

Save function yaml

-
-
-
from os import path
-from mlrun import run_local, NewTask, mlconf, import_function, mount_v3io, code_to_function, get_run_db
-mlconf.dbpath = mlconf.dbpath or 'http://mlrun-api:8080'
-
-
-
-
-
-
-
# create job function object from notebook code
-fn = code_to_function("concept_drift_streaming", kind='nuclio')
-
-# add metadata (for templates and reuse)
-fn.spec.default_handler = "handler"
-fn.spec.description = "Deploy a streaming Concept Drift detector on a labeled stream. the nuclio part of the concept_drift function"
-fn.metadata.categories = ["ml", "serve"]
-fn.metadata.labels = {"author": "orz", "framework": "sklearn"}
-fn.export("/User/functions/concept_drift_streaming/function.yaml")
-
-
-
-
-
[mlrun] 2020-07-14 13:49:22,720 function spec saved to path: /User/functions/concept_drift_streaming/function.yaml
-
-
-
<mlrun.runtimes.function.RemoteRuntime at 0x7fb5a7de3e80>
-
-
-
-
-
-
-
stream_trigger = nuclio.triggers.V3IOStreamTrigger(url='/bigdata/network-operations/inference_stream@cd2')
-
-
-
-
-
-
-
fn.add_trigger('labeled_stream', stream_trigger)
-
-
-
-
-
<mlrun.runtimes.function.RemoteRuntime at 0x7fa1dc063780>
-
-
-
-
-
-
-
fn.apply(mount_v3io()).with_v3io()
-
-
-
-
-
<mlrun.runtimes.function.RemoteRuntime at 0x7fa1dc063780>
-
-
-
-
-
-
-
fn.export("function.yaml")
-
-
-
-
-
-
-

Stream testing

-
-
-
fn = import_function('./function.yaml')
-
-
-
-
-
-
-
fn.deploy(project='network-operations')
-
-
-
-
-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/concept_drift_streaming/0.9.0/static/function.html b/functions/development/concept_drift_streaming/0.9.0/static/function.html deleted file mode 100644 index 32c31c6f..00000000 --- a/functions/development/concept_drift_streaming/0.9.0/static/function.html +++ /dev/null @@ -1,70 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: remote
-metadata:
-  name: concept-drift-streaming
-  tag: ''
-  hash: dc41ff41149be69f19b91a6d78a06571937063ae
-  project: ''
-  labels:
-    author: orz
-    framework: sklearn
-  categories:
-  - machine-learning
-  - monitoring
-spec:
-  command: ''
-  args: []
-  image: mlrun/ml-models
-  description: Deploy a streaming Concept Drift detector on a labeled stream. the
-    nuclio part of the concept_drift function
-  min_replicas: 1
-  max_replicas: 4
-  env: []
-  base_spec:
-    apiVersion: nuclio.io/v1
-    kind: Function
-    metadata:
-      name: concept-drift-streaming
-      labels: {}
-      annotations:
-        nuclio.io/generated_by: function generated from /User/test/functions/concept_drift_streaming/concept_drift_streaming.py
-    spec:
-      runtime: python:3.6
-      handler: concept_drift_streaming:handler
-      env: []
-      volumes: []
-      build:
-        commands: []
-        noBaseImagesPull: true
-        functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHNrbXVsdGlmbG93LmRyaWZ0X2RldGVjdGlvbgppbXBvcnQgbnVtcHkgYXMgbnAKaW1wb3J0IHBhbmRhcyBhcyBwZAppbXBvcnQgb3MKaW1wb3J0IGpzb24KaW1wb3J0IHYzaW8uZGF0YXBsYW5lCmltcG9ydCB2M2lvX2ZyYW1lcyBhcyB2M2YKaW1wb3J0IHJlcXVlc3RzCmZyb20gY2xvdWRwaWNrbGUgaW1wb3J0IGxvYWQKCmltcG9ydCByYW5kb20KCgpkZWYgc3BsaXRfcGF0aChtbnRwYXRoPSIiKToKICAgIGlmIG1udHBhdGhbMF0gPT0gIi8iOgogICAgICAgIG1udHBhdGggPSBtbnRwYXRoWzE6XQogICAgcGF0aHMgPSBtbnRwYXRoLnNwbGl0KCIvIikKICAgIGNvbnRhaW5lciA9IHBhdGhzWzBdCiAgICBzdWJwYXRoID0gIiIKICAgIGlmIGxlbihwYXRocykgPiAxOgogICAgICAgIHN1YnBhdGggPSBtbnRwYXRoW2xlbihjb250YWluZXIpIDpdCiAgICByZXR1cm4gY29udGFpbmVyLCBzdWJwYXRoCgoKZGVmIGNyZWF0ZV9zdHJlYW0oY29udGV4dCwgcGF0aCwgc2hhcmRzPTEpOgogICAgY29udGFpbmVyLCBzdHJlYW1fcGF0aCA9IHNwbGl0X3BhdGgocGF0aCkKICAgIGNvbnRleHQubG9nZ2VyLmluZm8oCiAgICAgICAgZiJDcmVhdGluZyBzdHJlYW0gaW4gQ29udGFpbmVyOiB7Y29udGFpbmVyfSAmIFBhdGgge3N0cmVhbV9wYXRofSIKICAgICkKICAgIHJlc3BvbnNlID0gY29udGV4dC52M2lvX2NsaWVudC5jcmVhdGVfc3RyZWFtKAogICAgICAgIGNvbnRhaW5lcj1jb250YWluZXIsCiAgICAgICAgcGF0aD1zdHJlYW1fcGF0aCwKICAgICAgICBzaGFyZF9jb3VudD1zaGFyZHMsCiAgICAgICAgcmFpc2VfZm9yX3N0YXR1cz12M2lvLmRhdGFwbGFuZS5SYWlzZUZvclN0YXR1cy5uZXZlciwKICAgICkKICAgIHJlc3BvbnNlLnJhaXNlX2Zvcl9zdGF0dXMoWzQwOSwgMjA0XSkKCgpkZWYgcHVzaF90b19zdHJlYW0oY29udGV4dCwgc3RyZWFtX3BhdGgsIGRhdGEpOgogICAgcmVjb3JkcyA9IFt7ImRhdGEiOiBqc29uLmR1bXBzKHJlYyl9IGZvciByZWMgaW4gZGF0YV0KICAgIGNvbnRhaW5lciwgc3RyZWFtX3BhdGggPSBzcGxpdF9wYXRoKHN0cmVhbV9wYXRoKQogICAgcmVzcG9uc2UgPSBjb250ZXh0LnYzaW9fY2xpZW50LnB1dF9yZWNvcmRzKAogICAgICAgIGNvbnRhaW5lcj1jb250YWluZXIsIHBhdGg9c3RyZWFtX3BhdGgsIHJlY29yZHM9cmVjb3JkcwogICAgKQoKCmRlZiBjb25zdHJ1Y3RfcmVjb3JkKHJlY29yZCk6CiAgICBsYWJlbF9jb2wgPSBvcy5nZXRlbnYoImxhYmVsX2NvbCIsICJsYWJlbCIpCiAgICBwcmVkaWN0aW9uX2NvbCA9IG9zLmdldGVudigicHJlZGljdGlvbl9jb2wiLCAicHJlZGljdGlvbiIpCiAgICByZXMgPSBkaWN0KFsoaywgcmVjb3JkW2tdKSBmb3IgayBpbiBbIndoZW4iLCAiY2xhc3MiLCAibW9kZWwiLCAicmVzcCIsICJyZXF1ZXN0Il1dKQogICAgcmVzWyJmZWF0dXJlX3ZlY3RvciJdID0gcmVzLnBvcCgicmVxdWVzdCIpWyJpbnN0YW5jZXMiXVswXQogICAgcmVzWyJ0aW1lc3RhbXAiXSA9IHJlcy5wb3AoIndoZW4iKQogICAgcmVzW3ByZWRpY3Rpb25fY29sXSA9IHJlc1sicmVzcCJdWzBdCiAgICByZXR1cm4gcmVzCgoKZGVmIGluaXRfY29udGV4dChjb250ZXh0KToKICAgIHYzaW9fY2xpZW50ID0gdjNpby5kYXRhcGxhbmUuQ2xpZW50KCkKICAgIHNldGF0dHIoY29udGV4dCwgInYzaW9fY2xpZW50IiwgdjNpb19jbGllbnQpCgogICAgdjNmX2NsaWVudCA9IHYzZi5DbGllbnQoImZyYW1lc2Q6ODA4MSIsIGNvbnRhaW5lcj0iYmlnZGF0YSIpCiAgICBzZXRhdHRyKGNvbnRleHQsICJ2M2YiLCB2M2ZfY2xpZW50KQogICAgd2luZG93ID0gW10KICAgIHNldGF0dHIoY29udGV4dCwgIndpbmRvdyIsIHdpbmRvdykKICAgIHNldGF0dHIoY29udGV4dCwgIndpbmRvd19zaXplIiwgaW50KG9zLmdldGVudigid2luZG93X3NpemUiLCAxMCkpKQogICAgc2V0YXR0cihjb250ZXh0LCAidHNkYl90YWJsZSIsIG9zLmdldGVudigidHNkYl90YWJsZSIsICJjb25jZXB0X2RyaWZ0X3RzZGJfMSIpKQogICAgdHJ5OgogICAgICAgIGNvbnRleHQudjNmLmNyZWF0ZSgidHNkYiIsIGNvbnRleHQudHNkYl90YWJsZSwgcmF0ZT0iMS9zIiwgaWZfZXhpc3RzPTEpCiAgICBleGNlcHQgRXhjZXB0aW9uIGFzIGU6CiAgICAgICAgY29udGV4dC5sb2dnZXIuaW5mbyhmIkNyZWF0aW5nIGNvbnRleHQgd2l0aCByYXRlPSBmYWlsZSBmb3Ige2V9IikKICAgICAgICBjb250ZXh0LnYzZi5jcmVhdGUoCiAgICAgICAgICAgICJ0c2RiIiwgY29udGV4dC50c2RiX3RhYmxlLCBhdHRycz17InJhdGUiOiAiMS9zIn0sIGlmX2V4aXN0cz0xCiAgICAgICAgKQoKICAgIGNhbGxiYWNrcyA9IFtjYWxsYmFjay5zdHJpcCgpIGZvciBjYWxsYmFjayBpbiBvcy5nZXRlbnYoImNhbGxiYWNrcyIsICIiKS5zcGxpdCgiLCIpXQogICAgc2V0YXR0cihjb250ZXh0LCAiY2FsbGJhY2tzIiwgY2FsbGJhY2tzKQoKICAgIHNldGF0dHIoY29udGV4dCwgImRyaWZ0X3N0cmVhbSIsIG9zLmdldGVudigiZHJpZnRfc3RyZWFtIiwgIi9iaWdkYXRhL2RyaWZ0X3N0cmVhbSIpKQogICAgdHJ5OgogICAgICAgIGNyZWF0ZV9zdHJlYW0oCiAgICAgICAgICAgIGNvbnRleHQsIGNvbnRleHQuZHJpZnRfc3RyZWFtLCBpbnQob3MuZ2V0ZW52KCJkcmlmdF9zdHJlYW1fc2hhcmRzIiwgMSkpCiAgICAgICAgKQogICAgZXhjZXB0OgogICAgICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZiJ7Y29udGV4dC5kcmlmdF9zdHJlYW19IGFscmVhZHkgZXhpc3RzIikKCiAgICBtb2RlbHMgPSB7fQogICAgbW9kZWxfdHlwZXMgPSBbInBhZ2VoaW5rZWx5IiwgImRkbSIsICJlZGRtIl0KICAgIHBhdGhfc3VmZml4ID0gIl9tb2RlbF9wYXRoIgogICAgZm9yIG1vZGVsIGluIG1vZGVsX3R5cGVzOgogICAgICAgIG1vZGVsX2VudiA9IGYie21vZGVsfXtwYXRoX3N1ZmZpeH0iCiAgICAgICAgaWYgbW9kZWxfZW52IGluIG9zLmVudmlyb246CiAgICAgICAgICAgIHdpdGggb3Blbihvcy5lbnZpcm9uW21vZGVsX2Vudl0sICJyYiIpIGFzIGY6CiAgICAgICAgICAgICAgICBtb2RlbHNbbW9kZWxdID0gbG9hZChmKQogICAgc2V0YXR0cihjb250ZXh0LCAibW9kZWxzIiwgbW9kZWxzKQoKICAgIHNldGF0dHIoY29udGV4dCwgImxhYmVsX2NvbCIsIG9zLmdldGVudigibGFiZWxfY29sIiwgImxhYmVsIikpCiAgICBzZXRhdHRyKGNvbnRleHQsICJwcmVkaWN0aW9uX2NvbCIsIG9zLmdldGVudigicHJlZGljdGlvbl9jb2wiLCAicHJlZGljdGlvbiIpKQoKCmRlZiBoYW5kbGVyKGNvbnRleHQsIGV2ZW50KToKICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZiJldmVudDoge2V2ZW50LmJvZHl9IikKICAgIGZ1bGxfZXZlbnQgPSBqc29uLmxvYWRzKGV2ZW50LmJvZHkpCiAgICByZWNvcmQgPSBjb25zdHJ1Y3RfcmVjb3JkKGZ1bGxfZXZlbnQpCgogICAgaXNfZXJyb3IgPSByZWNvcmRbY29udGV4dC5sYWJlbF9jb2xdICE9IHJlY29yZFtjb250ZXh0LnByZWRpY3Rpb25fY29sXQogICAgY29udGV4dC5sb2dnZXIuaW5mbyhmIkFkZGluZyB7aXNfZXJyb3J9IikKCiAgICBmb3IgbmFtZSwgbW9kZWwgaW4gY29udGV4dC5tb2RlbHMuaXRlbXMoKToKICAgICAgICByZXN1bHRzID0geyJ0aW1lc3RhbXAiOiByZWNvcmRbInRpbWVzdGFtcCJdfQogICAgICAgIHJlc3VsdHNbImFsZ29yaXRobSJdID0gbmFtZQogICAgICAgIG1vZGVsLmFkZF9lbGVtZW50KGlzX2Vycm9yKQoKICAgICAgICBpZiBoYXNhdHRyKG1vZGVsLCAiZGV0ZWN0ZWRfd2FybmluZ196b25lIikgYW5kIG1vZGVsLmRldGVjdGVkX3dhcm5pbmdfem9uZSgpOgogICAgICAgICAgICBjb250ZXh0LmxvZ2dlci5pbmZvKGYie25hbWV9XHRXYXJuaW5nIHpvbmUgZGV0ZWN0ZWQiKQogICAgICAgICAgICByZXN1bHRzWyJ3YXJuaW5nX3pvbmUiXSA9IDEKICAgICAgICAgICAgZnVsbF9ldmVudFtmIntuYW1lfV93YXJuaW5nX3pvbmUiXSA9IDEKICAgICAgICBlbHNlOgogICAgICAgICAgICByZXN1bHRzWyJ3YXJuaW5nX3pvbmUiXSA9IDAKICAgICAgICAgICAgZnVsbF9ldmVudFtmIntuYW1lfV93YXJuaW5nX3pvbmUiXSA9IDAKCiAgICAgICAgaWYgbW9kZWwuZGV0ZWN0ZWRfY2hhbmdlKCk6CiAgICAgICAgICAgIGNvbnRleHQubG9nZ2VyLmluZm8oIkNoYW5nZSBEZXRlY3RlZCIpCiAgICAgICAgICAgIHJlc3VsdHNbImNoYW5nZV9kZXRlY3RlZCJdID0gMQogICAgICAgICAgICBmdWxsX2V2ZW50W2Yie25hbWV9X2RyaWZ0Il0gPSAxCiAgICAgICAgZWxzZToKICAgICAgICAgICAgcmVzdWx0c1siY2hhbmdlX2RldGVjdGVkIl0gPSAwCiAgICAgICAgICAgIGZ1bGxfZXZlbnRbZiJ7bmFtZX1fZHJpZnQiXSA9IDAKICAgICAgICBjb250ZXh0LndpbmRvdy5hcHBlbmQocmVzdWx0cykKCiAgICBwdXNoX3RvX3N0cmVhbShjb250ZXh0LCBjb250ZXh0LmRyaWZ0X3N0cmVhbSwgW2Z1bGxfZXZlbnRdKQoKICAgIGlmIGNvbnRleHQuY2FsbGJhY2tzICE9IFsiIl06CiAgICAgICAgZm9yIGNhbGxiYWNrIGluIGNvbnRleHQuY2FsbGJhY2tzOgogICAgICAgICAgICByZXF1ZXN0cy5wb3N0KHVybD1jYWxsYmFjaywganNvbj1mdWxsX2V2ZW50KQoKICAgIGlmIChsZW4oY29udGV4dC53aW5kb3cpIC8gbGVuKGNvbnRleHQubW9kZWxzKSkgPj0gY29udGV4dC53aW5kb3dfc2l6ZToKICAgICAgICBkZiA9IHBkLkRhdGFGcmFtZShjb250ZXh0LndpbmRvdykKICAgICAgICBkZlsidGltZXN0YW1wIl0gPSBwZC50b19kYXRldGltZShkZlsidGltZXN0YW1wIl0pCiAgICAgICAgZGYgPSBkZi5zZXRfaW5kZXgoWyJ0aW1lc3RhbXAiLCAiYWxnb3JpdGhtIl0pCiAgICAgICAgY29udGV4dC52M2Yud3JpdGUoInRzZGIiLCBjb250ZXh0LnRzZGJfdGFibGUsIGRmKQogICAgICAgIGNvbnRleHQud2luZG93ID0gW10K
-  source: ''
-  build:
-    commands:
-    - python -m pip install scikit-multiflow==0.4.1 v3io_frames
-    code_origin: https://github.com/daniels290813/functions.git#d96059851b5d51fd4583e982483eb973fccc47d2:/User/test/functions/concept_drift_streaming/concept_drift_streaming.py
-    origin_filename: /User/test/functions/concept_drift_streaming/concept_drift_streaming.py
-  default_handler: handler
-  disable_auto_mount: false
-  affinity: null
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/concept_drift_streaming/0.9.0/static/item.html b/functions/development/concept_drift_streaming/0.9.0/static/item.html deleted file mode 100644 index 845bf1c4..00000000 --- a/functions/development/concept_drift_streaming/0.9.0/static/item.html +++ /dev/null @@ -1,50 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- machine-learning
-- monitoring
-description: Deploy a streaming Concept Drift detector on a labeled stream. the nuclio
-  part of the concept_drift function
-doc: ''
-example: concept_drift_streaming.ipynb
-generationDate: 2021-11-18:12-28
-icon: ''
-labels:
-  author: orz
-  framework: sklearn
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 0.8.0
-name: concept-drift-streaming
-platformVersion: 3.2.0
-spec:
-  filename: concept_drift_streaming.py
-  handler: handler
-  image: mlrun/ml-models
-  kind: nuclio
-  requirements:
-  - scikit-multiflow==0.4.1
-  - v3io_frames
-url: ''
-version: 0.9.0
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/concept_drift_streaming/0.9.0/static/source.html b/functions/development/concept_drift_streaming/0.9.0/static/source.html deleted file mode 100644 index 3c479b8f..00000000 --- a/functions/development/concept_drift_streaming/0.9.0/static/source.html +++ /dev/null @@ -1,165 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-# Generated by nuclio.export.NuclioExporter
-
-import skmultiflow.drift_detection
-import numpy as np
-import pandas as pd
-import os
-import json
-import v3io.dataplane
-import v3io_frames as v3f
-import requests
-from cloudpickle import load
-
-import random
-
-
-def split_path(mntpath=""):
-    if mntpath[0] == "/":
-        mntpath = mntpath[1:]
-    paths = mntpath.split("/")
-    container = paths[0]
-    subpath = ""
-    if len(paths) > 1:
-        subpath = mntpath[len(container) :]
-    return container, subpath
-
-
-def create_stream(context, path, shards=1):
-    container, stream_path = split_path(path)
-    context.logger.info(
-        f"Creating stream in Container: {container} & Path {stream_path}"
-    )
-    response = context.v3io_client.create_stream(
-        container=container,
-        path=stream_path,
-        shard_count=shards,
-        raise_for_status=v3io.dataplane.RaiseForStatus.never,
-    )
-    response.raise_for_status([409, 204])
-
-
-def push_to_stream(context, stream_path, data):
-    records = [{"data": json.dumps(rec)} for rec in data]
-    container, stream_path = split_path(stream_path)
-    response = context.v3io_client.put_records(
-        container=container, path=stream_path, records=records
-    )
-
-
-def construct_record(record):
-    label_col = os.getenv("label_col", "label")
-    prediction_col = os.getenv("prediction_col", "prediction")
-    res = dict([(k, record[k]) for k in ["when", "class", "model", "resp", "request"]])
-    res["feature_vector"] = res.pop("request")["instances"][0]
-    res["timestamp"] = res.pop("when")
-    res[prediction_col] = res["resp"][0]
-    return res
-
-
-def init_context(context):
-    v3io_client = v3io.dataplane.Client()
-    setattr(context, "v3io_client", v3io_client)
-
-    v3f_client = v3f.Client("framesd:8081", container="bigdata")
-    setattr(context, "v3f", v3f_client)
-    window = []
-    setattr(context, "window", window)
-    setattr(context, "window_size", int(os.getenv("window_size", 10)))
-    setattr(context, "tsdb_table", os.getenv("tsdb_table", "concept_drift_tsdb_1"))
-    try:
-        context.v3f.create("tsdb", context.tsdb_table, rate="1/s", if_exists=1)
-    except Exception as e:
-        context.logger.info(f"Creating context with rate= faile for {e}")
-        context.v3f.create(
-            "tsdb", context.tsdb_table, attrs={"rate": "1/s"}, if_exists=1
-        )
-
-    callbacks = [callback.strip() for callback in os.getenv("callbacks", "").split(",")]
-    setattr(context, "callbacks", callbacks)
-
-    setattr(context, "drift_stream", os.getenv("drift_stream", "/bigdata/drift_stream"))
-    try:
-        create_stream(
-            context, context.drift_stream, int(os.getenv("drift_stream_shards", 1))
-        )
-    except:
-        context.logger.info(f"{context.drift_stream} already exists")
-
-    models = {}
-    model_types = ["pagehinkely", "ddm", "eddm"]
-    path_suffix = "_model_path"
-    for model in model_types:
-        model_env = f"{model}{path_suffix}"
-        if model_env in os.environ:
-            with open(os.environ[model_env], "rb") as f:
-                models[model] = load(f)
-    setattr(context, "models", models)
-
-    setattr(context, "label_col", os.getenv("label_col", "label"))
-    setattr(context, "prediction_col", os.getenv("prediction_col", "prediction"))
-
-
-def handler(context, event):
-    context.logger.info(f"event: {event.body}")
-    full_event = json.loads(event.body)
-    record = construct_record(full_event)
-
-    is_error = record[context.label_col] != record[context.prediction_col]
-    context.logger.info(f"Adding {is_error}")
-
-    for name, model in context.models.items():
-        results = {"timestamp": record["timestamp"]}
-        results["algorithm"] = name
-        model.add_element(is_error)
-
-        if hasattr(model, "detected_warning_zone") and model.detected_warning_zone():
-            context.logger.info(f"{name}\tWarning zone detected")
-            results["warning_zone"] = 1
-            full_event[f"{name}_warning_zone"] = 1
-        else:
-            results["warning_zone"] = 0
-            full_event[f"{name}_warning_zone"] = 0
-
-        if model.detected_change():
-            context.logger.info("Change Detected")
-            results["change_detected"] = 1
-            full_event[f"{name}_drift"] = 1
-        else:
-            results["change_detected"] = 0
-            full_event[f"{name}_drift"] = 0
-        context.window.append(results)
-
-    push_to_stream(context, context.drift_stream, [full_event])
-
-    if context.callbacks != [""]:
-        for callback in context.callbacks:
-            requests.post(url=callback, json=full_event)
-
-    if (len(context.window) / len(context.models)) >= context.window_size:
-        df = pd.DataFrame(context.window)
-        df["timestamp"] = pd.to_datetime(df["timestamp"])
-        df = df.set_index(["timestamp", "algorithm"])
-        context.v3f.write("tsdb", context.tsdb_table, df)
-        context.window = []
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/concept_drift_streaming/0.9.1/src/concept_drift_streaming.ipynb b/functions/development/concept_drift_streaming/0.9.1/src/concept_drift_streaming.ipynb deleted file mode 100644 index b916cb7a..00000000 --- a/functions/development/concept_drift_streaming/0.9.1/src/concept_drift_streaming.ipynb +++ /dev/null @@ -1,480 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Concept Drift Streaming" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import nuclio" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "from pprint import pprint" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "%%nuclio cmd -c\n", - "python -m pip install scikit-multiflow==0.4.1\n", - "python -m pip install v3io_frames" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "%nuclio: setting kind to 'nuclio'\n", - "%nuclio: setting spec.build.baseImage to 'mlrun/ml-models'\n" - ] - } - ], - "source": [ - "# Define function spec\n", - "%nuclio config kind = \"nuclio\"\n", - "%nuclio config spec.build.baseImage = \"mlrun/ml-models\"\n", - "\n", - "# Add V3IO Mount\n", - "# %nuclio env %v3io" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: ignore\n", - "env = {'label_col': 'resp',\n", - " 'prediction_col': 'prediction',\n", - " 'drift_stream': '/bigdata/network-operations/drift_stream',\n", - " 'tsdb_table': 'network-operations/drift_tsdb',\n", - " 'pagehinkley_threshold': 10,\n", - " 'models': ['pagehinkley', 'ddm', 'eddm'],\n", - " 'window_size': 10}\n", - "config = {'kind': 'nuclio',\n", - " 'spec.build.baseImage': 'mlrun/ml-models'}\n", - "cmd = ['python -m pip install scikit-multiflow',\n", - " 'python -m pip install v3io_frames']\n", - "v3io = True\n", - "config = nuclio.ConfigSpec(env=env,\n", - " config=config,\n", - " cmd=cmd,\n", - " v3io=v3io)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: start-code" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "import skmultiflow.drift_detection\n", - "import numpy as np\n", - "import pandas as pd\n", - "import os\n", - "import json\n", - "import v3io.dataplane\n", - "import v3io_frames as v3f\n", - "import requests\n", - "from cloudpickle import load\n", - "\n", - "# For testing\n", - "import random" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "def split_path(mntpath=''):\n", - " if mntpath[0] == '/':\n", - " mntpath = mntpath[1:]\n", - " paths = mntpath.split('/')\n", - " container = paths[0]\n", - " subpath = ''\n", - " if len(paths) > 1:\n", - " subpath = mntpath[len(container):]\n", - " return container, subpath\n", - "\n", - "\n", - "def create_stream(context, path, shards=1):\n", - " # create a stream w/8 shards\n", - " container, stream_path = split_path(path)\n", - " context.logger.info(f'Creating stream in Container: {container} & Path {stream_path}')\n", - " response = context.v3io_client.create_stream(container=container,\n", - " path=stream_path, \n", - " shard_count=shards,\n", - " raise_for_status=v3io.dataplane.RaiseForStatus.never)\n", - " response.raise_for_status([409, 204])\n", - " \n", - " \n", - "def push_to_stream(context, stream_path, data):\n", - " records = [{'data': json.dumps(rec)} for rec in data]\n", - " container, stream_path = split_path(stream_path)\n", - " response = context.v3io_client.put_records(container=container,\n", - " path=stream_path, \n", - " records=records)\n", - "\n", - "\n", - "def construct_record(record):\n", - " label_col = os.getenv('label_col', 'label')\n", - " prediction_col = os.getenv('prediction_col', 'prediction')\n", - " res = dict([(k, record[k]) for k in ['when', 'class', 'model', 'resp', 'request']])\n", - " res['feature_vector'] = res.pop('request')['instances'][0]\n", - " res['timestamp'] = res.pop('when')\n", - " res['prediction'] = res['resp'][0]\n", - " return res" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "def init_context(context):\n", - " # create a v3io context object\n", - " v3io_client = v3io.dataplane.Client()\n", - " setattr(context, \"v3io_client\", v3io_client)\n", - " \n", - " # Setup windowing for TSDB writer\n", - " v3f_client = v3f.Client('framesd:8081', container='bigdata')\n", - " setattr(context, \"v3f\", v3f_client)\n", - " window = []\n", - " setattr(context, 'window', window)\n", - " setattr(context, 'window_size', int(os.getenv('window_size', 10)))\n", - " setattr(context, 'tsdb_table', os.getenv('tsdb_table', 'concept_drift_tsdb_1'))\n", - " try:\n", - " context.v3f.create('tsdb', context.tsdb_table, rate='1/s', if_exists=1)\n", - " except Exception as e:\n", - " context.logger.info(f'Creating context with rate= faile for {e}')\n", - " context.v3f.create('tsdb', context.tsdb_table, attrs={'rate': '1/s'}, if_exists=1)\n", - " \n", - " # Setup callbacks\n", - " callbacks = [callback.strip() for callback in os.getenv('callbacks', '').split(',')]\n", - " setattr(context, 'callbacks', callbacks)\n", - " \n", - " # Setup drift stream\n", - " setattr(context, 'drift_stream', os.getenv('drift_stream', '/bigdata/drift_stream'))\n", - " try:\n", - " create_stream(context, context.drift_stream, int(os.getenv('drift_stream_shards', 1)))\n", - " except:\n", - " context.logger.info(f'{context.drift_stream} already exists')\n", - " \n", - " # Load models\n", - " models = {}\n", - " model_types = ['pagehinkely', 'ddm', 'eddm']\n", - " path_suffix = '_model_path'\n", - " for model in model_types:\n", - " model_env = f'{model}{path_suffix}'\n", - " if model_env in os.environ:\n", - " with open(os.environ[model_env], 'rb') as f:\n", - " models[model] = load(f)\n", - " setattr(context, 'models', models)\n", - " \n", - " # Columns to check\n", - " setattr(context, 'label_col', os.getenv('label_col', 'label'))\n", - " setattr(context, 'prediction_col', os.getenv('prediction_col', 'prediction'))" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "def handler(context, event):\n", - " # Construct event\n", - " context.logger.info(f'event: {event.body}')\n", - " full_event = json.loads(event.body)\n", - " record = construct_record(full_event)\n", - " \n", - " # Is our prediction wrong?\n", - " is_error = record[context.label_col] != record[context.prediction_col]\n", - " context.logger.info(f'Adding {is_error}')\n", - " \n", - " # Process the {is_error} element with our algorithms\n", - " for name, model in context.models.items():\n", - " # Add element\n", - " results = {'timestamp': record['timestamp']}\n", - " results['algorithm'] = name\n", - " model.add_element(is_error)\n", - " \n", - " # Detect warning zone (if applicable to the algorithm)\n", - " if hasattr(model, 'detected_warning_zone') and model.detected_warning_zone():\n", - " context.logger.info(f'{name}\\tWarning zone detected')\n", - " results['warning_zone'] = 1\n", - " full_event[f'{name}_warning_zone'] = 1\n", - " else:\n", - " results['warning_zone'] = 0\n", - " full_event[f'{name}_warning_zone'] = 0\n", - " \n", - " # Detect drift\n", - " if model.detected_change():\n", - " context.logger.info('Change Detected')\n", - " results['change_detected'] = 1\n", - " full_event[f'{name}_drift'] = 1\n", - " else:\n", - " results['change_detected'] = 0\n", - " full_event[f'{name}_drift'] = 0\n", - " context.window.append(results)\n", - " \n", - " # Return results\n", - " # Write to stream\n", - " push_to_stream(context, context.drift_stream, [full_event])\n", - " \n", - " # Add to callbacks\n", - " if context.callbacks != ['']:\n", - " for callback in context.callbacks:\n", - " requests.post(url=callback,\n", - " json=full_event)\n", - " \n", - " if (len(context.window) / len(context.models)) >= context.window_size:\n", - " df = pd.DataFrame(context.window)\n", - " df['timestamp'] = pd.to_datetime(df['timestamp'])\n", - " df = df.set_index(['timestamp', 'algorithm'])\n", - " context.v3f.write('tsdb', context.tsdb_table, df)\n", - " context.window = []" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: end-code" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Test " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "init_context(context)\n", - "event = nuclio.Event(body=json.dumps({'prediction': 0,\n", - " 'when': 'now',\n", - " 'class': 'ClassModel', \n", - " 'model': 'tester_v1', \n", - " 'resp': [0], \n", - " 'request': {'instances': [[1, 1.2, 3]]}}))\n", - "out = handler(context, event)\n", - "out" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Cluster" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%nuclio deploy -n network-operations-concept-drift -p network-operations" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Save function yaml" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "from os import path\n", - "from mlrun import run_local, NewTask, mlconf, import_function, mount_v3io, code_to_function, get_run_db\n", - "mlconf.dbpath = mlconf.dbpath or 'http://mlrun-api:8080'" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-07-14 13:49:22,720 function spec saved to path: /User/functions/concept_drift_streaming/function.yaml\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# create job function object from notebook code\n", - "fn = code_to_function(\"concept_drift_streaming\", kind='nuclio')\n", - "\n", - "# add metadata (for templates and reuse)\n", - "fn.spec.default_handler = \"handler\"\n", - "fn.spec.description = \"Deploy a streaming Concept Drift detector on a labeled stream. the nuclio part of the concept_drift function\"\n", - "fn.metadata.categories = [\"ml\", \"serve\"]\n", - "fn.metadata.labels = {\"author\": \"orz\", \"framework\": \"sklearn\"}\n", - "fn.export(\"/User/functions/concept_drift_streaming/function.yaml\")" - ] - }, - { - "cell_type": "code", - "execution_count": 120, - "metadata": {}, - "outputs": [], - "source": [ - "stream_trigger = nuclio.triggers.V3IOStreamTrigger(url='/bigdata/network-operations/inference_stream@cd2')" - ] - }, - { - "cell_type": "code", - "execution_count": 121, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 121, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fn.add_trigger('labeled_stream', stream_trigger)" - ] - }, - { - "cell_type": "code", - "execution_count": 122, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 122, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fn.apply(mount_v3io()).with_v3io()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "fn.export(\"function.yaml\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Stream testing" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "metadata": {}, - "outputs": [], - "source": [ - "fn = import_function('./function.yaml')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "fn.deploy(project='network-operations')" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.8" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/functions/development/concept_drift_streaming/0.9.1/src/concept_drift_streaming.py b/functions/development/concept_drift_streaming/0.9.1/src/concept_drift_streaming.py deleted file mode 100644 index 94247c45..00000000 --- a/functions/development/concept_drift_streaming/0.9.1/src/concept_drift_streaming.py +++ /dev/null @@ -1,143 +0,0 @@ -# Generated by nuclio.export.NuclioExporter - -import skmultiflow.drift_detection -import numpy as np -import pandas as pd -import os -import json -import v3io.dataplane -import v3io_frames as v3f -import requests -from cloudpickle import load - -import random - - -def split_path(mntpath=""): - if mntpath[0] == "/": - mntpath = mntpath[1:] - paths = mntpath.split("/") - container = paths[0] - subpath = "" - if len(paths) > 1: - subpath = mntpath[len(container) :] - return container, subpath - - -def create_stream(context, path, shards=1): - container, stream_path = split_path(path) - context.logger.info( - f"Creating stream in Container: {container} & Path {stream_path}" - ) - response = context.v3io_client.create_stream( - container=container, - path=stream_path, - shard_count=shards, - raise_for_status=v3io.dataplane.RaiseForStatus.never, - ) - response.raise_for_status([409, 204]) - - -def push_to_stream(context, stream_path, data): - records = [{"data": json.dumps(rec)} for rec in data] - container, stream_path = split_path(stream_path) - response = context.v3io_client.put_records( - container=container, path=stream_path, records=records - ) - - -def construct_record(record): - label_col = os.getenv("label_col", "label") - prediction_col = os.getenv("prediction_col", "prediction") - res = dict([(k, record[k]) for k in ["when", "class", "model", "resp", "request"]]) - res["feature_vector"] = res.pop("request")["instances"][0] - res["timestamp"] = res.pop("when") - res[prediction_col] = res["resp"][0] - return res - - -def init_context(context): - v3io_client = v3io.dataplane.Client() - setattr(context, "v3io_client", v3io_client) - - v3f_client = v3f.Client("framesd:8081", container="bigdata") - setattr(context, "v3f", v3f_client) - window = [] - setattr(context, "window", window) - setattr(context, "window_size", int(os.getenv("window_size", 10))) - setattr(context, "tsdb_table", os.getenv("tsdb_table", "concept_drift_tsdb_1")) - try: - context.v3f.create("tsdb", context.tsdb_table, rate="1/s", if_exists=1) - except Exception as e: - context.logger.info(f"Creating context with rate= faile for {e}") - context.v3f.create( - "tsdb", context.tsdb_table, attrs={"rate": "1/s"}, if_exists=1 - ) - - callbacks = [callback.strip() for callback in os.getenv("callbacks", "").split(",")] - setattr(context, "callbacks", callbacks) - - setattr(context, "drift_stream", os.getenv("drift_stream", "/bigdata/drift_stream")) - try: - create_stream( - context, context.drift_stream, int(os.getenv("drift_stream_shards", 1)) - ) - except: - context.logger.info(f"{context.drift_stream} already exists") - - models = {} - model_types = ["pagehinkely", "ddm", "eddm"] - path_suffix = "_model_path" - for model in model_types: - model_env = f"{model}{path_suffix}" - if model_env in os.environ: - with open(os.environ[model_env], "rb") as f: - models[model] = load(f) - setattr(context, "models", models) - - setattr(context, "label_col", os.getenv("label_col", "label")) - setattr(context, "prediction_col", os.getenv("prediction_col", "prediction")) - - -def handler(context, event): - context.logger.info(f"event: {event.body}") - full_event = json.loads(event.body) - record = construct_record(full_event) - - is_error = record[context.label_col] != record[context.prediction_col] - context.logger.info(f"Adding {is_error}") - - for name, model in context.models.items(): - results = {"timestamp": record["timestamp"]} - results["algorithm"] = name - model.add_element(is_error) - - if hasattr(model, "detected_warning_zone") and model.detected_warning_zone(): - context.logger.info(f"{name}\tWarning zone detected") - results["warning_zone"] = 1 - full_event[f"{name}_warning_zone"] = 1 - else: - results["warning_zone"] = 0 - full_event[f"{name}_warning_zone"] = 0 - - if model.detected_change(): - context.logger.info("Change Detected") - results["change_detected"] = 1 - full_event[f"{name}_drift"] = 1 - else: - results["change_detected"] = 0 - full_event[f"{name}_drift"] = 0 - context.window.append(results) - - push_to_stream(context, context.drift_stream, [full_event]) - - if context.callbacks != [""]: - for callback in context.callbacks: - requests.post(url=callback, json=full_event) - - if (len(context.window) / len(context.models)) >= context.window_size: - df = pd.DataFrame(context.window) - df["timestamp"] = pd.to_datetime(df["timestamp"]) - df = df.set_index(["timestamp", "algorithm"]) - context.v3f.write("tsdb", context.tsdb_table, df) - context.window = [] diff --git a/functions/development/concept_drift_streaming/0.9.1/src/function.yaml b/functions/development/concept_drift_streaming/0.9.1/src/function.yaml deleted file mode 100644 index 3001b1bf..00000000 --- a/functions/development/concept_drift_streaming/0.9.1/src/function.yaml +++ /dev/null @@ -1,48 +0,0 @@ -kind: remote -metadata: - name: concept-drift-streaming - tag: '' - hash: dc41ff41149be69f19b91a6d78a06571937063ae - project: '' - labels: - author: orz - framework: sklearn - categories: - - machine-learning - - monitoring -spec: - command: '' - args: [] - image: mlrun/ml-models - description: Deploy a streaming Concept Drift detector on a labeled stream. the - nuclio part of the concept_drift function - min_replicas: 1 - max_replicas: 4 - env: [] - base_spec: - apiVersion: nuclio.io/v1 - kind: Function - metadata: - name: concept-drift-streaming - labels: {} - annotations: - nuclio.io/generated_by: function generated from /User/test/functions/concept_drift_streaming/concept_drift_streaming.py - spec: - runtime: python:3.6 - handler: concept_drift_streaming:handler - env: [] - volumes: [] - build: - commands: [] - noBaseImagesPull: true - functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHNrbXVsdGlmbG93LmRyaWZ0X2RldGVjdGlvbgppbXBvcnQgbnVtcHkgYXMgbnAKaW1wb3J0IHBhbmRhcyBhcyBwZAppbXBvcnQgb3MKaW1wb3J0IGpzb24KaW1wb3J0IHYzaW8uZGF0YXBsYW5lCmltcG9ydCB2M2lvX2ZyYW1lcyBhcyB2M2YKaW1wb3J0IHJlcXVlc3RzCmZyb20gY2xvdWRwaWNrbGUgaW1wb3J0IGxvYWQKCmltcG9ydCByYW5kb20KCgpkZWYgc3BsaXRfcGF0aChtbnRwYXRoPSIiKToKICAgIGlmIG1udHBhdGhbMF0gPT0gIi8iOgogICAgICAgIG1udHBhdGggPSBtbnRwYXRoWzE6XQogICAgcGF0aHMgPSBtbnRwYXRoLnNwbGl0KCIvIikKICAgIGNvbnRhaW5lciA9IHBhdGhzWzBdCiAgICBzdWJwYXRoID0gIiIKICAgIGlmIGxlbihwYXRocykgPiAxOgogICAgICAgIHN1YnBhdGggPSBtbnRwYXRoW2xlbihjb250YWluZXIpIDpdCiAgICByZXR1cm4gY29udGFpbmVyLCBzdWJwYXRoCgoKZGVmIGNyZWF0ZV9zdHJlYW0oY29udGV4dCwgcGF0aCwgc2hhcmRzPTEpOgogICAgY29udGFpbmVyLCBzdHJlYW1fcGF0aCA9IHNwbGl0X3BhdGgocGF0aCkKICAgIGNvbnRleHQubG9nZ2VyLmluZm8oCiAgICAgICAgZiJDcmVhdGluZyBzdHJlYW0gaW4gQ29udGFpbmVyOiB7Y29udGFpbmVyfSAmIFBhdGgge3N0cmVhbV9wYXRofSIKICAgICkKICAgIHJlc3BvbnNlID0gY29udGV4dC52M2lvX2NsaWVudC5jcmVhdGVfc3RyZWFtKAogICAgICAgIGNvbnRhaW5lcj1jb250YWluZXIsCiAgICAgICAgcGF0aD1zdHJlYW1fcGF0aCwKICAgICAgICBzaGFyZF9jb3VudD1zaGFyZHMsCiAgICAgICAgcmFpc2VfZm9yX3N0YXR1cz12M2lvLmRhdGFwbGFuZS5SYWlzZUZvclN0YXR1cy5uZXZlciwKICAgICkKICAgIHJlc3BvbnNlLnJhaXNlX2Zvcl9zdGF0dXMoWzQwOSwgMjA0XSkKCgpkZWYgcHVzaF90b19zdHJlYW0oY29udGV4dCwgc3RyZWFtX3BhdGgsIGRhdGEpOgogICAgcmVjb3JkcyA9IFt7ImRhdGEiOiBqc29uLmR1bXBzKHJlYyl9IGZvciByZWMgaW4gZGF0YV0KICAgIGNvbnRhaW5lciwgc3RyZWFtX3BhdGggPSBzcGxpdF9wYXRoKHN0cmVhbV9wYXRoKQogICAgcmVzcG9uc2UgPSBjb250ZXh0LnYzaW9fY2xpZW50LnB1dF9yZWNvcmRzKAogICAgICAgIGNvbnRhaW5lcj1jb250YWluZXIsIHBhdGg9c3RyZWFtX3BhdGgsIHJlY29yZHM9cmVjb3JkcwogICAgKQoKCmRlZiBjb25zdHJ1Y3RfcmVjb3JkKHJlY29yZCk6CiAgICBsYWJlbF9jb2wgPSBvcy5nZXRlbnYoImxhYmVsX2NvbCIsICJsYWJlbCIpCiAgICBwcmVkaWN0aW9uX2NvbCA9IG9zLmdldGVudigicHJlZGljdGlvbl9jb2wiLCAicHJlZGljdGlvbiIpCiAgICByZXMgPSBkaWN0KFsoaywgcmVjb3JkW2tdKSBmb3IgayBpbiBbIndoZW4iLCAiY2xhc3MiLCAibW9kZWwiLCAicmVzcCIsICJyZXF1ZXN0Il1dKQogICAgcmVzWyJmZWF0dXJlX3ZlY3RvciJdID0gcmVzLnBvcCgicmVxdWVzdCIpWyJpbnN0YW5jZXMiXVswXQogICAgcmVzWyJ0aW1lc3RhbXAiXSA9IHJlcy5wb3AoIndoZW4iKQogICAgcmVzW3ByZWRpY3Rpb25fY29sXSA9IHJlc1sicmVzcCJdWzBdCiAgICByZXR1cm4gcmVzCgoKZGVmIGluaXRfY29udGV4dChjb250ZXh0KToKICAgIHYzaW9fY2xpZW50ID0gdjNpby5kYXRhcGxhbmUuQ2xpZW50KCkKICAgIHNldGF0dHIoY29udGV4dCwgInYzaW9fY2xpZW50IiwgdjNpb19jbGllbnQpCgogICAgdjNmX2NsaWVudCA9IHYzZi5DbGllbnQoImZyYW1lc2Q6ODA4MSIsIGNvbnRhaW5lcj0iYmlnZGF0YSIpCiAgICBzZXRhdHRyKGNvbnRleHQsICJ2M2YiLCB2M2ZfY2xpZW50KQogICAgd2luZG93ID0gW10KICAgIHNldGF0dHIoY29udGV4dCwgIndpbmRvdyIsIHdpbmRvdykKICAgIHNldGF0dHIoY29udGV4dCwgIndpbmRvd19zaXplIiwgaW50KG9zLmdldGVudigid2luZG93X3NpemUiLCAxMCkpKQogICAgc2V0YXR0cihjb250ZXh0LCAidHNkYl90YWJsZSIsIG9zLmdldGVudigidHNkYl90YWJsZSIsICJjb25jZXB0X2RyaWZ0X3RzZGJfMSIpKQogICAgdHJ5OgogICAgICAgIGNvbnRleHQudjNmLmNyZWF0ZSgidHNkYiIsIGNvbnRleHQudHNkYl90YWJsZSwgcmF0ZT0iMS9zIiwgaWZfZXhpc3RzPTEpCiAgICBleGNlcHQgRXhjZXB0aW9uIGFzIGU6CiAgICAgICAgY29udGV4dC5sb2dnZXIuaW5mbyhmIkNyZWF0aW5nIGNvbnRleHQgd2l0aCByYXRlPSBmYWlsZSBmb3Ige2V9IikKICAgICAgICBjb250ZXh0LnYzZi5jcmVhdGUoCiAgICAgICAgICAgICJ0c2RiIiwgY29udGV4dC50c2RiX3RhYmxlLCBhdHRycz17InJhdGUiOiAiMS9zIn0sIGlmX2V4aXN0cz0xCiAgICAgICAgKQoKICAgIGNhbGxiYWNrcyA9IFtjYWxsYmFjay5zdHJpcCgpIGZvciBjYWxsYmFjayBpbiBvcy5nZXRlbnYoImNhbGxiYWNrcyIsICIiKS5zcGxpdCgiLCIpXQogICAgc2V0YXR0cihjb250ZXh0LCAiY2FsbGJhY2tzIiwgY2FsbGJhY2tzKQoKICAgIHNldGF0dHIoY29udGV4dCwgImRyaWZ0X3N0cmVhbSIsIG9zLmdldGVudigiZHJpZnRfc3RyZWFtIiwgIi9iaWdkYXRhL2RyaWZ0X3N0cmVhbSIpKQogICAgdHJ5OgogICAgICAgIGNyZWF0ZV9zdHJlYW0oCiAgICAgICAgICAgIGNvbnRleHQsIGNvbnRleHQuZHJpZnRfc3RyZWFtLCBpbnQob3MuZ2V0ZW52KCJkcmlmdF9zdHJlYW1fc2hhcmRzIiwgMSkpCiAgICAgICAgKQogICAgZXhjZXB0OgogICAgICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZiJ7Y29udGV4dC5kcmlmdF9zdHJlYW19IGFscmVhZHkgZXhpc3RzIikKCiAgICBtb2RlbHMgPSB7fQogICAgbW9kZWxfdHlwZXMgPSBbInBhZ2VoaW5rZWx5IiwgImRkbSIsICJlZGRtIl0KICAgIHBhdGhfc3VmZml4ID0gIl9tb2RlbF9wYXRoIgogICAgZm9yIG1vZGVsIGluIG1vZGVsX3R5cGVzOgogICAgICAgIG1vZGVsX2VudiA9IGYie21vZGVsfXtwYXRoX3N1ZmZpeH0iCiAgICAgICAgaWYgbW9kZWxfZW52IGluIG9zLmVudmlyb246CiAgICAgICAgICAgIHdpdGggb3Blbihvcy5lbnZpcm9uW21vZGVsX2Vudl0sICJyYiIpIGFzIGY6CiAgICAgICAgICAgICAgICBtb2RlbHNbbW9kZWxdID0gbG9hZChmKQogICAgc2V0YXR0cihjb250ZXh0LCAibW9kZWxzIiwgbW9kZWxzKQoKICAgIHNldGF0dHIoY29udGV4dCwgImxhYmVsX2NvbCIsIG9zLmdldGVudigibGFiZWxfY29sIiwgImxhYmVsIikpCiAgICBzZXRhdHRyKGNvbnRleHQsICJwcmVkaWN0aW9uX2NvbCIsIG9zLmdldGVudigicHJlZGljdGlvbl9jb2wiLCAicHJlZGljdGlvbiIpKQoKCmRlZiBoYW5kbGVyKGNvbnRleHQsIGV2ZW50KToKICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZiJldmVudDoge2V2ZW50LmJvZHl9IikKICAgIGZ1bGxfZXZlbnQgPSBqc29uLmxvYWRzKGV2ZW50LmJvZHkpCiAgICByZWNvcmQgPSBjb25zdHJ1Y3RfcmVjb3JkKGZ1bGxfZXZlbnQpCgogICAgaXNfZXJyb3IgPSByZWNvcmRbY29udGV4dC5sYWJlbF9jb2xdICE9IHJlY29yZFtjb250ZXh0LnByZWRpY3Rpb25fY29sXQogICAgY29udGV4dC5sb2dnZXIuaW5mbyhmIkFkZGluZyB7aXNfZXJyb3J9IikKCiAgICBmb3IgbmFtZSwgbW9kZWwgaW4gY29udGV4dC5tb2RlbHMuaXRlbXMoKToKICAgICAgICByZXN1bHRzID0geyJ0aW1lc3RhbXAiOiByZWNvcmRbInRpbWVzdGFtcCJdfQogICAgICAgIHJlc3VsdHNbImFsZ29yaXRobSJdID0gbmFtZQogICAgICAgIG1vZGVsLmFkZF9lbGVtZW50KGlzX2Vycm9yKQoKICAgICAgICBpZiBoYXNhdHRyKG1vZGVsLCAiZGV0ZWN0ZWRfd2FybmluZ196b25lIikgYW5kIG1vZGVsLmRldGVjdGVkX3dhcm5pbmdfem9uZSgpOgogICAgICAgICAgICBjb250ZXh0LmxvZ2dlci5pbmZvKGYie25hbWV9XHRXYXJuaW5nIHpvbmUgZGV0ZWN0ZWQiKQogICAgICAgICAgICByZXN1bHRzWyJ3YXJuaW5nX3pvbmUiXSA9IDEKICAgICAgICAgICAgZnVsbF9ldmVudFtmIntuYW1lfV93YXJuaW5nX3pvbmUiXSA9IDEKICAgICAgICBlbHNlOgogICAgICAgICAgICByZXN1bHRzWyJ3YXJuaW5nX3pvbmUiXSA9IDAKICAgICAgICAgICAgZnVsbF9ldmVudFtmIntuYW1lfV93YXJuaW5nX3pvbmUiXSA9IDAKCiAgICAgICAgaWYgbW9kZWwuZGV0ZWN0ZWRfY2hhbmdlKCk6CiAgICAgICAgICAgIGNvbnRleHQubG9nZ2VyLmluZm8oIkNoYW5nZSBEZXRlY3RlZCIpCiAgICAgICAgICAgIHJlc3VsdHNbImNoYW5nZV9kZXRlY3RlZCJdID0gMQogICAgICAgICAgICBmdWxsX2V2ZW50W2Yie25hbWV9X2RyaWZ0Il0gPSAxCiAgICAgICAgZWxzZToKICAgICAgICAgICAgcmVzdWx0c1siY2hhbmdlX2RldGVjdGVkIl0gPSAwCiAgICAgICAgICAgIGZ1bGxfZXZlbnRbZiJ7bmFtZX1fZHJpZnQiXSA9IDAKICAgICAgICBjb250ZXh0LndpbmRvdy5hcHBlbmQocmVzdWx0cykKCiAgICBwdXNoX3RvX3N0cmVhbShjb250ZXh0LCBjb250ZXh0LmRyaWZ0X3N0cmVhbSwgW2Z1bGxfZXZlbnRdKQoKICAgIGlmIGNvbnRleHQuY2FsbGJhY2tzICE9IFsiIl06CiAgICAgICAgZm9yIGNhbGxiYWNrIGluIGNvbnRleHQuY2FsbGJhY2tzOgogICAgICAgICAgICByZXF1ZXN0cy5wb3N0KHVybD1jYWxsYmFjaywganNvbj1mdWxsX2V2ZW50KQoKICAgIGlmIChsZW4oY29udGV4dC53aW5kb3cpIC8gbGVuKGNvbnRleHQubW9kZWxzKSkgPj0gY29udGV4dC53aW5kb3dfc2l6ZToKICAgICAgICBkZiA9IHBkLkRhdGFGcmFtZShjb250ZXh0LndpbmRvdykKICAgICAgICBkZlsidGltZXN0YW1wIl0gPSBwZC50b19kYXRldGltZShkZlsidGltZXN0YW1wIl0pCiAgICAgICAgZGYgPSBkZi5zZXRfaW5kZXgoWyJ0aW1lc3RhbXAiLCAiYWxnb3JpdGhtIl0pCiAgICAgICAgY29udGV4dC52M2Yud3JpdGUoInRzZGIiLCBjb250ZXh0LnRzZGJfdGFibGUsIGRmKQogICAgICAgIGNvbnRleHQud2luZG93ID0gW10K - source: '' - build: - commands: - - python -m pip install scikit-multiflow==0.4.1 v3io_frames - code_origin: https://github.com/daniels290813/functions.git#d96059851b5d51fd4583e982483eb973fccc47d2:/User/test/functions/concept_drift_streaming/concept_drift_streaming.py - origin_filename: /User/test/functions/concept_drift_streaming/concept_drift_streaming.py - default_handler: handler - disable_auto_mount: false - affinity: null -verbose: false diff --git a/functions/development/concept_drift_streaming/0.9.1/src/item.yaml b/functions/development/concept_drift_streaming/0.9.1/src/item.yaml deleted file mode 100644 index 3999259b..00000000 --- a/functions/development/concept_drift_streaming/0.9.1/src/item.yaml +++ /dev/null @@ -1,28 +0,0 @@ -apiVersion: v1 -categories: -- machine-learning -- monitoring -description: Deploy a streaming Concept Drift detector on a labeled stream. the nuclio - part of the concept_drift function -doc: '' -example: concept_drift_streaming.ipynb -generationDate: 2021-11-18:12-28 -icon: '' -labels: - author: orz - framework: sklearn -maintainers: [] -marketplaceType: '' -mlrunVersion: 0.8.0 -name: concept-drift-streaming -platformVersion: 3.2.0 -spec: - filename: concept_drift_streaming.py - handler: handler - image: mlrun/ml-models - kind: nuclio - requirements: - - scikit-multiflow==0.4.1 - - v3io_frames -url: '' -version: 0.9.1 diff --git a/functions/development/concept_drift_streaming/0.9.1/src/requirements.txt b/functions/development/concept_drift_streaming/0.9.1/src/requirements.txt deleted file mode 100644 index fa0fddd8..00000000 --- a/functions/development/concept_drift_streaming/0.9.1/src/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -skmultiflow \ No newline at end of file diff --git a/functions/development/concept_drift_streaming/0.9.1/static/documentation.html b/functions/development/concept_drift_streaming/0.9.1/static/documentation.html deleted file mode 100644 index c8d36c12..00000000 --- a/functions/development/concept_drift_streaming/0.9.1/static/documentation.html +++ /dev/null @@ -1,152 +0,0 @@ - - - - - - - -concept_drift_streaming package - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- - -
-
-
-
-
-
-

concept_drift_streaming package

-
-

Submodules

-
-
-

concept_drift_streaming.concept_drift_streaming module

-
-
-concept_drift_streaming.concept_drift_streaming.construct_record(record)[source]
-
-
-
-concept_drift_streaming.concept_drift_streaming.create_stream(context, path, shards=1)[source]
-
-
-
-concept_drift_streaming.concept_drift_streaming.handler(context, event)[source]
-
-
-
-concept_drift_streaming.concept_drift_streaming.init_context(context)[source]
-
-
-
-concept_drift_streaming.concept_drift_streaming.push_to_stream(context, stream_path, data)[source]
-
-
-
-concept_drift_streaming.concept_drift_streaming.split_path(mntpath='')[source]
-
-
-
-

Module contents

-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/concept_drift_streaming/0.9.1/static/example.html b/functions/development/concept_drift_streaming/0.9.1/static/example.html deleted file mode 100644 index 9f4aef2f..00000000 --- a/functions/development/concept_drift_streaming/0.9.1/static/example.html +++ /dev/null @@ -1,489 +0,0 @@ - - - - - - - -Concept Drift Streaming - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- -
-
- Contents -
- -
-
-
-
-
-
-
-

Concept Drift Streaming

-
-
-
import nuclio
-
-
-
-
-
-
-
from pprint import pprint
-
-
-
-
-
-
-
%%nuclio cmd -c
-python -m pip install scikit-multiflow==0.4.1
-python -m pip install v3io_frames
-
-
-
-
-
-
-
# Define function spec
-%nuclio config kind = "nuclio"
-%nuclio config spec.build.baseImage = "mlrun/ml-models"
-
-# Add V3IO Mount
-# %nuclio env %v3io
-
-
-
-
-
%nuclio: setting kind to 'nuclio'
-%nuclio: setting spec.build.baseImage to 'mlrun/ml-models'
-
-
-
-
-
-
-
# nuclio: ignore
-env = {'label_col': 'resp',
-       'prediction_col': 'prediction',
-       'drift_stream': '/bigdata/network-operations/drift_stream',
-       'tsdb_table': 'network-operations/drift_tsdb',
-       'pagehinkley_threshold': 10,
-       'models': ['pagehinkley', 'ddm', 'eddm'],
-       'window_size': 10}
-config = {'kind': 'nuclio',
-          'spec.build.baseImage': 'mlrun/ml-models'}
-cmd = ['python -m pip install scikit-multiflow',
-       'python -m pip install v3io_frames']
-v3io = True
-config = nuclio.ConfigSpec(env=env,
-                           config=config,
-                           cmd=cmd,
-                           v3io=v3io)
-
-
-
-
-
-
-
# nuclio: start-code
-
-
-
-
-
-
-
import skmultiflow.drift_detection
-import numpy as np
-import pandas as pd
-import os
-import json
-import v3io.dataplane
-import v3io_frames as v3f
-import requests
-from cloudpickle import load
-
-# For testing
-import random
-
-
-
-
-
-
-
def split_path(mntpath=''):
-    if mntpath[0] == '/':
-        mntpath = mntpath[1:]
-    paths = mntpath.split('/')
-    container = paths[0]
-    subpath = ''
-    if len(paths) > 1:
-        subpath = mntpath[len(container):]
-    return container, subpath
-
-
-def create_stream(context, path, shards=1):
-    # create a stream w/8 shards
-    container, stream_path = split_path(path)
-    context.logger.info(f'Creating stream in Container: {container} & Path {stream_path}')
-    response = context.v3io_client.create_stream(container=container,
-                                        path=stream_path, 
-                                        shard_count=shards,
-                                        raise_for_status=v3io.dataplane.RaiseForStatus.never)
-    response.raise_for_status([409, 204])
-    
-    
-def push_to_stream(context, stream_path, data):
-    records = [{'data': json.dumps(rec)} for rec in data]
-    container, stream_path = split_path(stream_path)
-    response = context.v3io_client.put_records(container=container,
-                                               path=stream_path, 
-                                               records=records)
-
-
-def construct_record(record):
-    label_col = os.getenv('label_col', 'label')
-    prediction_col = os.getenv('prediction_col', 'prediction')
-    res = dict([(k, record[k]) for k in ['when', 'class', 'model', 'resp', 'request']])
-    res['feature_vector'] = res.pop('request')['instances'][0]
-    res['timestamp'] = res.pop('when')
-    res['prediction'] = res['resp'][0]
-    return res
-
-
-
-
-
-
-
def init_context(context):
-    # create a v3io context object
-    v3io_client = v3io.dataplane.Client()
-    setattr(context, "v3io_client", v3io_client)
-    
-    # Setup windowing for TSDB writer
-    v3f_client = v3f.Client('framesd:8081', container='bigdata')
-    setattr(context, "v3f", v3f_client)
-    window = []
-    setattr(context, 'window', window)
-    setattr(context, 'window_size', int(os.getenv('window_size', 10)))
-    setattr(context, 'tsdb_table', os.getenv('tsdb_table', 'concept_drift_tsdb_1'))
-    try:
-        context.v3f.create('tsdb', context.tsdb_table, rate='1/s', if_exists=1)
-    except Exception as e:
-        context.logger.info(f'Creating context with rate= faile for {e}')
-        context.v3f.create('tsdb', context.tsdb_table, attrs={'rate': '1/s'}, if_exists=1)
-    
-    # Setup callbacks
-    callbacks = [callback.strip() for callback in os.getenv('callbacks', '').split(',')]
-    setattr(context, 'callbacks', callbacks)
-    
-    # Setup drift stream
-    setattr(context, 'drift_stream', os.getenv('drift_stream', '/bigdata/drift_stream'))
-    try:
-        create_stream(context, context.drift_stream, int(os.getenv('drift_stream_shards', 1)))
-    except:
-        context.logger.info(f'{context.drift_stream} already exists')
-    
-    # Load models
-    models = {}
-    model_types = ['pagehinkely', 'ddm', 'eddm']
-    path_suffix = '_model_path'
-    for model in model_types:
-        model_env = f'{model}{path_suffix}'
-        if model_env in os.environ:
-            with open(os.environ[model_env], 'rb') as f:
-                models[model] = load(f)
-    setattr(context, 'models', models)
-    
-    # Columns to check
-    setattr(context, 'label_col', os.getenv('label_col', 'label'))
-    setattr(context, 'prediction_col', os.getenv('prediction_col', 'prediction'))
-
-
-
-
-
-
-
def handler(context, event):
-    # Construct event
-    context.logger.info(f'event: {event.body}')
-    full_event = json.loads(event.body)
-    record = construct_record(full_event)
-    
-    # Is our prediction wrong?
-    is_error = record[context.label_col] != record[context.prediction_col]
-    context.logger.info(f'Adding {is_error}')
-    
-    # Process the {is_error} element with our algorithms
-    for name, model in context.models.items():
-        # Add element
-        results = {'timestamp': record['timestamp']}
-        results['algorithm'] = name
-        model.add_element(is_error)
-        
-        # Detect warning zone (if applicable to the algorithm)
-        if hasattr(model, 'detected_warning_zone') and model.detected_warning_zone():
-            context.logger.info(f'{name}\tWarning zone detected')
-            results['warning_zone'] = 1
-            full_event[f'{name}_warning_zone'] = 1
-        else:
-            results['warning_zone'] = 0
-            full_event[f'{name}_warning_zone'] = 0
-        
-        # Detect drift
-        if model.detected_change():
-            context.logger.info('Change Detected')
-            results['change_detected'] = 1
-            full_event[f'{name}_drift'] = 1
-        else:
-            results['change_detected'] = 0
-            full_event[f'{name}_drift'] = 0
-        context.window.append(results)
-    
-    # Return results
-    # Write to stream
-    push_to_stream(context, context.drift_stream, [full_event])
-    
-    # Add to callbacks
-    if context.callbacks != ['']:
-        for callback in context.callbacks:
-            requests.post(url=callback,
-                          json=full_event)
-    
-    if (len(context.window) / len(context.models)) >= context.window_size:
-        df = pd.DataFrame(context.window)
-        df['timestamp'] = pd.to_datetime(df['timestamp'])
-        df = df.set_index(['timestamp', 'algorithm'])
-        context.v3f.write('tsdb', context.tsdb_table, df)
-        context.window = []
-
-
-
-
-
-
-
# nuclio: end-code
-
-
-
-
-
-

Test

-
-
-
init_context(context)
-event = nuclio.Event(body=json.dumps({'prediction': 0,
-                                      'when': 'now',
-                                      'class': 'ClassModel', 
-                                      'model': 'tester_v1', 
-                                      'resp': [0], 
-                                      'request': {'instances': [[1, 1.2, 3]]}}))
-out = handler(context, event)
-out
-
-
-
-
-
-
-

Cluster

-
-
-
%nuclio deploy -n network-operations-concept-drift -p network-operations
-
-
-
-
-
-
-

Save function yaml

-
-
-
from os import path
-from mlrun import run_local, NewTask, mlconf, import_function, mount_v3io, code_to_function, get_run_db
-mlconf.dbpath = mlconf.dbpath or 'http://mlrun-api:8080'
-
-
-
-
-
-
-
# create job function object from notebook code
-fn = code_to_function("concept_drift_streaming", kind='nuclio')
-
-# add metadata (for templates and reuse)
-fn.spec.default_handler = "handler"
-fn.spec.description = "Deploy a streaming Concept Drift detector on a labeled stream. the nuclio part of the concept_drift function"
-fn.metadata.categories = ["ml", "serve"]
-fn.metadata.labels = {"author": "orz", "framework": "sklearn"}
-fn.export("/User/functions/concept_drift_streaming/function.yaml")
-
-
-
-
-
[mlrun] 2020-07-14 13:49:22,720 function spec saved to path: /User/functions/concept_drift_streaming/function.yaml
-
-
-
<mlrun.runtimes.function.RemoteRuntime at 0x7fb5a7de3e80>
-
-
-
-
-
-
-
stream_trigger = nuclio.triggers.V3IOStreamTrigger(url='/bigdata/network-operations/inference_stream@cd2')
-
-
-
-
-
-
-
fn.add_trigger('labeled_stream', stream_trigger)
-
-
-
-
-
<mlrun.runtimes.function.RemoteRuntime at 0x7fa1dc063780>
-
-
-
-
-
-
-
fn.apply(mount_v3io()).with_v3io()
-
-
-
-
-
<mlrun.runtimes.function.RemoteRuntime at 0x7fa1dc063780>
-
-
-
-
-
-
-
fn.export("function.yaml")
-
-
-
-
-
-
-

Stream testing

-
-
-
fn = import_function('./function.yaml')
-
-
-
-
-
-
-
fn.deploy(project='network-operations')
-
-
-
-
-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/concept_drift_streaming/0.9.1/static/function.html b/functions/development/concept_drift_streaming/0.9.1/static/function.html deleted file mode 100644 index 32c31c6f..00000000 --- a/functions/development/concept_drift_streaming/0.9.1/static/function.html +++ /dev/null @@ -1,70 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: remote
-metadata:
-  name: concept-drift-streaming
-  tag: ''
-  hash: dc41ff41149be69f19b91a6d78a06571937063ae
-  project: ''
-  labels:
-    author: orz
-    framework: sklearn
-  categories:
-  - machine-learning
-  - monitoring
-spec:
-  command: ''
-  args: []
-  image: mlrun/ml-models
-  description: Deploy a streaming Concept Drift detector on a labeled stream. the
-    nuclio part of the concept_drift function
-  min_replicas: 1
-  max_replicas: 4
-  env: []
-  base_spec:
-    apiVersion: nuclio.io/v1
-    kind: Function
-    metadata:
-      name: concept-drift-streaming
-      labels: {}
-      annotations:
-        nuclio.io/generated_by: function generated from /User/test/functions/concept_drift_streaming/concept_drift_streaming.py
-    spec:
-      runtime: python:3.6
-      handler: concept_drift_streaming:handler
-      env: []
-      volumes: []
-      build:
-        commands: []
-        noBaseImagesPull: true
-        functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHNrbXVsdGlmbG93LmRyaWZ0X2RldGVjdGlvbgppbXBvcnQgbnVtcHkgYXMgbnAKaW1wb3J0IHBhbmRhcyBhcyBwZAppbXBvcnQgb3MKaW1wb3J0IGpzb24KaW1wb3J0IHYzaW8uZGF0YXBsYW5lCmltcG9ydCB2M2lvX2ZyYW1lcyBhcyB2M2YKaW1wb3J0IHJlcXVlc3RzCmZyb20gY2xvdWRwaWNrbGUgaW1wb3J0IGxvYWQKCmltcG9ydCByYW5kb20KCgpkZWYgc3BsaXRfcGF0aChtbnRwYXRoPSIiKToKICAgIGlmIG1udHBhdGhbMF0gPT0gIi8iOgogICAgICAgIG1udHBhdGggPSBtbnRwYXRoWzE6XQogICAgcGF0aHMgPSBtbnRwYXRoLnNwbGl0KCIvIikKICAgIGNvbnRhaW5lciA9IHBhdGhzWzBdCiAgICBzdWJwYXRoID0gIiIKICAgIGlmIGxlbihwYXRocykgPiAxOgogICAgICAgIHN1YnBhdGggPSBtbnRwYXRoW2xlbihjb250YWluZXIpIDpdCiAgICByZXR1cm4gY29udGFpbmVyLCBzdWJwYXRoCgoKZGVmIGNyZWF0ZV9zdHJlYW0oY29udGV4dCwgcGF0aCwgc2hhcmRzPTEpOgogICAgY29udGFpbmVyLCBzdHJlYW1fcGF0aCA9IHNwbGl0X3BhdGgocGF0aCkKICAgIGNvbnRleHQubG9nZ2VyLmluZm8oCiAgICAgICAgZiJDcmVhdGluZyBzdHJlYW0gaW4gQ29udGFpbmVyOiB7Y29udGFpbmVyfSAmIFBhdGgge3N0cmVhbV9wYXRofSIKICAgICkKICAgIHJlc3BvbnNlID0gY29udGV4dC52M2lvX2NsaWVudC5jcmVhdGVfc3RyZWFtKAogICAgICAgIGNvbnRhaW5lcj1jb250YWluZXIsCiAgICAgICAgcGF0aD1zdHJlYW1fcGF0aCwKICAgICAgICBzaGFyZF9jb3VudD1zaGFyZHMsCiAgICAgICAgcmFpc2VfZm9yX3N0YXR1cz12M2lvLmRhdGFwbGFuZS5SYWlzZUZvclN0YXR1cy5uZXZlciwKICAgICkKICAgIHJlc3BvbnNlLnJhaXNlX2Zvcl9zdGF0dXMoWzQwOSwgMjA0XSkKCgpkZWYgcHVzaF90b19zdHJlYW0oY29udGV4dCwgc3RyZWFtX3BhdGgsIGRhdGEpOgogICAgcmVjb3JkcyA9IFt7ImRhdGEiOiBqc29uLmR1bXBzKHJlYyl9IGZvciByZWMgaW4gZGF0YV0KICAgIGNvbnRhaW5lciwgc3RyZWFtX3BhdGggPSBzcGxpdF9wYXRoKHN0cmVhbV9wYXRoKQogICAgcmVzcG9uc2UgPSBjb250ZXh0LnYzaW9fY2xpZW50LnB1dF9yZWNvcmRzKAogICAgICAgIGNvbnRhaW5lcj1jb250YWluZXIsIHBhdGg9c3RyZWFtX3BhdGgsIHJlY29yZHM9cmVjb3JkcwogICAgKQoKCmRlZiBjb25zdHJ1Y3RfcmVjb3JkKHJlY29yZCk6CiAgICBsYWJlbF9jb2wgPSBvcy5nZXRlbnYoImxhYmVsX2NvbCIsICJsYWJlbCIpCiAgICBwcmVkaWN0aW9uX2NvbCA9IG9zLmdldGVudigicHJlZGljdGlvbl9jb2wiLCAicHJlZGljdGlvbiIpCiAgICByZXMgPSBkaWN0KFsoaywgcmVjb3JkW2tdKSBmb3IgayBpbiBbIndoZW4iLCAiY2xhc3MiLCAibW9kZWwiLCAicmVzcCIsICJyZXF1ZXN0Il1dKQogICAgcmVzWyJmZWF0dXJlX3ZlY3RvciJdID0gcmVzLnBvcCgicmVxdWVzdCIpWyJpbnN0YW5jZXMiXVswXQogICAgcmVzWyJ0aW1lc3RhbXAiXSA9IHJlcy5wb3AoIndoZW4iKQogICAgcmVzW3ByZWRpY3Rpb25fY29sXSA9IHJlc1sicmVzcCJdWzBdCiAgICByZXR1cm4gcmVzCgoKZGVmIGluaXRfY29udGV4dChjb250ZXh0KToKICAgIHYzaW9fY2xpZW50ID0gdjNpby5kYXRhcGxhbmUuQ2xpZW50KCkKICAgIHNldGF0dHIoY29udGV4dCwgInYzaW9fY2xpZW50IiwgdjNpb19jbGllbnQpCgogICAgdjNmX2NsaWVudCA9IHYzZi5DbGllbnQoImZyYW1lc2Q6ODA4MSIsIGNvbnRhaW5lcj0iYmlnZGF0YSIpCiAgICBzZXRhdHRyKGNvbnRleHQsICJ2M2YiLCB2M2ZfY2xpZW50KQogICAgd2luZG93ID0gW10KICAgIHNldGF0dHIoY29udGV4dCwgIndpbmRvdyIsIHdpbmRvdykKICAgIHNldGF0dHIoY29udGV4dCwgIndpbmRvd19zaXplIiwgaW50KG9zLmdldGVudigid2luZG93X3NpemUiLCAxMCkpKQogICAgc2V0YXR0cihjb250ZXh0LCAidHNkYl90YWJsZSIsIG9zLmdldGVudigidHNkYl90YWJsZSIsICJjb25jZXB0X2RyaWZ0X3RzZGJfMSIpKQogICAgdHJ5OgogICAgICAgIGNvbnRleHQudjNmLmNyZWF0ZSgidHNkYiIsIGNvbnRleHQudHNkYl90YWJsZSwgcmF0ZT0iMS9zIiwgaWZfZXhpc3RzPTEpCiAgICBleGNlcHQgRXhjZXB0aW9uIGFzIGU6CiAgICAgICAgY29udGV4dC5sb2dnZXIuaW5mbyhmIkNyZWF0aW5nIGNvbnRleHQgd2l0aCByYXRlPSBmYWlsZSBmb3Ige2V9IikKICAgICAgICBjb250ZXh0LnYzZi5jcmVhdGUoCiAgICAgICAgICAgICJ0c2RiIiwgY29udGV4dC50c2RiX3RhYmxlLCBhdHRycz17InJhdGUiOiAiMS9zIn0sIGlmX2V4aXN0cz0xCiAgICAgICAgKQoKICAgIGNhbGxiYWNrcyA9IFtjYWxsYmFjay5zdHJpcCgpIGZvciBjYWxsYmFjayBpbiBvcy5nZXRlbnYoImNhbGxiYWNrcyIsICIiKS5zcGxpdCgiLCIpXQogICAgc2V0YXR0cihjb250ZXh0LCAiY2FsbGJhY2tzIiwgY2FsbGJhY2tzKQoKICAgIHNldGF0dHIoY29udGV4dCwgImRyaWZ0X3N0cmVhbSIsIG9zLmdldGVudigiZHJpZnRfc3RyZWFtIiwgIi9iaWdkYXRhL2RyaWZ0X3N0cmVhbSIpKQogICAgdHJ5OgogICAgICAgIGNyZWF0ZV9zdHJlYW0oCiAgICAgICAgICAgIGNvbnRleHQsIGNvbnRleHQuZHJpZnRfc3RyZWFtLCBpbnQob3MuZ2V0ZW52KCJkcmlmdF9zdHJlYW1fc2hhcmRzIiwgMSkpCiAgICAgICAgKQogICAgZXhjZXB0OgogICAgICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZiJ7Y29udGV4dC5kcmlmdF9zdHJlYW19IGFscmVhZHkgZXhpc3RzIikKCiAgICBtb2RlbHMgPSB7fQogICAgbW9kZWxfdHlwZXMgPSBbInBhZ2VoaW5rZWx5IiwgImRkbSIsICJlZGRtIl0KICAgIHBhdGhfc3VmZml4ID0gIl9tb2RlbF9wYXRoIgogICAgZm9yIG1vZGVsIGluIG1vZGVsX3R5cGVzOgogICAgICAgIG1vZGVsX2VudiA9IGYie21vZGVsfXtwYXRoX3N1ZmZpeH0iCiAgICAgICAgaWYgbW9kZWxfZW52IGluIG9zLmVudmlyb246CiAgICAgICAgICAgIHdpdGggb3Blbihvcy5lbnZpcm9uW21vZGVsX2Vudl0sICJyYiIpIGFzIGY6CiAgICAgICAgICAgICAgICBtb2RlbHNbbW9kZWxdID0gbG9hZChmKQogICAgc2V0YXR0cihjb250ZXh0LCAibW9kZWxzIiwgbW9kZWxzKQoKICAgIHNldGF0dHIoY29udGV4dCwgImxhYmVsX2NvbCIsIG9zLmdldGVudigibGFiZWxfY29sIiwgImxhYmVsIikpCiAgICBzZXRhdHRyKGNvbnRleHQsICJwcmVkaWN0aW9uX2NvbCIsIG9zLmdldGVudigicHJlZGljdGlvbl9jb2wiLCAicHJlZGljdGlvbiIpKQoKCmRlZiBoYW5kbGVyKGNvbnRleHQsIGV2ZW50KToKICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZiJldmVudDoge2V2ZW50LmJvZHl9IikKICAgIGZ1bGxfZXZlbnQgPSBqc29uLmxvYWRzKGV2ZW50LmJvZHkpCiAgICByZWNvcmQgPSBjb25zdHJ1Y3RfcmVjb3JkKGZ1bGxfZXZlbnQpCgogICAgaXNfZXJyb3IgPSByZWNvcmRbY29udGV4dC5sYWJlbF9jb2xdICE9IHJlY29yZFtjb250ZXh0LnByZWRpY3Rpb25fY29sXQogICAgY29udGV4dC5sb2dnZXIuaW5mbyhmIkFkZGluZyB7aXNfZXJyb3J9IikKCiAgICBmb3IgbmFtZSwgbW9kZWwgaW4gY29udGV4dC5tb2RlbHMuaXRlbXMoKToKICAgICAgICByZXN1bHRzID0geyJ0aW1lc3RhbXAiOiByZWNvcmRbInRpbWVzdGFtcCJdfQogICAgICAgIHJlc3VsdHNbImFsZ29yaXRobSJdID0gbmFtZQogICAgICAgIG1vZGVsLmFkZF9lbGVtZW50KGlzX2Vycm9yKQoKICAgICAgICBpZiBoYXNhdHRyKG1vZGVsLCAiZGV0ZWN0ZWRfd2FybmluZ196b25lIikgYW5kIG1vZGVsLmRldGVjdGVkX3dhcm5pbmdfem9uZSgpOgogICAgICAgICAgICBjb250ZXh0LmxvZ2dlci5pbmZvKGYie25hbWV9XHRXYXJuaW5nIHpvbmUgZGV0ZWN0ZWQiKQogICAgICAgICAgICByZXN1bHRzWyJ3YXJuaW5nX3pvbmUiXSA9IDEKICAgICAgICAgICAgZnVsbF9ldmVudFtmIntuYW1lfV93YXJuaW5nX3pvbmUiXSA9IDEKICAgICAgICBlbHNlOgogICAgICAgICAgICByZXN1bHRzWyJ3YXJuaW5nX3pvbmUiXSA9IDAKICAgICAgICAgICAgZnVsbF9ldmVudFtmIntuYW1lfV93YXJuaW5nX3pvbmUiXSA9IDAKCiAgICAgICAgaWYgbW9kZWwuZGV0ZWN0ZWRfY2hhbmdlKCk6CiAgICAgICAgICAgIGNvbnRleHQubG9nZ2VyLmluZm8oIkNoYW5nZSBEZXRlY3RlZCIpCiAgICAgICAgICAgIHJlc3VsdHNbImNoYW5nZV9kZXRlY3RlZCJdID0gMQogICAgICAgICAgICBmdWxsX2V2ZW50W2Yie25hbWV9X2RyaWZ0Il0gPSAxCiAgICAgICAgZWxzZToKICAgICAgICAgICAgcmVzdWx0c1siY2hhbmdlX2RldGVjdGVkIl0gPSAwCiAgICAgICAgICAgIGZ1bGxfZXZlbnRbZiJ7bmFtZX1fZHJpZnQiXSA9IDAKICAgICAgICBjb250ZXh0LndpbmRvdy5hcHBlbmQocmVzdWx0cykKCiAgICBwdXNoX3RvX3N0cmVhbShjb250ZXh0LCBjb250ZXh0LmRyaWZ0X3N0cmVhbSwgW2Z1bGxfZXZlbnRdKQoKICAgIGlmIGNvbnRleHQuY2FsbGJhY2tzICE9IFsiIl06CiAgICAgICAgZm9yIGNhbGxiYWNrIGluIGNvbnRleHQuY2FsbGJhY2tzOgogICAgICAgICAgICByZXF1ZXN0cy5wb3N0KHVybD1jYWxsYmFjaywganNvbj1mdWxsX2V2ZW50KQoKICAgIGlmIChsZW4oY29udGV4dC53aW5kb3cpIC8gbGVuKGNvbnRleHQubW9kZWxzKSkgPj0gY29udGV4dC53aW5kb3dfc2l6ZToKICAgICAgICBkZiA9IHBkLkRhdGFGcmFtZShjb250ZXh0LndpbmRvdykKICAgICAgICBkZlsidGltZXN0YW1wIl0gPSBwZC50b19kYXRldGltZShkZlsidGltZXN0YW1wIl0pCiAgICAgICAgZGYgPSBkZi5zZXRfaW5kZXgoWyJ0aW1lc3RhbXAiLCAiYWxnb3JpdGhtIl0pCiAgICAgICAgY29udGV4dC52M2Yud3JpdGUoInRzZGIiLCBjb250ZXh0LnRzZGJfdGFibGUsIGRmKQogICAgICAgIGNvbnRleHQud2luZG93ID0gW10K
-  source: ''
-  build:
-    commands:
-    - python -m pip install scikit-multiflow==0.4.1 v3io_frames
-    code_origin: https://github.com/daniels290813/functions.git#d96059851b5d51fd4583e982483eb973fccc47d2:/User/test/functions/concept_drift_streaming/concept_drift_streaming.py
-    origin_filename: /User/test/functions/concept_drift_streaming/concept_drift_streaming.py
-  default_handler: handler
-  disable_auto_mount: false
-  affinity: null
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/concept_drift_streaming/0.9.1/static/item.html b/functions/development/concept_drift_streaming/0.9.1/static/item.html deleted file mode 100644 index a09fc48b..00000000 --- a/functions/development/concept_drift_streaming/0.9.1/static/item.html +++ /dev/null @@ -1,50 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- machine-learning
-- monitoring
-description: Deploy a streaming Concept Drift detector on a labeled stream. the nuclio
-  part of the concept_drift function
-doc: ''
-example: concept_drift_streaming.ipynb
-generationDate: 2021-11-18:12-28
-icon: ''
-labels:
-  author: orz
-  framework: sklearn
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 0.8.0
-name: concept-drift-streaming
-platformVersion: 3.2.0
-spec:
-  filename: concept_drift_streaming.py
-  handler: handler
-  image: mlrun/ml-models
-  kind: nuclio
-  requirements:
-  - scikit-multiflow==0.4.1
-  - v3io_frames
-url: ''
-version: 0.9.1
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/concept_drift_streaming/0.9.1/static/source.html b/functions/development/concept_drift_streaming/0.9.1/static/source.html deleted file mode 100644 index 3c479b8f..00000000 --- a/functions/development/concept_drift_streaming/0.9.1/static/source.html +++ /dev/null @@ -1,165 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-# Generated by nuclio.export.NuclioExporter
-
-import skmultiflow.drift_detection
-import numpy as np
-import pandas as pd
-import os
-import json
-import v3io.dataplane
-import v3io_frames as v3f
-import requests
-from cloudpickle import load
-
-import random
-
-
-def split_path(mntpath=""):
-    if mntpath[0] == "/":
-        mntpath = mntpath[1:]
-    paths = mntpath.split("/")
-    container = paths[0]
-    subpath = ""
-    if len(paths) > 1:
-        subpath = mntpath[len(container) :]
-    return container, subpath
-
-
-def create_stream(context, path, shards=1):
-    container, stream_path = split_path(path)
-    context.logger.info(
-        f"Creating stream in Container: {container} & Path {stream_path}"
-    )
-    response = context.v3io_client.create_stream(
-        container=container,
-        path=stream_path,
-        shard_count=shards,
-        raise_for_status=v3io.dataplane.RaiseForStatus.never,
-    )
-    response.raise_for_status([409, 204])
-
-
-def push_to_stream(context, stream_path, data):
-    records = [{"data": json.dumps(rec)} for rec in data]
-    container, stream_path = split_path(stream_path)
-    response = context.v3io_client.put_records(
-        container=container, path=stream_path, records=records
-    )
-
-
-def construct_record(record):
-    label_col = os.getenv("label_col", "label")
-    prediction_col = os.getenv("prediction_col", "prediction")
-    res = dict([(k, record[k]) for k in ["when", "class", "model", "resp", "request"]])
-    res["feature_vector"] = res.pop("request")["instances"][0]
-    res["timestamp"] = res.pop("when")
-    res[prediction_col] = res["resp"][0]
-    return res
-
-
-def init_context(context):
-    v3io_client = v3io.dataplane.Client()
-    setattr(context, "v3io_client", v3io_client)
-
-    v3f_client = v3f.Client("framesd:8081", container="bigdata")
-    setattr(context, "v3f", v3f_client)
-    window = []
-    setattr(context, "window", window)
-    setattr(context, "window_size", int(os.getenv("window_size", 10)))
-    setattr(context, "tsdb_table", os.getenv("tsdb_table", "concept_drift_tsdb_1"))
-    try:
-        context.v3f.create("tsdb", context.tsdb_table, rate="1/s", if_exists=1)
-    except Exception as e:
-        context.logger.info(f"Creating context with rate= faile for {e}")
-        context.v3f.create(
-            "tsdb", context.tsdb_table, attrs={"rate": "1/s"}, if_exists=1
-        )
-
-    callbacks = [callback.strip() for callback in os.getenv("callbacks", "").split(",")]
-    setattr(context, "callbacks", callbacks)
-
-    setattr(context, "drift_stream", os.getenv("drift_stream", "/bigdata/drift_stream"))
-    try:
-        create_stream(
-            context, context.drift_stream, int(os.getenv("drift_stream_shards", 1))
-        )
-    except:
-        context.logger.info(f"{context.drift_stream} already exists")
-
-    models = {}
-    model_types = ["pagehinkely", "ddm", "eddm"]
-    path_suffix = "_model_path"
-    for model in model_types:
-        model_env = f"{model}{path_suffix}"
-        if model_env in os.environ:
-            with open(os.environ[model_env], "rb") as f:
-                models[model] = load(f)
-    setattr(context, "models", models)
-
-    setattr(context, "label_col", os.getenv("label_col", "label"))
-    setattr(context, "prediction_col", os.getenv("prediction_col", "prediction"))
-
-
-def handler(context, event):
-    context.logger.info(f"event: {event.body}")
-    full_event = json.loads(event.body)
-    record = construct_record(full_event)
-
-    is_error = record[context.label_col] != record[context.prediction_col]
-    context.logger.info(f"Adding {is_error}")
-
-    for name, model in context.models.items():
-        results = {"timestamp": record["timestamp"]}
-        results["algorithm"] = name
-        model.add_element(is_error)
-
-        if hasattr(model, "detected_warning_zone") and model.detected_warning_zone():
-            context.logger.info(f"{name}\tWarning zone detected")
-            results["warning_zone"] = 1
-            full_event[f"{name}_warning_zone"] = 1
-        else:
-            results["warning_zone"] = 0
-            full_event[f"{name}_warning_zone"] = 0
-
-        if model.detected_change():
-            context.logger.info("Change Detected")
-            results["change_detected"] = 1
-            full_event[f"{name}_drift"] = 1
-        else:
-            results["change_detected"] = 0
-            full_event[f"{name}_drift"] = 0
-        context.window.append(results)
-
-    push_to_stream(context, context.drift_stream, [full_event])
-
-    if context.callbacks != [""]:
-        for callback in context.callbacks:
-            requests.post(url=callback, json=full_event)
-
-    if (len(context.window) / len(context.models)) >= context.window_size:
-        df = pd.DataFrame(context.window)
-        df["timestamp"] = pd.to_datetime(df["timestamp"])
-        df = df.set_index(["timestamp", "algorithm"])
-        context.v3f.write("tsdb", context.tsdb_table, df)
-        context.window = []
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/concept_drift_streaming/1.1.0/src/concept_drift_streaming.ipynb b/functions/development/concept_drift_streaming/1.1.0/src/concept_drift_streaming.ipynb deleted file mode 100644 index b916cb7a..00000000 --- a/functions/development/concept_drift_streaming/1.1.0/src/concept_drift_streaming.ipynb +++ /dev/null @@ -1,480 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Concept Drift Streaming" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import nuclio" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "from pprint import pprint" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "%%nuclio cmd -c\n", - "python -m pip install scikit-multiflow==0.4.1\n", - "python -m pip install v3io_frames" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "%nuclio: setting kind to 'nuclio'\n", - "%nuclio: setting spec.build.baseImage to 'mlrun/ml-models'\n" - ] - } - ], - "source": [ - "# Define function spec\n", - "%nuclio config kind = \"nuclio\"\n", - "%nuclio config spec.build.baseImage = \"mlrun/ml-models\"\n", - "\n", - "# Add V3IO Mount\n", - "# %nuclio env %v3io" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: ignore\n", - "env = {'label_col': 'resp',\n", - " 'prediction_col': 'prediction',\n", - " 'drift_stream': '/bigdata/network-operations/drift_stream',\n", - " 'tsdb_table': 'network-operations/drift_tsdb',\n", - " 'pagehinkley_threshold': 10,\n", - " 'models': ['pagehinkley', 'ddm', 'eddm'],\n", - " 'window_size': 10}\n", - "config = {'kind': 'nuclio',\n", - " 'spec.build.baseImage': 'mlrun/ml-models'}\n", - "cmd = ['python -m pip install scikit-multiflow',\n", - " 'python -m pip install v3io_frames']\n", - "v3io = True\n", - "config = nuclio.ConfigSpec(env=env,\n", - " config=config,\n", - " cmd=cmd,\n", - " v3io=v3io)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: start-code" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "import skmultiflow.drift_detection\n", - "import numpy as np\n", - "import pandas as pd\n", - "import os\n", - "import json\n", - "import v3io.dataplane\n", - "import v3io_frames as v3f\n", - "import requests\n", - "from cloudpickle import load\n", - "\n", - "# For testing\n", - "import random" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "def split_path(mntpath=''):\n", - " if mntpath[0] == '/':\n", - " mntpath = mntpath[1:]\n", - " paths = mntpath.split('/')\n", - " container = paths[0]\n", - " subpath = ''\n", - " if len(paths) > 1:\n", - " subpath = mntpath[len(container):]\n", - " return container, subpath\n", - "\n", - "\n", - "def create_stream(context, path, shards=1):\n", - " # create a stream w/8 shards\n", - " container, stream_path = split_path(path)\n", - " context.logger.info(f'Creating stream in Container: {container} & Path {stream_path}')\n", - " response = context.v3io_client.create_stream(container=container,\n", - " path=stream_path, \n", - " shard_count=shards,\n", - " raise_for_status=v3io.dataplane.RaiseForStatus.never)\n", - " response.raise_for_status([409, 204])\n", - " \n", - " \n", - "def push_to_stream(context, stream_path, data):\n", - " records = [{'data': json.dumps(rec)} for rec in data]\n", - " container, stream_path = split_path(stream_path)\n", - " response = context.v3io_client.put_records(container=container,\n", - " path=stream_path, \n", - " records=records)\n", - "\n", - "\n", - "def construct_record(record):\n", - " label_col = os.getenv('label_col', 'label')\n", - " prediction_col = os.getenv('prediction_col', 'prediction')\n", - " res = dict([(k, record[k]) for k in ['when', 'class', 'model', 'resp', 'request']])\n", - " res['feature_vector'] = res.pop('request')['instances'][0]\n", - " res['timestamp'] = res.pop('when')\n", - " res['prediction'] = res['resp'][0]\n", - " return res" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "def init_context(context):\n", - " # create a v3io context object\n", - " v3io_client = v3io.dataplane.Client()\n", - " setattr(context, \"v3io_client\", v3io_client)\n", - " \n", - " # Setup windowing for TSDB writer\n", - " v3f_client = v3f.Client('framesd:8081', container='bigdata')\n", - " setattr(context, \"v3f\", v3f_client)\n", - " window = []\n", - " setattr(context, 'window', window)\n", - " setattr(context, 'window_size', int(os.getenv('window_size', 10)))\n", - " setattr(context, 'tsdb_table', os.getenv('tsdb_table', 'concept_drift_tsdb_1'))\n", - " try:\n", - " context.v3f.create('tsdb', context.tsdb_table, rate='1/s', if_exists=1)\n", - " except Exception as e:\n", - " context.logger.info(f'Creating context with rate= faile for {e}')\n", - " context.v3f.create('tsdb', context.tsdb_table, attrs={'rate': '1/s'}, if_exists=1)\n", - " \n", - " # Setup callbacks\n", - " callbacks = [callback.strip() for callback in os.getenv('callbacks', '').split(',')]\n", - " setattr(context, 'callbacks', callbacks)\n", - " \n", - " # Setup drift stream\n", - " setattr(context, 'drift_stream', os.getenv('drift_stream', '/bigdata/drift_stream'))\n", - " try:\n", - " create_stream(context, context.drift_stream, int(os.getenv('drift_stream_shards', 1)))\n", - " except:\n", - " context.logger.info(f'{context.drift_stream} already exists')\n", - " \n", - " # Load models\n", - " models = {}\n", - " model_types = ['pagehinkely', 'ddm', 'eddm']\n", - " path_suffix = '_model_path'\n", - " for model in model_types:\n", - " model_env = f'{model}{path_suffix}'\n", - " if model_env in os.environ:\n", - " with open(os.environ[model_env], 'rb') as f:\n", - " models[model] = load(f)\n", - " setattr(context, 'models', models)\n", - " \n", - " # Columns to check\n", - " setattr(context, 'label_col', os.getenv('label_col', 'label'))\n", - " setattr(context, 'prediction_col', os.getenv('prediction_col', 'prediction'))" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "def handler(context, event):\n", - " # Construct event\n", - " context.logger.info(f'event: {event.body}')\n", - " full_event = json.loads(event.body)\n", - " record = construct_record(full_event)\n", - " \n", - " # Is our prediction wrong?\n", - " is_error = record[context.label_col] != record[context.prediction_col]\n", - " context.logger.info(f'Adding {is_error}')\n", - " \n", - " # Process the {is_error} element with our algorithms\n", - " for name, model in context.models.items():\n", - " # Add element\n", - " results = {'timestamp': record['timestamp']}\n", - " results['algorithm'] = name\n", - " model.add_element(is_error)\n", - " \n", - " # Detect warning zone (if applicable to the algorithm)\n", - " if hasattr(model, 'detected_warning_zone') and model.detected_warning_zone():\n", - " context.logger.info(f'{name}\\tWarning zone detected')\n", - " results['warning_zone'] = 1\n", - " full_event[f'{name}_warning_zone'] = 1\n", - " else:\n", - " results['warning_zone'] = 0\n", - " full_event[f'{name}_warning_zone'] = 0\n", - " \n", - " # Detect drift\n", - " if model.detected_change():\n", - " context.logger.info('Change Detected')\n", - " results['change_detected'] = 1\n", - " full_event[f'{name}_drift'] = 1\n", - " else:\n", - " results['change_detected'] = 0\n", - " full_event[f'{name}_drift'] = 0\n", - " context.window.append(results)\n", - " \n", - " # Return results\n", - " # Write to stream\n", - " push_to_stream(context, context.drift_stream, [full_event])\n", - " \n", - " # Add to callbacks\n", - " if context.callbacks != ['']:\n", - " for callback in context.callbacks:\n", - " requests.post(url=callback,\n", - " json=full_event)\n", - " \n", - " if (len(context.window) / len(context.models)) >= context.window_size:\n", - " df = pd.DataFrame(context.window)\n", - " df['timestamp'] = pd.to_datetime(df['timestamp'])\n", - " df = df.set_index(['timestamp', 'algorithm'])\n", - " context.v3f.write('tsdb', context.tsdb_table, df)\n", - " context.window = []" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: end-code" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Test " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "init_context(context)\n", - "event = nuclio.Event(body=json.dumps({'prediction': 0,\n", - " 'when': 'now',\n", - " 'class': 'ClassModel', \n", - " 'model': 'tester_v1', \n", - " 'resp': [0], \n", - " 'request': {'instances': [[1, 1.2, 3]]}}))\n", - "out = handler(context, event)\n", - "out" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Cluster" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%nuclio deploy -n network-operations-concept-drift -p network-operations" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Save function yaml" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "from os import path\n", - "from mlrun import run_local, NewTask, mlconf, import_function, mount_v3io, code_to_function, get_run_db\n", - "mlconf.dbpath = mlconf.dbpath or 'http://mlrun-api:8080'" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-07-14 13:49:22,720 function spec saved to path: /User/functions/concept_drift_streaming/function.yaml\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# create job function object from notebook code\n", - "fn = code_to_function(\"concept_drift_streaming\", kind='nuclio')\n", - "\n", - "# add metadata (for templates and reuse)\n", - "fn.spec.default_handler = \"handler\"\n", - "fn.spec.description = \"Deploy a streaming Concept Drift detector on a labeled stream. the nuclio part of the concept_drift function\"\n", - "fn.metadata.categories = [\"ml\", \"serve\"]\n", - "fn.metadata.labels = {\"author\": \"orz\", \"framework\": \"sklearn\"}\n", - "fn.export(\"/User/functions/concept_drift_streaming/function.yaml\")" - ] - }, - { - "cell_type": "code", - "execution_count": 120, - "metadata": {}, - "outputs": [], - "source": [ - "stream_trigger = nuclio.triggers.V3IOStreamTrigger(url='/bigdata/network-operations/inference_stream@cd2')" - ] - }, - { - "cell_type": "code", - "execution_count": 121, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 121, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fn.add_trigger('labeled_stream', stream_trigger)" - ] - }, - { - "cell_type": "code", - "execution_count": 122, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 122, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fn.apply(mount_v3io()).with_v3io()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "fn.export(\"function.yaml\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Stream testing" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "metadata": {}, - "outputs": [], - "source": [ - "fn = import_function('./function.yaml')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "fn.deploy(project='network-operations')" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.8" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/functions/development/concept_drift_streaming/1.1.0/src/concept_drift_streaming.py b/functions/development/concept_drift_streaming/1.1.0/src/concept_drift_streaming.py deleted file mode 100644 index ebcbf8a1..00000000 --- a/functions/development/concept_drift_streaming/1.1.0/src/concept_drift_streaming.py +++ /dev/null @@ -1,157 +0,0 @@ -# Copyright 2019 Iguazio -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# Generated by nuclio.export.NuclioExporter - -import skmultiflow.drift_detection -import numpy as np -import pandas as pd -import os -import json -import v3io.dataplane -import v3io_frames as v3f -import requests -from cloudpickle import load - -import random - - -def split_path(mntpath=""): - if mntpath[0] == "/": - mntpath = mntpath[1:] - paths = mntpath.split("/") - container = paths[0] - subpath = "" - if len(paths) > 1: - subpath = mntpath[len(container) :] - return container, subpath - - -def create_stream(context, path, shards=1): - container, stream_path = split_path(path) - context.logger.info( - f"Creating stream in Container: {container} & Path {stream_path}" - ) - response = context.v3io_client.create_stream( - container=container, - path=stream_path, - shard_count=shards, - raise_for_status=v3io.dataplane.RaiseForStatus.never, - ) - response.raise_for_status([409, 204]) - - -def push_to_stream(context, stream_path, data): - records = [{"data": json.dumps(rec)} for rec in data] - container, stream_path = split_path(stream_path) - response = context.v3io_client.put_records( - container=container, path=stream_path, records=records - ) - - -def construct_record(record): - label_col = os.getenv("label_col", "label") - prediction_col = os.getenv("prediction_col", "prediction") - res = dict([(k, record[k]) for k in ["when", "class", "model", "resp", "request"]]) - res["feature_vector"] = res.pop("request")["instances"][0] - res["timestamp"] = res.pop("when") - res[prediction_col] = res["resp"][0] - return res - - -def init_context(context): - v3io_client = v3io.dataplane.Client() - setattr(context, "v3io_client", v3io_client) - - v3f_client = v3f.Client("framesd:8081", container="bigdata") - setattr(context, "v3f", v3f_client) - window = [] - setattr(context, "window", window) - setattr(context, "window_size", int(os.getenv("window_size", 10))) - setattr(context, "tsdb_table", os.getenv("tsdb_table", "concept_drift_tsdb_1")) - try: - context.v3f.create("tsdb", context.tsdb_table, rate="1/s", if_exists=1) - except Exception as e: - context.logger.info(f"Creating context with rate= faile for {e}") - context.v3f.create( - "tsdb", context.tsdb_table, attrs={"rate": "1/s"}, if_exists=1 - ) - - callbacks = [callback.strip() for callback in os.getenv("callbacks", "").split(",")] - setattr(context, "callbacks", callbacks) - - setattr(context, "drift_stream", os.getenv("drift_stream", "/bigdata/drift_stream")) - try: - create_stream( - context, context.drift_stream, int(os.getenv("drift_stream_shards", 1)) - ) - except: - context.logger.info(f"{context.drift_stream} already exists") - - models = {} - model_types = ["pagehinkely", "ddm", "eddm"] - path_suffix = "_model_path" - for model in model_types: - model_env = f"{model}{path_suffix}" - if model_env in os.environ: - with open(os.environ[model_env], "rb") as f: - models[model] = load(f) - setattr(context, "models", models) - - setattr(context, "label_col", os.getenv("label_col", "label")) - setattr(context, "prediction_col", os.getenv("prediction_col", "prediction")) - - -def handler(context, event): - context.logger.info(f"event: {event.body}") - full_event = json.loads(event.body) - record = construct_record(full_event) - - is_error = record[context.label_col] != record[context.prediction_col] - context.logger.info(f"Adding {is_error}") - - for name, model in context.models.items(): - results = {"timestamp": record["timestamp"]} - results["algorithm"] = name - model.add_element(is_error) - - if hasattr(model, "detected_warning_zone") and model.detected_warning_zone(): - context.logger.info(f"{name}\tWarning zone detected") - results["warning_zone"] = 1 - full_event[f"{name}_warning_zone"] = 1 - else: - results["warning_zone"] = 0 - full_event[f"{name}_warning_zone"] = 0 - - if model.detected_change(): - context.logger.info("Change Detected") - results["change_detected"] = 1 - full_event[f"{name}_drift"] = 1 - else: - results["change_detected"] = 0 - full_event[f"{name}_drift"] = 0 - context.window.append(results) - - push_to_stream(context, context.drift_stream, [full_event]) - - if context.callbacks != [""]: - for callback in context.callbacks: - requests.post(url=callback, json=full_event) - - if (len(context.window) / len(context.models)) >= context.window_size: - df = pd.DataFrame(context.window) - df["timestamp"] = pd.to_datetime(df["timestamp"]) - df = df.set_index(["timestamp", "algorithm"]) - context.v3f.write("tsdb", context.tsdb_table, df) - context.window = [] diff --git a/functions/development/concept_drift_streaming/1.1.0/src/function.yaml b/functions/development/concept_drift_streaming/1.1.0/src/function.yaml deleted file mode 100644 index 3001b1bf..00000000 --- a/functions/development/concept_drift_streaming/1.1.0/src/function.yaml +++ /dev/null @@ -1,48 +0,0 @@ -kind: remote -metadata: - name: concept-drift-streaming - tag: '' - hash: dc41ff41149be69f19b91a6d78a06571937063ae - project: '' - labels: - author: orz - framework: sklearn - categories: - - machine-learning - - monitoring -spec: - command: '' - args: [] - image: mlrun/ml-models - description: Deploy a streaming Concept Drift detector on a labeled stream. the - nuclio part of the concept_drift function - min_replicas: 1 - max_replicas: 4 - env: [] - base_spec: - apiVersion: nuclio.io/v1 - kind: Function - metadata: - name: concept-drift-streaming - labels: {} - annotations: - nuclio.io/generated_by: function generated from /User/test/functions/concept_drift_streaming/concept_drift_streaming.py - spec: - runtime: python:3.6 - handler: concept_drift_streaming:handler - env: [] - volumes: [] - build: - commands: [] - noBaseImagesPull: true - functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHNrbXVsdGlmbG93LmRyaWZ0X2RldGVjdGlvbgppbXBvcnQgbnVtcHkgYXMgbnAKaW1wb3J0IHBhbmRhcyBhcyBwZAppbXBvcnQgb3MKaW1wb3J0IGpzb24KaW1wb3J0IHYzaW8uZGF0YXBsYW5lCmltcG9ydCB2M2lvX2ZyYW1lcyBhcyB2M2YKaW1wb3J0IHJlcXVlc3RzCmZyb20gY2xvdWRwaWNrbGUgaW1wb3J0IGxvYWQKCmltcG9ydCByYW5kb20KCgpkZWYgc3BsaXRfcGF0aChtbnRwYXRoPSIiKToKICAgIGlmIG1udHBhdGhbMF0gPT0gIi8iOgogICAgICAgIG1udHBhdGggPSBtbnRwYXRoWzE6XQogICAgcGF0aHMgPSBtbnRwYXRoLnNwbGl0KCIvIikKICAgIGNvbnRhaW5lciA9IHBhdGhzWzBdCiAgICBzdWJwYXRoID0gIiIKICAgIGlmIGxlbihwYXRocykgPiAxOgogICAgICAgIHN1YnBhdGggPSBtbnRwYXRoW2xlbihjb250YWluZXIpIDpdCiAgICByZXR1cm4gY29udGFpbmVyLCBzdWJwYXRoCgoKZGVmIGNyZWF0ZV9zdHJlYW0oY29udGV4dCwgcGF0aCwgc2hhcmRzPTEpOgogICAgY29udGFpbmVyLCBzdHJlYW1fcGF0aCA9IHNwbGl0X3BhdGgocGF0aCkKICAgIGNvbnRleHQubG9nZ2VyLmluZm8oCiAgICAgICAgZiJDcmVhdGluZyBzdHJlYW0gaW4gQ29udGFpbmVyOiB7Y29udGFpbmVyfSAmIFBhdGgge3N0cmVhbV9wYXRofSIKICAgICkKICAgIHJlc3BvbnNlID0gY29udGV4dC52M2lvX2NsaWVudC5jcmVhdGVfc3RyZWFtKAogICAgICAgIGNvbnRhaW5lcj1jb250YWluZXIsCiAgICAgICAgcGF0aD1zdHJlYW1fcGF0aCwKICAgICAgICBzaGFyZF9jb3VudD1zaGFyZHMsCiAgICAgICAgcmFpc2VfZm9yX3N0YXR1cz12M2lvLmRhdGFwbGFuZS5SYWlzZUZvclN0YXR1cy5uZXZlciwKICAgICkKICAgIHJlc3BvbnNlLnJhaXNlX2Zvcl9zdGF0dXMoWzQwOSwgMjA0XSkKCgpkZWYgcHVzaF90b19zdHJlYW0oY29udGV4dCwgc3RyZWFtX3BhdGgsIGRhdGEpOgogICAgcmVjb3JkcyA9IFt7ImRhdGEiOiBqc29uLmR1bXBzKHJlYyl9IGZvciByZWMgaW4gZGF0YV0KICAgIGNvbnRhaW5lciwgc3RyZWFtX3BhdGggPSBzcGxpdF9wYXRoKHN0cmVhbV9wYXRoKQogICAgcmVzcG9uc2UgPSBjb250ZXh0LnYzaW9fY2xpZW50LnB1dF9yZWNvcmRzKAogICAgICAgIGNvbnRhaW5lcj1jb250YWluZXIsIHBhdGg9c3RyZWFtX3BhdGgsIHJlY29yZHM9cmVjb3JkcwogICAgKQoKCmRlZiBjb25zdHJ1Y3RfcmVjb3JkKHJlY29yZCk6CiAgICBsYWJlbF9jb2wgPSBvcy5nZXRlbnYoImxhYmVsX2NvbCIsICJsYWJlbCIpCiAgICBwcmVkaWN0aW9uX2NvbCA9IG9zLmdldGVudigicHJlZGljdGlvbl9jb2wiLCAicHJlZGljdGlvbiIpCiAgICByZXMgPSBkaWN0KFsoaywgcmVjb3JkW2tdKSBmb3IgayBpbiBbIndoZW4iLCAiY2xhc3MiLCAibW9kZWwiLCAicmVzcCIsICJyZXF1ZXN0Il1dKQogICAgcmVzWyJmZWF0dXJlX3ZlY3RvciJdID0gcmVzLnBvcCgicmVxdWVzdCIpWyJpbnN0YW5jZXMiXVswXQogICAgcmVzWyJ0aW1lc3RhbXAiXSA9IHJlcy5wb3AoIndoZW4iKQogICAgcmVzW3ByZWRpY3Rpb25fY29sXSA9IHJlc1sicmVzcCJdWzBdCiAgICByZXR1cm4gcmVzCgoKZGVmIGluaXRfY29udGV4dChjb250ZXh0KToKICAgIHYzaW9fY2xpZW50ID0gdjNpby5kYXRhcGxhbmUuQ2xpZW50KCkKICAgIHNldGF0dHIoY29udGV4dCwgInYzaW9fY2xpZW50IiwgdjNpb19jbGllbnQpCgogICAgdjNmX2NsaWVudCA9IHYzZi5DbGllbnQoImZyYW1lc2Q6ODA4MSIsIGNvbnRhaW5lcj0iYmlnZGF0YSIpCiAgICBzZXRhdHRyKGNvbnRleHQsICJ2M2YiLCB2M2ZfY2xpZW50KQogICAgd2luZG93ID0gW10KICAgIHNldGF0dHIoY29udGV4dCwgIndpbmRvdyIsIHdpbmRvdykKICAgIHNldGF0dHIoY29udGV4dCwgIndpbmRvd19zaXplIiwgaW50KG9zLmdldGVudigid2luZG93X3NpemUiLCAxMCkpKQogICAgc2V0YXR0cihjb250ZXh0LCAidHNkYl90YWJsZSIsIG9zLmdldGVudigidHNkYl90YWJsZSIsICJjb25jZXB0X2RyaWZ0X3RzZGJfMSIpKQogICAgdHJ5OgogICAgICAgIGNvbnRleHQudjNmLmNyZWF0ZSgidHNkYiIsIGNvbnRleHQudHNkYl90YWJsZSwgcmF0ZT0iMS9zIiwgaWZfZXhpc3RzPTEpCiAgICBleGNlcHQgRXhjZXB0aW9uIGFzIGU6CiAgICAgICAgY29udGV4dC5sb2dnZXIuaW5mbyhmIkNyZWF0aW5nIGNvbnRleHQgd2l0aCByYXRlPSBmYWlsZSBmb3Ige2V9IikKICAgICAgICBjb250ZXh0LnYzZi5jcmVhdGUoCiAgICAgICAgICAgICJ0c2RiIiwgY29udGV4dC50c2RiX3RhYmxlLCBhdHRycz17InJhdGUiOiAiMS9zIn0sIGlmX2V4aXN0cz0xCiAgICAgICAgKQoKICAgIGNhbGxiYWNrcyA9IFtjYWxsYmFjay5zdHJpcCgpIGZvciBjYWxsYmFjayBpbiBvcy5nZXRlbnYoImNhbGxiYWNrcyIsICIiKS5zcGxpdCgiLCIpXQogICAgc2V0YXR0cihjb250ZXh0LCAiY2FsbGJhY2tzIiwgY2FsbGJhY2tzKQoKICAgIHNldGF0dHIoY29udGV4dCwgImRyaWZ0X3N0cmVhbSIsIG9zLmdldGVudigiZHJpZnRfc3RyZWFtIiwgIi9iaWdkYXRhL2RyaWZ0X3N0cmVhbSIpKQogICAgdHJ5OgogICAgICAgIGNyZWF0ZV9zdHJlYW0oCiAgICAgICAgICAgIGNvbnRleHQsIGNvbnRleHQuZHJpZnRfc3RyZWFtLCBpbnQob3MuZ2V0ZW52KCJkcmlmdF9zdHJlYW1fc2hhcmRzIiwgMSkpCiAgICAgICAgKQogICAgZXhjZXB0OgogICAgICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZiJ7Y29udGV4dC5kcmlmdF9zdHJlYW19IGFscmVhZHkgZXhpc3RzIikKCiAgICBtb2RlbHMgPSB7fQogICAgbW9kZWxfdHlwZXMgPSBbInBhZ2VoaW5rZWx5IiwgImRkbSIsICJlZGRtIl0KICAgIHBhdGhfc3VmZml4ID0gIl9tb2RlbF9wYXRoIgogICAgZm9yIG1vZGVsIGluIG1vZGVsX3R5cGVzOgogICAgICAgIG1vZGVsX2VudiA9IGYie21vZGVsfXtwYXRoX3N1ZmZpeH0iCiAgICAgICAgaWYgbW9kZWxfZW52IGluIG9zLmVudmlyb246CiAgICAgICAgICAgIHdpdGggb3Blbihvcy5lbnZpcm9uW21vZGVsX2Vudl0sICJyYiIpIGFzIGY6CiAgICAgICAgICAgICAgICBtb2RlbHNbbW9kZWxdID0gbG9hZChmKQogICAgc2V0YXR0cihjb250ZXh0LCAibW9kZWxzIiwgbW9kZWxzKQoKICAgIHNldGF0dHIoY29udGV4dCwgImxhYmVsX2NvbCIsIG9zLmdldGVudigibGFiZWxfY29sIiwgImxhYmVsIikpCiAgICBzZXRhdHRyKGNvbnRleHQsICJwcmVkaWN0aW9uX2NvbCIsIG9zLmdldGVudigicHJlZGljdGlvbl9jb2wiLCAicHJlZGljdGlvbiIpKQoKCmRlZiBoYW5kbGVyKGNvbnRleHQsIGV2ZW50KToKICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZiJldmVudDoge2V2ZW50LmJvZHl9IikKICAgIGZ1bGxfZXZlbnQgPSBqc29uLmxvYWRzKGV2ZW50LmJvZHkpCiAgICByZWNvcmQgPSBjb25zdHJ1Y3RfcmVjb3JkKGZ1bGxfZXZlbnQpCgogICAgaXNfZXJyb3IgPSByZWNvcmRbY29udGV4dC5sYWJlbF9jb2xdICE9IHJlY29yZFtjb250ZXh0LnByZWRpY3Rpb25fY29sXQogICAgY29udGV4dC5sb2dnZXIuaW5mbyhmIkFkZGluZyB7aXNfZXJyb3J9IikKCiAgICBmb3IgbmFtZSwgbW9kZWwgaW4gY29udGV4dC5tb2RlbHMuaXRlbXMoKToKICAgICAgICByZXN1bHRzID0geyJ0aW1lc3RhbXAiOiByZWNvcmRbInRpbWVzdGFtcCJdfQogICAgICAgIHJlc3VsdHNbImFsZ29yaXRobSJdID0gbmFtZQogICAgICAgIG1vZGVsLmFkZF9lbGVtZW50KGlzX2Vycm9yKQoKICAgICAgICBpZiBoYXNhdHRyKG1vZGVsLCAiZGV0ZWN0ZWRfd2FybmluZ196b25lIikgYW5kIG1vZGVsLmRldGVjdGVkX3dhcm5pbmdfem9uZSgpOgogICAgICAgICAgICBjb250ZXh0LmxvZ2dlci5pbmZvKGYie25hbWV9XHRXYXJuaW5nIHpvbmUgZGV0ZWN0ZWQiKQogICAgICAgICAgICByZXN1bHRzWyJ3YXJuaW5nX3pvbmUiXSA9IDEKICAgICAgICAgICAgZnVsbF9ldmVudFtmIntuYW1lfV93YXJuaW5nX3pvbmUiXSA9IDEKICAgICAgICBlbHNlOgogICAgICAgICAgICByZXN1bHRzWyJ3YXJuaW5nX3pvbmUiXSA9IDAKICAgICAgICAgICAgZnVsbF9ldmVudFtmIntuYW1lfV93YXJuaW5nX3pvbmUiXSA9IDAKCiAgICAgICAgaWYgbW9kZWwuZGV0ZWN0ZWRfY2hhbmdlKCk6CiAgICAgICAgICAgIGNvbnRleHQubG9nZ2VyLmluZm8oIkNoYW5nZSBEZXRlY3RlZCIpCiAgICAgICAgICAgIHJlc3VsdHNbImNoYW5nZV9kZXRlY3RlZCJdID0gMQogICAgICAgICAgICBmdWxsX2V2ZW50W2Yie25hbWV9X2RyaWZ0Il0gPSAxCiAgICAgICAgZWxzZToKICAgICAgICAgICAgcmVzdWx0c1siY2hhbmdlX2RldGVjdGVkIl0gPSAwCiAgICAgICAgICAgIGZ1bGxfZXZlbnRbZiJ7bmFtZX1fZHJpZnQiXSA9IDAKICAgICAgICBjb250ZXh0LndpbmRvdy5hcHBlbmQocmVzdWx0cykKCiAgICBwdXNoX3RvX3N0cmVhbShjb250ZXh0LCBjb250ZXh0LmRyaWZ0X3N0cmVhbSwgW2Z1bGxfZXZlbnRdKQoKICAgIGlmIGNvbnRleHQuY2FsbGJhY2tzICE9IFsiIl06CiAgICAgICAgZm9yIGNhbGxiYWNrIGluIGNvbnRleHQuY2FsbGJhY2tzOgogICAgICAgICAgICByZXF1ZXN0cy5wb3N0KHVybD1jYWxsYmFjaywganNvbj1mdWxsX2V2ZW50KQoKICAgIGlmIChsZW4oY29udGV4dC53aW5kb3cpIC8gbGVuKGNvbnRleHQubW9kZWxzKSkgPj0gY29udGV4dC53aW5kb3dfc2l6ZToKICAgICAgICBkZiA9IHBkLkRhdGFGcmFtZShjb250ZXh0LndpbmRvdykKICAgICAgICBkZlsidGltZXN0YW1wIl0gPSBwZC50b19kYXRldGltZShkZlsidGltZXN0YW1wIl0pCiAgICAgICAgZGYgPSBkZi5zZXRfaW5kZXgoWyJ0aW1lc3RhbXAiLCAiYWxnb3JpdGhtIl0pCiAgICAgICAgY29udGV4dC52M2Yud3JpdGUoInRzZGIiLCBjb250ZXh0LnRzZGJfdGFibGUsIGRmKQogICAgICAgIGNvbnRleHQud2luZG93ID0gW10K - source: '' - build: - commands: - - python -m pip install scikit-multiflow==0.4.1 v3io_frames - code_origin: https://github.com/daniels290813/functions.git#d96059851b5d51fd4583e982483eb973fccc47d2:/User/test/functions/concept_drift_streaming/concept_drift_streaming.py - origin_filename: /User/test/functions/concept_drift_streaming/concept_drift_streaming.py - default_handler: handler - disable_auto_mount: false - affinity: null -verbose: false diff --git a/functions/development/concept_drift_streaming/1.1.0/src/item.yaml b/functions/development/concept_drift_streaming/1.1.0/src/item.yaml deleted file mode 100644 index 91dcb9f4..00000000 --- a/functions/development/concept_drift_streaming/1.1.0/src/item.yaml +++ /dev/null @@ -1,29 +0,0 @@ -apiVersion: v1 -categories: -- machine-learning -- monitoring -description: Deploy a streaming Concept Drift detector on a labeled stream. the nuclio - part of the concept_drift function -doc: '' -example: concept_drift_streaming.ipynb -generationDate: 2022-08-28:17-25 -hidden: false -icon: '' -labels: - author: orz - framework: sklearn -maintainers: [] -marketplaceType: '' -mlrunVersion: 1.1.0 -name: concept-drift-streaming -platformVersion: 3.5.0 -spec: - filename: concept_drift_streaming.py - handler: handler - image: mlrun/ml-models - kind: nuclio - requirements: - - scikit-multiflow==0.4.1 - - v3io_frames -url: '' -version: 1.1.0 diff --git a/functions/development/concept_drift_streaming/1.1.0/src/requirements.txt b/functions/development/concept_drift_streaming/1.1.0/src/requirements.txt deleted file mode 100644 index fa0fddd8..00000000 --- a/functions/development/concept_drift_streaming/1.1.0/src/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -skmultiflow \ No newline at end of file diff --git a/functions/development/concept_drift_streaming/1.1.0/static/concept_drift_streaming.html b/functions/development/concept_drift_streaming/1.1.0/static/concept_drift_streaming.html deleted file mode 100644 index 8885d1f6..00000000 --- a/functions/development/concept_drift_streaming/1.1.0/static/concept_drift_streaming.html +++ /dev/null @@ -1,297 +0,0 @@ - - - - - - - -concept_drift_streaming.concept_drift_streaming - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - -
-
- -
-
-
-
-
- -
-

- -
-
-
-
-
-
-
-

Source code for concept_drift_streaming.concept_drift_streaming

-# Copyright 2019 Iguazio
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# Generated by nuclio.export.NuclioExporter
-
-import skmultiflow.drift_detection
-import numpy as np
-import pandas as pd
-import os
-import json
-import v3io.dataplane
-import v3io_frames as v3f
-import requests
-from cloudpickle import load
-
-import random
-
-
-
[docs]def split_path(mntpath=""): - if mntpath[0] == "/": - mntpath = mntpath[1:] - paths = mntpath.split("/") - container = paths[0] - subpath = "" - if len(paths) > 1: - subpath = mntpath[len(container) :] - return container, subpath
- - -
[docs]def create_stream(context, path, shards=1): - container, stream_path = split_path(path) - context.logger.info( - f"Creating stream in Container: {container} & Path {stream_path}" - ) - response = context.v3io_client.create_stream( - container=container, - path=stream_path, - shard_count=shards, - raise_for_status=v3io.dataplane.RaiseForStatus.never, - ) - response.raise_for_status([409, 204])
- - -
[docs]def push_to_stream(context, stream_path, data): - records = [{"data": json.dumps(rec)} for rec in data] - container, stream_path = split_path(stream_path) - response = context.v3io_client.put_records( - container=container, path=stream_path, records=records - )
- - -
[docs]def construct_record(record): - label_col = os.getenv("label_col", "label") - prediction_col = os.getenv("prediction_col", "prediction") - res = dict([(k, record[k]) for k in ["when", "class", "model", "resp", "request"]]) - res["feature_vector"] = res.pop("request")["instances"][0] - res["timestamp"] = res.pop("when") - res[prediction_col] = res["resp"][0] - return res
- - -
[docs]def init_context(context): - v3io_client = v3io.dataplane.Client() - setattr(context, "v3io_client", v3io_client) - - v3f_client = v3f.Client("framesd:8081", container="bigdata") - setattr(context, "v3f", v3f_client) - window = [] - setattr(context, "window", window) - setattr(context, "window_size", int(os.getenv("window_size", 10))) - setattr(context, "tsdb_table", os.getenv("tsdb_table", "concept_drift_tsdb_1")) - try: - context.v3f.create("tsdb", context.tsdb_table, rate="1/s", if_exists=1) - except Exception as e: - context.logger.info(f"Creating context with rate= faile for {e}") - context.v3f.create( - "tsdb", context.tsdb_table, attrs={"rate": "1/s"}, if_exists=1 - ) - - callbacks = [callback.strip() for callback in os.getenv("callbacks", "").split(",")] - setattr(context, "callbacks", callbacks) - - setattr(context, "drift_stream", os.getenv("drift_stream", "/bigdata/drift_stream")) - try: - create_stream( - context, context.drift_stream, int(os.getenv("drift_stream_shards", 1)) - ) - except: - context.logger.info(f"{context.drift_stream} already exists") - - models = {} - model_types = ["pagehinkely", "ddm", "eddm"] - path_suffix = "_model_path" - for model in model_types: - model_env = f"{model}{path_suffix}" - if model_env in os.environ: - with open(os.environ[model_env], "rb") as f: - models[model] = load(f) - setattr(context, "models", models) - - setattr(context, "label_col", os.getenv("label_col", "label")) - setattr(context, "prediction_col", os.getenv("prediction_col", "prediction"))
- - -
[docs]def handler(context, event): - context.logger.info(f"event: {event.body}") - full_event = json.loads(event.body) - record = construct_record(full_event) - - is_error = record[context.label_col] != record[context.prediction_col] - context.logger.info(f"Adding {is_error}") - - for name, model in context.models.items(): - results = {"timestamp": record["timestamp"]} - results["algorithm"] = name - model.add_element(is_error) - - if hasattr(model, "detected_warning_zone") and model.detected_warning_zone(): - context.logger.info(f"{name}\tWarning zone detected") - results["warning_zone"] = 1 - full_event[f"{name}_warning_zone"] = 1 - else: - results["warning_zone"] = 0 - full_event[f"{name}_warning_zone"] = 0 - - if model.detected_change(): - context.logger.info("Change Detected") - results["change_detected"] = 1 - full_event[f"{name}_drift"] = 1 - else: - results["change_detected"] = 0 - full_event[f"{name}_drift"] = 0 - context.window.append(results) - - push_to_stream(context, context.drift_stream, [full_event]) - - if context.callbacks != [""]: - for callback in context.callbacks: - requests.post(url=callback, json=full_event) - - if (len(context.window) / len(context.models)) >= context.window_size: - df = pd.DataFrame(context.window) - df["timestamp"] = pd.to_datetime(df["timestamp"]) - df = df.set_index(["timestamp", "algorithm"]) - context.v3f.write("tsdb", context.tsdb_table, df) - context.window = []
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/concept_drift_streaming/1.1.0/static/documentation.html b/functions/development/concept_drift_streaming/1.1.0/static/documentation.html deleted file mode 100644 index 3698c0c7..00000000 --- a/functions/development/concept_drift_streaming/1.1.0/static/documentation.html +++ /dev/null @@ -1,246 +0,0 @@ - - - - - - - -concept_drift_streaming package - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - - - -
-
- - -
-
-
- -
-

concept_drift_streaming package

- -
- -
-
-
-
-
-

concept_drift_streaming package#

-
-

Submodules#

-
-
-

concept_drift_streaming.concept_drift_streaming module#

-
-
-concept_drift_streaming.concept_drift_streaming.construct_record(record)[source]#
-
-
-
-concept_drift_streaming.concept_drift_streaming.create_stream(context, path, shards=1)[source]#
-
-
-
-concept_drift_streaming.concept_drift_streaming.handler(context, event)[source]#
-
-
-
-concept_drift_streaming.concept_drift_streaming.init_context(context)[source]#
-
-
-
-concept_drift_streaming.concept_drift_streaming.push_to_stream(context, stream_path, data)[source]#
-
-
-
-concept_drift_streaming.concept_drift_streaming.split_path(mntpath='')[source]#
-
-
-
-

Module contents#

-
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/concept_drift_streaming/1.1.0/static/example.html b/functions/development/concept_drift_streaming/1.1.0/static/example.html deleted file mode 100644 index e3e865b8..00000000 --- a/functions/development/concept_drift_streaming/1.1.0/static/example.html +++ /dev/null @@ -1,598 +0,0 @@ - - - - - - - -Concept Drift Streaming - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - - - -
-
- -
-
- Contents -
- -
-
-
-
- -
-

Concept Drift Streaming

- -
-
-
-

Contents

-
- -
-
-
-
-
-
-

Concept Drift Streaming#

-
-
-
import nuclio
-
-
-
-
-
-
-
from pprint import pprint
-
-
-
-
-
-
-
%%nuclio cmd -c
-python -m pip install scikit-multiflow==0.4.1
-python -m pip install v3io_frames
-
-
-
-
-
-
-
# Define function spec
-%nuclio config kind = "nuclio"
-%nuclio config spec.build.baseImage = "mlrun/ml-models"
-
-# Add V3IO Mount
-# %nuclio env %v3io
-
-
-
-
-
%nuclio: setting kind to 'nuclio'
-%nuclio: setting spec.build.baseImage to 'mlrun/ml-models'
-
-
-
-
-
-
-
# nuclio: ignore
-env = {'label_col': 'resp',
-       'prediction_col': 'prediction',
-       'drift_stream': '/bigdata/network-operations/drift_stream',
-       'tsdb_table': 'network-operations/drift_tsdb',
-       'pagehinkley_threshold': 10,
-       'models': ['pagehinkley', 'ddm', 'eddm'],
-       'window_size': 10}
-config = {'kind': 'nuclio',
-          'spec.build.baseImage': 'mlrun/ml-models'}
-cmd = ['python -m pip install scikit-multiflow',
-       'python -m pip install v3io_frames']
-v3io = True
-config = nuclio.ConfigSpec(env=env,
-                           config=config,
-                           cmd=cmd,
-                           v3io=v3io)
-
-
-
-
-
-
-
# nuclio: start-code
-
-
-
-
-
-
-
import skmultiflow.drift_detection
-import numpy as np
-import pandas as pd
-import os
-import json
-import v3io.dataplane
-import v3io_frames as v3f
-import requests
-from cloudpickle import load
-
-# For testing
-import random
-
-
-
-
-
-
-
def split_path(mntpath=''):
-    if mntpath[0] == '/':
-        mntpath = mntpath[1:]
-    paths = mntpath.split('/')
-    container = paths[0]
-    subpath = ''
-    if len(paths) > 1:
-        subpath = mntpath[len(container):]
-    return container, subpath
-
-
-def create_stream(context, path, shards=1):
-    # create a stream w/8 shards
-    container, stream_path = split_path(path)
-    context.logger.info(f'Creating stream in Container: {container} & Path {stream_path}')
-    response = context.v3io_client.create_stream(container=container,
-                                        path=stream_path, 
-                                        shard_count=shards,
-                                        raise_for_status=v3io.dataplane.RaiseForStatus.never)
-    response.raise_for_status([409, 204])
-    
-    
-def push_to_stream(context, stream_path, data):
-    records = [{'data': json.dumps(rec)} for rec in data]
-    container, stream_path = split_path(stream_path)
-    response = context.v3io_client.put_records(container=container,
-                                               path=stream_path, 
-                                               records=records)
-
-
-def construct_record(record):
-    label_col = os.getenv('label_col', 'label')
-    prediction_col = os.getenv('prediction_col', 'prediction')
-    res = dict([(k, record[k]) for k in ['when', 'class', 'model', 'resp', 'request']])
-    res['feature_vector'] = res.pop('request')['instances'][0]
-    res['timestamp'] = res.pop('when')
-    res['prediction'] = res['resp'][0]
-    return res
-
-
-
-
-
-
-
def init_context(context):
-    # create a v3io context object
-    v3io_client = v3io.dataplane.Client()
-    setattr(context, "v3io_client", v3io_client)
-    
-    # Setup windowing for TSDB writer
-    v3f_client = v3f.Client('framesd:8081', container='bigdata')
-    setattr(context, "v3f", v3f_client)
-    window = []
-    setattr(context, 'window', window)
-    setattr(context, 'window_size', int(os.getenv('window_size', 10)))
-    setattr(context, 'tsdb_table', os.getenv('tsdb_table', 'concept_drift_tsdb_1'))
-    try:
-        context.v3f.create('tsdb', context.tsdb_table, rate='1/s', if_exists=1)
-    except Exception as e:
-        context.logger.info(f'Creating context with rate= faile for {e}')
-        context.v3f.create('tsdb', context.tsdb_table, attrs={'rate': '1/s'}, if_exists=1)
-    
-    # Setup callbacks
-    callbacks = [callback.strip() for callback in os.getenv('callbacks', '').split(',')]
-    setattr(context, 'callbacks', callbacks)
-    
-    # Setup drift stream
-    setattr(context, 'drift_stream', os.getenv('drift_stream', '/bigdata/drift_stream'))
-    try:
-        create_stream(context, context.drift_stream, int(os.getenv('drift_stream_shards', 1)))
-    except:
-        context.logger.info(f'{context.drift_stream} already exists')
-    
-    # Load models
-    models = {}
-    model_types = ['pagehinkely', 'ddm', 'eddm']
-    path_suffix = '_model_path'
-    for model in model_types:
-        model_env = f'{model}{path_suffix}'
-        if model_env in os.environ:
-            with open(os.environ[model_env], 'rb') as f:
-                models[model] = load(f)
-    setattr(context, 'models', models)
-    
-    # Columns to check
-    setattr(context, 'label_col', os.getenv('label_col', 'label'))
-    setattr(context, 'prediction_col', os.getenv('prediction_col', 'prediction'))
-
-
-
-
-
-
-
def handler(context, event):
-    # Construct event
-    context.logger.info(f'event: {event.body}')
-    full_event = json.loads(event.body)
-    record = construct_record(full_event)
-    
-    # Is our prediction wrong?
-    is_error = record[context.label_col] != record[context.prediction_col]
-    context.logger.info(f'Adding {is_error}')
-    
-    # Process the {is_error} element with our algorithms
-    for name, model in context.models.items():
-        # Add element
-        results = {'timestamp': record['timestamp']}
-        results['algorithm'] = name
-        model.add_element(is_error)
-        
-        # Detect warning zone (if applicable to the algorithm)
-        if hasattr(model, 'detected_warning_zone') and model.detected_warning_zone():
-            context.logger.info(f'{name}\tWarning zone detected')
-            results['warning_zone'] = 1
-            full_event[f'{name}_warning_zone'] = 1
-        else:
-            results['warning_zone'] = 0
-            full_event[f'{name}_warning_zone'] = 0
-        
-        # Detect drift
-        if model.detected_change():
-            context.logger.info('Change Detected')
-            results['change_detected'] = 1
-            full_event[f'{name}_drift'] = 1
-        else:
-            results['change_detected'] = 0
-            full_event[f'{name}_drift'] = 0
-        context.window.append(results)
-    
-    # Return results
-    # Write to stream
-    push_to_stream(context, context.drift_stream, [full_event])
-    
-    # Add to callbacks
-    if context.callbacks != ['']:
-        for callback in context.callbacks:
-            requests.post(url=callback,
-                          json=full_event)
-    
-    if (len(context.window) / len(context.models)) >= context.window_size:
-        df = pd.DataFrame(context.window)
-        df['timestamp'] = pd.to_datetime(df['timestamp'])
-        df = df.set_index(['timestamp', 'algorithm'])
-        context.v3f.write('tsdb', context.tsdb_table, df)
-        context.window = []
-
-
-
-
-
-
-
# nuclio: end-code
-
-
-
-
-
-

Test#

-
-
-
init_context(context)
-event = nuclio.Event(body=json.dumps({'prediction': 0,
-                                      'when': 'now',
-                                      'class': 'ClassModel', 
-                                      'model': 'tester_v1', 
-                                      'resp': [0], 
-                                      'request': {'instances': [[1, 1.2, 3]]}}))
-out = handler(context, event)
-out
-
-
-
-
-
-
-

Cluster#

-
-
-
%nuclio deploy -n network-operations-concept-drift -p network-operations
-
-
-
-
-
-
-

Save function yaml#

-
-
-
from os import path
-from mlrun import run_local, NewTask, mlconf, import_function, mount_v3io, code_to_function, get_run_db
-mlconf.dbpath = mlconf.dbpath or 'http://mlrun-api:8080'
-
-
-
-
-
-
-
# create job function object from notebook code
-fn = code_to_function("concept_drift_streaming", kind='nuclio')
-
-# add metadata (for templates and reuse)
-fn.spec.default_handler = "handler"
-fn.spec.description = "Deploy a streaming Concept Drift detector on a labeled stream. the nuclio part of the concept_drift function"
-fn.metadata.categories = ["ml", "serve"]
-fn.metadata.labels = {"author": "orz", "framework": "sklearn"}
-fn.export("/User/functions/concept_drift_streaming/function.yaml")
-
-
-
-
-
[mlrun] 2020-07-14 13:49:22,720 function spec saved to path: /User/functions/concept_drift_streaming/function.yaml
-
-
-
<mlrun.runtimes.function.RemoteRuntime at 0x7fb5a7de3e80>
-
-
-
-
-
-
-
stream_trigger = nuclio.triggers.V3IOStreamTrigger(url='/bigdata/network-operations/inference_stream@cd2')
-
-
-
-
-
-
-
fn.add_trigger('labeled_stream', stream_trigger)
-
-
-
-
-
<mlrun.runtimes.function.RemoteRuntime at 0x7fa1dc063780>
-
-
-
-
-
-
-
fn.apply(mount_v3io()).with_v3io()
-
-
-
-
-
<mlrun.runtimes.function.RemoteRuntime at 0x7fa1dc063780>
-
-
-
-
-
-
-
fn.export("function.yaml")
-
-
-
-
-
-
-

Stream testing#

-
-
-
fn = import_function('./function.yaml')
-
-
-
-
-
-
-
fn.deploy(project='network-operations')
-
-
-
-
-
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/concept_drift_streaming/1.1.0/static/function.html b/functions/development/concept_drift_streaming/1.1.0/static/function.html deleted file mode 100644 index 32c31c6f..00000000 --- a/functions/development/concept_drift_streaming/1.1.0/static/function.html +++ /dev/null @@ -1,70 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: remote
-metadata:
-  name: concept-drift-streaming
-  tag: ''
-  hash: dc41ff41149be69f19b91a6d78a06571937063ae
-  project: ''
-  labels:
-    author: orz
-    framework: sklearn
-  categories:
-  - machine-learning
-  - monitoring
-spec:
-  command: ''
-  args: []
-  image: mlrun/ml-models
-  description: Deploy a streaming Concept Drift detector on a labeled stream. the
-    nuclio part of the concept_drift function
-  min_replicas: 1
-  max_replicas: 4
-  env: []
-  base_spec:
-    apiVersion: nuclio.io/v1
-    kind: Function
-    metadata:
-      name: concept-drift-streaming
-      labels: {}
-      annotations:
-        nuclio.io/generated_by: function generated from /User/test/functions/concept_drift_streaming/concept_drift_streaming.py
-    spec:
-      runtime: python:3.6
-      handler: concept_drift_streaming:handler
-      env: []
-      volumes: []
-      build:
-        commands: []
-        noBaseImagesPull: true
-        functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHNrbXVsdGlmbG93LmRyaWZ0X2RldGVjdGlvbgppbXBvcnQgbnVtcHkgYXMgbnAKaW1wb3J0IHBhbmRhcyBhcyBwZAppbXBvcnQgb3MKaW1wb3J0IGpzb24KaW1wb3J0IHYzaW8uZGF0YXBsYW5lCmltcG9ydCB2M2lvX2ZyYW1lcyBhcyB2M2YKaW1wb3J0IHJlcXVlc3RzCmZyb20gY2xvdWRwaWNrbGUgaW1wb3J0IGxvYWQKCmltcG9ydCByYW5kb20KCgpkZWYgc3BsaXRfcGF0aChtbnRwYXRoPSIiKToKICAgIGlmIG1udHBhdGhbMF0gPT0gIi8iOgogICAgICAgIG1udHBhdGggPSBtbnRwYXRoWzE6XQogICAgcGF0aHMgPSBtbnRwYXRoLnNwbGl0KCIvIikKICAgIGNvbnRhaW5lciA9IHBhdGhzWzBdCiAgICBzdWJwYXRoID0gIiIKICAgIGlmIGxlbihwYXRocykgPiAxOgogICAgICAgIHN1YnBhdGggPSBtbnRwYXRoW2xlbihjb250YWluZXIpIDpdCiAgICByZXR1cm4gY29udGFpbmVyLCBzdWJwYXRoCgoKZGVmIGNyZWF0ZV9zdHJlYW0oY29udGV4dCwgcGF0aCwgc2hhcmRzPTEpOgogICAgY29udGFpbmVyLCBzdHJlYW1fcGF0aCA9IHNwbGl0X3BhdGgocGF0aCkKICAgIGNvbnRleHQubG9nZ2VyLmluZm8oCiAgICAgICAgZiJDcmVhdGluZyBzdHJlYW0gaW4gQ29udGFpbmVyOiB7Y29udGFpbmVyfSAmIFBhdGgge3N0cmVhbV9wYXRofSIKICAgICkKICAgIHJlc3BvbnNlID0gY29udGV4dC52M2lvX2NsaWVudC5jcmVhdGVfc3RyZWFtKAogICAgICAgIGNvbnRhaW5lcj1jb250YWluZXIsCiAgICAgICAgcGF0aD1zdHJlYW1fcGF0aCwKICAgICAgICBzaGFyZF9jb3VudD1zaGFyZHMsCiAgICAgICAgcmFpc2VfZm9yX3N0YXR1cz12M2lvLmRhdGFwbGFuZS5SYWlzZUZvclN0YXR1cy5uZXZlciwKICAgICkKICAgIHJlc3BvbnNlLnJhaXNlX2Zvcl9zdGF0dXMoWzQwOSwgMjA0XSkKCgpkZWYgcHVzaF90b19zdHJlYW0oY29udGV4dCwgc3RyZWFtX3BhdGgsIGRhdGEpOgogICAgcmVjb3JkcyA9IFt7ImRhdGEiOiBqc29uLmR1bXBzKHJlYyl9IGZvciByZWMgaW4gZGF0YV0KICAgIGNvbnRhaW5lciwgc3RyZWFtX3BhdGggPSBzcGxpdF9wYXRoKHN0cmVhbV9wYXRoKQogICAgcmVzcG9uc2UgPSBjb250ZXh0LnYzaW9fY2xpZW50LnB1dF9yZWNvcmRzKAogICAgICAgIGNvbnRhaW5lcj1jb250YWluZXIsIHBhdGg9c3RyZWFtX3BhdGgsIHJlY29yZHM9cmVjb3JkcwogICAgKQoKCmRlZiBjb25zdHJ1Y3RfcmVjb3JkKHJlY29yZCk6CiAgICBsYWJlbF9jb2wgPSBvcy5nZXRlbnYoImxhYmVsX2NvbCIsICJsYWJlbCIpCiAgICBwcmVkaWN0aW9uX2NvbCA9IG9zLmdldGVudigicHJlZGljdGlvbl9jb2wiLCAicHJlZGljdGlvbiIpCiAgICByZXMgPSBkaWN0KFsoaywgcmVjb3JkW2tdKSBmb3IgayBpbiBbIndoZW4iLCAiY2xhc3MiLCAibW9kZWwiLCAicmVzcCIsICJyZXF1ZXN0Il1dKQogICAgcmVzWyJmZWF0dXJlX3ZlY3RvciJdID0gcmVzLnBvcCgicmVxdWVzdCIpWyJpbnN0YW5jZXMiXVswXQogICAgcmVzWyJ0aW1lc3RhbXAiXSA9IHJlcy5wb3AoIndoZW4iKQogICAgcmVzW3ByZWRpY3Rpb25fY29sXSA9IHJlc1sicmVzcCJdWzBdCiAgICByZXR1cm4gcmVzCgoKZGVmIGluaXRfY29udGV4dChjb250ZXh0KToKICAgIHYzaW9fY2xpZW50ID0gdjNpby5kYXRhcGxhbmUuQ2xpZW50KCkKICAgIHNldGF0dHIoY29udGV4dCwgInYzaW9fY2xpZW50IiwgdjNpb19jbGllbnQpCgogICAgdjNmX2NsaWVudCA9IHYzZi5DbGllbnQoImZyYW1lc2Q6ODA4MSIsIGNvbnRhaW5lcj0iYmlnZGF0YSIpCiAgICBzZXRhdHRyKGNvbnRleHQsICJ2M2YiLCB2M2ZfY2xpZW50KQogICAgd2luZG93ID0gW10KICAgIHNldGF0dHIoY29udGV4dCwgIndpbmRvdyIsIHdpbmRvdykKICAgIHNldGF0dHIoY29udGV4dCwgIndpbmRvd19zaXplIiwgaW50KG9zLmdldGVudigid2luZG93X3NpemUiLCAxMCkpKQogICAgc2V0YXR0cihjb250ZXh0LCAidHNkYl90YWJsZSIsIG9zLmdldGVudigidHNkYl90YWJsZSIsICJjb25jZXB0X2RyaWZ0X3RzZGJfMSIpKQogICAgdHJ5OgogICAgICAgIGNvbnRleHQudjNmLmNyZWF0ZSgidHNkYiIsIGNvbnRleHQudHNkYl90YWJsZSwgcmF0ZT0iMS9zIiwgaWZfZXhpc3RzPTEpCiAgICBleGNlcHQgRXhjZXB0aW9uIGFzIGU6CiAgICAgICAgY29udGV4dC5sb2dnZXIuaW5mbyhmIkNyZWF0aW5nIGNvbnRleHQgd2l0aCByYXRlPSBmYWlsZSBmb3Ige2V9IikKICAgICAgICBjb250ZXh0LnYzZi5jcmVhdGUoCiAgICAgICAgICAgICJ0c2RiIiwgY29udGV4dC50c2RiX3RhYmxlLCBhdHRycz17InJhdGUiOiAiMS9zIn0sIGlmX2V4aXN0cz0xCiAgICAgICAgKQoKICAgIGNhbGxiYWNrcyA9IFtjYWxsYmFjay5zdHJpcCgpIGZvciBjYWxsYmFjayBpbiBvcy5nZXRlbnYoImNhbGxiYWNrcyIsICIiKS5zcGxpdCgiLCIpXQogICAgc2V0YXR0cihjb250ZXh0LCAiY2FsbGJhY2tzIiwgY2FsbGJhY2tzKQoKICAgIHNldGF0dHIoY29udGV4dCwgImRyaWZ0X3N0cmVhbSIsIG9zLmdldGVudigiZHJpZnRfc3RyZWFtIiwgIi9iaWdkYXRhL2RyaWZ0X3N0cmVhbSIpKQogICAgdHJ5OgogICAgICAgIGNyZWF0ZV9zdHJlYW0oCiAgICAgICAgICAgIGNvbnRleHQsIGNvbnRleHQuZHJpZnRfc3RyZWFtLCBpbnQob3MuZ2V0ZW52KCJkcmlmdF9zdHJlYW1fc2hhcmRzIiwgMSkpCiAgICAgICAgKQogICAgZXhjZXB0OgogICAgICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZiJ7Y29udGV4dC5kcmlmdF9zdHJlYW19IGFscmVhZHkgZXhpc3RzIikKCiAgICBtb2RlbHMgPSB7fQogICAgbW9kZWxfdHlwZXMgPSBbInBhZ2VoaW5rZWx5IiwgImRkbSIsICJlZGRtIl0KICAgIHBhdGhfc3VmZml4ID0gIl9tb2RlbF9wYXRoIgogICAgZm9yIG1vZGVsIGluIG1vZGVsX3R5cGVzOgogICAgICAgIG1vZGVsX2VudiA9IGYie21vZGVsfXtwYXRoX3N1ZmZpeH0iCiAgICAgICAgaWYgbW9kZWxfZW52IGluIG9zLmVudmlyb246CiAgICAgICAgICAgIHdpdGggb3Blbihvcy5lbnZpcm9uW21vZGVsX2Vudl0sICJyYiIpIGFzIGY6CiAgICAgICAgICAgICAgICBtb2RlbHNbbW9kZWxdID0gbG9hZChmKQogICAgc2V0YXR0cihjb250ZXh0LCAibW9kZWxzIiwgbW9kZWxzKQoKICAgIHNldGF0dHIoY29udGV4dCwgImxhYmVsX2NvbCIsIG9zLmdldGVudigibGFiZWxfY29sIiwgImxhYmVsIikpCiAgICBzZXRhdHRyKGNvbnRleHQsICJwcmVkaWN0aW9uX2NvbCIsIG9zLmdldGVudigicHJlZGljdGlvbl9jb2wiLCAicHJlZGljdGlvbiIpKQoKCmRlZiBoYW5kbGVyKGNvbnRleHQsIGV2ZW50KToKICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZiJldmVudDoge2V2ZW50LmJvZHl9IikKICAgIGZ1bGxfZXZlbnQgPSBqc29uLmxvYWRzKGV2ZW50LmJvZHkpCiAgICByZWNvcmQgPSBjb25zdHJ1Y3RfcmVjb3JkKGZ1bGxfZXZlbnQpCgogICAgaXNfZXJyb3IgPSByZWNvcmRbY29udGV4dC5sYWJlbF9jb2xdICE9IHJlY29yZFtjb250ZXh0LnByZWRpY3Rpb25fY29sXQogICAgY29udGV4dC5sb2dnZXIuaW5mbyhmIkFkZGluZyB7aXNfZXJyb3J9IikKCiAgICBmb3IgbmFtZSwgbW9kZWwgaW4gY29udGV4dC5tb2RlbHMuaXRlbXMoKToKICAgICAgICByZXN1bHRzID0geyJ0aW1lc3RhbXAiOiByZWNvcmRbInRpbWVzdGFtcCJdfQogICAgICAgIHJlc3VsdHNbImFsZ29yaXRobSJdID0gbmFtZQogICAgICAgIG1vZGVsLmFkZF9lbGVtZW50KGlzX2Vycm9yKQoKICAgICAgICBpZiBoYXNhdHRyKG1vZGVsLCAiZGV0ZWN0ZWRfd2FybmluZ196b25lIikgYW5kIG1vZGVsLmRldGVjdGVkX3dhcm5pbmdfem9uZSgpOgogICAgICAgICAgICBjb250ZXh0LmxvZ2dlci5pbmZvKGYie25hbWV9XHRXYXJuaW5nIHpvbmUgZGV0ZWN0ZWQiKQogICAgICAgICAgICByZXN1bHRzWyJ3YXJuaW5nX3pvbmUiXSA9IDEKICAgICAgICAgICAgZnVsbF9ldmVudFtmIntuYW1lfV93YXJuaW5nX3pvbmUiXSA9IDEKICAgICAgICBlbHNlOgogICAgICAgICAgICByZXN1bHRzWyJ3YXJuaW5nX3pvbmUiXSA9IDAKICAgICAgICAgICAgZnVsbF9ldmVudFtmIntuYW1lfV93YXJuaW5nX3pvbmUiXSA9IDAKCiAgICAgICAgaWYgbW9kZWwuZGV0ZWN0ZWRfY2hhbmdlKCk6CiAgICAgICAgICAgIGNvbnRleHQubG9nZ2VyLmluZm8oIkNoYW5nZSBEZXRlY3RlZCIpCiAgICAgICAgICAgIHJlc3VsdHNbImNoYW5nZV9kZXRlY3RlZCJdID0gMQogICAgICAgICAgICBmdWxsX2V2ZW50W2Yie25hbWV9X2RyaWZ0Il0gPSAxCiAgICAgICAgZWxzZToKICAgICAgICAgICAgcmVzdWx0c1siY2hhbmdlX2RldGVjdGVkIl0gPSAwCiAgICAgICAgICAgIGZ1bGxfZXZlbnRbZiJ7bmFtZX1fZHJpZnQiXSA9IDAKICAgICAgICBjb250ZXh0LndpbmRvdy5hcHBlbmQocmVzdWx0cykKCiAgICBwdXNoX3RvX3N0cmVhbShjb250ZXh0LCBjb250ZXh0LmRyaWZ0X3N0cmVhbSwgW2Z1bGxfZXZlbnRdKQoKICAgIGlmIGNvbnRleHQuY2FsbGJhY2tzICE9IFsiIl06CiAgICAgICAgZm9yIGNhbGxiYWNrIGluIGNvbnRleHQuY2FsbGJhY2tzOgogICAgICAgICAgICByZXF1ZXN0cy5wb3N0KHVybD1jYWxsYmFjaywganNvbj1mdWxsX2V2ZW50KQoKICAgIGlmIChsZW4oY29udGV4dC53aW5kb3cpIC8gbGVuKGNvbnRleHQubW9kZWxzKSkgPj0gY29udGV4dC53aW5kb3dfc2l6ZToKICAgICAgICBkZiA9IHBkLkRhdGFGcmFtZShjb250ZXh0LndpbmRvdykKICAgICAgICBkZlsidGltZXN0YW1wIl0gPSBwZC50b19kYXRldGltZShkZlsidGltZXN0YW1wIl0pCiAgICAgICAgZGYgPSBkZi5zZXRfaW5kZXgoWyJ0aW1lc3RhbXAiLCAiYWxnb3JpdGhtIl0pCiAgICAgICAgY29udGV4dC52M2Yud3JpdGUoInRzZGIiLCBjb250ZXh0LnRzZGJfdGFibGUsIGRmKQogICAgICAgIGNvbnRleHQud2luZG93ID0gW10K
-  source: ''
-  build:
-    commands:
-    - python -m pip install scikit-multiflow==0.4.1 v3io_frames
-    code_origin: https://github.com/daniels290813/functions.git#d96059851b5d51fd4583e982483eb973fccc47d2:/User/test/functions/concept_drift_streaming/concept_drift_streaming.py
-    origin_filename: /User/test/functions/concept_drift_streaming/concept_drift_streaming.py
-  default_handler: handler
-  disable_auto_mount: false
-  affinity: null
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/concept_drift_streaming/1.1.0/static/item.html b/functions/development/concept_drift_streaming/1.1.0/static/item.html deleted file mode 100644 index ad0dc6ee..00000000 --- a/functions/development/concept_drift_streaming/1.1.0/static/item.html +++ /dev/null @@ -1,51 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- machine-learning
-- monitoring
-description: Deploy a streaming Concept Drift detector on a labeled stream. the nuclio
-  part of the concept_drift function
-doc: ''
-example: concept_drift_streaming.ipynb
-generationDate: 2022-08-28:17-25
-hidden: false
-icon: ''
-labels:
-  author: orz
-  framework: sklearn
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 1.1.0
-name: concept-drift-streaming
-platformVersion: 3.5.0
-spec:
-  filename: concept_drift_streaming.py
-  handler: handler
-  image: mlrun/ml-models
-  kind: nuclio
-  requirements:
-  - scikit-multiflow==0.4.1
-  - v3io_frames
-url: ''
-version: 1.1.0
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/concept_drift_streaming/1.1.0/static/source.html b/functions/development/concept_drift_streaming/1.1.0/static/source.html deleted file mode 100644 index 2524ecf3..00000000 --- a/functions/development/concept_drift_streaming/1.1.0/static/source.html +++ /dev/null @@ -1,179 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-# Copyright 2019 Iguazio
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# Generated by nuclio.export.NuclioExporter
-
-import skmultiflow.drift_detection
-import numpy as np
-import pandas as pd
-import os
-import json
-import v3io.dataplane
-import v3io_frames as v3f
-import requests
-from cloudpickle import load
-
-import random
-
-
-def split_path(mntpath=""):
-    if mntpath[0] == "/":
-        mntpath = mntpath[1:]
-    paths = mntpath.split("/")
-    container = paths[0]
-    subpath = ""
-    if len(paths) > 1:
-        subpath = mntpath[len(container) :]
-    return container, subpath
-
-
-def create_stream(context, path, shards=1):
-    container, stream_path = split_path(path)
-    context.logger.info(
-        f"Creating stream in Container: {container} & Path {stream_path}"
-    )
-    response = context.v3io_client.create_stream(
-        container=container,
-        path=stream_path,
-        shard_count=shards,
-        raise_for_status=v3io.dataplane.RaiseForStatus.never,
-    )
-    response.raise_for_status([409, 204])
-
-
-def push_to_stream(context, stream_path, data):
-    records = [{"data": json.dumps(rec)} for rec in data]
-    container, stream_path = split_path(stream_path)
-    response = context.v3io_client.put_records(
-        container=container, path=stream_path, records=records
-    )
-
-
-def construct_record(record):
-    label_col = os.getenv("label_col", "label")
-    prediction_col = os.getenv("prediction_col", "prediction")
-    res = dict([(k, record[k]) for k in ["when", "class", "model", "resp", "request"]])
-    res["feature_vector"] = res.pop("request")["instances"][0]
-    res["timestamp"] = res.pop("when")
-    res[prediction_col] = res["resp"][0]
-    return res
-
-
-def init_context(context):
-    v3io_client = v3io.dataplane.Client()
-    setattr(context, "v3io_client", v3io_client)
-
-    v3f_client = v3f.Client("framesd:8081", container="bigdata")
-    setattr(context, "v3f", v3f_client)
-    window = []
-    setattr(context, "window", window)
-    setattr(context, "window_size", int(os.getenv("window_size", 10)))
-    setattr(context, "tsdb_table", os.getenv("tsdb_table", "concept_drift_tsdb_1"))
-    try:
-        context.v3f.create("tsdb", context.tsdb_table, rate="1/s", if_exists=1)
-    except Exception as e:
-        context.logger.info(f"Creating context with rate= faile for {e}")
-        context.v3f.create(
-            "tsdb", context.tsdb_table, attrs={"rate": "1/s"}, if_exists=1
-        )
-
-    callbacks = [callback.strip() for callback in os.getenv("callbacks", "").split(",")]
-    setattr(context, "callbacks", callbacks)
-
-    setattr(context, "drift_stream", os.getenv("drift_stream", "/bigdata/drift_stream"))
-    try:
-        create_stream(
-            context, context.drift_stream, int(os.getenv("drift_stream_shards", 1))
-        )
-    except:
-        context.logger.info(f"{context.drift_stream} already exists")
-
-    models = {}
-    model_types = ["pagehinkely", "ddm", "eddm"]
-    path_suffix = "_model_path"
-    for model in model_types:
-        model_env = f"{model}{path_suffix}"
-        if model_env in os.environ:
-            with open(os.environ[model_env], "rb") as f:
-                models[model] = load(f)
-    setattr(context, "models", models)
-
-    setattr(context, "label_col", os.getenv("label_col", "label"))
-    setattr(context, "prediction_col", os.getenv("prediction_col", "prediction"))
-
-
-def handler(context, event):
-    context.logger.info(f"event: {event.body}")
-    full_event = json.loads(event.body)
-    record = construct_record(full_event)
-
-    is_error = record[context.label_col] != record[context.prediction_col]
-    context.logger.info(f"Adding {is_error}")
-
-    for name, model in context.models.items():
-        results = {"timestamp": record["timestamp"]}
-        results["algorithm"] = name
-        model.add_element(is_error)
-
-        if hasattr(model, "detected_warning_zone") and model.detected_warning_zone():
-            context.logger.info(f"{name}\tWarning zone detected")
-            results["warning_zone"] = 1
-            full_event[f"{name}_warning_zone"] = 1
-        else:
-            results["warning_zone"] = 0
-            full_event[f"{name}_warning_zone"] = 0
-
-        if model.detected_change():
-            context.logger.info("Change Detected")
-            results["change_detected"] = 1
-            full_event[f"{name}_drift"] = 1
-        else:
-            results["change_detected"] = 0
-            full_event[f"{name}_drift"] = 0
-        context.window.append(results)
-
-    push_to_stream(context, context.drift_stream, [full_event])
-
-    if context.callbacks != [""]:
-        for callback in context.callbacks:
-            requests.post(url=callback, json=full_event)
-
-    if (len(context.window) / len(context.models)) >= context.window_size:
-        df = pd.DataFrame(context.window)
-        df["timestamp"] = pd.to_datetime(df["timestamp"])
-        df = df.set_index(["timestamp", "algorithm"])
-        context.v3f.write("tsdb", context.tsdb_table, df)
-        context.window = []
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/concept_drift_streaming/latest/src/concept_drift_streaming.ipynb b/functions/development/concept_drift_streaming/latest/src/concept_drift_streaming.ipynb deleted file mode 100644 index b916cb7a..00000000 --- a/functions/development/concept_drift_streaming/latest/src/concept_drift_streaming.ipynb +++ /dev/null @@ -1,480 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Concept Drift Streaming" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import nuclio" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "from pprint import pprint" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "%%nuclio cmd -c\n", - "python -m pip install scikit-multiflow==0.4.1\n", - "python -m pip install v3io_frames" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "%nuclio: setting kind to 'nuclio'\n", - "%nuclio: setting spec.build.baseImage to 'mlrun/ml-models'\n" - ] - } - ], - "source": [ - "# Define function spec\n", - "%nuclio config kind = \"nuclio\"\n", - "%nuclio config spec.build.baseImage = \"mlrun/ml-models\"\n", - "\n", - "# Add V3IO Mount\n", - "# %nuclio env %v3io" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: ignore\n", - "env = {'label_col': 'resp',\n", - " 'prediction_col': 'prediction',\n", - " 'drift_stream': '/bigdata/network-operations/drift_stream',\n", - " 'tsdb_table': 'network-operations/drift_tsdb',\n", - " 'pagehinkley_threshold': 10,\n", - " 'models': ['pagehinkley', 'ddm', 'eddm'],\n", - " 'window_size': 10}\n", - "config = {'kind': 'nuclio',\n", - " 'spec.build.baseImage': 'mlrun/ml-models'}\n", - "cmd = ['python -m pip install scikit-multiflow',\n", - " 'python -m pip install v3io_frames']\n", - "v3io = True\n", - "config = nuclio.ConfigSpec(env=env,\n", - " config=config,\n", - " cmd=cmd,\n", - " v3io=v3io)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: start-code" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "import skmultiflow.drift_detection\n", - "import numpy as np\n", - "import pandas as pd\n", - "import os\n", - "import json\n", - "import v3io.dataplane\n", - "import v3io_frames as v3f\n", - "import requests\n", - "from cloudpickle import load\n", - "\n", - "# For testing\n", - "import random" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "def split_path(mntpath=''):\n", - " if mntpath[0] == '/':\n", - " mntpath = mntpath[1:]\n", - " paths = mntpath.split('/')\n", - " container = paths[0]\n", - " subpath = ''\n", - " if len(paths) > 1:\n", - " subpath = mntpath[len(container):]\n", - " return container, subpath\n", - "\n", - "\n", - "def create_stream(context, path, shards=1):\n", - " # create a stream w/8 shards\n", - " container, stream_path = split_path(path)\n", - " context.logger.info(f'Creating stream in Container: {container} & Path {stream_path}')\n", - " response = context.v3io_client.create_stream(container=container,\n", - " path=stream_path, \n", - " shard_count=shards,\n", - " raise_for_status=v3io.dataplane.RaiseForStatus.never)\n", - " response.raise_for_status([409, 204])\n", - " \n", - " \n", - "def push_to_stream(context, stream_path, data):\n", - " records = [{'data': json.dumps(rec)} for rec in data]\n", - " container, stream_path = split_path(stream_path)\n", - " response = context.v3io_client.put_records(container=container,\n", - " path=stream_path, \n", - " records=records)\n", - "\n", - "\n", - "def construct_record(record):\n", - " label_col = os.getenv('label_col', 'label')\n", - " prediction_col = os.getenv('prediction_col', 'prediction')\n", - " res = dict([(k, record[k]) for k in ['when', 'class', 'model', 'resp', 'request']])\n", - " res['feature_vector'] = res.pop('request')['instances'][0]\n", - " res['timestamp'] = res.pop('when')\n", - " res['prediction'] = res['resp'][0]\n", - " return res" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "def init_context(context):\n", - " # create a v3io context object\n", - " v3io_client = v3io.dataplane.Client()\n", - " setattr(context, \"v3io_client\", v3io_client)\n", - " \n", - " # Setup windowing for TSDB writer\n", - " v3f_client = v3f.Client('framesd:8081', container='bigdata')\n", - " setattr(context, \"v3f\", v3f_client)\n", - " window = []\n", - " setattr(context, 'window', window)\n", - " setattr(context, 'window_size', int(os.getenv('window_size', 10)))\n", - " setattr(context, 'tsdb_table', os.getenv('tsdb_table', 'concept_drift_tsdb_1'))\n", - " try:\n", - " context.v3f.create('tsdb', context.tsdb_table, rate='1/s', if_exists=1)\n", - " except Exception as e:\n", - " context.logger.info(f'Creating context with rate= faile for {e}')\n", - " context.v3f.create('tsdb', context.tsdb_table, attrs={'rate': '1/s'}, if_exists=1)\n", - " \n", - " # Setup callbacks\n", - " callbacks = [callback.strip() for callback in os.getenv('callbacks', '').split(',')]\n", - " setattr(context, 'callbacks', callbacks)\n", - " \n", - " # Setup drift stream\n", - " setattr(context, 'drift_stream', os.getenv('drift_stream', '/bigdata/drift_stream'))\n", - " try:\n", - " create_stream(context, context.drift_stream, int(os.getenv('drift_stream_shards', 1)))\n", - " except:\n", - " context.logger.info(f'{context.drift_stream} already exists')\n", - " \n", - " # Load models\n", - " models = {}\n", - " model_types = ['pagehinkely', 'ddm', 'eddm']\n", - " path_suffix = '_model_path'\n", - " for model in model_types:\n", - " model_env = f'{model}{path_suffix}'\n", - " if model_env in os.environ:\n", - " with open(os.environ[model_env], 'rb') as f:\n", - " models[model] = load(f)\n", - " setattr(context, 'models', models)\n", - " \n", - " # Columns to check\n", - " setattr(context, 'label_col', os.getenv('label_col', 'label'))\n", - " setattr(context, 'prediction_col', os.getenv('prediction_col', 'prediction'))" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "def handler(context, event):\n", - " # Construct event\n", - " context.logger.info(f'event: {event.body}')\n", - " full_event = json.loads(event.body)\n", - " record = construct_record(full_event)\n", - " \n", - " # Is our prediction wrong?\n", - " is_error = record[context.label_col] != record[context.prediction_col]\n", - " context.logger.info(f'Adding {is_error}')\n", - " \n", - " # Process the {is_error} element with our algorithms\n", - " for name, model in context.models.items():\n", - " # Add element\n", - " results = {'timestamp': record['timestamp']}\n", - " results['algorithm'] = name\n", - " model.add_element(is_error)\n", - " \n", - " # Detect warning zone (if applicable to the algorithm)\n", - " if hasattr(model, 'detected_warning_zone') and model.detected_warning_zone():\n", - " context.logger.info(f'{name}\\tWarning zone detected')\n", - " results['warning_zone'] = 1\n", - " full_event[f'{name}_warning_zone'] = 1\n", - " else:\n", - " results['warning_zone'] = 0\n", - " full_event[f'{name}_warning_zone'] = 0\n", - " \n", - " # Detect drift\n", - " if model.detected_change():\n", - " context.logger.info('Change Detected')\n", - " results['change_detected'] = 1\n", - " full_event[f'{name}_drift'] = 1\n", - " else:\n", - " results['change_detected'] = 0\n", - " full_event[f'{name}_drift'] = 0\n", - " context.window.append(results)\n", - " \n", - " # Return results\n", - " # Write to stream\n", - " push_to_stream(context, context.drift_stream, [full_event])\n", - " \n", - " # Add to callbacks\n", - " if context.callbacks != ['']:\n", - " for callback in context.callbacks:\n", - " requests.post(url=callback,\n", - " json=full_event)\n", - " \n", - " if (len(context.window) / len(context.models)) >= context.window_size:\n", - " df = pd.DataFrame(context.window)\n", - " df['timestamp'] = pd.to_datetime(df['timestamp'])\n", - " df = df.set_index(['timestamp', 'algorithm'])\n", - " context.v3f.write('tsdb', context.tsdb_table, df)\n", - " context.window = []" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: end-code" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Test " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "init_context(context)\n", - "event = nuclio.Event(body=json.dumps({'prediction': 0,\n", - " 'when': 'now',\n", - " 'class': 'ClassModel', \n", - " 'model': 'tester_v1', \n", - " 'resp': [0], \n", - " 'request': {'instances': [[1, 1.2, 3]]}}))\n", - "out = handler(context, event)\n", - "out" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Cluster" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%nuclio deploy -n network-operations-concept-drift -p network-operations" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Save function yaml" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "from os import path\n", - "from mlrun import run_local, NewTask, mlconf, import_function, mount_v3io, code_to_function, get_run_db\n", - "mlconf.dbpath = mlconf.dbpath or 'http://mlrun-api:8080'" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-07-14 13:49:22,720 function spec saved to path: /User/functions/concept_drift_streaming/function.yaml\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# create job function object from notebook code\n", - "fn = code_to_function(\"concept_drift_streaming\", kind='nuclio')\n", - "\n", - "# add metadata (for templates and reuse)\n", - "fn.spec.default_handler = \"handler\"\n", - "fn.spec.description = \"Deploy a streaming Concept Drift detector on a labeled stream. the nuclio part of the concept_drift function\"\n", - "fn.metadata.categories = [\"ml\", \"serve\"]\n", - "fn.metadata.labels = {\"author\": \"orz\", \"framework\": \"sklearn\"}\n", - "fn.export(\"/User/functions/concept_drift_streaming/function.yaml\")" - ] - }, - { - "cell_type": "code", - "execution_count": 120, - "metadata": {}, - "outputs": [], - "source": [ - "stream_trigger = nuclio.triggers.V3IOStreamTrigger(url='/bigdata/network-operations/inference_stream@cd2')" - ] - }, - { - "cell_type": "code", - "execution_count": 121, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 121, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fn.add_trigger('labeled_stream', stream_trigger)" - ] - }, - { - "cell_type": "code", - "execution_count": 122, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 122, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fn.apply(mount_v3io()).with_v3io()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "fn.export(\"function.yaml\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Stream testing" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "metadata": {}, - "outputs": [], - "source": [ - "fn = import_function('./function.yaml')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "fn.deploy(project='network-operations')" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.8" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/functions/development/concept_drift_streaming/latest/src/concept_drift_streaming.py b/functions/development/concept_drift_streaming/latest/src/concept_drift_streaming.py deleted file mode 100644 index ebcbf8a1..00000000 --- a/functions/development/concept_drift_streaming/latest/src/concept_drift_streaming.py +++ /dev/null @@ -1,157 +0,0 @@ -# Copyright 2019 Iguazio -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# Generated by nuclio.export.NuclioExporter - -import skmultiflow.drift_detection -import numpy as np -import pandas as pd -import os -import json -import v3io.dataplane -import v3io_frames as v3f -import requests -from cloudpickle import load - -import random - - -def split_path(mntpath=""): - if mntpath[0] == "/": - mntpath = mntpath[1:] - paths = mntpath.split("/") - container = paths[0] - subpath = "" - if len(paths) > 1: - subpath = mntpath[len(container) :] - return container, subpath - - -def create_stream(context, path, shards=1): - container, stream_path = split_path(path) - context.logger.info( - f"Creating stream in Container: {container} & Path {stream_path}" - ) - response = context.v3io_client.create_stream( - container=container, - path=stream_path, - shard_count=shards, - raise_for_status=v3io.dataplane.RaiseForStatus.never, - ) - response.raise_for_status([409, 204]) - - -def push_to_stream(context, stream_path, data): - records = [{"data": json.dumps(rec)} for rec in data] - container, stream_path = split_path(stream_path) - response = context.v3io_client.put_records( - container=container, path=stream_path, records=records - ) - - -def construct_record(record): - label_col = os.getenv("label_col", "label") - prediction_col = os.getenv("prediction_col", "prediction") - res = dict([(k, record[k]) for k in ["when", "class", "model", "resp", "request"]]) - res["feature_vector"] = res.pop("request")["instances"][0] - res["timestamp"] = res.pop("when") - res[prediction_col] = res["resp"][0] - return res - - -def init_context(context): - v3io_client = v3io.dataplane.Client() - setattr(context, "v3io_client", v3io_client) - - v3f_client = v3f.Client("framesd:8081", container="bigdata") - setattr(context, "v3f", v3f_client) - window = [] - setattr(context, "window", window) - setattr(context, "window_size", int(os.getenv("window_size", 10))) - setattr(context, "tsdb_table", os.getenv("tsdb_table", "concept_drift_tsdb_1")) - try: - context.v3f.create("tsdb", context.tsdb_table, rate="1/s", if_exists=1) - except Exception as e: - context.logger.info(f"Creating context with rate= faile for {e}") - context.v3f.create( - "tsdb", context.tsdb_table, attrs={"rate": "1/s"}, if_exists=1 - ) - - callbacks = [callback.strip() for callback in os.getenv("callbacks", "").split(",")] - setattr(context, "callbacks", callbacks) - - setattr(context, "drift_stream", os.getenv("drift_stream", "/bigdata/drift_stream")) - try: - create_stream( - context, context.drift_stream, int(os.getenv("drift_stream_shards", 1)) - ) - except: - context.logger.info(f"{context.drift_stream} already exists") - - models = {} - model_types = ["pagehinkely", "ddm", "eddm"] - path_suffix = "_model_path" - for model in model_types: - model_env = f"{model}{path_suffix}" - if model_env in os.environ: - with open(os.environ[model_env], "rb") as f: - models[model] = load(f) - setattr(context, "models", models) - - setattr(context, "label_col", os.getenv("label_col", "label")) - setattr(context, "prediction_col", os.getenv("prediction_col", "prediction")) - - -def handler(context, event): - context.logger.info(f"event: {event.body}") - full_event = json.loads(event.body) - record = construct_record(full_event) - - is_error = record[context.label_col] != record[context.prediction_col] - context.logger.info(f"Adding {is_error}") - - for name, model in context.models.items(): - results = {"timestamp": record["timestamp"]} - results["algorithm"] = name - model.add_element(is_error) - - if hasattr(model, "detected_warning_zone") and model.detected_warning_zone(): - context.logger.info(f"{name}\tWarning zone detected") - results["warning_zone"] = 1 - full_event[f"{name}_warning_zone"] = 1 - else: - results["warning_zone"] = 0 - full_event[f"{name}_warning_zone"] = 0 - - if model.detected_change(): - context.logger.info("Change Detected") - results["change_detected"] = 1 - full_event[f"{name}_drift"] = 1 - else: - results["change_detected"] = 0 - full_event[f"{name}_drift"] = 0 - context.window.append(results) - - push_to_stream(context, context.drift_stream, [full_event]) - - if context.callbacks != [""]: - for callback in context.callbacks: - requests.post(url=callback, json=full_event) - - if (len(context.window) / len(context.models)) >= context.window_size: - df = pd.DataFrame(context.window) - df["timestamp"] = pd.to_datetime(df["timestamp"]) - df = df.set_index(["timestamp", "algorithm"]) - context.v3f.write("tsdb", context.tsdb_table, df) - context.window = [] diff --git a/functions/development/concept_drift_streaming/latest/src/function.yaml b/functions/development/concept_drift_streaming/latest/src/function.yaml deleted file mode 100644 index 3001b1bf..00000000 --- a/functions/development/concept_drift_streaming/latest/src/function.yaml +++ /dev/null @@ -1,48 +0,0 @@ -kind: remote -metadata: - name: concept-drift-streaming - tag: '' - hash: dc41ff41149be69f19b91a6d78a06571937063ae - project: '' - labels: - author: orz - framework: sklearn - categories: - - machine-learning - - monitoring -spec: - command: '' - args: [] - image: mlrun/ml-models - description: Deploy a streaming Concept Drift detector on a labeled stream. the - nuclio part of the concept_drift function - min_replicas: 1 - max_replicas: 4 - env: [] - base_spec: - apiVersion: nuclio.io/v1 - kind: Function - metadata: - name: concept-drift-streaming - labels: {} - annotations: - nuclio.io/generated_by: function generated from /User/test/functions/concept_drift_streaming/concept_drift_streaming.py - spec: - runtime: python:3.6 - handler: concept_drift_streaming:handler - env: [] - volumes: [] - build: - commands: [] - noBaseImagesPull: true - functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHNrbXVsdGlmbG93LmRyaWZ0X2RldGVjdGlvbgppbXBvcnQgbnVtcHkgYXMgbnAKaW1wb3J0IHBhbmRhcyBhcyBwZAppbXBvcnQgb3MKaW1wb3J0IGpzb24KaW1wb3J0IHYzaW8uZGF0YXBsYW5lCmltcG9ydCB2M2lvX2ZyYW1lcyBhcyB2M2YKaW1wb3J0IHJlcXVlc3RzCmZyb20gY2xvdWRwaWNrbGUgaW1wb3J0IGxvYWQKCmltcG9ydCByYW5kb20KCgpkZWYgc3BsaXRfcGF0aChtbnRwYXRoPSIiKToKICAgIGlmIG1udHBhdGhbMF0gPT0gIi8iOgogICAgICAgIG1udHBhdGggPSBtbnRwYXRoWzE6XQogICAgcGF0aHMgPSBtbnRwYXRoLnNwbGl0KCIvIikKICAgIGNvbnRhaW5lciA9IHBhdGhzWzBdCiAgICBzdWJwYXRoID0gIiIKICAgIGlmIGxlbihwYXRocykgPiAxOgogICAgICAgIHN1YnBhdGggPSBtbnRwYXRoW2xlbihjb250YWluZXIpIDpdCiAgICByZXR1cm4gY29udGFpbmVyLCBzdWJwYXRoCgoKZGVmIGNyZWF0ZV9zdHJlYW0oY29udGV4dCwgcGF0aCwgc2hhcmRzPTEpOgogICAgY29udGFpbmVyLCBzdHJlYW1fcGF0aCA9IHNwbGl0X3BhdGgocGF0aCkKICAgIGNvbnRleHQubG9nZ2VyLmluZm8oCiAgICAgICAgZiJDcmVhdGluZyBzdHJlYW0gaW4gQ29udGFpbmVyOiB7Y29udGFpbmVyfSAmIFBhdGgge3N0cmVhbV9wYXRofSIKICAgICkKICAgIHJlc3BvbnNlID0gY29udGV4dC52M2lvX2NsaWVudC5jcmVhdGVfc3RyZWFtKAogICAgICAgIGNvbnRhaW5lcj1jb250YWluZXIsCiAgICAgICAgcGF0aD1zdHJlYW1fcGF0aCwKICAgICAgICBzaGFyZF9jb3VudD1zaGFyZHMsCiAgICAgICAgcmFpc2VfZm9yX3N0YXR1cz12M2lvLmRhdGFwbGFuZS5SYWlzZUZvclN0YXR1cy5uZXZlciwKICAgICkKICAgIHJlc3BvbnNlLnJhaXNlX2Zvcl9zdGF0dXMoWzQwOSwgMjA0XSkKCgpkZWYgcHVzaF90b19zdHJlYW0oY29udGV4dCwgc3RyZWFtX3BhdGgsIGRhdGEpOgogICAgcmVjb3JkcyA9IFt7ImRhdGEiOiBqc29uLmR1bXBzKHJlYyl9IGZvciByZWMgaW4gZGF0YV0KICAgIGNvbnRhaW5lciwgc3RyZWFtX3BhdGggPSBzcGxpdF9wYXRoKHN0cmVhbV9wYXRoKQogICAgcmVzcG9uc2UgPSBjb250ZXh0LnYzaW9fY2xpZW50LnB1dF9yZWNvcmRzKAogICAgICAgIGNvbnRhaW5lcj1jb250YWluZXIsIHBhdGg9c3RyZWFtX3BhdGgsIHJlY29yZHM9cmVjb3JkcwogICAgKQoKCmRlZiBjb25zdHJ1Y3RfcmVjb3JkKHJlY29yZCk6CiAgICBsYWJlbF9jb2wgPSBvcy5nZXRlbnYoImxhYmVsX2NvbCIsICJsYWJlbCIpCiAgICBwcmVkaWN0aW9uX2NvbCA9IG9zLmdldGVudigicHJlZGljdGlvbl9jb2wiLCAicHJlZGljdGlvbiIpCiAgICByZXMgPSBkaWN0KFsoaywgcmVjb3JkW2tdKSBmb3IgayBpbiBbIndoZW4iLCAiY2xhc3MiLCAibW9kZWwiLCAicmVzcCIsICJyZXF1ZXN0Il1dKQogICAgcmVzWyJmZWF0dXJlX3ZlY3RvciJdID0gcmVzLnBvcCgicmVxdWVzdCIpWyJpbnN0YW5jZXMiXVswXQogICAgcmVzWyJ0aW1lc3RhbXAiXSA9IHJlcy5wb3AoIndoZW4iKQogICAgcmVzW3ByZWRpY3Rpb25fY29sXSA9IHJlc1sicmVzcCJdWzBdCiAgICByZXR1cm4gcmVzCgoKZGVmIGluaXRfY29udGV4dChjb250ZXh0KToKICAgIHYzaW9fY2xpZW50ID0gdjNpby5kYXRhcGxhbmUuQ2xpZW50KCkKICAgIHNldGF0dHIoY29udGV4dCwgInYzaW9fY2xpZW50IiwgdjNpb19jbGllbnQpCgogICAgdjNmX2NsaWVudCA9IHYzZi5DbGllbnQoImZyYW1lc2Q6ODA4MSIsIGNvbnRhaW5lcj0iYmlnZGF0YSIpCiAgICBzZXRhdHRyKGNvbnRleHQsICJ2M2YiLCB2M2ZfY2xpZW50KQogICAgd2luZG93ID0gW10KICAgIHNldGF0dHIoY29udGV4dCwgIndpbmRvdyIsIHdpbmRvdykKICAgIHNldGF0dHIoY29udGV4dCwgIndpbmRvd19zaXplIiwgaW50KG9zLmdldGVudigid2luZG93X3NpemUiLCAxMCkpKQogICAgc2V0YXR0cihjb250ZXh0LCAidHNkYl90YWJsZSIsIG9zLmdldGVudigidHNkYl90YWJsZSIsICJjb25jZXB0X2RyaWZ0X3RzZGJfMSIpKQogICAgdHJ5OgogICAgICAgIGNvbnRleHQudjNmLmNyZWF0ZSgidHNkYiIsIGNvbnRleHQudHNkYl90YWJsZSwgcmF0ZT0iMS9zIiwgaWZfZXhpc3RzPTEpCiAgICBleGNlcHQgRXhjZXB0aW9uIGFzIGU6CiAgICAgICAgY29udGV4dC5sb2dnZXIuaW5mbyhmIkNyZWF0aW5nIGNvbnRleHQgd2l0aCByYXRlPSBmYWlsZSBmb3Ige2V9IikKICAgICAgICBjb250ZXh0LnYzZi5jcmVhdGUoCiAgICAgICAgICAgICJ0c2RiIiwgY29udGV4dC50c2RiX3RhYmxlLCBhdHRycz17InJhdGUiOiAiMS9zIn0sIGlmX2V4aXN0cz0xCiAgICAgICAgKQoKICAgIGNhbGxiYWNrcyA9IFtjYWxsYmFjay5zdHJpcCgpIGZvciBjYWxsYmFjayBpbiBvcy5nZXRlbnYoImNhbGxiYWNrcyIsICIiKS5zcGxpdCgiLCIpXQogICAgc2V0YXR0cihjb250ZXh0LCAiY2FsbGJhY2tzIiwgY2FsbGJhY2tzKQoKICAgIHNldGF0dHIoY29udGV4dCwgImRyaWZ0X3N0cmVhbSIsIG9zLmdldGVudigiZHJpZnRfc3RyZWFtIiwgIi9iaWdkYXRhL2RyaWZ0X3N0cmVhbSIpKQogICAgdHJ5OgogICAgICAgIGNyZWF0ZV9zdHJlYW0oCiAgICAgICAgICAgIGNvbnRleHQsIGNvbnRleHQuZHJpZnRfc3RyZWFtLCBpbnQob3MuZ2V0ZW52KCJkcmlmdF9zdHJlYW1fc2hhcmRzIiwgMSkpCiAgICAgICAgKQogICAgZXhjZXB0OgogICAgICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZiJ7Y29udGV4dC5kcmlmdF9zdHJlYW19IGFscmVhZHkgZXhpc3RzIikKCiAgICBtb2RlbHMgPSB7fQogICAgbW9kZWxfdHlwZXMgPSBbInBhZ2VoaW5rZWx5IiwgImRkbSIsICJlZGRtIl0KICAgIHBhdGhfc3VmZml4ID0gIl9tb2RlbF9wYXRoIgogICAgZm9yIG1vZGVsIGluIG1vZGVsX3R5cGVzOgogICAgICAgIG1vZGVsX2VudiA9IGYie21vZGVsfXtwYXRoX3N1ZmZpeH0iCiAgICAgICAgaWYgbW9kZWxfZW52IGluIG9zLmVudmlyb246CiAgICAgICAgICAgIHdpdGggb3Blbihvcy5lbnZpcm9uW21vZGVsX2Vudl0sICJyYiIpIGFzIGY6CiAgICAgICAgICAgICAgICBtb2RlbHNbbW9kZWxdID0gbG9hZChmKQogICAgc2V0YXR0cihjb250ZXh0LCAibW9kZWxzIiwgbW9kZWxzKQoKICAgIHNldGF0dHIoY29udGV4dCwgImxhYmVsX2NvbCIsIG9zLmdldGVudigibGFiZWxfY29sIiwgImxhYmVsIikpCiAgICBzZXRhdHRyKGNvbnRleHQsICJwcmVkaWN0aW9uX2NvbCIsIG9zLmdldGVudigicHJlZGljdGlvbl9jb2wiLCAicHJlZGljdGlvbiIpKQoKCmRlZiBoYW5kbGVyKGNvbnRleHQsIGV2ZW50KToKICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZiJldmVudDoge2V2ZW50LmJvZHl9IikKICAgIGZ1bGxfZXZlbnQgPSBqc29uLmxvYWRzKGV2ZW50LmJvZHkpCiAgICByZWNvcmQgPSBjb25zdHJ1Y3RfcmVjb3JkKGZ1bGxfZXZlbnQpCgogICAgaXNfZXJyb3IgPSByZWNvcmRbY29udGV4dC5sYWJlbF9jb2xdICE9IHJlY29yZFtjb250ZXh0LnByZWRpY3Rpb25fY29sXQogICAgY29udGV4dC5sb2dnZXIuaW5mbyhmIkFkZGluZyB7aXNfZXJyb3J9IikKCiAgICBmb3IgbmFtZSwgbW9kZWwgaW4gY29udGV4dC5tb2RlbHMuaXRlbXMoKToKICAgICAgICByZXN1bHRzID0geyJ0aW1lc3RhbXAiOiByZWNvcmRbInRpbWVzdGFtcCJdfQogICAgICAgIHJlc3VsdHNbImFsZ29yaXRobSJdID0gbmFtZQogICAgICAgIG1vZGVsLmFkZF9lbGVtZW50KGlzX2Vycm9yKQoKICAgICAgICBpZiBoYXNhdHRyKG1vZGVsLCAiZGV0ZWN0ZWRfd2FybmluZ196b25lIikgYW5kIG1vZGVsLmRldGVjdGVkX3dhcm5pbmdfem9uZSgpOgogICAgICAgICAgICBjb250ZXh0LmxvZ2dlci5pbmZvKGYie25hbWV9XHRXYXJuaW5nIHpvbmUgZGV0ZWN0ZWQiKQogICAgICAgICAgICByZXN1bHRzWyJ3YXJuaW5nX3pvbmUiXSA9IDEKICAgICAgICAgICAgZnVsbF9ldmVudFtmIntuYW1lfV93YXJuaW5nX3pvbmUiXSA9IDEKICAgICAgICBlbHNlOgogICAgICAgICAgICByZXN1bHRzWyJ3YXJuaW5nX3pvbmUiXSA9IDAKICAgICAgICAgICAgZnVsbF9ldmVudFtmIntuYW1lfV93YXJuaW5nX3pvbmUiXSA9IDAKCiAgICAgICAgaWYgbW9kZWwuZGV0ZWN0ZWRfY2hhbmdlKCk6CiAgICAgICAgICAgIGNvbnRleHQubG9nZ2VyLmluZm8oIkNoYW5nZSBEZXRlY3RlZCIpCiAgICAgICAgICAgIHJlc3VsdHNbImNoYW5nZV9kZXRlY3RlZCJdID0gMQogICAgICAgICAgICBmdWxsX2V2ZW50W2Yie25hbWV9X2RyaWZ0Il0gPSAxCiAgICAgICAgZWxzZToKICAgICAgICAgICAgcmVzdWx0c1siY2hhbmdlX2RldGVjdGVkIl0gPSAwCiAgICAgICAgICAgIGZ1bGxfZXZlbnRbZiJ7bmFtZX1fZHJpZnQiXSA9IDAKICAgICAgICBjb250ZXh0LndpbmRvdy5hcHBlbmQocmVzdWx0cykKCiAgICBwdXNoX3RvX3N0cmVhbShjb250ZXh0LCBjb250ZXh0LmRyaWZ0X3N0cmVhbSwgW2Z1bGxfZXZlbnRdKQoKICAgIGlmIGNvbnRleHQuY2FsbGJhY2tzICE9IFsiIl06CiAgICAgICAgZm9yIGNhbGxiYWNrIGluIGNvbnRleHQuY2FsbGJhY2tzOgogICAgICAgICAgICByZXF1ZXN0cy5wb3N0KHVybD1jYWxsYmFjaywganNvbj1mdWxsX2V2ZW50KQoKICAgIGlmIChsZW4oY29udGV4dC53aW5kb3cpIC8gbGVuKGNvbnRleHQubW9kZWxzKSkgPj0gY29udGV4dC53aW5kb3dfc2l6ZToKICAgICAgICBkZiA9IHBkLkRhdGFGcmFtZShjb250ZXh0LndpbmRvdykKICAgICAgICBkZlsidGltZXN0YW1wIl0gPSBwZC50b19kYXRldGltZShkZlsidGltZXN0YW1wIl0pCiAgICAgICAgZGYgPSBkZi5zZXRfaW5kZXgoWyJ0aW1lc3RhbXAiLCAiYWxnb3JpdGhtIl0pCiAgICAgICAgY29udGV4dC52M2Yud3JpdGUoInRzZGIiLCBjb250ZXh0LnRzZGJfdGFibGUsIGRmKQogICAgICAgIGNvbnRleHQud2luZG93ID0gW10K - source: '' - build: - commands: - - python -m pip install scikit-multiflow==0.4.1 v3io_frames - code_origin: https://github.com/daniels290813/functions.git#d96059851b5d51fd4583e982483eb973fccc47d2:/User/test/functions/concept_drift_streaming/concept_drift_streaming.py - origin_filename: /User/test/functions/concept_drift_streaming/concept_drift_streaming.py - default_handler: handler - disable_auto_mount: false - affinity: null -verbose: false diff --git a/functions/development/concept_drift_streaming/latest/src/item.yaml b/functions/development/concept_drift_streaming/latest/src/item.yaml deleted file mode 100644 index 91dcb9f4..00000000 --- a/functions/development/concept_drift_streaming/latest/src/item.yaml +++ /dev/null @@ -1,29 +0,0 @@ -apiVersion: v1 -categories: -- machine-learning -- monitoring -description: Deploy a streaming Concept Drift detector on a labeled stream. the nuclio - part of the concept_drift function -doc: '' -example: concept_drift_streaming.ipynb -generationDate: 2022-08-28:17-25 -hidden: false -icon: '' -labels: - author: orz - framework: sklearn -maintainers: [] -marketplaceType: '' -mlrunVersion: 1.1.0 -name: concept-drift-streaming -platformVersion: 3.5.0 -spec: - filename: concept_drift_streaming.py - handler: handler - image: mlrun/ml-models - kind: nuclio - requirements: - - scikit-multiflow==0.4.1 - - v3io_frames -url: '' -version: 1.1.0 diff --git a/functions/development/concept_drift_streaming/latest/src/requirements.txt b/functions/development/concept_drift_streaming/latest/src/requirements.txt deleted file mode 100644 index fa0fddd8..00000000 --- a/functions/development/concept_drift_streaming/latest/src/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -skmultiflow \ No newline at end of file diff --git a/functions/development/concept_drift_streaming/latest/static/concept_drift_streaming.html b/functions/development/concept_drift_streaming/latest/static/concept_drift_streaming.html deleted file mode 100644 index 8885d1f6..00000000 --- a/functions/development/concept_drift_streaming/latest/static/concept_drift_streaming.html +++ /dev/null @@ -1,297 +0,0 @@ - - - - - - - -concept_drift_streaming.concept_drift_streaming - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - -
-
- -
-
-
-
-
- -
-

- -
-
-
-
-
-
-
-

Source code for concept_drift_streaming.concept_drift_streaming

-# Copyright 2019 Iguazio
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# Generated by nuclio.export.NuclioExporter
-
-import skmultiflow.drift_detection
-import numpy as np
-import pandas as pd
-import os
-import json
-import v3io.dataplane
-import v3io_frames as v3f
-import requests
-from cloudpickle import load
-
-import random
-
-
-
[docs]def split_path(mntpath=""): - if mntpath[0] == "/": - mntpath = mntpath[1:] - paths = mntpath.split("/") - container = paths[0] - subpath = "" - if len(paths) > 1: - subpath = mntpath[len(container) :] - return container, subpath
- - -
[docs]def create_stream(context, path, shards=1): - container, stream_path = split_path(path) - context.logger.info( - f"Creating stream in Container: {container} & Path {stream_path}" - ) - response = context.v3io_client.create_stream( - container=container, - path=stream_path, - shard_count=shards, - raise_for_status=v3io.dataplane.RaiseForStatus.never, - ) - response.raise_for_status([409, 204])
- - -
[docs]def push_to_stream(context, stream_path, data): - records = [{"data": json.dumps(rec)} for rec in data] - container, stream_path = split_path(stream_path) - response = context.v3io_client.put_records( - container=container, path=stream_path, records=records - )
- - -
[docs]def construct_record(record): - label_col = os.getenv("label_col", "label") - prediction_col = os.getenv("prediction_col", "prediction") - res = dict([(k, record[k]) for k in ["when", "class", "model", "resp", "request"]]) - res["feature_vector"] = res.pop("request")["instances"][0] - res["timestamp"] = res.pop("when") - res[prediction_col] = res["resp"][0] - return res
- - -
[docs]def init_context(context): - v3io_client = v3io.dataplane.Client() - setattr(context, "v3io_client", v3io_client) - - v3f_client = v3f.Client("framesd:8081", container="bigdata") - setattr(context, "v3f", v3f_client) - window = [] - setattr(context, "window", window) - setattr(context, "window_size", int(os.getenv("window_size", 10))) - setattr(context, "tsdb_table", os.getenv("tsdb_table", "concept_drift_tsdb_1")) - try: - context.v3f.create("tsdb", context.tsdb_table, rate="1/s", if_exists=1) - except Exception as e: - context.logger.info(f"Creating context with rate= faile for {e}") - context.v3f.create( - "tsdb", context.tsdb_table, attrs={"rate": "1/s"}, if_exists=1 - ) - - callbacks = [callback.strip() for callback in os.getenv("callbacks", "").split(",")] - setattr(context, "callbacks", callbacks) - - setattr(context, "drift_stream", os.getenv("drift_stream", "/bigdata/drift_stream")) - try: - create_stream( - context, context.drift_stream, int(os.getenv("drift_stream_shards", 1)) - ) - except: - context.logger.info(f"{context.drift_stream} already exists") - - models = {} - model_types = ["pagehinkely", "ddm", "eddm"] - path_suffix = "_model_path" - for model in model_types: - model_env = f"{model}{path_suffix}" - if model_env in os.environ: - with open(os.environ[model_env], "rb") as f: - models[model] = load(f) - setattr(context, "models", models) - - setattr(context, "label_col", os.getenv("label_col", "label")) - setattr(context, "prediction_col", os.getenv("prediction_col", "prediction"))
- - -
[docs]def handler(context, event): - context.logger.info(f"event: {event.body}") - full_event = json.loads(event.body) - record = construct_record(full_event) - - is_error = record[context.label_col] != record[context.prediction_col] - context.logger.info(f"Adding {is_error}") - - for name, model in context.models.items(): - results = {"timestamp": record["timestamp"]} - results["algorithm"] = name - model.add_element(is_error) - - if hasattr(model, "detected_warning_zone") and model.detected_warning_zone(): - context.logger.info(f"{name}\tWarning zone detected") - results["warning_zone"] = 1 - full_event[f"{name}_warning_zone"] = 1 - else: - results["warning_zone"] = 0 - full_event[f"{name}_warning_zone"] = 0 - - if model.detected_change(): - context.logger.info("Change Detected") - results["change_detected"] = 1 - full_event[f"{name}_drift"] = 1 - else: - results["change_detected"] = 0 - full_event[f"{name}_drift"] = 0 - context.window.append(results) - - push_to_stream(context, context.drift_stream, [full_event]) - - if context.callbacks != [""]: - for callback in context.callbacks: - requests.post(url=callback, json=full_event) - - if (len(context.window) / len(context.models)) >= context.window_size: - df = pd.DataFrame(context.window) - df["timestamp"] = pd.to_datetime(df["timestamp"]) - df = df.set_index(["timestamp", "algorithm"]) - context.v3f.write("tsdb", context.tsdb_table, df) - context.window = []
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/concept_drift_streaming/latest/static/documentation.html b/functions/development/concept_drift_streaming/latest/static/documentation.html deleted file mode 100644 index 3698c0c7..00000000 --- a/functions/development/concept_drift_streaming/latest/static/documentation.html +++ /dev/null @@ -1,246 +0,0 @@ - - - - - - - -concept_drift_streaming package - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - - - -
-
- - -
-
-
- -
-

concept_drift_streaming package

- -
- -
-
-
-
-
-

concept_drift_streaming package#

-
-

Submodules#

-
-
-

concept_drift_streaming.concept_drift_streaming module#

-
-
-concept_drift_streaming.concept_drift_streaming.construct_record(record)[source]#
-
-
-
-concept_drift_streaming.concept_drift_streaming.create_stream(context, path, shards=1)[source]#
-
-
-
-concept_drift_streaming.concept_drift_streaming.handler(context, event)[source]#
-
-
-
-concept_drift_streaming.concept_drift_streaming.init_context(context)[source]#
-
-
-
-concept_drift_streaming.concept_drift_streaming.push_to_stream(context, stream_path, data)[source]#
-
-
-
-concept_drift_streaming.concept_drift_streaming.split_path(mntpath='')[source]#
-
-
-
-

Module contents#

-
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/concept_drift_streaming/latest/static/example.html b/functions/development/concept_drift_streaming/latest/static/example.html deleted file mode 100644 index e3e865b8..00000000 --- a/functions/development/concept_drift_streaming/latest/static/example.html +++ /dev/null @@ -1,598 +0,0 @@ - - - - - - - -Concept Drift Streaming - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - - - -
-
- -
-
- Contents -
- -
-
-
-
- -
-

Concept Drift Streaming

- -
-
-
-

Contents

-
- -
-
-
-
-
-
-

Concept Drift Streaming#

-
-
-
import nuclio
-
-
-
-
-
-
-
from pprint import pprint
-
-
-
-
-
-
-
%%nuclio cmd -c
-python -m pip install scikit-multiflow==0.4.1
-python -m pip install v3io_frames
-
-
-
-
-
-
-
# Define function spec
-%nuclio config kind = "nuclio"
-%nuclio config spec.build.baseImage = "mlrun/ml-models"
-
-# Add V3IO Mount
-# %nuclio env %v3io
-
-
-
-
-
%nuclio: setting kind to 'nuclio'
-%nuclio: setting spec.build.baseImage to 'mlrun/ml-models'
-
-
-
-
-
-
-
# nuclio: ignore
-env = {'label_col': 'resp',
-       'prediction_col': 'prediction',
-       'drift_stream': '/bigdata/network-operations/drift_stream',
-       'tsdb_table': 'network-operations/drift_tsdb',
-       'pagehinkley_threshold': 10,
-       'models': ['pagehinkley', 'ddm', 'eddm'],
-       'window_size': 10}
-config = {'kind': 'nuclio',
-          'spec.build.baseImage': 'mlrun/ml-models'}
-cmd = ['python -m pip install scikit-multiflow',
-       'python -m pip install v3io_frames']
-v3io = True
-config = nuclio.ConfigSpec(env=env,
-                           config=config,
-                           cmd=cmd,
-                           v3io=v3io)
-
-
-
-
-
-
-
# nuclio: start-code
-
-
-
-
-
-
-
import skmultiflow.drift_detection
-import numpy as np
-import pandas as pd
-import os
-import json
-import v3io.dataplane
-import v3io_frames as v3f
-import requests
-from cloudpickle import load
-
-# For testing
-import random
-
-
-
-
-
-
-
def split_path(mntpath=''):
-    if mntpath[0] == '/':
-        mntpath = mntpath[1:]
-    paths = mntpath.split('/')
-    container = paths[0]
-    subpath = ''
-    if len(paths) > 1:
-        subpath = mntpath[len(container):]
-    return container, subpath
-
-
-def create_stream(context, path, shards=1):
-    # create a stream w/8 shards
-    container, stream_path = split_path(path)
-    context.logger.info(f'Creating stream in Container: {container} & Path {stream_path}')
-    response = context.v3io_client.create_stream(container=container,
-                                        path=stream_path, 
-                                        shard_count=shards,
-                                        raise_for_status=v3io.dataplane.RaiseForStatus.never)
-    response.raise_for_status([409, 204])
-    
-    
-def push_to_stream(context, stream_path, data):
-    records = [{'data': json.dumps(rec)} for rec in data]
-    container, stream_path = split_path(stream_path)
-    response = context.v3io_client.put_records(container=container,
-                                               path=stream_path, 
-                                               records=records)
-
-
-def construct_record(record):
-    label_col = os.getenv('label_col', 'label')
-    prediction_col = os.getenv('prediction_col', 'prediction')
-    res = dict([(k, record[k]) for k in ['when', 'class', 'model', 'resp', 'request']])
-    res['feature_vector'] = res.pop('request')['instances'][0]
-    res['timestamp'] = res.pop('when')
-    res['prediction'] = res['resp'][0]
-    return res
-
-
-
-
-
-
-
def init_context(context):
-    # create a v3io context object
-    v3io_client = v3io.dataplane.Client()
-    setattr(context, "v3io_client", v3io_client)
-    
-    # Setup windowing for TSDB writer
-    v3f_client = v3f.Client('framesd:8081', container='bigdata')
-    setattr(context, "v3f", v3f_client)
-    window = []
-    setattr(context, 'window', window)
-    setattr(context, 'window_size', int(os.getenv('window_size', 10)))
-    setattr(context, 'tsdb_table', os.getenv('tsdb_table', 'concept_drift_tsdb_1'))
-    try:
-        context.v3f.create('tsdb', context.tsdb_table, rate='1/s', if_exists=1)
-    except Exception as e:
-        context.logger.info(f'Creating context with rate= faile for {e}')
-        context.v3f.create('tsdb', context.tsdb_table, attrs={'rate': '1/s'}, if_exists=1)
-    
-    # Setup callbacks
-    callbacks = [callback.strip() for callback in os.getenv('callbacks', '').split(',')]
-    setattr(context, 'callbacks', callbacks)
-    
-    # Setup drift stream
-    setattr(context, 'drift_stream', os.getenv('drift_stream', '/bigdata/drift_stream'))
-    try:
-        create_stream(context, context.drift_stream, int(os.getenv('drift_stream_shards', 1)))
-    except:
-        context.logger.info(f'{context.drift_stream} already exists')
-    
-    # Load models
-    models = {}
-    model_types = ['pagehinkely', 'ddm', 'eddm']
-    path_suffix = '_model_path'
-    for model in model_types:
-        model_env = f'{model}{path_suffix}'
-        if model_env in os.environ:
-            with open(os.environ[model_env], 'rb') as f:
-                models[model] = load(f)
-    setattr(context, 'models', models)
-    
-    # Columns to check
-    setattr(context, 'label_col', os.getenv('label_col', 'label'))
-    setattr(context, 'prediction_col', os.getenv('prediction_col', 'prediction'))
-
-
-
-
-
-
-
def handler(context, event):
-    # Construct event
-    context.logger.info(f'event: {event.body}')
-    full_event = json.loads(event.body)
-    record = construct_record(full_event)
-    
-    # Is our prediction wrong?
-    is_error = record[context.label_col] != record[context.prediction_col]
-    context.logger.info(f'Adding {is_error}')
-    
-    # Process the {is_error} element with our algorithms
-    for name, model in context.models.items():
-        # Add element
-        results = {'timestamp': record['timestamp']}
-        results['algorithm'] = name
-        model.add_element(is_error)
-        
-        # Detect warning zone (if applicable to the algorithm)
-        if hasattr(model, 'detected_warning_zone') and model.detected_warning_zone():
-            context.logger.info(f'{name}\tWarning zone detected')
-            results['warning_zone'] = 1
-            full_event[f'{name}_warning_zone'] = 1
-        else:
-            results['warning_zone'] = 0
-            full_event[f'{name}_warning_zone'] = 0
-        
-        # Detect drift
-        if model.detected_change():
-            context.logger.info('Change Detected')
-            results['change_detected'] = 1
-            full_event[f'{name}_drift'] = 1
-        else:
-            results['change_detected'] = 0
-            full_event[f'{name}_drift'] = 0
-        context.window.append(results)
-    
-    # Return results
-    # Write to stream
-    push_to_stream(context, context.drift_stream, [full_event])
-    
-    # Add to callbacks
-    if context.callbacks != ['']:
-        for callback in context.callbacks:
-            requests.post(url=callback,
-                          json=full_event)
-    
-    if (len(context.window) / len(context.models)) >= context.window_size:
-        df = pd.DataFrame(context.window)
-        df['timestamp'] = pd.to_datetime(df['timestamp'])
-        df = df.set_index(['timestamp', 'algorithm'])
-        context.v3f.write('tsdb', context.tsdb_table, df)
-        context.window = []
-
-
-
-
-
-
-
# nuclio: end-code
-
-
-
-
-
-

Test#

-
-
-
init_context(context)
-event = nuclio.Event(body=json.dumps({'prediction': 0,
-                                      'when': 'now',
-                                      'class': 'ClassModel', 
-                                      'model': 'tester_v1', 
-                                      'resp': [0], 
-                                      'request': {'instances': [[1, 1.2, 3]]}}))
-out = handler(context, event)
-out
-
-
-
-
-
-
-

Cluster#

-
-
-
%nuclio deploy -n network-operations-concept-drift -p network-operations
-
-
-
-
-
-
-

Save function yaml#

-
-
-
from os import path
-from mlrun import run_local, NewTask, mlconf, import_function, mount_v3io, code_to_function, get_run_db
-mlconf.dbpath = mlconf.dbpath or 'http://mlrun-api:8080'
-
-
-
-
-
-
-
# create job function object from notebook code
-fn = code_to_function("concept_drift_streaming", kind='nuclio')
-
-# add metadata (for templates and reuse)
-fn.spec.default_handler = "handler"
-fn.spec.description = "Deploy a streaming Concept Drift detector on a labeled stream. the nuclio part of the concept_drift function"
-fn.metadata.categories = ["ml", "serve"]
-fn.metadata.labels = {"author": "orz", "framework": "sklearn"}
-fn.export("/User/functions/concept_drift_streaming/function.yaml")
-
-
-
-
-
[mlrun] 2020-07-14 13:49:22,720 function spec saved to path: /User/functions/concept_drift_streaming/function.yaml
-
-
-
<mlrun.runtimes.function.RemoteRuntime at 0x7fb5a7de3e80>
-
-
-
-
-
-
-
stream_trigger = nuclio.triggers.V3IOStreamTrigger(url='/bigdata/network-operations/inference_stream@cd2')
-
-
-
-
-
-
-
fn.add_trigger('labeled_stream', stream_trigger)
-
-
-
-
-
<mlrun.runtimes.function.RemoteRuntime at 0x7fa1dc063780>
-
-
-
-
-
-
-
fn.apply(mount_v3io()).with_v3io()
-
-
-
-
-
<mlrun.runtimes.function.RemoteRuntime at 0x7fa1dc063780>
-
-
-
-
-
-
-
fn.export("function.yaml")
-
-
-
-
-
-
-

Stream testing#

-
-
-
fn = import_function('./function.yaml')
-
-
-
-
-
-
-
fn.deploy(project='network-operations')
-
-
-
-
-
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/concept_drift_streaming/latest/static/function.html b/functions/development/concept_drift_streaming/latest/static/function.html deleted file mode 100644 index 32c31c6f..00000000 --- a/functions/development/concept_drift_streaming/latest/static/function.html +++ /dev/null @@ -1,70 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: remote
-metadata:
-  name: concept-drift-streaming
-  tag: ''
-  hash: dc41ff41149be69f19b91a6d78a06571937063ae
-  project: ''
-  labels:
-    author: orz
-    framework: sklearn
-  categories:
-  - machine-learning
-  - monitoring
-spec:
-  command: ''
-  args: []
-  image: mlrun/ml-models
-  description: Deploy a streaming Concept Drift detector on a labeled stream. the
-    nuclio part of the concept_drift function
-  min_replicas: 1
-  max_replicas: 4
-  env: []
-  base_spec:
-    apiVersion: nuclio.io/v1
-    kind: Function
-    metadata:
-      name: concept-drift-streaming
-      labels: {}
-      annotations:
-        nuclio.io/generated_by: function generated from /User/test/functions/concept_drift_streaming/concept_drift_streaming.py
-    spec:
-      runtime: python:3.6
-      handler: concept_drift_streaming:handler
-      env: []
-      volumes: []
-      build:
-        commands: []
-        noBaseImagesPull: true
-        functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHNrbXVsdGlmbG93LmRyaWZ0X2RldGVjdGlvbgppbXBvcnQgbnVtcHkgYXMgbnAKaW1wb3J0IHBhbmRhcyBhcyBwZAppbXBvcnQgb3MKaW1wb3J0IGpzb24KaW1wb3J0IHYzaW8uZGF0YXBsYW5lCmltcG9ydCB2M2lvX2ZyYW1lcyBhcyB2M2YKaW1wb3J0IHJlcXVlc3RzCmZyb20gY2xvdWRwaWNrbGUgaW1wb3J0IGxvYWQKCmltcG9ydCByYW5kb20KCgpkZWYgc3BsaXRfcGF0aChtbnRwYXRoPSIiKToKICAgIGlmIG1udHBhdGhbMF0gPT0gIi8iOgogICAgICAgIG1udHBhdGggPSBtbnRwYXRoWzE6XQogICAgcGF0aHMgPSBtbnRwYXRoLnNwbGl0KCIvIikKICAgIGNvbnRhaW5lciA9IHBhdGhzWzBdCiAgICBzdWJwYXRoID0gIiIKICAgIGlmIGxlbihwYXRocykgPiAxOgogICAgICAgIHN1YnBhdGggPSBtbnRwYXRoW2xlbihjb250YWluZXIpIDpdCiAgICByZXR1cm4gY29udGFpbmVyLCBzdWJwYXRoCgoKZGVmIGNyZWF0ZV9zdHJlYW0oY29udGV4dCwgcGF0aCwgc2hhcmRzPTEpOgogICAgY29udGFpbmVyLCBzdHJlYW1fcGF0aCA9IHNwbGl0X3BhdGgocGF0aCkKICAgIGNvbnRleHQubG9nZ2VyLmluZm8oCiAgICAgICAgZiJDcmVhdGluZyBzdHJlYW0gaW4gQ29udGFpbmVyOiB7Y29udGFpbmVyfSAmIFBhdGgge3N0cmVhbV9wYXRofSIKICAgICkKICAgIHJlc3BvbnNlID0gY29udGV4dC52M2lvX2NsaWVudC5jcmVhdGVfc3RyZWFtKAogICAgICAgIGNvbnRhaW5lcj1jb250YWluZXIsCiAgICAgICAgcGF0aD1zdHJlYW1fcGF0aCwKICAgICAgICBzaGFyZF9jb3VudD1zaGFyZHMsCiAgICAgICAgcmFpc2VfZm9yX3N0YXR1cz12M2lvLmRhdGFwbGFuZS5SYWlzZUZvclN0YXR1cy5uZXZlciwKICAgICkKICAgIHJlc3BvbnNlLnJhaXNlX2Zvcl9zdGF0dXMoWzQwOSwgMjA0XSkKCgpkZWYgcHVzaF90b19zdHJlYW0oY29udGV4dCwgc3RyZWFtX3BhdGgsIGRhdGEpOgogICAgcmVjb3JkcyA9IFt7ImRhdGEiOiBqc29uLmR1bXBzKHJlYyl9IGZvciByZWMgaW4gZGF0YV0KICAgIGNvbnRhaW5lciwgc3RyZWFtX3BhdGggPSBzcGxpdF9wYXRoKHN0cmVhbV9wYXRoKQogICAgcmVzcG9uc2UgPSBjb250ZXh0LnYzaW9fY2xpZW50LnB1dF9yZWNvcmRzKAogICAgICAgIGNvbnRhaW5lcj1jb250YWluZXIsIHBhdGg9c3RyZWFtX3BhdGgsIHJlY29yZHM9cmVjb3JkcwogICAgKQoKCmRlZiBjb25zdHJ1Y3RfcmVjb3JkKHJlY29yZCk6CiAgICBsYWJlbF9jb2wgPSBvcy5nZXRlbnYoImxhYmVsX2NvbCIsICJsYWJlbCIpCiAgICBwcmVkaWN0aW9uX2NvbCA9IG9zLmdldGVudigicHJlZGljdGlvbl9jb2wiLCAicHJlZGljdGlvbiIpCiAgICByZXMgPSBkaWN0KFsoaywgcmVjb3JkW2tdKSBmb3IgayBpbiBbIndoZW4iLCAiY2xhc3MiLCAibW9kZWwiLCAicmVzcCIsICJyZXF1ZXN0Il1dKQogICAgcmVzWyJmZWF0dXJlX3ZlY3RvciJdID0gcmVzLnBvcCgicmVxdWVzdCIpWyJpbnN0YW5jZXMiXVswXQogICAgcmVzWyJ0aW1lc3RhbXAiXSA9IHJlcy5wb3AoIndoZW4iKQogICAgcmVzW3ByZWRpY3Rpb25fY29sXSA9IHJlc1sicmVzcCJdWzBdCiAgICByZXR1cm4gcmVzCgoKZGVmIGluaXRfY29udGV4dChjb250ZXh0KToKICAgIHYzaW9fY2xpZW50ID0gdjNpby5kYXRhcGxhbmUuQ2xpZW50KCkKICAgIHNldGF0dHIoY29udGV4dCwgInYzaW9fY2xpZW50IiwgdjNpb19jbGllbnQpCgogICAgdjNmX2NsaWVudCA9IHYzZi5DbGllbnQoImZyYW1lc2Q6ODA4MSIsIGNvbnRhaW5lcj0iYmlnZGF0YSIpCiAgICBzZXRhdHRyKGNvbnRleHQsICJ2M2YiLCB2M2ZfY2xpZW50KQogICAgd2luZG93ID0gW10KICAgIHNldGF0dHIoY29udGV4dCwgIndpbmRvdyIsIHdpbmRvdykKICAgIHNldGF0dHIoY29udGV4dCwgIndpbmRvd19zaXplIiwgaW50KG9zLmdldGVudigid2luZG93X3NpemUiLCAxMCkpKQogICAgc2V0YXR0cihjb250ZXh0LCAidHNkYl90YWJsZSIsIG9zLmdldGVudigidHNkYl90YWJsZSIsICJjb25jZXB0X2RyaWZ0X3RzZGJfMSIpKQogICAgdHJ5OgogICAgICAgIGNvbnRleHQudjNmLmNyZWF0ZSgidHNkYiIsIGNvbnRleHQudHNkYl90YWJsZSwgcmF0ZT0iMS9zIiwgaWZfZXhpc3RzPTEpCiAgICBleGNlcHQgRXhjZXB0aW9uIGFzIGU6CiAgICAgICAgY29udGV4dC5sb2dnZXIuaW5mbyhmIkNyZWF0aW5nIGNvbnRleHQgd2l0aCByYXRlPSBmYWlsZSBmb3Ige2V9IikKICAgICAgICBjb250ZXh0LnYzZi5jcmVhdGUoCiAgICAgICAgICAgICJ0c2RiIiwgY29udGV4dC50c2RiX3RhYmxlLCBhdHRycz17InJhdGUiOiAiMS9zIn0sIGlmX2V4aXN0cz0xCiAgICAgICAgKQoKICAgIGNhbGxiYWNrcyA9IFtjYWxsYmFjay5zdHJpcCgpIGZvciBjYWxsYmFjayBpbiBvcy5nZXRlbnYoImNhbGxiYWNrcyIsICIiKS5zcGxpdCgiLCIpXQogICAgc2V0YXR0cihjb250ZXh0LCAiY2FsbGJhY2tzIiwgY2FsbGJhY2tzKQoKICAgIHNldGF0dHIoY29udGV4dCwgImRyaWZ0X3N0cmVhbSIsIG9zLmdldGVudigiZHJpZnRfc3RyZWFtIiwgIi9iaWdkYXRhL2RyaWZ0X3N0cmVhbSIpKQogICAgdHJ5OgogICAgICAgIGNyZWF0ZV9zdHJlYW0oCiAgICAgICAgICAgIGNvbnRleHQsIGNvbnRleHQuZHJpZnRfc3RyZWFtLCBpbnQob3MuZ2V0ZW52KCJkcmlmdF9zdHJlYW1fc2hhcmRzIiwgMSkpCiAgICAgICAgKQogICAgZXhjZXB0OgogICAgICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZiJ7Y29udGV4dC5kcmlmdF9zdHJlYW19IGFscmVhZHkgZXhpc3RzIikKCiAgICBtb2RlbHMgPSB7fQogICAgbW9kZWxfdHlwZXMgPSBbInBhZ2VoaW5rZWx5IiwgImRkbSIsICJlZGRtIl0KICAgIHBhdGhfc3VmZml4ID0gIl9tb2RlbF9wYXRoIgogICAgZm9yIG1vZGVsIGluIG1vZGVsX3R5cGVzOgogICAgICAgIG1vZGVsX2VudiA9IGYie21vZGVsfXtwYXRoX3N1ZmZpeH0iCiAgICAgICAgaWYgbW9kZWxfZW52IGluIG9zLmVudmlyb246CiAgICAgICAgICAgIHdpdGggb3Blbihvcy5lbnZpcm9uW21vZGVsX2Vudl0sICJyYiIpIGFzIGY6CiAgICAgICAgICAgICAgICBtb2RlbHNbbW9kZWxdID0gbG9hZChmKQogICAgc2V0YXR0cihjb250ZXh0LCAibW9kZWxzIiwgbW9kZWxzKQoKICAgIHNldGF0dHIoY29udGV4dCwgImxhYmVsX2NvbCIsIG9zLmdldGVudigibGFiZWxfY29sIiwgImxhYmVsIikpCiAgICBzZXRhdHRyKGNvbnRleHQsICJwcmVkaWN0aW9uX2NvbCIsIG9zLmdldGVudigicHJlZGljdGlvbl9jb2wiLCAicHJlZGljdGlvbiIpKQoKCmRlZiBoYW5kbGVyKGNvbnRleHQsIGV2ZW50KToKICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZiJldmVudDoge2V2ZW50LmJvZHl9IikKICAgIGZ1bGxfZXZlbnQgPSBqc29uLmxvYWRzKGV2ZW50LmJvZHkpCiAgICByZWNvcmQgPSBjb25zdHJ1Y3RfcmVjb3JkKGZ1bGxfZXZlbnQpCgogICAgaXNfZXJyb3IgPSByZWNvcmRbY29udGV4dC5sYWJlbF9jb2xdICE9IHJlY29yZFtjb250ZXh0LnByZWRpY3Rpb25fY29sXQogICAgY29udGV4dC5sb2dnZXIuaW5mbyhmIkFkZGluZyB7aXNfZXJyb3J9IikKCiAgICBmb3IgbmFtZSwgbW9kZWwgaW4gY29udGV4dC5tb2RlbHMuaXRlbXMoKToKICAgICAgICByZXN1bHRzID0geyJ0aW1lc3RhbXAiOiByZWNvcmRbInRpbWVzdGFtcCJdfQogICAgICAgIHJlc3VsdHNbImFsZ29yaXRobSJdID0gbmFtZQogICAgICAgIG1vZGVsLmFkZF9lbGVtZW50KGlzX2Vycm9yKQoKICAgICAgICBpZiBoYXNhdHRyKG1vZGVsLCAiZGV0ZWN0ZWRfd2FybmluZ196b25lIikgYW5kIG1vZGVsLmRldGVjdGVkX3dhcm5pbmdfem9uZSgpOgogICAgICAgICAgICBjb250ZXh0LmxvZ2dlci5pbmZvKGYie25hbWV9XHRXYXJuaW5nIHpvbmUgZGV0ZWN0ZWQiKQogICAgICAgICAgICByZXN1bHRzWyJ3YXJuaW5nX3pvbmUiXSA9IDEKICAgICAgICAgICAgZnVsbF9ldmVudFtmIntuYW1lfV93YXJuaW5nX3pvbmUiXSA9IDEKICAgICAgICBlbHNlOgogICAgICAgICAgICByZXN1bHRzWyJ3YXJuaW5nX3pvbmUiXSA9IDAKICAgICAgICAgICAgZnVsbF9ldmVudFtmIntuYW1lfV93YXJuaW5nX3pvbmUiXSA9IDAKCiAgICAgICAgaWYgbW9kZWwuZGV0ZWN0ZWRfY2hhbmdlKCk6CiAgICAgICAgICAgIGNvbnRleHQubG9nZ2VyLmluZm8oIkNoYW5nZSBEZXRlY3RlZCIpCiAgICAgICAgICAgIHJlc3VsdHNbImNoYW5nZV9kZXRlY3RlZCJdID0gMQogICAgICAgICAgICBmdWxsX2V2ZW50W2Yie25hbWV9X2RyaWZ0Il0gPSAxCiAgICAgICAgZWxzZToKICAgICAgICAgICAgcmVzdWx0c1siY2hhbmdlX2RldGVjdGVkIl0gPSAwCiAgICAgICAgICAgIGZ1bGxfZXZlbnRbZiJ7bmFtZX1fZHJpZnQiXSA9IDAKICAgICAgICBjb250ZXh0LndpbmRvdy5hcHBlbmQocmVzdWx0cykKCiAgICBwdXNoX3RvX3N0cmVhbShjb250ZXh0LCBjb250ZXh0LmRyaWZ0X3N0cmVhbSwgW2Z1bGxfZXZlbnRdKQoKICAgIGlmIGNvbnRleHQuY2FsbGJhY2tzICE9IFsiIl06CiAgICAgICAgZm9yIGNhbGxiYWNrIGluIGNvbnRleHQuY2FsbGJhY2tzOgogICAgICAgICAgICByZXF1ZXN0cy5wb3N0KHVybD1jYWxsYmFjaywganNvbj1mdWxsX2V2ZW50KQoKICAgIGlmIChsZW4oY29udGV4dC53aW5kb3cpIC8gbGVuKGNvbnRleHQubW9kZWxzKSkgPj0gY29udGV4dC53aW5kb3dfc2l6ZToKICAgICAgICBkZiA9IHBkLkRhdGFGcmFtZShjb250ZXh0LndpbmRvdykKICAgICAgICBkZlsidGltZXN0YW1wIl0gPSBwZC50b19kYXRldGltZShkZlsidGltZXN0YW1wIl0pCiAgICAgICAgZGYgPSBkZi5zZXRfaW5kZXgoWyJ0aW1lc3RhbXAiLCAiYWxnb3JpdGhtIl0pCiAgICAgICAgY29udGV4dC52M2Yud3JpdGUoInRzZGIiLCBjb250ZXh0LnRzZGJfdGFibGUsIGRmKQogICAgICAgIGNvbnRleHQud2luZG93ID0gW10K
-  source: ''
-  build:
-    commands:
-    - python -m pip install scikit-multiflow==0.4.1 v3io_frames
-    code_origin: https://github.com/daniels290813/functions.git#d96059851b5d51fd4583e982483eb973fccc47d2:/User/test/functions/concept_drift_streaming/concept_drift_streaming.py
-    origin_filename: /User/test/functions/concept_drift_streaming/concept_drift_streaming.py
-  default_handler: handler
-  disable_auto_mount: false
-  affinity: null
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/concept_drift_streaming/latest/static/item.html b/functions/development/concept_drift_streaming/latest/static/item.html deleted file mode 100644 index ad0dc6ee..00000000 --- a/functions/development/concept_drift_streaming/latest/static/item.html +++ /dev/null @@ -1,51 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- machine-learning
-- monitoring
-description: Deploy a streaming Concept Drift detector on a labeled stream. the nuclio
-  part of the concept_drift function
-doc: ''
-example: concept_drift_streaming.ipynb
-generationDate: 2022-08-28:17-25
-hidden: false
-icon: ''
-labels:
-  author: orz
-  framework: sklearn
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 1.1.0
-name: concept-drift-streaming
-platformVersion: 3.5.0
-spec:
-  filename: concept_drift_streaming.py
-  handler: handler
-  image: mlrun/ml-models
-  kind: nuclio
-  requirements:
-  - scikit-multiflow==0.4.1
-  - v3io_frames
-url: ''
-version: 1.1.0
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/concept_drift_streaming/latest/static/source.html b/functions/development/concept_drift_streaming/latest/static/source.html deleted file mode 100644 index 2524ecf3..00000000 --- a/functions/development/concept_drift_streaming/latest/static/source.html +++ /dev/null @@ -1,179 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-# Copyright 2019 Iguazio
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# Generated by nuclio.export.NuclioExporter
-
-import skmultiflow.drift_detection
-import numpy as np
-import pandas as pd
-import os
-import json
-import v3io.dataplane
-import v3io_frames as v3f
-import requests
-from cloudpickle import load
-
-import random
-
-
-def split_path(mntpath=""):
-    if mntpath[0] == "/":
-        mntpath = mntpath[1:]
-    paths = mntpath.split("/")
-    container = paths[0]
-    subpath = ""
-    if len(paths) > 1:
-        subpath = mntpath[len(container) :]
-    return container, subpath
-
-
-def create_stream(context, path, shards=1):
-    container, stream_path = split_path(path)
-    context.logger.info(
-        f"Creating stream in Container: {container} & Path {stream_path}"
-    )
-    response = context.v3io_client.create_stream(
-        container=container,
-        path=stream_path,
-        shard_count=shards,
-        raise_for_status=v3io.dataplane.RaiseForStatus.never,
-    )
-    response.raise_for_status([409, 204])
-
-
-def push_to_stream(context, stream_path, data):
-    records = [{"data": json.dumps(rec)} for rec in data]
-    container, stream_path = split_path(stream_path)
-    response = context.v3io_client.put_records(
-        container=container, path=stream_path, records=records
-    )
-
-
-def construct_record(record):
-    label_col = os.getenv("label_col", "label")
-    prediction_col = os.getenv("prediction_col", "prediction")
-    res = dict([(k, record[k]) for k in ["when", "class", "model", "resp", "request"]])
-    res["feature_vector"] = res.pop("request")["instances"][0]
-    res["timestamp"] = res.pop("when")
-    res[prediction_col] = res["resp"][0]
-    return res
-
-
-def init_context(context):
-    v3io_client = v3io.dataplane.Client()
-    setattr(context, "v3io_client", v3io_client)
-
-    v3f_client = v3f.Client("framesd:8081", container="bigdata")
-    setattr(context, "v3f", v3f_client)
-    window = []
-    setattr(context, "window", window)
-    setattr(context, "window_size", int(os.getenv("window_size", 10)))
-    setattr(context, "tsdb_table", os.getenv("tsdb_table", "concept_drift_tsdb_1"))
-    try:
-        context.v3f.create("tsdb", context.tsdb_table, rate="1/s", if_exists=1)
-    except Exception as e:
-        context.logger.info(f"Creating context with rate= faile for {e}")
-        context.v3f.create(
-            "tsdb", context.tsdb_table, attrs={"rate": "1/s"}, if_exists=1
-        )
-
-    callbacks = [callback.strip() for callback in os.getenv("callbacks", "").split(",")]
-    setattr(context, "callbacks", callbacks)
-
-    setattr(context, "drift_stream", os.getenv("drift_stream", "/bigdata/drift_stream"))
-    try:
-        create_stream(
-            context, context.drift_stream, int(os.getenv("drift_stream_shards", 1))
-        )
-    except:
-        context.logger.info(f"{context.drift_stream} already exists")
-
-    models = {}
-    model_types = ["pagehinkely", "ddm", "eddm"]
-    path_suffix = "_model_path"
-    for model in model_types:
-        model_env = f"{model}{path_suffix}"
-        if model_env in os.environ:
-            with open(os.environ[model_env], "rb") as f:
-                models[model] = load(f)
-    setattr(context, "models", models)
-
-    setattr(context, "label_col", os.getenv("label_col", "label"))
-    setattr(context, "prediction_col", os.getenv("prediction_col", "prediction"))
-
-
-def handler(context, event):
-    context.logger.info(f"event: {event.body}")
-    full_event = json.loads(event.body)
-    record = construct_record(full_event)
-
-    is_error = record[context.label_col] != record[context.prediction_col]
-    context.logger.info(f"Adding {is_error}")
-
-    for name, model in context.models.items():
-        results = {"timestamp": record["timestamp"]}
-        results["algorithm"] = name
-        model.add_element(is_error)
-
-        if hasattr(model, "detected_warning_zone") and model.detected_warning_zone():
-            context.logger.info(f"{name}\tWarning zone detected")
-            results["warning_zone"] = 1
-            full_event[f"{name}_warning_zone"] = 1
-        else:
-            results["warning_zone"] = 0
-            full_event[f"{name}_warning_zone"] = 0
-
-        if model.detected_change():
-            context.logger.info("Change Detected")
-            results["change_detected"] = 1
-            full_event[f"{name}_drift"] = 1
-        else:
-            results["change_detected"] = 0
-            full_event[f"{name}_drift"] = 0
-        context.window.append(results)
-
-    push_to_stream(context, context.drift_stream, [full_event])
-
-    if context.callbacks != [""]:
-        for callback in context.callbacks:
-            requests.post(url=callback, json=full_event)
-
-    if (len(context.window) / len(context.models)) >= context.window_size:
-        df = pd.DataFrame(context.window)
-        df["timestamp"] = pd.to_datetime(df["timestamp"])
-        df = df.set_index(["timestamp", "algorithm"])
-        context.v3f.write("tsdb", context.tsdb_table, df)
-        context.window = []
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/feature_perms/0.0.1/src/README.ipynb b/functions/development/feature_perms/0.0.1/src/README.ipynb deleted file mode 100644 index 0929a6f6..00000000 --- a/functions/development/feature_perms/0.0.1/src/README.ipynb +++ /dev/null @@ -1,788 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# feature importances" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "There are a number of ways to compute feature importances and **the default estimates reported by scikit learn can be shown to be biased** under certain circumstances. In addition, many non-tree algorithms do not provide conveniently calculated feature importance estimates. The following demonstration is based on material that draws heavily from the following sources:" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## references\n", - "\n", - "\n", - "### repos\n", - "\n", - "* **[Feature importances for scikit-learn machine learning models](https://github.com/parrt/random-forest-importances)**, [MIT License](https://github.com/parrt/random-forest-importances/blob/master/LICENSE)\n", - "* **[Scikit-Learn ensemble module - forests](https://github.com/scikit-learn/scikit-learn/blob/0.23.1/sklearn/ensemble/_forest.py)**, [BSD License](https://github.com/scikit-learn/scikit-learn/blob/fd237278e895b42abe8d8d09105cbb82dc2cbba7/sklearn/ensemble/_forest.py#L40)\n", - "* **[ELI5 - Permutation Importance](https://eli5.readthedocs.io/en/latest/blackbox/permutation_importance.html)** \n", - "\n", - "### articles\n", - "\n", - "Strobl, C., Boulesteix, A., Zeileis, A. et al. **[Bias in random forest variable importance measures: Illustrations, sources and a solution](https://link.springer.com/article/10.1186/1471-2105-8-25#citeas)**. BMC Bioinformatics 8, 25 (2007). https://doi.org/10.1186/1471-2105-8-25 \n", - "\n", - "Strobl, C., Boulesteix, A., Kneib, T. et al. **[Conditional variable importance for random forests](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-9-307#citeas)**. BMC Bioinformatics 9, 307 (2008). https://doi.org/10.1186/1471-2105-9-307 " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## what we'll do\n", - "\n", - "* demonstrate an issue with default feature importance estimates \n", - "* provide alternatives and compare to the default \n", - "* create a new function `feature_perms` that implements a computationally simple algorithm \n", - "* create a new function `dropcol_importances` that implements a computationally intensive algorithm that is more accurate\n", - "* test our new functions\n", - "\n", - "It should be noted that although we are developing this notebook using a classification example, an almost identical presentation can be done for regression." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## imports" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "import pandas as pd\n", - "\n", - "import sklearn\n", - "from sklearn.base import clone\n", - "\n", - "from sklearn.ensemble import RandomForestClassifier as SomeModel\n", - "\n", - "import matplotlib.pyplot as plt\n", - "import seaborn as sns\n", - "\n", - "from typing import Union, Callable, List" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## default feature importances\n", - "\n", - "This is a function that plots default feature importances from an estimated model object when available. It is taken from mlrun's current source-code implementation:" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "def feature_importances(\n", - " model: SomeModel,\n", - " header: List[str], \n", - " figsz=(10, 5)\n", - ") -> None:\n", - " \"\"\"Display default model feature importances\n", - "\n", - " Only works for models with attribute 'feature_importances_`\n", - "\n", - " :param model: fitted model with a feature_importances_ attribute\n", - " :param header: feature labels\n", - " :param figsz: matplotlib figure size\n", - " \"\"\"\n", - " if not hasattr(model, \"feature_importances_\"):\n", - " raise Exception(\n", - " \"feature importances are only available for some models\")\n", - "\n", - " # create a feature importance table with desired labels\n", - " zipped = zip(model.feature_importances_, header)\n", - " feature_imp = pd.DataFrame(\n", - " sorted(zipped), columns=[\"freq\", \"feature\"]).sort_values(\n", - " by=\"freq\", ascending=False)\n", - "\n", - " plt.clf()\n", - " plt.figure(figsize=figsz)\n", - " sns.barplot(x=\"freq\", y=\"feature\", data=feature_imp)\n", - " plt.title(\"features\")\n", - " plt.tight_layout();\n", - " \n", - " return feature_imp" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## permuted features\n", - "\n", - "A proposed solution that has general applicability is randomly permuted features**[refs](#references)**: \n", - "* loop through the feature set \n", - "* shuffle one feature \n", - "* run predict\n", - "* compare the (marginal) change in accuracy (or other metric of interest) \n", - "\n", - "This approach is computationally more demanding than relying on the default values, however it can be easily parallelized. To perform the estimation we only need an estimated model and a held-out test set. The following was proposed in **[Beware Default Random Forest Importances](https://explained.ai/rf-importance/index.html)**:\n", - "\n", - "( the following 3 glue functions will no longer be publicly visible in the sklearn package from 0.24 onwards, consider this a temporary hack while we refactor these away)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**the following has been refactored in final version of function:**" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "from distutils.version import LooseVersion\n", - "import numpy as np\n", - "from sklearn.utils import check_random_state\n", - "\n", - "def _generate_sample_indices(random_state: int, n_samples: int, n_samples_bootstrap: int):\n", - " \"\"\"\n", - " Private function used to _parallel_build_trees function.\n", - " taken from:\n", - " https://github.com/scikit-learn/scikit-learn/blob/2253807bb488b6de73796aef2de38a6dcf282d86/sklearn/ensemble/_forest.py#L116\n", - " (public availability to be deprecated by sklearn v0.24)\n", - " \"\"\"\n", - " random_instance = check_random_state(random_state)\n", - " sample_indices = random_instance.randint(0, n_samples, n_samples_bootstrap)\n", - "\n", - " return sample_indices\n", - "\n", - "def _generate_unsampled_indices(random_state: int, n_samples: int, n_samples_bootstrap: int):\n", - " \"\"\"\n", - " Private function used to forest._set_oob_score function.\n", - " taken from: \n", - " https://github.com/scikit-learn/scikit-learn/blob/2253807bb488b6de73796aef2de38a6dcf282d86/sklearn/ensemble/_forest.py#L126\n", - " (public availability to be deprecated by sklearn v0.24)\n", - " \"\"\"\n", - " sample_indices = _generate_sample_indices(random_state, n_samples,\n", - " n_samples_bootstrap)\n", - " sample_counts = np.bincount(sample_indices, minlength=n_samples)\n", - " unsampled_mask = sample_counts == 0\n", - " indices_range = np.arange(n_samples)\n", - " unsampled_indices = indices_range[unsampled_mask]\n", - "\n", - " return unsampled_indices\n", - "\n", - "def _get_unsampled_indices(tree, n_samples: int):\n", - " \"\"\"\n", - " An interface to get unsampled indices regardless of sklearn version.\n", - " \"\"\"\n", - " import warnings\n", - " warnings.simplefilter(action=\"ignore\", category=FutureWarning)\n", - " if LooseVersion(sklearn.__version__) >= LooseVersion(\"0.22\"):\n", - " # Version 0.22 or newer uses 3 arguments.\n", - " from sklearn.ensemble.forest import _get_n_samples_bootstrap\n", - " n_samples_bootstrap = _get_n_samples_bootstrap(n_samples, n_samples)\n", - " return _generate_unsampled_indices(tree.random_state, n_samples,\n", - " n_samples_bootstrap)\n", - " else:\n", - " # Version 0.21 or older uses only two arguments.\n", - " return _generate_unsampled_indices(tree.random_state, n_samples)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The following function estimates classifier accuracy and has been borrowed from **[references](#references)**. See **[breitman on oob](https://www.stat.berkeley.edu/~breiman/OOBestimation.pdf)** for details on out-of-bag estimation:" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "def oob_classifier_accuracy(rf, X_train: np.array, y_train: np.array) -> float:\n", - " \"\"\"\n", - " Compute out-of-bag (OOB) accuracy for a scikit-learn forest classifier.\n", - " \n", - " https://github.com/scikit-learn/scikit-learn/blob/a24c8b46/sklearn/ensemble/forest.py#L425\n", - " \"\"\"\n", - " X = X_train.values\n", - " y = y_train.values\n", - "\n", - " n_samples = len(X)\n", - " n_classes = len(np.unique(y))\n", - " predictions = np.zeros((n_samples, n_classes))\n", - " for tree in rf.estimators_:\n", - " unsampled_indices = _get_unsampled_indices(tree, n_samples)\n", - " tree_preds = tree.predict_proba(X[unsampled_indices, :])\n", - " predictions[unsampled_indices] += tree_preds\n", - "\n", - " predicted_class_indexes = np.argmax(predictions, axis=1)\n", - " predicted_classes = [rf.classes_[i] for i in predicted_class_indexes]\n", - "\n", - " oob_score = np.mean(y == predicted_classes)\n", - " \n", - " return oob_score" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Putting it all together:" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "def permutation_importances(\n", - " model, \n", - " X_train: np.array,\n", - " y_train: np.array, \n", - " header: List[str],\n", - " metric: Callable = oob_classifier_accuracy,\n", - " figsz=(10, 5)\n", - ") -> np.array:\n", - " \"\"\"calculate change in metric from permuting feature columns\n", - " \n", - " modified from https://explained.ai/rf-importance/index.html\n", - " \n", - " uses a pre-estimated model\n", - "\n", - " :param X_train: training set features\n", - " :param y_train: training set ground truths, regression targets\n", - " :param header: column labels for X_train\n", - " :param figsz: matplotlib figure size\n", - " \n", - " \"\"\"\n", - " baseline = metric(model, X_train, y_train)\n", - " imp = []\n", - " for col in X_train.columns:\n", - " save = X_train[col].copy()\n", - " X_train[col] = np.random.permutation(X_train[col])\n", - " m = metric(model, X_train, y_train)\n", - " X_train[col] = save\n", - " imp.append(baseline - m)\n", - " \n", - " # create a feature importance table with desired labels\n", - " zipped = zip(imp, header)\n", - " feature_imp = pd.DataFrame(sorted(zipped), columns=[\"importance\", \"feature\"])\n", - " feature_imp.sort_values(by=\"importance\", ascending=False, inplace=True)\n", - "\n", - " plt.clf()\n", - " plt.figure(figsize=figsz)\n", - " sns.barplot(x=\"importance\", y=\"feature\", data=feature_imp)\n", - " plt.title(\"feature permutation importances\")\n", - " plt.tight_layout()\n", - "\n", - " return np.array(feature_imp)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## drop-column importances\n", - "\n", - "According to our **[references](#references)** a more accurate measure of feature importance would have us re-estimate the model after dropping a column. This is considered as being close to \"ideal\". Unfortunately, the entire model needs to be re-estimated for each column and without some approximating shortcut this is likely to be infeasible for large datasets.\n", - "\n", - "Here is the suggested implementation and **don't run this on big models!**:" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "def dropcol_importances(\n", - " model, \n", - " X_train: np.array,\n", - " y_train: np.array,\n", - " header: List[str] = [],\n", - " random_state: int = 1994,\n", - " figsz=(10, 5)\n", - ") -> pd.DataFrame:\n", - " \"\"\"drop columns and re-estimate model\n", - " \n", - " modified from https://explained.ai/rf-importance/index.html\n", - " \n", - " :param rf: model to fit\n", - " :param X_train: training set features\n", - " :param y_train: training set ground truth labels\n", - "\n", - " Returns:\n", - " pd.DataFrame: table of diffs vs baseline metric\n", - " \"\"\"\n", - " # cloning makes copy of model pre-fit\n", - " # calculate a baseline with all features\n", - " model_ = clone(model)\n", - " model_.random_state = random_state\n", - " model_.fit(X_train, y_train)\n", - " baseline = model_.oob_score_\n", - " \n", - " # now drop each colum, refit model and calc metric\n", - " imp = []\n", - " for col in X_train.columns:\n", - " X = X_train.drop(col, axis=1)\n", - " model_ = clone(model)\n", - " model_.random_state = random_state\n", - " model_.fit(X, y_train)\n", - " o = model_.oob_score_\n", - " imp.append(baseline - o)\n", - " \n", - " # put it all in a table\n", - " imp = np.array(imp)\n", - " feature_imps = pd.DataFrame(\n", - " data={'feature': X_train.columns,\n", - " 'importance': imp})\n", - " #feature_imps.set_index('feature', inplace=True)\n", - " feature_imps.sort_values('importance', ascending=True, inplace=True)\n", - " \n", - " plt.clf()\n", - " plt.figure(figsize=figsz)\n", - " sns.barplot(x=\"importance\", y=\"feature\", data=feature_imps)\n", - " plt.title(\"drop column feature importances\")\n", - " plt.tight_layout()\n", - " \n", - " return feature_imps" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## demonstration\n", - "\n", - "In this demonstratuon we are going to take a fraction of a fraction of **[Kaggle's RentHop rental listing interest competition](https://www.kaggle.com/c/two-sigma-connect-rental-listing-inquiries)**--the complete dataset is presently >80GB, we'll be looking at 5K rows. \n", - "\n", - "The competition's **[goal](https://www.kaggle.com/c/two-sigma-connect-rental-listing-inquiries)** was\n", - "> to predict the number of inquiries a new listing receives based on the listing’s creation date and other features. \n", - "\n", - "Doing so would help **[RentHop](https://www.renthop.com/)**\n", - "> better handle fraud control, identify potential listing quality issues, and allow owners and agents to better understand renters’ needs and preferences." - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [], - "source": [ - "data = \"/User/artifacts/two-sigma-connect-rental-listing-inquiries/\"\n", - "NFRAC = 0.1" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "sample dimensions (4935, 6)\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
bathroomsbedroomspricelongitudelatitudeinterest_level
182351.001800-73.996040.71972
421401.023000-73.987940.76533
46771.011350-73.899640.85492
\n", - "
" - ], - "text/plain": [ - " bathrooms bedrooms price longitude latitude interest_level\n", - "18235 1.0 0 1800 -73.9960 40.7197 2\n", - "42140 1.0 2 3000 -73.9879 40.7653 3\n", - "4677 1.0 1 1350 -73.8996 40.8549 2" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df = pd.read_csv(data + 'rent.csv').sample(frac=NFRAC)\n", - "print(\"sample dimensions\", df.shape)\n", - "df.head(3)" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [], - "source": [ - "features = ['bathrooms', 'bedrooms', 'longitude', 'latitude', 'price']\n", - "dfr = df[features]\n", - "\n", - "# drop price column\n", - "X_train, y_train = dfr.drop('price', axis=1), dfr['price']\n", - "\n", - "# insert column with random values\n", - "X_train['random'] = np.random.random(size=len(X_train))\n", - "features = ['bathrooms', 'bedrooms', 'longitude', 'latitude', 'random']" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "RandomForestClassifier(n_jobs=-1, oob_score=True)" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# define model\n", - "model_params = {\n", - " \"n_estimators\" : 100, \n", - " \"min_samples_leaf\" : 1,\n", - " \"n_jobs\" : -1,\n", - " \"oob_score\" : True\n", - "}\n", - "\n", - "model = SomeModel(**model_params)\n", - "\n", - "# estimate\n", - "model.fit(X_train, y_train)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### to run this the model needs a default attribute" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "default feature_importances [0.01683784 0.03215169 0.29983429 0.30418813 0.34698806]\n" - ] - }, - { - "data": { - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "if hasattr(model, \"feature_importances_\"):\n", - " print(\"default feature_importances\", model.feature_importances_)\n", - " feature_importances(model, features)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## permutation importances\n", - "\n", - "No need to check for default attributes or functions, this can be run on any kind of model:" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([[0.06545086119554205, 'longitude'],\n", - " [0.06160081053698076, 'latitude'],\n", - " [0.053495440729483285, 'bedrooms'],\n", - " [0.021681864235055734, 'bathrooms'],\n", - " [0.0004052684903748799, 'random']], dtype=object)" - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "pi = permutation_importances(model, X_train, y_train, features)\n", - "pi" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## drop-column importances" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
featureimportance
4random-0.042756
0bathrooms0.001824
1bedrooms0.023100
2longitude0.049240
3latitude0.051874
\n", - "
" - ], - "text/plain": [ - " feature importance\n", - "4 random -0.042756\n", - "0 bathrooms 0.001824\n", - "1 bedrooms 0.023100\n", - "2 longitude 0.049240\n", - "3 latitude 0.051874" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "dc = dropcol_importances(model, X_train, y_train)\n", - "dc" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## conclusions" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "So I would say location is a prime factor, then the number of bedrooms. Bathrooms often is gte bedrooms, and is likely correlated so one of them should likely be dropped." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.8" - }, - "toc-autonumbering": false, - "toc-showcode": false, - "toc-showmarkdowntxt": false - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/functions/development/feature_perms/0.0.1/src/feature_perms.ipynb b/functions/development/feature_perms/0.0.1/src/feature_perms.ipynb deleted file mode 100644 index 77da7b55..00000000 --- a/functions/development/feature_perms/0.0.1/src/feature_perms.ipynb +++ /dev/null @@ -1,1106 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# permutation_importances as reusable function" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## function code" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: ignore\n", - "import nuclio" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "import pandas as pd\n", - "import numbers\n", - "\n", - "import sklearn\n", - "from sklearn.base import clone\n", - "from sklearn.utils import check_random_state\n", - "\n", - "import matplotlib.pyplot as plt\n", - "import seaborn as sns\n", - "\n", - "from cloudpickle import load\n", - "\n", - "from mlrun.execution import MLClientCtx\n", - "from mlrun.datastore import DataItem\n", - "from mlrun.artifacts import get_model, PlotArtifact\n", - "from typing import Union, Callable, List\n", - "\n", - "def _get_n_samples_bootstrap(n_samples, max_samples) -> int:\n", - " \"\"\"get the number of samples in a bootstrap sample\n", - " \n", - " returns the total number of samples to draw for the bootstrap sample\n", - " \n", - " private api in sklearn >= v0.24, taken from sklearn.ensemble._forest.py\n", - "\n", - " :param n_samples: Number of samples in the dataset.\n", - " :param max_samples: \n", - " The maximum number of samples to draw from the total available:\n", - " - if float, this indicates a fraction of the total and should be\n", - " the interval `(0, 1)`;\n", - " - if int, this indicates the exact number of samples;\n", - " - if None, this indicates the total number of samples.\n", - " \"\"\"\n", - " if max_samples is None:\n", - " return n_samples\n", - "\n", - " if isinstance(max_samples, numbers.Integral):\n", - " if not (1 <= max_samples <= n_samples):\n", - " msg = \"`max_samples` must be in range 1 to {} but got value {}\"\n", - " raise ValueError(msg.format(n_samples, max_samples))\n", - " return max_samples\n", - "\n", - " if isinstance(max_samples, numbers.Real):\n", - " if not (0 < max_samples < 1):\n", - " msg = \"`max_samples` must be in range (0, 1) but got value {}\"\n", - " raise ValueError(msg.format(max_samples))\n", - " return int(round(n_samples * max_samples))\n", - "\n", - " msg = \"`max_samples` should be int or float, but got type '{}'\"\n", - " raise TypeError(msg.format(type(max_samples)))\n", - "\n", - "def _get_unsampled_ix(random_state, n_samples: int) -> np.array:\n", - " \"\"\"\n", - " future-proof get unsampled indices\n", - " \"\"\"\n", - " n_bootstrap = _get_n_samples_bootstrap(n_samples, n_samples)\n", - " random_instance = check_random_state(random_state)\n", - " sample_indices = random_instance.randint(0, n_samples, n_bootstrap)\n", - " sample_counts = np.bincount(sample_indices, minlength=n_samples)\n", - "\n", - " return np.arange(n_samples)[sample_counts==0]\n", - "\n", - "def _oob_classifier_accuracy(rf, X_train, y_train) -> float:\n", - " \"\"\"\n", - " Compute out-of-bag (OOB) accuracy for a scikit-learn forest classifier.\n", - " \n", - " https://github.com/scikit-learn/scikit-learn/blob/a24c8b46/sklearn/ensemble/forest.py#L425\n", - " \"\"\"\n", - " X = X_train.values if isinstance(X_train, pd.DataFrame) else X_train\n", - " y = y_train.values if isinstance(y_train, pd.Series) else y_train\n", - "\n", - " n_samples = len(X)\n", - " n_classes = len(np.unique(y))\n", - " predictions = np.zeros((n_samples, n_classes))\n", - " for tree in rf.estimators_:\n", - " unsampled_indices = _get_unsampled_ix(tree.random_state, n_samples)\n", - " tree_preds = tree.predict_proba(X[unsampled_indices, :])\n", - " predictions[unsampled_indices] += tree_preds\n", - "\n", - " predicted_class_indexes = np.argmax(predictions, axis=1)\n", - " predicted_classes = [rf.classes_[i] for i in predicted_class_indexes]\n", - "\n", - " oob_score = np.mean(y == predicted_classes)\n", - " \n", - " return oob_score\n", - "\n", - "def permutation_importances(\n", - " context: MLClientCtx,\n", - " model: DataItem,\n", - " dataset: DataItem,\n", - " labels: str,\n", - " figsz=(10, 5),\n", - " plots_dest: str = \"plots\",\n", - " fitype: str = \"permute\"\n", - ") -> pd.DataFrame:\n", - " \"\"\"calculate change in metric\n", - " \n", - " type 'permute' uses a pre-estimated model\n", - " type 'dropcol' uses a re-estimates model\n", - " \n", - " :param context: the function's execution context\n", - " :param model: a trained model\n", - " :param dataset: features and ground truths, regression targets\n", - " :param labels name of the ground truths column\n", - " :param figsz: matplotlib figure size\n", - " :param plots_dest: path within artifact store\n", - " :\n", - " \"\"\"\n", - " model_file, model_data, _ = get_model(model.url, suffix='.pkl')\n", - " model = load(open(str(model_file), \"rb\"))\n", - " \n", - " X = dataset.as_df()\n", - " y = X.pop(labels)\n", - " header = X.columns\n", - " \n", - " # this will be paramettrized next version, and include regression\n", - " metric = _oob_classifier_accuracy\n", - " \n", - " baseline = metric(model, X, y)\n", - " \n", - " imp = []\n", - " for col in X.columns:\n", - " if fitype is \"permute\":\n", - " save = X[col].copy()\n", - " X[col] = np.random.permutation(X[col])\n", - " m = metric(model, X, y)\n", - " X[col] = save\n", - " imp.append(baseline - m)\n", - " elif fitype is \"dropcol\":\n", - " X_ = X.drop(col, axis=1)\n", - " model_ = clone(model)\n", - " model_.random_state = random_state\n", - " model_.fit(X_, y)\n", - " o = model_.oob_score_\n", - " imp.append(baseline - o)\n", - " else:\n", - " raise ValueError(\"unknown fitype, only 'permute' or 'dropcol' permitted\")\n", - "\n", - " # create a feature importance table with desired labels\n", - " zipped = zip(imp, header)\n", - " feature_imp = pd.DataFrame(sorted(zipped), columns=[\"importance\", \"feature\"])\n", - " feature_imp.sort_values(by=\"importance\", ascending=False, inplace=True)\n", - "\n", - " plt.clf()\n", - " plt.figure(figsize=figsz)\n", - " sns.barplot(x=\"importance\", y=\"feature\", data=feature_imp)\n", - " plt.title(f\"feature importances-{fitype}\")\n", - " plt.tight_layout()\n", - "\n", - " context.log_artifact(PlotArtifact(f\"feature importances-{fitype}\", body=plt.gcf()),\n", - " local_path=f\"{plots_dest}/feature-permutations.html\")\n", - " context.log_dataset(f\"feature-importances-{fitype}-tbl\", df=feature_imp, index=False)" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: end-code" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## save function" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-06-07 19:58:25,298 function spec saved to path: function.yaml\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from mlrun import code_to_function\n", - "from mlrun.platforms.other import auto_mount\n", - "\n", - "gpus = False\n", - "\n", - "# create job function object from notebook code\n", - "fn_params = {\n", - " \"name\" : \"feature-perms\",\n", - " \"handler\" : \"permutation_importances\",\n", - " \"kind\" : \"job\",\n", - " \"image\" : \"mlrun/ml-models\" if not gpus else \"mlrun/ml-models-gpu\",\n", - " \"description\" : \"estimate feature importances using permutations\",\n", - " \"categories\" : [\"analysis\"],\n", - " \"labels\" : {\"author\": \"yjb\"}\n", - "}\n", - "\n", - "perms_fn = code_to_function(**fn_params)\n", - "perms_fn.apply(auto_mount())\n", - "perms_fn.export(\"function.yaml\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## tests" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import import_function\n", - "from mlrun import NewTask, mlconf" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### get some data" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-06-07 19:58:25,352 starting run tasks arc-to-parq uid=e9bc67f2189c418d96bfde754d369956 -> http://mlrun-api:8080\n", - "[mlrun] 2020-06-07 19:58:25,486 Job is running in the background, pod: tasks-arc-to-parq-xqkr7\n", - "[mlrun] 2020-06-07 19:58:29,118 starting local run: main.py # arc_to_parquet\n", - "[mlrun] 2020-06-07 19:58:29,169 downloading https://raw.githubusercontent.com/parrt/random-forest-importances/master/notebooks/data/rent.csv to local tmp\n", - "[mlrun] 2020-06-07 19:58:29,535 destination file does not exist, downloading\n", - "[mlrun] 2020-06-07 19:58:29,898 log artifact rent at /User/artifacts/rent.csv, size: 1492462, db: Y\n", - "\n", - "[mlrun] 2020-06-07 19:58:29,917 run executed, status=completed\n", - "final state: succeeded\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 07 19:58:29completedtasks arc-to-parq
v3io_user=admin
kind=job
owner=admin
host=tasks-arc-to-parq-xqkr7
archive_url
key=rent
stats=True
file_ext=csv
rent
\n", - "
\n", - "
\n", - "
\n", - " Title\n", - " ×\n", - "
\n", - " \n", - "
\n", - "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "to track results use .show() or .logs() or in CLI: \n", - "!mlrun get run e9bc67f2189c418d96bfde754d369956 --project default , !mlrun logs e9bc67f2189c418d96bfde754d369956 --project default\n", - "[mlrun] 2020-06-07 19:58:31,666 run executed, status=completed\n" - ] - } - ], - "source": [ - "data_url = \"https://raw.githubusercontent.com/parrt/random-forest-importances/master/notebooks/data/rent.csv\"\n", - "\n", - "fn = import_function(\"hub://arc_to_parquet\", \"a2p\")\n", - "fn.apply(auto_mount())\n", - "\n", - "params = {\n", - " \"name\" : \"tasks arc-to-parq\",\n", - " \"params\" : {\"key\":\"rent\", \"stats\": True, \"file_ext\":\"csv\"}\n", - "}\n", - "acquire_run = fn.run(NewTask(**params),inputs={\"archive_url\" : data_url},\n", - " artifact_path=mlconf.artifact_path)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### train a model" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-06-07 19:58:31,704 starting run tasks random forest uid=57af834167264641905a5bb5e6b0e263 -> http://mlrun-api:8080\n", - "[mlrun] 2020-06-07 19:58:31,861 Job is running in the background, pod: tasks-random-forest-vkjk5\n", - "[mlrun] 2020-06-07 19:58:35,390 starting local run: main.py # train_model\n", - "[mlrun] 2020-06-07 19:58:36,310 log artifact test_set at /User/artifacts/data/test_set.parquet, size: 24484, db: Y\n", - "[mlrun] 2020-06-07 19:58:37,153 log artifact confusion-matrix at /User/artifacts/model/plots/confusion-matrix.html, size: 27401, db: N\n", - "[mlrun] 2020-06-07 19:58:37,598 log artifact feature-importances at /User/artifacts/model/plots/feature-importances.html, size: 19685, db: N\n", - "[mlrun] 2020-06-07 19:58:37,806 log artifact precision-recall-multiclass at /User/artifacts/model/plots/precision-recall-multiclass.html, size: 74009, db: N\n", - "[mlrun] 2020-06-07 19:58:37,936 log artifact roc-multiclass at /User/artifacts/model/plots/roc-multiclass.html, size: 73053, db: N\n", - "[mlrun] 2020-06-07 19:58:38,079 log artifact model at /User/artifacts/model/, size: 10346780, db: Y\n", - "\n", - "[mlrun] 2020-06-07 19:58:38,106 run executed, status=completed\n", - "final state: succeeded\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 07 19:58:36completedtasks random forest
v3io_user=admin
kind=job
owner=admin
host=tasks-random-forest-vkjk5
class=sklearn.ensemble.RandomForestClassifier
dataset
sample=-5000
model_pkg_class=sklearn.ensemble.RandomForestClassifier
label_column=interest_level
CLASS_n_estimators=100
CLASS_min_samples_leaf=1
CLASS_n_jobs=-1
CLASS_oob_score=True
test-accuracy=0.6902857142857143
test-error=0.3097142857142857
auc-micro=0.8567196734693878
auc-weighted=0.7077200281488216
f1-score=0.44361444815007395
precision_score=0.4969837043184901
recall_score=0.42733978329897576
test_set
confusion-matrix
feature-importances
precision-recall-multiclass
roc-multiclass
model
\n", - "
\n", - "
\n", - "
\n", - " Title\n", - " ×\n", - "
\n", - " \n", - "
\n", - "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "to track results use .show() or .logs() or in CLI: \n", - "!mlrun get run 57af834167264641905a5bb5e6b0e263 --project default , !mlrun logs 57af834167264641905a5bb5e6b0e263 --project default\n", - "[mlrun] 2020-06-07 19:58:41,115 run executed, status=completed\n" - ] - } - ], - "source": [ - "fn = import_function(\"hub://sklearn_classifier\", \"skrf\")\n", - "fn.apply(auto_mount())\n", - "\n", - "# define model\n", - "params = {\n", - " \"name\" : \"tasks random forest\",\n", - " \"params\" : {\n", - " \"sample\" : -5_000, # 5k random rows,\n", - " \"model_pkg_class\" : \"sklearn.ensemble.RandomForestClassifier\",\n", - " \"label_column\" : \"interest_level\",\n", - " \"CLASS_n_estimators\" : 100,\n", - " \"CLASS_min_samples_leaf\" : 1,\n", - " \"CLASS_n_jobs\" : -1,\n", - " \"CLASS_oob_score\" : True}\n", - "}\n", - "\n", - "train_run = fn.run(NewTask(**params), inputs={\"dataset\" : acquire_run.outputs[\"rent\"]},\n", - " artifact_path=mlconf.artifact_path)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "

Feature Importances

\n", - "" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from IPython.display import HTML\n", - "HTML(filename=train_run.outputs['feature-importances'])" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "data = acquire_run.outputs[\"rent\"]\n", - "labels = \"interest_level\"\n", - "model = train_run.outputs[\"model\"]\n" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-06-07 19:58:41,152 starting run features-permutation_importances uid=89235b15ac2a4213aefc906c178a1c5e -> http://mlrun-api:8080\n", - "[mlrun] 2020-06-07 19:58:41,312 Job is running in the background, pod: features-permutation-importances-dwxmt\n", - "[mlrun] 2020-06-07 19:58:44,871 starting local run: main.py # permutation_importances\n", - "[mlrun] 2020-06-07 19:58:48,714 log artifact feature importances-permute at /User/artifacts/plots/feature-permutations.html, size: 25694, db: Y\n", - "[mlrun] 2020-06-07 19:58:48,770 log artifact feature-importances-permute-tbl at /User/artifacts/feature-importances-permute-tbl.csv, size: 167, db: Y\n", - "\n", - "[mlrun] 2020-06-07 19:58:48,785 run executed, status=completed\n", - "final state: succeeded\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 07 19:58:45completedfeatures-permutation_importances
v3io_user=admin
kind=job
owner=admin
host=features-permutation-importances-dwxmt
model
dataset
labels=interest_level
plots_dest=plots
feature importances-permute
feature-importances-permute-tbl
\n", - "
\n", - "
\n", - "
\n", - " Title\n", - " ×\n", - "
\n", - " \n", - "
\n", - "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "to track results use .show() or .logs() or in CLI: \n", - "!mlrun get run 89235b15ac2a4213aefc906c178a1c5e --project default , !mlrun logs 89235b15ac2a4213aefc906c178a1c5e --project default\n", - "[mlrun] 2020-06-07 19:58:50,488 run executed, status=completed\n" - ] - } - ], - "source": [ - "fi_perms = perms_fn.run(\n", - " NewTask(params={\"labels\": labels, \n", - " \"plots_dest\": \"plots\"}),\n", - " inputs={\"model\": model, \"dataset\": data},\n", - " artifact_path=mlconf.artifact_path)" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from IPython.display import HTML\n", - "HTML(filename=fi_perms.outputs['feature importances-permute'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.8" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} \ No newline at end of file diff --git a/functions/development/feature_perms/0.0.1/src/feature_perms.py b/functions/development/feature_perms/0.0.1/src/feature_perms.py deleted file mode 100644 index 3a6c9948..00000000 --- a/functions/development/feature_perms/0.0.1/src/feature_perms.py +++ /dev/null @@ -1,160 +0,0 @@ -# Generated by nuclio.export.NuclioExporter - -import numpy as np -import pandas as pd -import numbers - -import sklearn -from sklearn.base import clone -from sklearn.utils import check_random_state - -import matplotlib.pyplot as plt -import seaborn as sns - -from cloudpickle import load - -from mlrun.execution import MLClientCtx -from mlrun.datastore import DataItem -from mlrun.artifacts import get_model, PlotArtifact -from typing import Union, Callable, List - - -def _get_n_samples_bootstrap(n_samples, max_samples) -> int: - """get the number of samples in a bootstrap sample - - returns the total number of samples to draw for the bootstrap sample - - private api in sklearn >= v0.24, taken from sklearn.ensemble._forest.py - - :param n_samples: Number of samples in the dataset. - :param max_samples: - The maximum number of samples to draw from the total available: - - if float, this indicates a fraction of the total and should be - the interval `(0, 1)`; - - if int, this indicates the exact number of samples; - - if None, this indicates the total number of samples. - """ - if max_samples is None: - return n_samples - - if isinstance(max_samples, numbers.Integral): - if not (1 <= max_samples <= n_samples): - msg = "`max_samples` must be in range 1 to {} but got value {}" - raise ValueError(msg.format(n_samples, max_samples)) - return max_samples - - if isinstance(max_samples, numbers.Real): - if not (0 < max_samples < 1): - msg = "`max_samples` must be in range (0, 1) but got value {}" - raise ValueError(msg.format(max_samples)) - return int(round(n_samples * max_samples)) - - msg = "`max_samples` should be int or float, but got type '{}'" - raise TypeError(msg.format(type(max_samples))) - - -def _get_unsampled_ix(random_state, n_samples: int) -> np.array: - """ - future-proof get unsampled indices - """ - n_bootstrap = _get_n_samples_bootstrap(n_samples, n_samples) - random_instance = check_random_state(random_state) - sample_indices = random_instance.randint(0, n_samples, n_bootstrap) - sample_counts = np.bincount(sample_indices, minlength=n_samples) - - return np.arange(n_samples)[sample_counts == 0] - - -def _oob_classifier_accuracy(rf, X_train, y_train) -> float: - """ - Compute out-of-bag (OOB) accuracy for a scikit-learn forest classifier. - - https://github.com/scikit-learn/scikit-learn/blob/a24c8b46/sklearn/ensemble/forest.py#L425 - """ - X = X_train.values if isinstance(X_train, pd.DataFrame) else X_train - y = y_train.values if isinstance(y_train, pd.Series) else y_train - - n_samples = len(X) - n_classes = len(np.unique(y)) - predictions = np.zeros((n_samples, n_classes)) - for tree in rf.estimators_: - unsampled_indices = _get_unsampled_ix(tree.random_state, n_samples) - tree_preds = tree.predict_proba(X[unsampled_indices, :]) - predictions[unsampled_indices] += tree_preds - - predicted_class_indexes = np.argmax(predictions, axis=1) - predicted_classes = [rf.classes_[i] for i in predicted_class_indexes] - - oob_score = np.mean(y == predicted_classes) - - return oob_score - - -def permutation_importance( - context: MLClientCtx, - model: DataItem, - dataset: DataItem, - labels: str, - figsz=(10, 5), - plots_dest: str = "plots", - fitype: str = "permute", -) -> pd.DataFrame: - """calculate change in metric - - type 'permute' uses a pre-estimated model - type 'dropcol' uses a re-estimates model - - :param context: the function's execution context - :param model: a trained model - :param dataset: features and ground truths, regression targets - :param labels name of the ground truths column - :param figsz: matplotlib figure size - :param plots_dest: path within artifact store - : - """ - model_file, model_data, _ = get_model(model.url, suffix=".pkl") - model = load(open(str(model_file), "rb")) - - X = dataset.as_df() - y = X.pop(labels) - header = X.columns - - metric = _oob_classifier_accuracy - - baseline = metric(model, X, y) - - imp = [] - for col in X.columns: - if fitype is "permute": - save = X[col].copy() - X[col] = np.random.permutation(X[col]) - m = metric(model, X, y) - X[col] = save - imp.append(baseline - m) - elif fitype is "dropcol": - X_ = X.drop(col, axis=1) - model_ = clone(model) - #model_.random_state = random_state - model_.fit(X_, y) - o = model_.oob_score_ - imp.append(baseline - o) - else: - raise ValueError("unknown fitype, only 'permute' or 'dropcol' permitted") - - zipped = zip(imp, header) - feature_imp = pd.DataFrame(sorted(zipped), columns=["importance", "feature"]) - feature_imp.sort_values(by="importance", ascending=False, inplace=True) - - plt.clf() - plt.figure(figsize=figsz) - sns.barplot(x="importance", y="feature", data=feature_imp) - plt.title(f"feature importances-{fitype}") - plt.tight_layout() - - context.log_artifact( - PlotArtifact(f"feature importances-{fitype}", body=plt.gcf()), - local_path=f"{plots_dest}/feature-permutations.html", - ) - context.log_dataset( - f"feature-importances-{fitype}-tbl", df=feature_imp, index=False - ) diff --git a/functions/development/feature_perms/0.0.1/src/function.yaml b/functions/development/feature_perms/0.0.1/src/function.yaml deleted file mode 100644 index e9d3913a..00000000 --- a/functions/development/feature_perms/0.0.1/src/function.yaml +++ /dev/null @@ -1,63 +0,0 @@ -kind: job -metadata: - name: feature-perms - tag: '' - hash: 2e32234a73e2e48f029cf6c957b150ec2ffd4bc7 - project: default - labels: - author: yjb - categories: - - data-analysis -spec: - command: '' - args: [] - image: mlrun/ml-models - env: [] - default_handler: permutation_importance - entry_points: - permutation_importance: - name: permutation_importance - doc: 'calculate change in metric - - - type ''permute'' uses a pre-estimated model - - type ''dropcol'' uses a re-estimates model' - parameters: - - name: context - type: MLClientCtx - doc: the function's execution context - default: '' - - name: model - type: DataItem - doc: a trained model - default: '' - - name: dataset - type: DataItem - doc: features and ground truths, regression targets - default: '' - - name: labels - type: str - default: '' - - name: figsz - doc: matplotlib figure size - default: - - 10 - - 5 - - name: plots_dest - type: str - doc: path within artifact store - default: plots - - name: fitype - type: str - default: permute - outputs: - - default: '' - lineno: 93 - description: estimate feature importances using permutations - build: - functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IG51bXB5IGFzIG5wCmltcG9ydCBwYW5kYXMgYXMgcGQKaW1wb3J0IG51bWJlcnMKCmltcG9ydCBza2xlYXJuCmZyb20gc2tsZWFybi5iYXNlIGltcG9ydCBjbG9uZQpmcm9tIHNrbGVhcm4udXRpbHMgaW1wb3J0IGNoZWNrX3JhbmRvbV9zdGF0ZQoKaW1wb3J0IG1hdHBsb3RsaWIucHlwbG90IGFzIHBsdAppbXBvcnQgc2VhYm9ybiBhcyBzbnMKCmZyb20gY2xvdWRwaWNrbGUgaW1wb3J0IGxvYWQKCmZyb20gbWxydW4uZXhlY3V0aW9uIGltcG9ydCBNTENsaWVudEN0eApmcm9tIG1scnVuLmRhdGFzdG9yZSBpbXBvcnQgRGF0YUl0ZW0KZnJvbSBtbHJ1bi5hcnRpZmFjdHMgaW1wb3J0IGdldF9tb2RlbCwgUGxvdEFydGlmYWN0CmZyb20gdHlwaW5nIGltcG9ydCBVbmlvbiwgQ2FsbGFibGUsIExpc3QKCgpkZWYgX2dldF9uX3NhbXBsZXNfYm9vdHN0cmFwKG5fc2FtcGxlcywgbWF4X3NhbXBsZXMpIC0+IGludDoKICAgICIiImdldCB0aGUgbnVtYmVyIG9mIHNhbXBsZXMgaW4gYSBib290c3RyYXAgc2FtcGxlCgogICAgcmV0dXJucyB0aGUgdG90YWwgbnVtYmVyIG9mIHNhbXBsZXMgdG8gZHJhdyBmb3IgdGhlIGJvb3RzdHJhcCBzYW1wbGUKCiAgICBwcml2YXRlIGFwaSBpbiBza2xlYXJuID49IHYwLjI0LCB0YWtlbiBmcm9tIHNrbGVhcm4uZW5zZW1ibGUuX2ZvcmVzdC5weQoKICAgIDpwYXJhbSBuX3NhbXBsZXM6ICAgTnVtYmVyIG9mIHNhbXBsZXMgaW4gdGhlIGRhdGFzZXQuCiAgICA6cGFyYW0gbWF4X3NhbXBsZXM6CiAgICAgICAgVGhlIG1heGltdW0gbnVtYmVyIG9mIHNhbXBsZXMgdG8gZHJhdyBmcm9tIHRoZSB0b3RhbCBhdmFpbGFibGU6CiAgICAgICAgICAgIC0gaWYgZmxvYXQsIHRoaXMgaW5kaWNhdGVzIGEgZnJhY3Rpb24gb2YgdGhlIHRvdGFsIGFuZCBzaG91bGQgYmUKICAgICAgICAgICAgICB0aGUgaW50ZXJ2YWwgYCgwLCAxKWA7CiAgICAgICAgICAgIC0gaWYgaW50LCB0aGlzIGluZGljYXRlcyB0aGUgZXhhY3QgbnVtYmVyIG9mIHNhbXBsZXM7CiAgICAgICAgICAgIC0gaWYgTm9uZSwgdGhpcyBpbmRpY2F0ZXMgdGhlIHRvdGFsIG51bWJlciBvZiBzYW1wbGVzLgogICAgIiIiCiAgICBpZiBtYXhfc2FtcGxlcyBpcyBOb25lOgogICAgICAgIHJldHVybiBuX3NhbXBsZXMKCiAgICBpZiBpc2luc3RhbmNlKG1heF9zYW1wbGVzLCBudW1iZXJzLkludGVncmFsKToKICAgICAgICBpZiBub3QgKDEgPD0gbWF4X3NhbXBsZXMgPD0gbl9zYW1wbGVzKToKICAgICAgICAgICAgbXNnID0gImBtYXhfc2FtcGxlc2AgbXVzdCBiZSBpbiByYW5nZSAxIHRvIHt9IGJ1dCBnb3QgdmFsdWUge30iCiAgICAgICAgICAgIHJhaXNlIFZhbHVlRXJyb3IobXNnLmZvcm1hdChuX3NhbXBsZXMsIG1heF9zYW1wbGVzKSkKICAgICAgICByZXR1cm4gbWF4X3NhbXBsZXMKCiAgICBpZiBpc2luc3RhbmNlKG1heF9zYW1wbGVzLCBudW1iZXJzLlJlYWwpOgogICAgICAgIGlmIG5vdCAoMCA8IG1heF9zYW1wbGVzIDwgMSk6CiAgICAgICAgICAgIG1zZyA9ICJgbWF4X3NhbXBsZXNgIG11c3QgYmUgaW4gcmFuZ2UgKDAsIDEpIGJ1dCBnb3QgdmFsdWUge30iCiAgICAgICAgICAgIHJhaXNlIFZhbHVlRXJyb3IobXNnLmZvcm1hdChtYXhfc2FtcGxlcykpCiAgICAgICAgcmV0dXJuIGludChyb3VuZChuX3NhbXBsZXMgKiBtYXhfc2FtcGxlcykpCgogICAgbXNnID0gImBtYXhfc2FtcGxlc2Agc2hvdWxkIGJlIGludCBvciBmbG9hdCwgYnV0IGdvdCB0eXBlICd7fSciCiAgICByYWlzZSBUeXBlRXJyb3IobXNnLmZvcm1hdCh0eXBlKG1heF9zYW1wbGVzKSkpCgoKZGVmIF9nZXRfdW5zYW1wbGVkX2l4KHJhbmRvbV9zdGF0ZSwgbl9zYW1wbGVzOiBpbnQpIC0+IG5wLmFycmF5OgogICAgIiIiCiAgICBmdXR1cmUtcHJvb2YgZ2V0IHVuc2FtcGxlZCBpbmRpY2VzCiAgICAiIiIKICAgIG5fYm9vdHN0cmFwID0gX2dldF9uX3NhbXBsZXNfYm9vdHN0cmFwKG5fc2FtcGxlcywgbl9zYW1wbGVzKQogICAgcmFuZG9tX2luc3RhbmNlID0gY2hlY2tfcmFuZG9tX3N0YXRlKHJhbmRvbV9zdGF0ZSkKICAgIHNhbXBsZV9pbmRpY2VzID0gcmFuZG9tX2luc3RhbmNlLnJhbmRpbnQoMCwgbl9zYW1wbGVzLCBuX2Jvb3RzdHJhcCkKICAgIHNhbXBsZV9jb3VudHMgPSBucC5iaW5jb3VudChzYW1wbGVfaW5kaWNlcywgbWlubGVuZ3RoPW5fc2FtcGxlcykKCiAgICByZXR1cm4gbnAuYXJhbmdlKG5fc2FtcGxlcylbc2FtcGxlX2NvdW50cyA9PSAwXQoKCmRlZiBfb29iX2NsYXNzaWZpZXJfYWNjdXJhY3kocmYsIFhfdHJhaW4sIHlfdHJhaW4pIC0+IGZsb2F0OgogICAgIiIiCiAgICBDb21wdXRlIG91dC1vZi1iYWcgKE9PQikgYWNjdXJhY3kgZm9yIGEgc2Npa2l0LWxlYXJuIGZvcmVzdCBjbGFzc2lmaWVyLgoKICAgIGh0dHBzOi8vZ2l0aHViLmNvbS9zY2lraXQtbGVhcm4vc2Npa2l0LWxlYXJuL2Jsb2IvYTI0YzhiNDYvc2tsZWFybi9lbnNlbWJsZS9mb3Jlc3QucHkjTDQyNQogICAgIiIiCiAgICBYID0gWF90cmFpbi52YWx1ZXMgaWYgaXNpbnN0YW5jZShYX3RyYWluLCBwZC5EYXRhRnJhbWUpIGVsc2UgWF90cmFpbgogICAgeSA9IHlfdHJhaW4udmFsdWVzIGlmIGlzaW5zdGFuY2UoeV90cmFpbiwgcGQuU2VyaWVzKSBlbHNlIHlfdHJhaW4KCiAgICBuX3NhbXBsZXMgPSBsZW4oWCkKICAgIG5fY2xhc3NlcyA9IGxlbihucC51bmlxdWUoeSkpCiAgICBwcmVkaWN0aW9ucyA9IG5wLnplcm9zKChuX3NhbXBsZXMsIG5fY2xhc3NlcykpCiAgICBmb3IgdHJlZSBpbiByZi5lc3RpbWF0b3JzXzoKICAgICAgICB1bnNhbXBsZWRfaW5kaWNlcyA9IF9nZXRfdW5zYW1wbGVkX2l4KHRyZWUucmFuZG9tX3N0YXRlLCBuX3NhbXBsZXMpCiAgICAgICAgdHJlZV9wcmVkcyA9IHRyZWUucHJlZGljdF9wcm9iYShYW3Vuc2FtcGxlZF9pbmRpY2VzLCA6XSkKICAgICAgICBwcmVkaWN0aW9uc1t1bnNhbXBsZWRfaW5kaWNlc10gKz0gdHJlZV9wcmVkcwoKICAgIHByZWRpY3RlZF9jbGFzc19pbmRleGVzID0gbnAuYXJnbWF4KHByZWRpY3Rpb25zLCBheGlzPTEpCiAgICBwcmVkaWN0ZWRfY2xhc3NlcyA9IFtyZi5jbGFzc2VzX1tpXSBmb3IgaSBpbiBwcmVkaWN0ZWRfY2xhc3NfaW5kZXhlc10KCiAgICBvb2Jfc2NvcmUgPSBucC5tZWFuKHkgPT0gcHJlZGljdGVkX2NsYXNzZXMpCgogICAgcmV0dXJuIG9vYl9zY29yZQoKCmRlZiBwZXJtdXRhdGlvbl9pbXBvcnRhbmNlKAogICAgY29udGV4dDogTUxDbGllbnRDdHgsCiAgICBtb2RlbDogRGF0YUl0ZW0sCiAgICBkYXRhc2V0OiBEYXRhSXRlbSwKICAgIGxhYmVsczogc3RyLAogICAgZmlnc3o9KDEwLCA1KSwKICAgIHBsb3RzX2Rlc3Q6IHN0ciA9ICJwbG90cyIsCiAgICBmaXR5cGU6IHN0ciA9ICJwZXJtdXRlIiwKKSAtPiBwZC5EYXRhRnJhbWU6CiAgICAiIiJjYWxjdWxhdGUgY2hhbmdlIGluIG1ldHJpYwoKICAgIHR5cGUgJ3Blcm11dGUnIHVzZXMgYSBwcmUtZXN0aW1hdGVkIG1vZGVsCiAgICB0eXBlICdkcm9wY29sJyB1c2VzIGEgcmUtZXN0aW1hdGVzIG1vZGVsCgogICAgOnBhcmFtIGNvbnRleHQ6ICAgICB0aGUgZnVuY3Rpb24ncyBleGVjdXRpb24gY29udGV4dAogICAgOnBhcmFtIG1vZGVsOiAgICAgICBhIHRyYWluZWQgbW9kZWwKICAgIDpwYXJhbSBkYXRhc2V0OiAgICAgZmVhdHVyZXMgYW5kIGdyb3VuZCB0cnV0aHMsIHJlZ3Jlc3Npb24gdGFyZ2V0cwogICAgOnBhcmFtIGxhYmVscyAgICAgICBuYW1lIG9mIHRoZSBncm91bmQgdHJ1dGhzIGNvbHVtbgogICAgOnBhcmFtIGZpZ3N6OiAgICAgICBtYXRwbG90bGliIGZpZ3VyZSBzaXplCiAgICA6cGFyYW0gcGxvdHNfZGVzdDogIHBhdGggd2l0aGluIGFydGlmYWN0IHN0b3JlCiAgICA6CiAgICAiIiIKICAgIG1vZGVsX2ZpbGUsIG1vZGVsX2RhdGEsIF8gPSBnZXRfbW9kZWwobW9kZWwudXJsLCBzdWZmaXg9Ii5wa2wiKQogICAgbW9kZWwgPSBsb2FkKG9wZW4oc3RyKG1vZGVsX2ZpbGUpLCAicmIiKSkKCiAgICBYID0gZGF0YXNldC5hc19kZigpCiAgICB5ID0gWC5wb3AobGFiZWxzKQogICAgaGVhZGVyID0gWC5jb2x1bW5zCgogICAgbWV0cmljID0gX29vYl9jbGFzc2lmaWVyX2FjY3VyYWN5CgogICAgYmFzZWxpbmUgPSBtZXRyaWMobW9kZWwsIFgsIHkpCgogICAgaW1wID0gW10KICAgIGZvciBjb2wgaW4gWC5jb2x1bW5zOgogICAgICAgIGlmIGZpdHlwZSBpcyAicGVybXV0ZSI6CiAgICAgICAgICAgIHNhdmUgPSBYW2NvbF0uY29weSgpCiAgICAgICAgICAgIFhbY29sXSA9IG5wLnJhbmRvbS5wZXJtdXRhdGlvbihYW2NvbF0pCiAgICAgICAgICAgIG0gPSBtZXRyaWMobW9kZWwsIFgsIHkpCiAgICAgICAgICAgIFhbY29sXSA9IHNhdmUKICAgICAgICAgICAgaW1wLmFwcGVuZChiYXNlbGluZSAtIG0pCiAgICAgICAgZWxpZiBmaXR5cGUgaXMgImRyb3Bjb2wiOgogICAgICAgICAgICBYXyA9IFguZHJvcChjb2wsIGF4aXM9MSkKICAgICAgICAgICAgbW9kZWxfID0gY2xvbmUobW9kZWwpCiAgICAgICAgICAgICNtb2RlbF8ucmFuZG9tX3N0YXRlID0gcmFuZG9tX3N0YXRlCiAgICAgICAgICAgIG1vZGVsXy5maXQoWF8sIHkpCiAgICAgICAgICAgIG8gPSBtb2RlbF8ub29iX3Njb3JlXwogICAgICAgICAgICBpbXAuYXBwZW5kKGJhc2VsaW5lIC0gbykKICAgICAgICBlbHNlOgogICAgICAgICAgICByYWlzZSBWYWx1ZUVycm9yKCJ1bmtub3duIGZpdHlwZSwgb25seSAncGVybXV0ZScgb3IgJ2Ryb3Bjb2wnIHBlcm1pdHRlZCIpCgogICAgemlwcGVkID0gemlwKGltcCwgaGVhZGVyKQogICAgZmVhdHVyZV9pbXAgPSBwZC5EYXRhRnJhbWUoc29ydGVkKHppcHBlZCksIGNvbHVtbnM9WyJpbXBvcnRhbmNlIiwgImZlYXR1cmUiXSkKICAgIGZlYXR1cmVfaW1wLnNvcnRfdmFsdWVzKGJ5PSJpbXBvcnRhbmNlIiwgYXNjZW5kaW5nPUZhbHNlLCBpbnBsYWNlPVRydWUpCgogICAgcGx0LmNsZigpCiAgICBwbHQuZmlndXJlKGZpZ3NpemU9Zmlnc3opCiAgICBzbnMuYmFycGxvdCh4PSJpbXBvcnRhbmNlIiwgeT0iZmVhdHVyZSIsIGRhdGE9ZmVhdHVyZV9pbXApCiAgICBwbHQudGl0bGUoZiJmZWF0dXJlIGltcG9ydGFuY2VzLXtmaXR5cGV9IikKICAgIHBsdC50aWdodF9sYXlvdXQoKQoKICAgIGNvbnRleHQubG9nX2FydGlmYWN0KAogICAgICAgIFBsb3RBcnRpZmFjdChmImZlYXR1cmUgaW1wb3J0YW5jZXMte2ZpdHlwZX0iLCBib2R5PXBsdC5nY2YoKSksCiAgICAgICAgbG9jYWxfcGF0aD1mIntwbG90c19kZXN0fS9mZWF0dXJlLXBlcm11dGF0aW9ucy5odG1sIiwKICAgICkKICAgIGNvbnRleHQubG9nX2RhdGFzZXQoCiAgICAgICAgZiJmZWF0dXJlLWltcG9ydGFuY2VzLXtmaXR5cGV9LXRibCIsIGRmPWZlYXR1cmVfaW1wLCBpbmRleD1GYWxzZQogICAgKQo= - commands: [] - code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/feature_perms/feature_perms.py - affinity: null -verbose: false diff --git a/functions/development/feature_perms/0.0.1/src/item.yaml b/functions/development/feature_perms/0.0.1/src/item.yaml deleted file mode 100644 index 1fd8d6da..00000000 --- a/functions/development/feature_perms/0.0.1/src/item.yaml +++ /dev/null @@ -1,23 +0,0 @@ -apiVersion: v1 -categories: -- data-analysis -description: estimate feature importances using permutations -doc: '' -example: feature_perms.ipynb -generationDate: 2021-05-19:22-41 -icon: '' -labels: - author: yjb -maintainers: [] -marketplaceType: '' -mlrunVersion: '' -name: feature-perms -platformVersion: '' -spec: - filename: feature_perms.py - handler: permutation_importance - image: mlrun/ml-models - kind: job - requirements: [] -url: '' -version: 0.0.1 diff --git a/functions/development/feature_perms/0.0.1/src/requirements.txt b/functions/development/feature_perms/0.0.1/src/requirements.txt deleted file mode 100644 index f53a3289..00000000 --- a/functions/development/feature_perms/0.0.1/src/requirements.txt +++ /dev/null @@ -1,6 +0,0 @@ -mlrun -sklearn -matplotlib -seaborn -scikit-plot - diff --git a/functions/development/feature_perms/0.0.1/src/test_feature_perms.py b/functions/development/feature_perms/0.0.1/src/test_feature_perms.py deleted file mode 100644 index b270292d..00000000 --- a/functions/development/feature_perms/0.0.1/src/test_feature_perms.py +++ /dev/null @@ -1,98 +0,0 @@ -from mlrun import code_to_function, import_function -from pathlib import Path -import os - -ARTIFACTS_PATH = 'artifacts' -DATA_URL = "https://raw.githubusercontent.com/parrt/random-forest-importances/master/notebooks/data/rent.csv" -FEATURE_OUTPUT = ARTIFACTS_PATH + "/feature-importances-permute-tbl.parquet" - - -def arc_to_parquet(): - from mlrun import import_function - from mlrun.platforms import auto_mount - - archive_func = import_function('hub://arc_to_parquet') - archive_run = archive_func.run(handler="arc_to_parquet", - params={"key": "rent", "stats": True, "file_ext": "csv"}, - inputs={"archive_url": DATA_URL}, - artifact_path=os.getcwd() + '/artifacts' - , local=True - ) - - -def sklearn_classifier(run): - cwd = os.getcwd() - file_path = str(Path(cwd).parent.absolute()) + "/sklearn_classifier/sklearn_classifier.py" - fn = code_to_function(name='test_sklearn_classifier', - filename=file_path, - handler="train_model", - kind="local", - ) - fn.spec.command = file_path - fn.run(params={ - "sample": -5_000, # 5k random rows, - "model_pkg_class": "sklearn.ensemble.RandomForestClassifier", - "label_column": "interest_level", - "CLASS_n_estimators": 100, - "CLASS_min_samples_leaf": 1, - "CLASS_n_jobs": -1, - "CLASS_oob_score": True}, - handler="train_model", - inputs={"dataset": run.outputs["rent"]}, - artifact_path='artifacts' - # , local=True - ) - - -def train_model(): - from mlrun import import_function - from mlrun.platforms import auto_mount - - train = import_function('hub://sklearn_classifier') - # .apply(auto_mount()) - - train_run = train.run( - inputs={"dataset": "artifacts/rent.csv"}, - params={ - "sample": -5_000, # 5k random rows, - "model_pkg_class": "sklearn.ensemble.RandomForestClassifier", - "label_column": "interest_level", - "CLASS_n_estimators": 100, - "CLASS_min_samples_leaf": 1, - "CLASS_n_jobs": -1, - "CLASS_oob_score": True}, - local=True) - - -def test_feature_selection_run_local(): - arc_to_parquet() - train_model() - data = "artifacts/rent.csv" - labels = "interest_level" - model = "model/model.pkl" - fn = code_to_function(name='test_run_local_feature_perms', - filename="feature_perms.py", - handler="permutation_importance", - kind="local", - ) - fn.spec.command = "feature_perms.py" - fi_perms = fn.run(params={"labels": labels, - "plots_dest": "plots"}, - inputs={"model": model, "dataset": data}, - artifact_path='artifacts') - assert Path(FEATURE_OUTPUT).is_file() - - -def test_feature_perms_import_function(): - arc_to_parquet() - train_model() - data = "artifacts/rent.csv" - labels = "interest_level" - model = "model/model.pkl" - fi_perms = import_function("function.yaml") - fi_perms.run(params={"labels": labels, - "plots_dest": "plots"}, - inputs={"model": model, "dataset": data}, - artifact_path=os.getcwd() + '/artifacts' - , local=True) - assert Path(FEATURE_OUTPUT).is_file() diff --git a/functions/development/feature_perms/0.0.1/static/documentation.html b/functions/development/feature_perms/0.0.1/static/documentation.html deleted file mode 100644 index d485113f..00000000 --- a/functions/development/feature_perms/0.0.1/static/documentation.html +++ /dev/null @@ -1,145 +0,0 @@ - - - - - - - -feature_perms package - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- - -
-
-
-
-
-
-

feature_perms package

-
-

Submodules

-
-
-

feature_perms.feature_perms module

-
-
-feature_perms.feature_perms.permutation_importance(context: mlrun.execution.MLClientCtx, model: mlrun.datastore.base.DataItem, dataset: mlrun.datastore.base.DataItem, labels: str, figsz=(10, 5), plots_dest: str = 'plots', fitype: str = 'permute')pandas.core.frame.DataFrame[source]
-

calculate change in metric

-

type ‘permute’ uses a pre-estimated model -type ‘dropcol’ uses a re-estimates model

-
-
Parameters
-
    -
  • context – the function’s execution context

  • -
  • model – a trained model

  • -
  • dataset – features and ground truths, regression targets

  • -
-
-
-

:param labels name of the ground truths column -:param figsz: matplotlib figure size -:param plots_dest: path within artifact store -:

-
-
-
-

Module contents

-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/feature_perms/0.0.1/static/example.html b/functions/development/feature_perms/0.0.1/static/example.html deleted file mode 100644 index 3eecd61f..00000000 --- a/functions/development/feature_perms/0.0.1/static/example.html +++ /dev/null @@ -1,1065 +0,0 @@ - - - - - - - -permutation_importances as reusable function - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- - -
-
-
-
-
-
-

permutation_importances as reusable function

-
-

function code

-
-
-
# nuclio: ignore
-import nuclio
-
-
-
-
-
-
-
import numpy as np
-import pandas as pd
-import numbers
-
-import sklearn
-from sklearn.base import clone
-from sklearn.utils import check_random_state
-
-import matplotlib.pyplot as plt
-import seaborn as sns
-
-from cloudpickle import load
-
-from mlrun.execution import MLClientCtx
-from mlrun.datastore import DataItem
-from mlrun.artifacts import get_model, PlotArtifact
-from typing import Union, Callable, List
-
-def _get_n_samples_bootstrap(n_samples, max_samples) -> int:
-    """get the number of samples in a bootstrap sample
-    
-    returns the total number of samples to draw for the bootstrap sample
-    
-    private api in sklearn >= v0.24, taken from sklearn.ensemble._forest.py
-
-    :param n_samples:   Number of samples in the dataset.
-    :param max_samples: 
-        The maximum number of samples to draw from the total available:
-            - if float, this indicates a fraction of the total and should be
-              the interval `(0, 1)`;
-            - if int, this indicates the exact number of samples;
-            - if None, this indicates the total number of samples.
-    """
-    if max_samples is None:
-        return n_samples
-
-    if isinstance(max_samples, numbers.Integral):
-        if not (1 <= max_samples <= n_samples):
-            msg = "`max_samples` must be in range 1 to {} but got value {}"
-            raise ValueError(msg.format(n_samples, max_samples))
-        return max_samples
-
-    if isinstance(max_samples, numbers.Real):
-        if not (0 < max_samples < 1):
-            msg = "`max_samples` must be in range (0, 1) but got value {}"
-            raise ValueError(msg.format(max_samples))
-        return int(round(n_samples * max_samples))
-
-    msg = "`max_samples` should be int or float, but got type '{}'"
-    raise TypeError(msg.format(type(max_samples)))
-
-def _get_unsampled_ix(random_state, n_samples: int) -> np.array:
-    """
-    future-proof get unsampled indices
-    """
-    n_bootstrap = _get_n_samples_bootstrap(n_samples, n_samples)
-    random_instance = check_random_state(random_state)
-    sample_indices = random_instance.randint(0, n_samples, n_bootstrap)
-    sample_counts = np.bincount(sample_indices, minlength=n_samples)
-
-    return np.arange(n_samples)[sample_counts==0]
-
-def _oob_classifier_accuracy(rf, X_train, y_train) -> float:
-    """
-    Compute out-of-bag (OOB) accuracy for a scikit-learn forest classifier.
-    
-    https://github.com/scikit-learn/scikit-learn/blob/a24c8b46/sklearn/ensemble/forest.py#L425
-    """
-    X = X_train.values if isinstance(X_train, pd.DataFrame) else X_train
-    y = y_train.values if isinstance(y_train, pd.Series) else y_train
-
-    n_samples = len(X)
-    n_classes = len(np.unique(y))
-    predictions = np.zeros((n_samples, n_classes))
-    for tree in rf.estimators_:
-        unsampled_indices = _get_unsampled_ix(tree.random_state, n_samples)
-        tree_preds = tree.predict_proba(X[unsampled_indices, :])
-        predictions[unsampled_indices] += tree_preds
-
-    predicted_class_indexes = np.argmax(predictions, axis=1)
-    predicted_classes = [rf.classes_[i] for i in predicted_class_indexes]
-
-    oob_score = np.mean(y == predicted_classes)
-    
-    return oob_score
-
-def permutation_importances(
-    context: MLClientCtx,
-    model: DataItem,
-    dataset: DataItem,
-    labels: str,
-    figsz=(10, 5),
-    plots_dest: str = "plots",
-    fitype: str = "permute"
-) -> pd.DataFrame:
-    """calculate change in metric
-    
-    type 'permute' uses a pre-estimated model
-    type 'dropcol' uses a re-estimates model
-    
-    :param context:     the function's execution context
-    :param model:       a trained model
-    :param dataset:     features and ground truths, regression targets
-    :param labels       name of the ground truths column
-    :param figsz:       matplotlib figure size
-    :param plots_dest:  path within artifact store
-    :
-    """
-    model_file, model_data, _ = get_model(model.url, suffix='.pkl')
-    model = load(open(str(model_file), "rb"))
-    
-    X = dataset.as_df()
-    y = X.pop(labels)
-    header = X.columns
-    
-    # this will be paramettrized next version, and include regression
-    metric = _oob_classifier_accuracy
-    
-    baseline = metric(model, X, y)
-    
-    imp = []
-    for col in X.columns:
-        if fitype is "permute":
-            save = X[col].copy()
-            X[col] = np.random.permutation(X[col])
-            m = metric(model, X, y)
-            X[col] = save
-            imp.append(baseline - m)
-        elif fitype is "dropcol":
-            X_ = X.drop(col, axis=1)
-            model_ = clone(model)
-            model_.random_state = random_state
-            model_.fit(X_, y)
-            o = model_.oob_score_
-            imp.append(baseline - o)
-        else:
-            raise ValueError("unknown fitype, only 'permute' or 'dropcol' permitted")
-
-    # create a feature importance table with desired labels
-    zipped = zip(imp, header)
-    feature_imp = pd.DataFrame(sorted(zipped), columns=["importance", "feature"])
-    feature_imp.sort_values(by="importance", ascending=False, inplace=True)
-
-    plt.clf()
-    plt.figure(figsize=figsz)
-    sns.barplot(x="importance", y="feature", data=feature_imp)
-    plt.title(f"feature importances-{fitype}")
-    plt.tight_layout()
-
-    context.log_artifact(PlotArtifact(f"feature importances-{fitype}", body=plt.gcf()),
-                         local_path=f"{plots_dest}/feature-permutations.html")
-    context.log_dataset(f"feature-importances-{fitype}-tbl", df=feature_imp, index=False)
-
-
-
-
-
-
-
# nuclio: end-code
-
-
-
-
-
-
-

save function

-
-
-
from mlrun import code_to_function
-from mlrun.platforms.other import auto_mount
-
-gpus = False
-
-# create job function object from notebook code
-fn_params = {
-    "name"        : "feature-perms",
-    "handler"     : "permutation_importances",
-    "kind"        : "job",
-    "image"       : "mlrun/ml-models" if not gpus else "mlrun/ml-models-gpu",
-    "description" : "estimate feature importances using permutations",
-    "categories"  : ["analysis"],
-    "labels"      : {"author": "yjb"}
-}
-
-perms_fn = code_to_function(**fn_params)
-perms_fn.apply(auto_mount())
-perms_fn.export("function.yaml")
-
-
-
-
-
[mlrun] 2020-06-07 19:58:25,298 function spec saved to path: function.yaml
-
-
-
<mlrun.runtimes.kubejob.KubejobRuntime at 0x7feb0104efd0>
-
-
-
-
-
-
-

tests

-
-
-
from mlrun import import_function
-from mlrun import NewTask, mlconf
-
-
-
-
-
-

get some data

-
-
-
data_url = "https://raw.githubusercontent.com/parrt/random-forest-importances/master/notebooks/data/rent.csv"
-
-fn = import_function("hub://arc_to_parquet", "a2p")
-fn.apply(auto_mount())
-
-params = {
-    "name" : "tasks arc-to-parq",
-    "params" : {"key":"rent", "stats": True, "file_ext":"csv"}
-}
-acquire_run = fn.run(NewTask(**params),inputs={"archive_url" : data_url},
-                     artifact_path=mlconf.artifact_path)
-
-
-
-
-
[mlrun] 2020-06-07 19:58:25,352 starting run tasks arc-to-parq uid=e9bc67f2189c418d96bfde754d369956  -> http://mlrun-api:8080
-[mlrun] 2020-06-07 19:58:25,486 Job is running in the background, pod: tasks-arc-to-parq-xqkr7
-[mlrun] 2020-06-07 19:58:29,118 starting local run: main.py # arc_to_parquet
-[mlrun] 2020-06-07 19:58:29,169 downloading https://raw.githubusercontent.com/parrt/random-forest-importances/master/notebooks/data/rent.csv to local tmp
-[mlrun] 2020-06-07 19:58:29,535 destination file does not exist, downloading
-[mlrun] 2020-06-07 19:58:29,898 log artifact rent at /User/artifacts/rent.csv, size: 1492462, db: Y
-
-[mlrun] 2020-06-07 19:58:29,917 run executed, status=completed
-final state: succeeded
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 07 19:58:29completedtasks arc-to-parq
v3io_user=admin
kind=job
owner=admin
host=tasks-arc-to-parq-xqkr7
archive_url
key=rent
stats=True
file_ext=csv
rent
-
- -
-
to track results use .show() or .logs() or in CLI: 
-!mlrun get run e9bc67f2189c418d96bfde754d369956 --project default , !mlrun logs e9bc67f2189c418d96bfde754d369956 --project default
-[mlrun] 2020-06-07 19:58:31,666 run executed, status=completed
-
-
-
-
-
-
-

train a model

-
-
-
fn = import_function("hub://sklearn_classifier", "skrf")
-fn.apply(auto_mount())
-
-# define model
-params = {
-    "name" : "tasks random forest",
-    "params" : {
-        "sample"                 : -5_000, # 5k random rows,
-        "model_pkg_class"        : "sklearn.ensemble.RandomForestClassifier",
-        "label_column"           : "interest_level",
-        "CLASS_n_estimators"     : 100,
-        "CLASS_min_samples_leaf" : 1,
-        "CLASS_n_jobs"           : -1,
-        "CLASS_oob_score"        : True}
-}
-
-train_run = fn.run(NewTask(**params), inputs={"dataset" : acquire_run.outputs["rent"]},
-                   artifact_path=mlconf.artifact_path)
-
-
-
-
-
[mlrun] 2020-06-07 19:58:31,704 starting run tasks random forest uid=57af834167264641905a5bb5e6b0e263  -> http://mlrun-api:8080
-[mlrun] 2020-06-07 19:58:31,861 Job is running in the background, pod: tasks-random-forest-vkjk5
-[mlrun] 2020-06-07 19:58:35,390 starting local run: main.py # train_model
-[mlrun] 2020-06-07 19:58:36,310 log artifact test_set at /User/artifacts/data/test_set.parquet, size: 24484, db: Y
-[mlrun] 2020-06-07 19:58:37,153 log artifact confusion-matrix at /User/artifacts/model/plots/confusion-matrix.html, size: 27401, db: N
-[mlrun] 2020-06-07 19:58:37,598 log artifact feature-importances at /User/artifacts/model/plots/feature-importances.html, size: 19685, db: N
-[mlrun] 2020-06-07 19:58:37,806 log artifact precision-recall-multiclass at /User/artifacts/model/plots/precision-recall-multiclass.html, size: 74009, db: N
-[mlrun] 2020-06-07 19:58:37,936 log artifact roc-multiclass at /User/artifacts/model/plots/roc-multiclass.html, size: 73053, db: N
-[mlrun] 2020-06-07 19:58:38,079 log artifact model at /User/artifacts/model/, size: 10346780, db: Y
-
-[mlrun] 2020-06-07 19:58:38,106 run executed, status=completed
-final state: succeeded
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 07 19:58:36completedtasks random forest
v3io_user=admin
kind=job
owner=admin
host=tasks-random-forest-vkjk5
class=sklearn.ensemble.RandomForestClassifier
dataset
sample=-5000
model_pkg_class=sklearn.ensemble.RandomForestClassifier
label_column=interest_level
CLASS_n_estimators=100
CLASS_min_samples_leaf=1
CLASS_n_jobs=-1
CLASS_oob_score=True
test-accuracy=0.6902857142857143
test-error=0.3097142857142857
auc-micro=0.8567196734693878
auc-weighted=0.7077200281488216
f1-score=0.44361444815007395
precision_score=0.4969837043184901
recall_score=0.42733978329897576
test_set
confusion-matrix
feature-importances
precision-recall-multiclass
roc-multiclass
model
-
- -
-
to track results use .show() or .logs() or in CLI: 
-!mlrun get run 57af834167264641905a5bb5e6b0e263 --project default , !mlrun logs 57af834167264641905a5bb5e6b0e263 --project default
-[mlrun] 2020-06-07 19:58:41,115 run executed, status=completed
-
-
-
-
-
-
-
from IPython.display import HTML
-HTML(filename=train_run.outputs['feature-importances'])
-
-
-
-
-

Feature Importances

-
-
-
-
-
data   = acquire_run.outputs["rent"]
-labels = "interest_level"
-model  = train_run.outputs["model"]
-
-
-
-
-
-
-
fi_perms = perms_fn.run(
-    NewTask(params={"labels": labels, 
-                    "plots_dest": "plots"}),
-    inputs={"model": model, "dataset": data},
-    artifact_path=mlconf.artifact_path)
-
-
-
-
-
[mlrun] 2020-06-07 19:58:41,152 starting run features-permutation_importances uid=89235b15ac2a4213aefc906c178a1c5e  -> http://mlrun-api:8080
-[mlrun] 2020-06-07 19:58:41,312 Job is running in the background, pod: features-permutation-importances-dwxmt
-[mlrun] 2020-06-07 19:58:44,871 starting local run: main.py # permutation_importances
-[mlrun] 2020-06-07 19:58:48,714 log artifact feature importances-permute at /User/artifacts/plots/feature-permutations.html, size: 25694, db: Y
-[mlrun] 2020-06-07 19:58:48,770 log artifact feature-importances-permute-tbl at /User/artifacts/feature-importances-permute-tbl.csv, size: 167, db: Y
-
-[mlrun] 2020-06-07 19:58:48,785 run executed, status=completed
-final state: succeeded
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 07 19:58:45completedfeatures-permutation_importances
v3io_user=admin
kind=job
owner=admin
host=features-permutation-importances-dwxmt
model
dataset
labels=interest_level
plots_dest=plots
feature importances-permute
feature-importances-permute-tbl
-
- -
-
to track results use .show() or .logs() or in CLI: 
-!mlrun get run 89235b15ac2a4213aefc906c178a1c5e --project default , !mlrun logs 89235b15ac2a4213aefc906c178a1c5e --project default
-[mlrun] 2020-06-07 19:58:50,488 run executed, status=completed
-
-
-
-
-
-
-
from IPython.display import HTML
-HTML(filename=fi_perms.outputs['feature importances-permute'])
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/feature_perms/0.0.1/static/function.html b/functions/development/feature_perms/0.0.1/static/function.html deleted file mode 100644 index 662b2eb8..00000000 --- a/functions/development/feature_perms/0.0.1/static/function.html +++ /dev/null @@ -1,85 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: job
-metadata:
-  name: feature-perms
-  tag: ''
-  hash: 2e32234a73e2e48f029cf6c957b150ec2ffd4bc7
-  project: default
-  labels:
-    author: yjb
-  categories:
-  - data-analysis
-spec:
-  command: ''
-  args: []
-  image: mlrun/ml-models
-  env: []
-  default_handler: permutation_importance
-  entry_points:
-    permutation_importance:
-      name: permutation_importance
-      doc: 'calculate change in metric
-
-
-        type ''permute'' uses a pre-estimated model
-
-        type ''dropcol'' uses a re-estimates model'
-      parameters:
-      - name: context
-        type: MLClientCtx
-        doc: the function's execution context
-        default: ''
-      - name: model
-        type: DataItem
-        doc: a trained model
-        default: ''
-      - name: dataset
-        type: DataItem
-        doc: features and ground truths, regression targets
-        default: ''
-      - name: labels
-        type: str
-        default: ''
-      - name: figsz
-        doc: matplotlib figure size
-        default:
-        - 10
-        - 5
-      - name: plots_dest
-        type: str
-        doc: path within artifact store
-        default: plots
-      - name: fitype
-        type: str
-        default: permute
-      outputs:
-      - default: ''
-      lineno: 93
-  description: estimate feature importances using permutations
-  build:
-    functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IG51bXB5IGFzIG5wCmltcG9ydCBwYW5kYXMgYXMgcGQKaW1wb3J0IG51bWJlcnMKCmltcG9ydCBza2xlYXJuCmZyb20gc2tsZWFybi5iYXNlIGltcG9ydCBjbG9uZQpmcm9tIHNrbGVhcm4udXRpbHMgaW1wb3J0IGNoZWNrX3JhbmRvbV9zdGF0ZQoKaW1wb3J0IG1hdHBsb3RsaWIucHlwbG90IGFzIHBsdAppbXBvcnQgc2VhYm9ybiBhcyBzbnMKCmZyb20gY2xvdWRwaWNrbGUgaW1wb3J0IGxvYWQKCmZyb20gbWxydW4uZXhlY3V0aW9uIGltcG9ydCBNTENsaWVudEN0eApmcm9tIG1scnVuLmRhdGFzdG9yZSBpbXBvcnQgRGF0YUl0ZW0KZnJvbSBtbHJ1bi5hcnRpZmFjdHMgaW1wb3J0IGdldF9tb2RlbCwgUGxvdEFydGlmYWN0CmZyb20gdHlwaW5nIGltcG9ydCBVbmlvbiwgQ2FsbGFibGUsIExpc3QKCgpkZWYgX2dldF9uX3NhbXBsZXNfYm9vdHN0cmFwKG5fc2FtcGxlcywgbWF4X3NhbXBsZXMpIC0+IGludDoKICAgICIiImdldCB0aGUgbnVtYmVyIG9mIHNhbXBsZXMgaW4gYSBib290c3RyYXAgc2FtcGxlCgogICAgcmV0dXJucyB0aGUgdG90YWwgbnVtYmVyIG9mIHNhbXBsZXMgdG8gZHJhdyBmb3IgdGhlIGJvb3RzdHJhcCBzYW1wbGUKCiAgICBwcml2YXRlIGFwaSBpbiBza2xlYXJuID49IHYwLjI0LCB0YWtlbiBmcm9tIHNrbGVhcm4uZW5zZW1ibGUuX2ZvcmVzdC5weQoKICAgIDpwYXJhbSBuX3NhbXBsZXM6ICAgTnVtYmVyIG9mIHNhbXBsZXMgaW4gdGhlIGRhdGFzZXQuCiAgICA6cGFyYW0gbWF4X3NhbXBsZXM6CiAgICAgICAgVGhlIG1heGltdW0gbnVtYmVyIG9mIHNhbXBsZXMgdG8gZHJhdyBmcm9tIHRoZSB0b3RhbCBhdmFpbGFibGU6CiAgICAgICAgICAgIC0gaWYgZmxvYXQsIHRoaXMgaW5kaWNhdGVzIGEgZnJhY3Rpb24gb2YgdGhlIHRvdGFsIGFuZCBzaG91bGQgYmUKICAgICAgICAgICAgICB0aGUgaW50ZXJ2YWwgYCgwLCAxKWA7CiAgICAgICAgICAgIC0gaWYgaW50LCB0aGlzIGluZGljYXRlcyB0aGUgZXhhY3QgbnVtYmVyIG9mIHNhbXBsZXM7CiAgICAgICAgICAgIC0gaWYgTm9uZSwgdGhpcyBpbmRpY2F0ZXMgdGhlIHRvdGFsIG51bWJlciBvZiBzYW1wbGVzLgogICAgIiIiCiAgICBpZiBtYXhfc2FtcGxlcyBpcyBOb25lOgogICAgICAgIHJldHVybiBuX3NhbXBsZXMKCiAgICBpZiBpc2luc3RhbmNlKG1heF9zYW1wbGVzLCBudW1iZXJzLkludGVncmFsKToKICAgICAgICBpZiBub3QgKDEgPD0gbWF4X3NhbXBsZXMgPD0gbl9zYW1wbGVzKToKICAgICAgICAgICAgbXNnID0gImBtYXhfc2FtcGxlc2AgbXVzdCBiZSBpbiByYW5nZSAxIHRvIHt9IGJ1dCBnb3QgdmFsdWUge30iCiAgICAgICAgICAgIHJhaXNlIFZhbHVlRXJyb3IobXNnLmZvcm1hdChuX3NhbXBsZXMsIG1heF9zYW1wbGVzKSkKICAgICAgICByZXR1cm4gbWF4X3NhbXBsZXMKCiAgICBpZiBpc2luc3RhbmNlKG1heF9zYW1wbGVzLCBudW1iZXJzLlJlYWwpOgogICAgICAgIGlmIG5vdCAoMCA8IG1heF9zYW1wbGVzIDwgMSk6CiAgICAgICAgICAgIG1zZyA9ICJgbWF4X3NhbXBsZXNgIG11c3QgYmUgaW4gcmFuZ2UgKDAsIDEpIGJ1dCBnb3QgdmFsdWUge30iCiAgICAgICAgICAgIHJhaXNlIFZhbHVlRXJyb3IobXNnLmZvcm1hdChtYXhfc2FtcGxlcykpCiAgICAgICAgcmV0dXJuIGludChyb3VuZChuX3NhbXBsZXMgKiBtYXhfc2FtcGxlcykpCgogICAgbXNnID0gImBtYXhfc2FtcGxlc2Agc2hvdWxkIGJlIGludCBvciBmbG9hdCwgYnV0IGdvdCB0eXBlICd7fSciCiAgICByYWlzZSBUeXBlRXJyb3IobXNnLmZvcm1hdCh0eXBlKG1heF9zYW1wbGVzKSkpCgoKZGVmIF9nZXRfdW5zYW1wbGVkX2l4KHJhbmRvbV9zdGF0ZSwgbl9zYW1wbGVzOiBpbnQpIC0+IG5wLmFycmF5OgogICAgIiIiCiAgICBmdXR1cmUtcHJvb2YgZ2V0IHVuc2FtcGxlZCBpbmRpY2VzCiAgICAiIiIKICAgIG5fYm9vdHN0cmFwID0gX2dldF9uX3NhbXBsZXNfYm9vdHN0cmFwKG5fc2FtcGxlcywgbl9zYW1wbGVzKQogICAgcmFuZG9tX2luc3RhbmNlID0gY2hlY2tfcmFuZG9tX3N0YXRlKHJhbmRvbV9zdGF0ZSkKICAgIHNhbXBsZV9pbmRpY2VzID0gcmFuZG9tX2luc3RhbmNlLnJhbmRpbnQoMCwgbl9zYW1wbGVzLCBuX2Jvb3RzdHJhcCkKICAgIHNhbXBsZV9jb3VudHMgPSBucC5iaW5jb3VudChzYW1wbGVfaW5kaWNlcywgbWlubGVuZ3RoPW5fc2FtcGxlcykKCiAgICByZXR1cm4gbnAuYXJhbmdlKG5fc2FtcGxlcylbc2FtcGxlX2NvdW50cyA9PSAwXQoKCmRlZiBfb29iX2NsYXNzaWZpZXJfYWNjdXJhY3kocmYsIFhfdHJhaW4sIHlfdHJhaW4pIC0+IGZsb2F0OgogICAgIiIiCiAgICBDb21wdXRlIG91dC1vZi1iYWcgKE9PQikgYWNjdXJhY3kgZm9yIGEgc2Npa2l0LWxlYXJuIGZvcmVzdCBjbGFzc2lmaWVyLgoKICAgIGh0dHBzOi8vZ2l0aHViLmNvbS9zY2lraXQtbGVhcm4vc2Npa2l0LWxlYXJuL2Jsb2IvYTI0YzhiNDYvc2tsZWFybi9lbnNlbWJsZS9mb3Jlc3QucHkjTDQyNQogICAgIiIiCiAgICBYID0gWF90cmFpbi52YWx1ZXMgaWYgaXNpbnN0YW5jZShYX3RyYWluLCBwZC5EYXRhRnJhbWUpIGVsc2UgWF90cmFpbgogICAgeSA9IHlfdHJhaW4udmFsdWVzIGlmIGlzaW5zdGFuY2UoeV90cmFpbiwgcGQuU2VyaWVzKSBlbHNlIHlfdHJhaW4KCiAgICBuX3NhbXBsZXMgPSBsZW4oWCkKICAgIG5fY2xhc3NlcyA9IGxlbihucC51bmlxdWUoeSkpCiAgICBwcmVkaWN0aW9ucyA9IG5wLnplcm9zKChuX3NhbXBsZXMsIG5fY2xhc3NlcykpCiAgICBmb3IgdHJlZSBpbiByZi5lc3RpbWF0b3JzXzoKICAgICAgICB1bnNhbXBsZWRfaW5kaWNlcyA9IF9nZXRfdW5zYW1wbGVkX2l4KHRyZWUucmFuZG9tX3N0YXRlLCBuX3NhbXBsZXMpCiAgICAgICAgdHJlZV9wcmVkcyA9IHRyZWUucHJlZGljdF9wcm9iYShYW3Vuc2FtcGxlZF9pbmRpY2VzLCA6XSkKICAgICAgICBwcmVkaWN0aW9uc1t1bnNhbXBsZWRfaW5kaWNlc10gKz0gdHJlZV9wcmVkcwoKICAgIHByZWRpY3RlZF9jbGFzc19pbmRleGVzID0gbnAuYXJnbWF4KHByZWRpY3Rpb25zLCBheGlzPTEpCiAgICBwcmVkaWN0ZWRfY2xhc3NlcyA9IFtyZi5jbGFzc2VzX1tpXSBmb3IgaSBpbiBwcmVkaWN0ZWRfY2xhc3NfaW5kZXhlc10KCiAgICBvb2Jfc2NvcmUgPSBucC5tZWFuKHkgPT0gcHJlZGljdGVkX2NsYXNzZXMpCgogICAgcmV0dXJuIG9vYl9zY29yZQoKCmRlZiBwZXJtdXRhdGlvbl9pbXBvcnRhbmNlKAogICAgY29udGV4dDogTUxDbGllbnRDdHgsCiAgICBtb2RlbDogRGF0YUl0ZW0sCiAgICBkYXRhc2V0OiBEYXRhSXRlbSwKICAgIGxhYmVsczogc3RyLAogICAgZmlnc3o9KDEwLCA1KSwKICAgIHBsb3RzX2Rlc3Q6IHN0ciA9ICJwbG90cyIsCiAgICBmaXR5cGU6IHN0ciA9ICJwZXJtdXRlIiwKKSAtPiBwZC5EYXRhRnJhbWU6CiAgICAiIiJjYWxjdWxhdGUgY2hhbmdlIGluIG1ldHJpYwoKICAgIHR5cGUgJ3Blcm11dGUnIHVzZXMgYSBwcmUtZXN0aW1hdGVkIG1vZGVsCiAgICB0eXBlICdkcm9wY29sJyB1c2VzIGEgcmUtZXN0aW1hdGVzIG1vZGVsCgogICAgOnBhcmFtIGNvbnRleHQ6ICAgICB0aGUgZnVuY3Rpb24ncyBleGVjdXRpb24gY29udGV4dAogICAgOnBhcmFtIG1vZGVsOiAgICAgICBhIHRyYWluZWQgbW9kZWwKICAgIDpwYXJhbSBkYXRhc2V0OiAgICAgZmVhdHVyZXMgYW5kIGdyb3VuZCB0cnV0aHMsIHJlZ3Jlc3Npb24gdGFyZ2V0cwogICAgOnBhcmFtIGxhYmVscyAgICAgICBuYW1lIG9mIHRoZSBncm91bmQgdHJ1dGhzIGNvbHVtbgogICAgOnBhcmFtIGZpZ3N6OiAgICAgICBtYXRwbG90bGliIGZpZ3VyZSBzaXplCiAgICA6cGFyYW0gcGxvdHNfZGVzdDogIHBhdGggd2l0aGluIGFydGlmYWN0IHN0b3JlCiAgICA6CiAgICAiIiIKICAgIG1vZGVsX2ZpbGUsIG1vZGVsX2RhdGEsIF8gPSBnZXRfbW9kZWwobW9kZWwudXJsLCBzdWZmaXg9Ii5wa2wiKQogICAgbW9kZWwgPSBsb2FkKG9wZW4oc3RyKG1vZGVsX2ZpbGUpLCAicmIiKSkKCiAgICBYID0gZGF0YXNldC5hc19kZigpCiAgICB5ID0gWC5wb3AobGFiZWxzKQogICAgaGVhZGVyID0gWC5jb2x1bW5zCgogICAgbWV0cmljID0gX29vYl9jbGFzc2lmaWVyX2FjY3VyYWN5CgogICAgYmFzZWxpbmUgPSBtZXRyaWMobW9kZWwsIFgsIHkpCgogICAgaW1wID0gW10KICAgIGZvciBjb2wgaW4gWC5jb2x1bW5zOgogICAgICAgIGlmIGZpdHlwZSBpcyAicGVybXV0ZSI6CiAgICAgICAgICAgIHNhdmUgPSBYW2NvbF0uY29weSgpCiAgICAgICAgICAgIFhbY29sXSA9IG5wLnJhbmRvbS5wZXJtdXRhdGlvbihYW2NvbF0pCiAgICAgICAgICAgIG0gPSBtZXRyaWMobW9kZWwsIFgsIHkpCiAgICAgICAgICAgIFhbY29sXSA9IHNhdmUKICAgICAgICAgICAgaW1wLmFwcGVuZChiYXNlbGluZSAtIG0pCiAgICAgICAgZWxpZiBmaXR5cGUgaXMgImRyb3Bjb2wiOgogICAgICAgICAgICBYXyA9IFguZHJvcChjb2wsIGF4aXM9MSkKICAgICAgICAgICAgbW9kZWxfID0gY2xvbmUobW9kZWwpCiAgICAgICAgICAgICNtb2RlbF8ucmFuZG9tX3N0YXRlID0gcmFuZG9tX3N0YXRlCiAgICAgICAgICAgIG1vZGVsXy5maXQoWF8sIHkpCiAgICAgICAgICAgIG8gPSBtb2RlbF8ub29iX3Njb3JlXwogICAgICAgICAgICBpbXAuYXBwZW5kKGJhc2VsaW5lIC0gbykKICAgICAgICBlbHNlOgogICAgICAgICAgICByYWlzZSBWYWx1ZUVycm9yKCJ1bmtub3duIGZpdHlwZSwgb25seSAncGVybXV0ZScgb3IgJ2Ryb3Bjb2wnIHBlcm1pdHRlZCIpCgogICAgemlwcGVkID0gemlwKGltcCwgaGVhZGVyKQogICAgZmVhdHVyZV9pbXAgPSBwZC5EYXRhRnJhbWUoc29ydGVkKHppcHBlZCksIGNvbHVtbnM9WyJpbXBvcnRhbmNlIiwgImZlYXR1cmUiXSkKICAgIGZlYXR1cmVfaW1wLnNvcnRfdmFsdWVzKGJ5PSJpbXBvcnRhbmNlIiwgYXNjZW5kaW5nPUZhbHNlLCBpbnBsYWNlPVRydWUpCgogICAgcGx0LmNsZigpCiAgICBwbHQuZmlndXJlKGZpZ3NpemU9Zmlnc3opCiAgICBzbnMuYmFycGxvdCh4PSJpbXBvcnRhbmNlIiwgeT0iZmVhdHVyZSIsIGRhdGE9ZmVhdHVyZV9pbXApCiAgICBwbHQudGl0bGUoZiJmZWF0dXJlIGltcG9ydGFuY2VzLXtmaXR5cGV9IikKICAgIHBsdC50aWdodF9sYXlvdXQoKQoKICAgIGNvbnRleHQubG9nX2FydGlmYWN0KAogICAgICAgIFBsb3RBcnRpZmFjdChmImZlYXR1cmUgaW1wb3J0YW5jZXMte2ZpdHlwZX0iLCBib2R5PXBsdC5nY2YoKSksCiAgICAgICAgbG9jYWxfcGF0aD1mIntwbG90c19kZXN0fS9mZWF0dXJlLXBlcm11dGF0aW9ucy5odG1sIiwKICAgICkKICAgIGNvbnRleHQubG9nX2RhdGFzZXQoCiAgICAgICAgZiJmZWF0dXJlLWltcG9ydGFuY2VzLXtmaXR5cGV9LXRibCIsIGRmPWZlYXR1cmVfaW1wLCBpbmRleD1GYWxzZQogICAgKQo=
-    commands: []
-    code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/feature_perms/feature_perms.py
-  affinity: null
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/feature_perms/0.0.1/static/item.html b/functions/development/feature_perms/0.0.1/static/item.html deleted file mode 100644 index d51f70c1..00000000 --- a/functions/development/feature_perms/0.0.1/static/item.html +++ /dev/null @@ -1,45 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- data-analysis
-description: estimate feature importances using permutations
-doc: ''
-example: feature_perms.ipynb
-generationDate: 2021-05-19:22-41
-icon: ''
-labels:
-  author: yjb
-maintainers: []
-marketplaceType: ''
-mlrunVersion: ''
-name: feature-perms
-platformVersion: ''
-spec:
-  filename: feature_perms.py
-  handler: permutation_importance
-  image: mlrun/ml-models
-  kind: job
-  requirements: []
-url: ''
-version: 0.0.1
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/feature_perms/0.0.1/static/source.html b/functions/development/feature_perms/0.0.1/static/source.html deleted file mode 100644 index 3013427a..00000000 --- a/functions/development/feature_perms/0.0.1/static/source.html +++ /dev/null @@ -1,182 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-# Generated by nuclio.export.NuclioExporter
-
-import numpy as np
-import pandas as pd
-import numbers
-
-import sklearn
-from sklearn.base import clone
-from sklearn.utils import check_random_state
-
-import matplotlib.pyplot as plt
-import seaborn as sns
-
-from cloudpickle import load
-
-from mlrun.execution import MLClientCtx
-from mlrun.datastore import DataItem
-from mlrun.artifacts import get_model, PlotArtifact
-from typing import Union, Callable, List
-
-
-def _get_n_samples_bootstrap(n_samples, max_samples) -> int:
-    """get the number of samples in a bootstrap sample
-
-    returns the total number of samples to draw for the bootstrap sample
-
-    private api in sklearn >= v0.24, taken from sklearn.ensemble._forest.py
-
-    :param n_samples:   Number of samples in the dataset.
-    :param max_samples:
-        The maximum number of samples to draw from the total available:
-            - if float, this indicates a fraction of the total and should be
-              the interval `(0, 1)`;
-            - if int, this indicates the exact number of samples;
-            - if None, this indicates the total number of samples.
-    """
-    if max_samples is None:
-        return n_samples
-
-    if isinstance(max_samples, numbers.Integral):
-        if not (1 <= max_samples <= n_samples):
-            msg = "`max_samples` must be in range 1 to {} but got value {}"
-            raise ValueError(msg.format(n_samples, max_samples))
-        return max_samples
-
-    if isinstance(max_samples, numbers.Real):
-        if not (0 < max_samples < 1):
-            msg = "`max_samples` must be in range (0, 1) but got value {}"
-            raise ValueError(msg.format(max_samples))
-        return int(round(n_samples * max_samples))
-
-    msg = "`max_samples` should be int or float, but got type '{}'"
-    raise TypeError(msg.format(type(max_samples)))
-
-
-def _get_unsampled_ix(random_state, n_samples: int) -> np.array:
-    """
-    future-proof get unsampled indices
-    """
-    n_bootstrap = _get_n_samples_bootstrap(n_samples, n_samples)
-    random_instance = check_random_state(random_state)
-    sample_indices = random_instance.randint(0, n_samples, n_bootstrap)
-    sample_counts = np.bincount(sample_indices, minlength=n_samples)
-
-    return np.arange(n_samples)[sample_counts == 0]
-
-
-def _oob_classifier_accuracy(rf, X_train, y_train) -> float:
-    """
-    Compute out-of-bag (OOB) accuracy for a scikit-learn forest classifier.
-
-    https://github.com/scikit-learn/scikit-learn/blob/a24c8b46/sklearn/ensemble/forest.py#L425
-    """
-    X = X_train.values if isinstance(X_train, pd.DataFrame) else X_train
-    y = y_train.values if isinstance(y_train, pd.Series) else y_train
-
-    n_samples = len(X)
-    n_classes = len(np.unique(y))
-    predictions = np.zeros((n_samples, n_classes))
-    for tree in rf.estimators_:
-        unsampled_indices = _get_unsampled_ix(tree.random_state, n_samples)
-        tree_preds = tree.predict_proba(X[unsampled_indices, :])
-        predictions[unsampled_indices] += tree_preds
-
-    predicted_class_indexes = np.argmax(predictions, axis=1)
-    predicted_classes = [rf.classes_[i] for i in predicted_class_indexes]
-
-    oob_score = np.mean(y == predicted_classes)
-
-    return oob_score
-
-
-def permutation_importance(
-    context: MLClientCtx,
-    model: DataItem,
-    dataset: DataItem,
-    labels: str,
-    figsz=(10, 5),
-    plots_dest: str = "plots",
-    fitype: str = "permute",
-) -> pd.DataFrame:
-    """calculate change in metric
-
-    type 'permute' uses a pre-estimated model
-    type 'dropcol' uses a re-estimates model
-
-    :param context:     the function's execution context
-    :param model:       a trained model
-    :param dataset:     features and ground truths, regression targets
-    :param labels       name of the ground truths column
-    :param figsz:       matplotlib figure size
-    :param plots_dest:  path within artifact store
-    :
-    """
-    model_file, model_data, _ = get_model(model.url, suffix=".pkl")
-    model = load(open(str(model_file), "rb"))
-
-    X = dataset.as_df()
-    y = X.pop(labels)
-    header = X.columns
-
-    metric = _oob_classifier_accuracy
-
-    baseline = metric(model, X, y)
-
-    imp = []
-    for col in X.columns:
-        if fitype is "permute":
-            save = X[col].copy()
-            X[col] = np.random.permutation(X[col])
-            m = metric(model, X, y)
-            X[col] = save
-            imp.append(baseline - m)
-        elif fitype is "dropcol":
-            X_ = X.drop(col, axis=1)
-            model_ = clone(model)
-            #model_.random_state = random_state
-            model_.fit(X_, y)
-            o = model_.oob_score_
-            imp.append(baseline - o)
-        else:
-            raise ValueError("unknown fitype, only 'permute' or 'dropcol' permitted")
-
-    zipped = zip(imp, header)
-    feature_imp = pd.DataFrame(sorted(zipped), columns=["importance", "feature"])
-    feature_imp.sort_values(by="importance", ascending=False, inplace=True)
-
-    plt.clf()
-    plt.figure(figsize=figsz)
-    sns.barplot(x="importance", y="feature", data=feature_imp)
-    plt.title(f"feature importances-{fitype}")
-    plt.tight_layout()
-
-    context.log_artifact(
-        PlotArtifact(f"feature importances-{fitype}", body=plt.gcf()),
-        local_path=f"{plots_dest}/feature-permutations.html",
-    )
-    context.log_dataset(
-        f"feature-importances-{fitype}-tbl", df=feature_imp, index=False
-    )
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/feature_perms/0.8.0/src/README.ipynb b/functions/development/feature_perms/0.8.0/src/README.ipynb deleted file mode 100644 index 0929a6f6..00000000 --- a/functions/development/feature_perms/0.8.0/src/README.ipynb +++ /dev/null @@ -1,788 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# feature importances" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "There are a number of ways to compute feature importances and **the default estimates reported by scikit learn can be shown to be biased** under certain circumstances. In addition, many non-tree algorithms do not provide conveniently calculated feature importance estimates. The following demonstration is based on material that draws heavily from the following sources:" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## references\n", - "\n", - "\n", - "### repos\n", - "\n", - "* **[Feature importances for scikit-learn machine learning models](https://github.com/parrt/random-forest-importances)**, [MIT License](https://github.com/parrt/random-forest-importances/blob/master/LICENSE)\n", - "* **[Scikit-Learn ensemble module - forests](https://github.com/scikit-learn/scikit-learn/blob/0.23.1/sklearn/ensemble/_forest.py)**, [BSD License](https://github.com/scikit-learn/scikit-learn/blob/fd237278e895b42abe8d8d09105cbb82dc2cbba7/sklearn/ensemble/_forest.py#L40)\n", - "* **[ELI5 - Permutation Importance](https://eli5.readthedocs.io/en/latest/blackbox/permutation_importance.html)** \n", - "\n", - "### articles\n", - "\n", - "Strobl, C., Boulesteix, A., Zeileis, A. et al. **[Bias in random forest variable importance measures: Illustrations, sources and a solution](https://link.springer.com/article/10.1186/1471-2105-8-25#citeas)**. BMC Bioinformatics 8, 25 (2007). https://doi.org/10.1186/1471-2105-8-25 \n", - "\n", - "Strobl, C., Boulesteix, A., Kneib, T. et al. **[Conditional variable importance for random forests](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-9-307#citeas)**. BMC Bioinformatics 9, 307 (2008). https://doi.org/10.1186/1471-2105-9-307 " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## what we'll do\n", - "\n", - "* demonstrate an issue with default feature importance estimates \n", - "* provide alternatives and compare to the default \n", - "* create a new function `feature_perms` that implements a computationally simple algorithm \n", - "* create a new function `dropcol_importances` that implements a computationally intensive algorithm that is more accurate\n", - "* test our new functions\n", - "\n", - "It should be noted that although we are developing this notebook using a classification example, an almost identical presentation can be done for regression." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## imports" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "import pandas as pd\n", - "\n", - "import sklearn\n", - "from sklearn.base import clone\n", - "\n", - "from sklearn.ensemble import RandomForestClassifier as SomeModel\n", - "\n", - "import matplotlib.pyplot as plt\n", - "import seaborn as sns\n", - "\n", - "from typing import Union, Callable, List" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## default feature importances\n", - "\n", - "This is a function that plots default feature importances from an estimated model object when available. It is taken from mlrun's current source-code implementation:" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "def feature_importances(\n", - " model: SomeModel,\n", - " header: List[str], \n", - " figsz=(10, 5)\n", - ") -> None:\n", - " \"\"\"Display default model feature importances\n", - "\n", - " Only works for models with attribute 'feature_importances_`\n", - "\n", - " :param model: fitted model with a feature_importances_ attribute\n", - " :param header: feature labels\n", - " :param figsz: matplotlib figure size\n", - " \"\"\"\n", - " if not hasattr(model, \"feature_importances_\"):\n", - " raise Exception(\n", - " \"feature importances are only available for some models\")\n", - "\n", - " # create a feature importance table with desired labels\n", - " zipped = zip(model.feature_importances_, header)\n", - " feature_imp = pd.DataFrame(\n", - " sorted(zipped), columns=[\"freq\", \"feature\"]).sort_values(\n", - " by=\"freq\", ascending=False)\n", - "\n", - " plt.clf()\n", - " plt.figure(figsize=figsz)\n", - " sns.barplot(x=\"freq\", y=\"feature\", data=feature_imp)\n", - " plt.title(\"features\")\n", - " plt.tight_layout();\n", - " \n", - " return feature_imp" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## permuted features\n", - "\n", - "A proposed solution that has general applicability is randomly permuted features**[refs](#references)**: \n", - "* loop through the feature set \n", - "* shuffle one feature \n", - "* run predict\n", - "* compare the (marginal) change in accuracy (or other metric of interest) \n", - "\n", - "This approach is computationally more demanding than relying on the default values, however it can be easily parallelized. To perform the estimation we only need an estimated model and a held-out test set. The following was proposed in **[Beware Default Random Forest Importances](https://explained.ai/rf-importance/index.html)**:\n", - "\n", - "( the following 3 glue functions will no longer be publicly visible in the sklearn package from 0.24 onwards, consider this a temporary hack while we refactor these away)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**the following has been refactored in final version of function:**" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "from distutils.version import LooseVersion\n", - "import numpy as np\n", - "from sklearn.utils import check_random_state\n", - "\n", - "def _generate_sample_indices(random_state: int, n_samples: int, n_samples_bootstrap: int):\n", - " \"\"\"\n", - " Private function used to _parallel_build_trees function.\n", - " taken from:\n", - " https://github.com/scikit-learn/scikit-learn/blob/2253807bb488b6de73796aef2de38a6dcf282d86/sklearn/ensemble/_forest.py#L116\n", - " (public availability to be deprecated by sklearn v0.24)\n", - " \"\"\"\n", - " random_instance = check_random_state(random_state)\n", - " sample_indices = random_instance.randint(0, n_samples, n_samples_bootstrap)\n", - "\n", - " return sample_indices\n", - "\n", - "def _generate_unsampled_indices(random_state: int, n_samples: int, n_samples_bootstrap: int):\n", - " \"\"\"\n", - " Private function used to forest._set_oob_score function.\n", - " taken from: \n", - " https://github.com/scikit-learn/scikit-learn/blob/2253807bb488b6de73796aef2de38a6dcf282d86/sklearn/ensemble/_forest.py#L126\n", - " (public availability to be deprecated by sklearn v0.24)\n", - " \"\"\"\n", - " sample_indices = _generate_sample_indices(random_state, n_samples,\n", - " n_samples_bootstrap)\n", - " sample_counts = np.bincount(sample_indices, minlength=n_samples)\n", - " unsampled_mask = sample_counts == 0\n", - " indices_range = np.arange(n_samples)\n", - " unsampled_indices = indices_range[unsampled_mask]\n", - "\n", - " return unsampled_indices\n", - "\n", - "def _get_unsampled_indices(tree, n_samples: int):\n", - " \"\"\"\n", - " An interface to get unsampled indices regardless of sklearn version.\n", - " \"\"\"\n", - " import warnings\n", - " warnings.simplefilter(action=\"ignore\", category=FutureWarning)\n", - " if LooseVersion(sklearn.__version__) >= LooseVersion(\"0.22\"):\n", - " # Version 0.22 or newer uses 3 arguments.\n", - " from sklearn.ensemble.forest import _get_n_samples_bootstrap\n", - " n_samples_bootstrap = _get_n_samples_bootstrap(n_samples, n_samples)\n", - " return _generate_unsampled_indices(tree.random_state, n_samples,\n", - " n_samples_bootstrap)\n", - " else:\n", - " # Version 0.21 or older uses only two arguments.\n", - " return _generate_unsampled_indices(tree.random_state, n_samples)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The following function estimates classifier accuracy and has been borrowed from **[references](#references)**. See **[breitman on oob](https://www.stat.berkeley.edu/~breiman/OOBestimation.pdf)** for details on out-of-bag estimation:" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "def oob_classifier_accuracy(rf, X_train: np.array, y_train: np.array) -> float:\n", - " \"\"\"\n", - " Compute out-of-bag (OOB) accuracy for a scikit-learn forest classifier.\n", - " \n", - " https://github.com/scikit-learn/scikit-learn/blob/a24c8b46/sklearn/ensemble/forest.py#L425\n", - " \"\"\"\n", - " X = X_train.values\n", - " y = y_train.values\n", - "\n", - " n_samples = len(X)\n", - " n_classes = len(np.unique(y))\n", - " predictions = np.zeros((n_samples, n_classes))\n", - " for tree in rf.estimators_:\n", - " unsampled_indices = _get_unsampled_indices(tree, n_samples)\n", - " tree_preds = tree.predict_proba(X[unsampled_indices, :])\n", - " predictions[unsampled_indices] += tree_preds\n", - "\n", - " predicted_class_indexes = np.argmax(predictions, axis=1)\n", - " predicted_classes = [rf.classes_[i] for i in predicted_class_indexes]\n", - "\n", - " oob_score = np.mean(y == predicted_classes)\n", - " \n", - " return oob_score" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Putting it all together:" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "def permutation_importances(\n", - " model, \n", - " X_train: np.array,\n", - " y_train: np.array, \n", - " header: List[str],\n", - " metric: Callable = oob_classifier_accuracy,\n", - " figsz=(10, 5)\n", - ") -> np.array:\n", - " \"\"\"calculate change in metric from permuting feature columns\n", - " \n", - " modified from https://explained.ai/rf-importance/index.html\n", - " \n", - " uses a pre-estimated model\n", - "\n", - " :param X_train: training set features\n", - " :param y_train: training set ground truths, regression targets\n", - " :param header: column labels for X_train\n", - " :param figsz: matplotlib figure size\n", - " \n", - " \"\"\"\n", - " baseline = metric(model, X_train, y_train)\n", - " imp = []\n", - " for col in X_train.columns:\n", - " save = X_train[col].copy()\n", - " X_train[col] = np.random.permutation(X_train[col])\n", - " m = metric(model, X_train, y_train)\n", - " X_train[col] = save\n", - " imp.append(baseline - m)\n", - " \n", - " # create a feature importance table with desired labels\n", - " zipped = zip(imp, header)\n", - " feature_imp = pd.DataFrame(sorted(zipped), columns=[\"importance\", \"feature\"])\n", - " feature_imp.sort_values(by=\"importance\", ascending=False, inplace=True)\n", - "\n", - " plt.clf()\n", - " plt.figure(figsize=figsz)\n", - " sns.barplot(x=\"importance\", y=\"feature\", data=feature_imp)\n", - " plt.title(\"feature permutation importances\")\n", - " plt.tight_layout()\n", - "\n", - " return np.array(feature_imp)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## drop-column importances\n", - "\n", - "According to our **[references](#references)** a more accurate measure of feature importance would have us re-estimate the model after dropping a column. This is considered as being close to \"ideal\". Unfortunately, the entire model needs to be re-estimated for each column and without some approximating shortcut this is likely to be infeasible for large datasets.\n", - "\n", - "Here is the suggested implementation and **don't run this on big models!**:" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "def dropcol_importances(\n", - " model, \n", - " X_train: np.array,\n", - " y_train: np.array,\n", - " header: List[str] = [],\n", - " random_state: int = 1994,\n", - " figsz=(10, 5)\n", - ") -> pd.DataFrame:\n", - " \"\"\"drop columns and re-estimate model\n", - " \n", - " modified from https://explained.ai/rf-importance/index.html\n", - " \n", - " :param rf: model to fit\n", - " :param X_train: training set features\n", - " :param y_train: training set ground truth labels\n", - "\n", - " Returns:\n", - " pd.DataFrame: table of diffs vs baseline metric\n", - " \"\"\"\n", - " # cloning makes copy of model pre-fit\n", - " # calculate a baseline with all features\n", - " model_ = clone(model)\n", - " model_.random_state = random_state\n", - " model_.fit(X_train, y_train)\n", - " baseline = model_.oob_score_\n", - " \n", - " # now drop each colum, refit model and calc metric\n", - " imp = []\n", - " for col in X_train.columns:\n", - " X = X_train.drop(col, axis=1)\n", - " model_ = clone(model)\n", - " model_.random_state = random_state\n", - " model_.fit(X, y_train)\n", - " o = model_.oob_score_\n", - " imp.append(baseline - o)\n", - " \n", - " # put it all in a table\n", - " imp = np.array(imp)\n", - " feature_imps = pd.DataFrame(\n", - " data={'feature': X_train.columns,\n", - " 'importance': imp})\n", - " #feature_imps.set_index('feature', inplace=True)\n", - " feature_imps.sort_values('importance', ascending=True, inplace=True)\n", - " \n", - " plt.clf()\n", - " plt.figure(figsize=figsz)\n", - " sns.barplot(x=\"importance\", y=\"feature\", data=feature_imps)\n", - " plt.title(\"drop column feature importances\")\n", - " plt.tight_layout()\n", - " \n", - " return feature_imps" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## demonstration\n", - "\n", - "In this demonstratuon we are going to take a fraction of a fraction of **[Kaggle's RentHop rental listing interest competition](https://www.kaggle.com/c/two-sigma-connect-rental-listing-inquiries)**--the complete dataset is presently >80GB, we'll be looking at 5K rows. \n", - "\n", - "The competition's **[goal](https://www.kaggle.com/c/two-sigma-connect-rental-listing-inquiries)** was\n", - "> to predict the number of inquiries a new listing receives based on the listing’s creation date and other features. \n", - "\n", - "Doing so would help **[RentHop](https://www.renthop.com/)**\n", - "> better handle fraud control, identify potential listing quality issues, and allow owners and agents to better understand renters’ needs and preferences." - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [], - "source": [ - "data = \"/User/artifacts/two-sigma-connect-rental-listing-inquiries/\"\n", - "NFRAC = 0.1" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "sample dimensions (4935, 6)\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
bathroomsbedroomspricelongitudelatitudeinterest_level
182351.001800-73.996040.71972
421401.023000-73.987940.76533
46771.011350-73.899640.85492
\n", - "
" - ], - "text/plain": [ - " bathrooms bedrooms price longitude latitude interest_level\n", - "18235 1.0 0 1800 -73.9960 40.7197 2\n", - "42140 1.0 2 3000 -73.9879 40.7653 3\n", - "4677 1.0 1 1350 -73.8996 40.8549 2" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df = pd.read_csv(data + 'rent.csv').sample(frac=NFRAC)\n", - "print(\"sample dimensions\", df.shape)\n", - "df.head(3)" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [], - "source": [ - "features = ['bathrooms', 'bedrooms', 'longitude', 'latitude', 'price']\n", - "dfr = df[features]\n", - "\n", - "# drop price column\n", - "X_train, y_train = dfr.drop('price', axis=1), dfr['price']\n", - "\n", - "# insert column with random values\n", - "X_train['random'] = np.random.random(size=len(X_train))\n", - "features = ['bathrooms', 'bedrooms', 'longitude', 'latitude', 'random']" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "RandomForestClassifier(n_jobs=-1, oob_score=True)" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# define model\n", - "model_params = {\n", - " \"n_estimators\" : 100, \n", - " \"min_samples_leaf\" : 1,\n", - " \"n_jobs\" : -1,\n", - " \"oob_score\" : True\n", - "}\n", - "\n", - "model = SomeModel(**model_params)\n", - "\n", - "# estimate\n", - "model.fit(X_train, y_train)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### to run this the model needs a default attribute" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "default feature_importances [0.01683784 0.03215169 0.29983429 0.30418813 0.34698806]\n" - ] - }, - { - "data": { - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "if hasattr(model, \"feature_importances_\"):\n", - " print(\"default feature_importances\", model.feature_importances_)\n", - " feature_importances(model, features)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## permutation importances\n", - "\n", - "No need to check for default attributes or functions, this can be run on any kind of model:" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([[0.06545086119554205, 'longitude'],\n", - " [0.06160081053698076, 'latitude'],\n", - " [0.053495440729483285, 'bedrooms'],\n", - " [0.021681864235055734, 'bathrooms'],\n", - " [0.0004052684903748799, 'random']], dtype=object)" - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "pi = permutation_importances(model, X_train, y_train, features)\n", - "pi" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## drop-column importances" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
featureimportance
4random-0.042756
0bathrooms0.001824
1bedrooms0.023100
2longitude0.049240
3latitude0.051874
\n", - "
" - ], - "text/plain": [ - " feature importance\n", - "4 random -0.042756\n", - "0 bathrooms 0.001824\n", - "1 bedrooms 0.023100\n", - "2 longitude 0.049240\n", - "3 latitude 0.051874" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAsgAAAFgCAYAAACmDI9oAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjMsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+AADFEAAAgAElEQVR4nO3de5geZX3/8fdHonIUFFIrSoiCioAYZEERBDxh8YiKRaUqVqVa8VAVq9WfiqfaUooHRKUewBOgVlvUqlBF0HiABCIBATkLijSInAWFfH9/zERv1t3Nk012n83yfl3XXvvMzD0z35l7k3z2zj3Pk6pCkiRJUuduwy5AkiRJmkkMyJIkSVLDgCxJkiQ1DMiSJElSw4AsSZIkNQzIkiRJUsOALGnGS3JMkvcMu46JJPlekpfNgDqS5NNJfpvk9GHXs6qSfDPJi4ddh6S7NgOyJM0uuwNPAh5QVbuszoGSHJjkB2umrMFU1T5Vdex0nnM8M+WXHknTz4Asaa2WZM6wa5hhtgQuq6qbh13I2to3/Si8/z5Kd2H+BSBpxkmyY5Izk9yY5ARg3WbbXkmuTPKPSX4NfLpf//IkFyW5NsmJSTZv9qkkr0lySZJrkhw2XgBKsk6Sf0pycX/+xUm26Lc9JskZSa7vvz9mnGO8M8nnmuX5fQ1z+uXvJXlPkh8muSnJ15JsmuTzSW7ojz1/VP2vSHJhP3XiI0kyxnlfCnwC2LU/7qH9+qclWZLkuv6cOzT7vLm51p8leVa//mHAx5pjXdfU/rJm/zuNMve1virJhcCF/bptkpzc980FSf56rPs2+vj9sRcmOaKv/ZK+Dw5MckWS/2unY/RTcT7Wn+vGJKcm2bLZPm7/9ed9b5KFwC3AZ4HHAkf2139k3+6D/blv6H82Hjuq37+Y5DP9+c9NMtJs3yLJV5IsS/KbFcfst/1tkvP6/v32irrTOaK/1uuTnJ1k+/Hun6Q1w4AsaUZJcg/gv+gCyn2ALwHPGdXsL/ttWwIHJXk88M/AXwP3Ay4Hjh+1z7OAEeCRwDOBvx2nhNcDzweeAtyrb3dLkvsA3wA+BGwK/DvwjSSbTvJSnwe8ELg/sBXwI7qwfx/gPOAdo9o/DdgZeER/nU8efcCq+iTwCuBHVbVhVb0jySOBTwF/19f9ceDEJPfsd7uYLghuDBwKfC7J/arqvFHH2mQVrm1f4FHAtkk2AE4GvgD8Bd29PSrJdgMe61HA2X3tX6Dr152BrYG/oQuwGzbtDwDeDWwGLAE+DzBg/70QOAjYCDgQ+D5wcH/9B/dtzgAW0PXTF4AvJVm3OcYz+ho3AU4EVgTrdYCv0/1szqfr9+P7bfsC/wQ8G5jbn/e4/nh7A3sAD+mPuT/wmwHvnaRJMiBLmmkeDdwd+EBV/aGqvkwXSlrLgXdU1W1V9Tu6UPSpqjqzqm4D3kI38jm/2edfquraqvoF8AG6oDaWlwFvq6oLqvPTqvoN8FTgwqr6bFXdXlXHAecDT5/kdX66qi6uquuBbwIXV9X/VtXtdL8U7Diq/fur6rq+/lPoQtogXg58vKp+UlV39PN7b6O7z1TVl6rqV1W1vKpOoBv1Xa25y8A/9/f6d3TB/rKq+nR/384E/hPYb8BjXdrvewdwArAF8K6+708Cfk8Xllf4RlWd1v8cvJXu52ALBuu/Y6rq3H77H8Yqpqo+V1W/6dscDtwTeGjT5AdV9T99vZ+l+4UGunu6OXBIVd1cVbdW1YqR97/r79l5ff+/D1jQjyL/gS6wbwOkb3PVgPdO0iQZkCXNNJsDv6yqatZdPqrNsqq6ddQ+f2xTVTfRjbLdv2lzxajjbc7YtqAbVR2rrtF1XD7qHKvi6ub178ZY3vDOzfl18/qWMbaPZ0vgDf0Uhev6qRJb0F9/khc10y+uA7anG31dHe293hJ41KjzH0D3vwCDGH1fqKqJ7tUfz93/HFxLd62D9N8VrESSN/RTIa7vr2Vj7ny/RvfTuumm1mwBXN4H4NG2BD7Y3J9rgQD3r6rv0o1CfwS4OsnRSe61sjolrR4DsqSZ5irg/qPm2M4b1aZGLf+KLmQA0P+3/qbAL5s2W4w63q/GOf8VdFMeRrvTOZrj/HKMtjcD6zfLg4bBqXAF8N6q2qT5Wr+qjutHKP8DOBjYtJ9GcQ5dOIM/v88w2LW1+10BnDrq/BtW1StX+8rG9sd+7qde3Ieu7wbpv9HXe6flfr7xP9JNcbl3f7+u50/3ayJXAPMy9oOLVwB/N+oerVdVPwSoqg9V1U7AdnRTLQ4Z4HySVoMBWdJM8yPgduA1SeYkeTYr/y//LwAvSbKgn1v7PuAnVXVZ0+aQJPfu/7v9tXT/XT+WTwDvTvLg/gGpHfp5qv8DPCTJC/q69ge2pZtXOtoSYI8k85JsTDflY1j+A3hFkkf117NBkqcm2QjYgC4ELgNI8hK6EeQVrgYe0M8LX2EJ8Owk6yfZGnjpSs7/dbr79sIkd++/dk73EOBUeEqS3fua3033c3AFq9Z/K1wNPKhZ3ojuZ3MZMCfJ2+nmqQ/idLpf/t7f98G6SXbrt30MeMuKedlJNk7y3P71zn3f3Z3ul5NbgTsGPKekSTIgS5pRqur3dA8rHQj8lu6hpK+sZJ/vAP+Pbm7rVXQjwM8b1ey/gcV0Ae8bwCfHOdy/A18ETgJu6Nut189DfhrwBrrpG28CnlZV14xRz8l0Afzs/pwThbApVVWL6OYhH0l3Py+iu7dU1c+Aw+l+KbkaeDiwsNn9u8C5wK+TrLjOI+jm/V4NHEv/ENwE57+R7kGz59GN4v4a+Be6ubtT4Qt0DzheC+xEN52DVem/xgeB/fp3lvgQ8G26+eI/p5uecSsDTMvoz38H3XznrYFfAFfS/WxTVV+luyfHJ7mBbhR/n37Xe9H9kvPb/py/Af5tkHNKmrzceZqfJM0+SQp4cFVdNOxaNHWSHANcWVVvG3YtktZujiBLkiRJDQOyJEmS1HCKhSRJktRwBFmSJElqjPV+jJqEzTbbrObPnz/sMiRJkjSgxYsXX1NVc0evNyCvIfPnz2fRokXDLkOSJEkDSjL6EzYBp1hIkiRJd2JAliRJkhoGZEmSJKlhQJYkSZIaPqS3ltrpkM8MuwRJQ7L4sBcNuwRJmtUcQZYkSZIaBmRJkiSpYUCWJEmSGgZkSZIkqWFAliRJkhoGZEmSJKlhQJYkSZIaBmRJkiSpYUCWJEmSGgZkSZIkqWFAliRJkhoGZEmSJKlhQJYkSZIaBmRJkiSpYUCWJEmSGgZkSZIkqWFAliRJkhoGZEmSJKlhQJYkSZIad4mAnOSyJJsNuw5JkiTNfDM+IKcz4+uUJEnS7DAjg2eS+UnOS3IUcCbwySSLkpyb5NCm3WVJDk1yZpKlSbbp12+a5KQkZyX5OJBmn9cnOaf/el1zvvOTfKJf//kkT0yyMMmFSXaZ5lsgSZKkIZmRAbn3UOAzVbUj8IaqGgF2APZMskPT7pqqeiTwUeCN/bp3AD/o9z0RmAeQZCfgJcCjgEcDL0+yY7/P1sAH+3NsA7wA2L0/5j9N2VVKkiRpRpnJAfnyqvpx//qvk5wJnAVsB2zbtPtK/30xML9/vQfwOYCq+gbw23797sBXq+rmqrqp3/ex/bZLq2ppVS0HzgW+U1UFLG2OeydJDupHthctW7ZstS5WkiRJM8NMDsg3AyR5IN0o7hOqagfgG8C6Tbvb+u93AHOa9TXGMTPGutHHAVjeLC8fddw/naDq6KoaqaqRuXPnTnBoSZIkrS1mckBe4V50Yfn6JPcF9hlgn9OAAwCS7APcu1m/b5L1k2wAPAv4/povWZIkSWurMUdGZ5Kq+mmSs+imPVwCLBxgt0OB4/ppGacCv+iPdWaSY4DT+3afqKqzksxf03VLkiRp7ZRumq1W18jISC1atGjazrfTIZ+ZtnNJmlkWH/aiYZcgSbNCksX9G0HcydowxUKSJEmaNgZkSZIkqWFAliRJkhoGZEmSJKlhQJYkSZIaBmRJkiSpYUCWJEmSGgZkSZIkqWFAliRJkhoGZEmSJKlhQJYkSZIaBmRJkiSpYUCWJEmSGgZkSZIkqWFAliRJkhoGZEmSJKlhQJYkSZIaBmRJkiSpMWfYBWhyFh/2omGXIEmSNCs5gixJkiQ1DMiSJElSw4AsSZIkNQzIkiRJUsOALEmSJDUMyJIkSVLDgCxJkiQ1DMiSJElSw4AsSZIkNQzIkiRJUsOALEmSJDXmDLsASdKq+cW7Hj5w23lvXzqFlUjS7OQIsiRJktQwIEuSJEkNA7IkSZLUMCBLkiRJDQOyJEmS1DAgS5IkSQ0DsiRJktQwIEuSJEkNA7IkSZLUMCBLkiRJDQOyJEmS1DAgS5IkSQ0DsiRJktQwIEuSJEkNA7IkSZLUMCBLkiRJDQOyJEmS1DAgS5IkSQ0DsiRJktSYsoCcZH6Sc1ah/YFJNm+WL0uy2dRUJ0mSJI1tJo0gHwhsvrJGrSRzpqYUSZIk3VVNdUCek+TYJGcn+XKS9ZO8PckZSc5JcnQ6+wEjwOeTLEmyXr//q5OcmWRpkm0Akryz3+8k4DNJ1k3y6b7NWUke17cbb/2BSf4rydeSXJrk4CSv79v8OMl9+navSfKzvvbjp/g+SZIkaYaY6oD8UODoqtoBuAH4e+DIqtq5qrYH1gOeVlVfBhYBB1TVgqr6Xb//NVX1SOCjwBub4+4EPLOqXgC8CqCqHg48Hzg2yboTrAfYHngBsAvwXuCWqtoR+BHwor7Nm4Ed+9pfMdbFJTkoyaIki5YtW7Yat0mSJEkzxVQH5CuqamH/+nPA7sDjkvwkyVLg8cB2E+z/lf77YmB+s/7EJkTvDnwWoKrOBy4HHjLBeoBTqurGqloGXA98rV+/tDnP2XQj2n8D3D5WcVV1dFWNVNXI3LlzJ7gMSZIkrS2mOiDXGMtHAfv1I7v/Aaz7Z3v9yW399zuAdr7xzc3rjLPveOvb4wIsb5aXN+d5KvARutHqxc53liRJumuY6oA8L8mu/evnAz/oX1+TZENgv6btjcBGkzjHacABAEkeAswDLphg/UoluRuwRVWdArwJ2ATYcBK1SZIkaS0z1aOi5wEvTvJx4EK6ucT3ppvKcBlwRtP2GOBjSX4H7Mrgjur3W0o3FeLAqrotyXjrBznmOsDnkmxMNxJ9RFVdtwo1SZIkaS2VqtGzIDQZIyMjtWjRomGXIeku4BfvevjAbee9fekUViJJa7cki6tqZPT6mfQ+yJIkSdLQGZAlSZKkhgFZkiRJahiQJUmSpIYBWZIkSWoYkCVJkqSGAVmSJElqGJAlSZKkhgFZkiRJahiQJUmSpIYBWZIkSWoYkCVJkqSGAVmSJElqGJAlSZKkhgFZkiRJahiQJUmSpIYBWZIkSWrMGXYBkqRVM+/tS4ddgiTNao4gS5IkSQ0DsiRJktQwIEuSJEkNA7IkSZLUMCBLkiRJDQOyJEmS1DAgS5IkSQ0DsiRJktQwIEuSJEkNA7IkSZLUMCBLkiRJjTnDLkCStGp2+/Buwy5Ba4mFr1447BKktZIjyJIkSVLDgCxJkiQ1DMiSJElSw4AsSZIkNQzIkiRJUsOALEmSJDUMyJIkSVJjpQE5yX2TfDLJN/vlbZO8dOpLkyRJkqbfICPIxwDfBjbvl38OvG6qCpIkSZKGaZCAvFlVfRFYDlBVtwN3TGlVkiRJ0pAMEpBvTrIpUABJHg1cP6VVSZIkSUMyZ4A2rwdOBLZKshCYC+w3pVVJkiRJQzJhQE5yN2BdYE/goUCAC6rqD9NQmyRJkjTtJgzIVbU8yeFVtStw7jTVJEmSJA3NIHOQT0rynCSZ8mokSZKkIRt0DvIGwO1JbqWbZlFVda8prUySJEkagpUG5KraaDoKkSRJkmaClQbkJHuMtb6qTlvz5UiSJEnDNcgUi0Oa1+sCuwCLgcdPSUWSJEnSEK30Ib2qenrz9SRge+DqVT1RkvlJzplMkauzryRJkrQqBnkXi9GupAvJQ5dkkBFwSZIkaWCDzEH+MP3HTNMF6gXATyd7viTHAjsCPwdeBDwM+HdgQ+Aa4MCquirJTsCngFuAHzT1HAg8lW66xwZJngD8K7BPX+d7quqE/m3pxlq/F3Ao3Sj4AuArwFLgtcB6wL5VdXGS5wLvAO4Arq+qMediS5IkaXYZZAR2UfP6duC4qlo4yfM9FHhpVS1M8ingVcCzgGdW1bIk+wPvBf4W+DTw6qo6Nclho46zK7BDVV2b5Dl0QfcRwGbAGUlOAx4zznr6dQ8DrgUuAT5RVbskeS3wauB1wNuBJ1fVL5NsMtbFJDkIOAhg3rx5k7wlkiRJmkkGmWKxSVUd2399vg+3r53k+a5owvXngCfTTdc4OckS4G3AA5Js3J/31L7tZ0cd5+SqurZ/vTtdaL+jqq4GTgV2nmA9wBlVdVVV3QZcDJzUr18KzO9fLwSOSfJyYJ2xLqaqjq6qkaoamTt37qrfDUmSJM04gwTkF4+x7sBJnq9GLd8InFtVC/qvh1fV3vQfRjLBcW5uXo/3CX8TffLfbc3r5c3ycvpR9ap6BV1g3wJYkmTTCY4nSZKkWWLcgJzk+Um+BjwwyYnN1ynAbyZ5vnlJdu1fPx/4MTB3xbokd0+yXVVdB1yfZPe+7QETHPM0YP8k6ySZC+wBnD7B+oEk2aqqflJVb6ebG73FKlynJEmS1lITzUH+IXAV3fzdw5v1NwJnT/J85wEvTvJx4ELgw8C3gQ/10yrmAB8AzgVeAnwqyS19m/F8lW5O8k/pRp3fVFW/TjLe+m0GrPWwJA+mG4n+DpN/MFGSJElrkVRNNJNBgxoZGalFixatvKEkrabdPrzbsEvQWmLhqyf7TL1015BkcVWNjF6/0jnISR6d5IwkNyX5fZI7ktwwNWVKkiRJwzXIQ3pH0s0XvpDufYJfRjc1QpIkSZp1Bvokuqq6KMk6VXUH8OkkP5ziuiRJkqShGCQg35LkHnRvdfavdA/ubTC1ZUmSJEnDMcgUixf27Q6me//hLYDnTGVRkiRJ0rCsdAS5qi5Psh5wv6o6dBpqkiRJkoZmkHexeDqwBPhWv7wgyYlTXZgkSZI0DINMsXgnsAtwHUBVLQHmT11JkiRJ0vAMEpBvr6rrp7wSSZIkaQYY5F0szknyAmCd/qOXX0P3MdSSJEnSrDPuCHKSz/YvLwa2A24DjgNuAF439aVJkiRJ02+iEeSdkmwJ7A88Dji82bY+cOtUFiZJkiQNw0QB+WN071zxIGBRsz5A9eslSZKkWWXcKRZV9aGqehjwqap6UPP1wKoyHEuSJGlWWum7WFTVK6ejEEmSJGkmGORt3iRJkqS7DAOyJEmS1BjkfZAlSTPIwlcvHHYJkjSrOYIsSZIkNQzIkiRJUsOALEmSJDUMyJIkSVLDgCxJkiQ1DMiSJElSw4AsSZIkNQzIkiRJUsOALEmSJDUMyJIkSVLDgCxJkiQ15gy7AEnSqjl1jz2HXYIk/Zk9Tzt12CWsMY4gS5IkSQ0DsiRJktQwIEuSJEkNA7IkSZLUMCBLkiRJDQOyJEmS1DAgS5IkSQ0DsiRJktQwIEuSJEkNA7IkSZLUMCBLkiRJDQOyJEmS1DAgS5IkSQ0DsiRJktQwIEuSJEkNA7IkSZLUMCBLkiRJDQOyJEmS1DAgS5IkSY2hBOQkN03BMZ+R5M39632TbDuJY3wvyciark2SJElrj1kzglxVJ1bV+/vFfYFVDsiSJEnSUANyOoclOSfJ0iT79+v36kdzv5zk/CSfT5J+21P6dT9I8qEkX+/XH5jkyCSPAZ4BHJZkSZKt2pHhJJsluax/vV6S45OcneQEYL2mtr2T/CjJmUm+lGTD6b07kiRJGoZhjyA/G1gAPAJ4Il2ovV+/bUfgdXQjwQ8CdkuyLvBxYJ+q2h2YO/qAVfVD4ETgkKpaUFUXT3D+VwK3VNUOwHuBnaAL0cDbgCdW1SOBRcDrR++c5KAki5IsWrZs2apfvSRJkmacYQfk3YHjquqOqroaOBXYud92elVdWVXLgSXAfGAb4JKqurRvc9xqnn8P4HMAVXU2cHa//tF0wXxhkiXAi4EtR+9cVUdX1UhVjcyd+2dZXZIkSWuhOUM+fybYdlvz+g66WidqP5Hb+dMvA+uO2lbj1HVyVT1/kueTJEnSWmrYI8inAfsnWSfJXLoR3dMnaH8+8KAk8/vl/cdpdyOwUbN8Gf30CWC/Uec/ACDJ9sAO/fof003p2Lrftn6ShwxwPZIkSVrLDTsgf5VuWsNPge8Cb6qqX4/XuKp+B/w98K0kPwCuBq4fo+nxwCFJzkqyFfBvwCuT/BDYrGn3UWDDJGcDb6IP51W1DDgQOK7f9mO66R2SJEma5VI11gyDmSvJhlV1U/+uFh8BLqyqI4Zd18jISC1atGjYZUi6Czh1jz2HXYIk/Zk9Tzt12CWssiSLq+rPPgNj2CPIk/Hy/sG5c4GN6d7VQpIkSVojhv2Q3irrR4uHPmIsSZKk2WltHEGWJEmSpowBWZIkSWoYkCVJkqSGAVmSJElqGJAlSZKkhgFZkiRJahiQJUmSpIYBWZIkSWoYkCVJkqSGAVmSJElqGJAlSZKkhgFZkiRJahiQJUmSpIYBWZIkSWoYkCVJkqSGAVmSJElqzBl2AZKkVbPnaacOuwRJmtUcQZYkSZIaBmRJkiSpYUCWJEmSGgZkSZIkqWFAliRJkhoGZEmSJKlhQJYkSZIaBmRJkiSpYUCWJEmSGgZkSZIkqeFHTUvSWubIN3xt2CVI0hp38OFPH3YJf+QIsiRJktQwIEuSJEkNA7IkSZLUMCBLkiRJDQOyJEmS1DAgS5IkSQ0DsiRJktQwIEuSJEkNA7IkSZLUMCBLkiRJDQOyJEmS1DAgS5IkSQ0DsiRJktQwIEuSJEkNA7IkSZLUMCBLkiRJDQOyJEmS1DAgS5IkSQ0DsiRJktSYkQE5yU0r2b5Jkr9vljdP8uX+9YIkT5nEOd+Z5I2rXq0kSZJmkxkZkAewCfDHgFxVv6qq/frFBcAqB2RJkiQJZnhATrJhku8kOTPJ0iTP7De9H9gqyZIkhyWZn+ScJPcA3gXs32/bf/TIcN9ufv/6rUkuSPK/wEObNlsl+VaSxUm+n2SbabtoSZIkDdWcYRewErcCz6qqG5JsBvw4yYnAm4Htq2oBwIrAW1W/T/J2YKSqDu63vXOsAyfZCXgesCPdfTgTWNxvPhp4RVVdmORRwFHA46fkCiVJkjSjzPSAHOB9SfYAlgP3B+67ho79WOCrVXULQB+8SbIh8BjgS0lWtL3nmMUlBwEHAcybN28NlSVJkqRhmukB+QBgLrBTVf0hyWXAuqt4jNu581SSdv8ao/3dgOtWjE5PpKqOphttZmRkZKxjSZIkaS0zo+cgAxsD/9eH48cBW/brbwQ2Gmef0dsuAx4JkOSRwAP79acBz0qyXpKNgKcDVNUNwKVJntvvkySPWHOXJEmSpJlspgfkzwMjSRbRjSafD1BVvwEW9g/cHTZqn1OAbVc8pAf8J3CfJEuAVwI/749xJnACsKRv8/3mGAcAL03yU+Bc4JlIkiTpLmFGTrGoqg3779cAu47T5gWjVm3fr78W2HnUtr3HOcZ7gfeOsf5S4K9WrWpJkiTNBjN9BFmSJEmaVgZkSZIkqWFAliRJkhoGZEmSJKlhQJYkSZIaBmRJkiSpYUCWJEmSGgZkSZIkqWFAliRJkhoGZEmSJKlhQJYkSZIaBmRJkiSpYUCWJEmSGgZkSZIkqWFAliRJkhoGZEmSJKlhQJYkSZIaBmRJkiSpMWfYBUiSVs3Bhz992CVI0qzmCLIkSZLUMCBLkiRJDQOyJEmS1DAgS5IkSQ0DsiRJktRIVQ27hlkhyTLg8mHXsYo2A64ZdhGaMvbv7GXfzm727+xm/84sW1bV3NErDch3YUkWVdXIsOvQ1LB/Zy/7dnazf2c3+3ft4BQLSZIkqWFAliRJkhoG5Lu2o4ddgKaU/Tt72bezm/07u9m/awHnIEuSJEkNR5AlSZKkhgFZkiRJahiQZ7kk90lycpIL++/3Hqfdi/s2FyZ58RjbT0xyztRXrEGtTt8mWT/JN5Kcn+TcJO+f3uo1niR/leSCJBclefMY2++Z5IR++0+SzG+2vaVff0GSJ09n3RrMZPs3yZOSLE6ytP/++OmuXSu3On9+++3zktyU5I3TVbPGZkCe/d4MfKeqHgx8p1++kyT3Ad4BPArYBXhHG7aSPBu4aXrK1SpY3b79t6raBtgR2C3JPtNTtsaTZB3gI8A+wLbA85NsO6rZS4HfVtXWwBHAv/T7bgs8D9gO+CvgqP54miFWp3/pPlji6VX1cODFwGenp2oNajX7d4UjgG9Oda1aOQPy7PdM4Nj+9bHAvmO0eTJwclVdW1W/BU6m+weWJBsCrwfeMw21atVMum+r6paqOgWgqn4PnAk8YBpq1sR2AS6qqkv6fjmerp9bbb9/GXhCkvTrj6+q26rqUuCi/niaOSbdv1V1VlX9ql9/LrBukntOS9Ua1Or8+SXJvsAldP2rITMgz373raqrAPrvfzFGm/sDVzTLV/brAN4NHA7cMpVFalJWt28BSLIJ8HS6UWgN10r7q21TVbcD1wObDrivhmt1+rf1HOCsqrptiurU5Ey6f5NsAPwjcOg01KkBzBl2AVp9Sf4X+MsxNr110EOMsa6SLAC2rqp/GD1PStNjqvq2Of4c4DjgQ1V1yapXqDVswv5aSZtB9tVwrU7/dhuT7ej+W37vNViX1ozV6d9DgSOq6qZ+QFlDZkCeBarqieNtS3J1kvtV1VVJ7gf83xjNrgT2apYfAHwP2BXYKclldD8rf5Hke1W1F5oWU9i3KxwNXFhVH1gD5Wr1XQls0Sw/APjVOG2u7H/B2Ri4dsB9NVyr078keQDwVeBFVXXx1JerVbQ6/fsoYL8k/wpsAixPcmtVHTn1ZWssTrGY/U6ke6CD/vt/j9Hm28DeSe7dP8C1N/DtqvpoVW1eVfOB3YGfG45nlEn3LUCS99D95fy6aahVgzkDeHCSBya5B91DdyeOatP2+37Ad6v7xKcTgef1T8k/EHgwcPo01a3BTLp/+6lQ3wDeUlULp61irYpJ91ba2jgAAANqSURBVG9VPbaq5vf/3n4AeJ/heLgMyLPf+4EnJbkQeFK/TJKRJJ8AqKpr6eYan9F/vatfp5lt0n3bj0S9le5J6zOTLEnysmFchP6kn5N4MN0vMecBX6yqc5O8K8kz+mafpJuzeBHdA7Rv7vc9F/gi8DPgW8CrquqO6b4GjW91+rffb2vg//V/XpckGeu5Aw3JavavZhg/alqSJElqOIIsSZIkNQzIkiRJUsOALEmSJDUMyJIkSVLDgCxJkiQ1DMiStJZJ8sNpPt/8JC+YznNK0jAZkCVpLVNVj5muc/Wf9jUfMCBLusvwfZAlaS2T5Kaq2jDJXsChwNXAAuArwFLgtcB6wL5VdXGSY4Bbge2A+wKvr6qvJ1kX+CgwAtzerz8lyYHAU4F1gQ2A9YGHAZcCx9J93PFn+20AB1fVD/t63glcA2wPLAb+pv8kuJ2BD/b73AY8AbiF7gNu9gLuCXykqj6+hm+XJK2yOcMuQJK0Wh5BF16vBS4BPlFVuyR5LfBq/vRR4vOBPYGtgFOSbA28CqCqHp5kG+CkJA/p2+8K7NB/8uJewBur6mkASdYHnlRVtyZ5MHAcXcgG2JEuiP8KWAjsluR04ARg/6o6I8m9gN8BLwWur6qdk9wTWJjkpKq6dArukyQNzIAsSWu3M6rqKoAkFwMn9euXAo9r2n2xqpYDFya5BNgG2B34MEBVnZ/kcmBFQD55go+cvztwZJIFwB3NPgCnV9WVfT1L6IL59cBVVXVGf64b+u17Azsk2a/fd2PgwXQj1ZI0NAZkSVq73da8Xt4sL+fOf8ePnk9XQCY47s0TbPsHumkdj6B7luXWceq5o68hY5yffv2rq+rbE5xLkqadD+lJ0l3Dc5PcLclWwIOAC4DTgAMA+qkV8/r1o90IbNQsb0w3IrwceCGwzkrOfT6weT8PmSQb9Q//fRt4ZZK7r6ghyQYTHEeSpoUjyJJ013ABcCrdQ3qv6OcPHwV8LMlSuof0Dqyq25I/G1g+G7g9yU+BY4CjgP9M8lzgFCYebaaqfp9kf+DDSdajm3/8ROATdFMwzkx30mXAvmviYiVpdfguFpI0y/XvYvH1qvrysGuRpLWBUywkSZKkhiPIkiRJUsMRZEmSJKlhQJYkSZIaBmRJkiSpYUCWJEmSGgZkSZIkqfH/AZjgTXszi7XlAAAAAElFTkSuQmCC\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "dc = dropcol_importances(model, X_train, y_train)\n", - "dc" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## conclusions" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "So I would say location is a prime factor, then the number of bedrooms. Bathrooms often is gte bedrooms, and is likely correlated so one of them should likely be dropped." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.8" - }, - "toc-autonumbering": false, - "toc-showcode": false, - "toc-showmarkdowntxt": false - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/functions/development/feature_perms/0.8.0/src/feature_perms.ipynb b/functions/development/feature_perms/0.8.0/src/feature_perms.ipynb deleted file mode 100644 index 77da7b55..00000000 --- a/functions/development/feature_perms/0.8.0/src/feature_perms.ipynb +++ /dev/null @@ -1,1106 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# permutation_importances as reusable function" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## function code" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: ignore\n", - "import nuclio" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "import pandas as pd\n", - "import numbers\n", - "\n", - "import sklearn\n", - "from sklearn.base import clone\n", - "from sklearn.utils import check_random_state\n", - "\n", - "import matplotlib.pyplot as plt\n", - "import seaborn as sns\n", - "\n", - "from cloudpickle import load\n", - "\n", - "from mlrun.execution import MLClientCtx\n", - "from mlrun.datastore import DataItem\n", - "from mlrun.artifacts import get_model, PlotArtifact\n", - "from typing import Union, Callable, List\n", - "\n", - "def _get_n_samples_bootstrap(n_samples, max_samples) -> int:\n", - " \"\"\"get the number of samples in a bootstrap sample\n", - " \n", - " returns the total number of samples to draw for the bootstrap sample\n", - " \n", - " private api in sklearn >= v0.24, taken from sklearn.ensemble._forest.py\n", - "\n", - " :param n_samples: Number of samples in the dataset.\n", - " :param max_samples: \n", - " The maximum number of samples to draw from the total available:\n", - " - if float, this indicates a fraction of the total and should be\n", - " the interval `(0, 1)`;\n", - " - if int, this indicates the exact number of samples;\n", - " - if None, this indicates the total number of samples.\n", - " \"\"\"\n", - " if max_samples is None:\n", - " return n_samples\n", - "\n", - " if isinstance(max_samples, numbers.Integral):\n", - " if not (1 <= max_samples <= n_samples):\n", - " msg = \"`max_samples` must be in range 1 to {} but got value {}\"\n", - " raise ValueError(msg.format(n_samples, max_samples))\n", - " return max_samples\n", - "\n", - " if isinstance(max_samples, numbers.Real):\n", - " if not (0 < max_samples < 1):\n", - " msg = \"`max_samples` must be in range (0, 1) but got value {}\"\n", - " raise ValueError(msg.format(max_samples))\n", - " return int(round(n_samples * max_samples))\n", - "\n", - " msg = \"`max_samples` should be int or float, but got type '{}'\"\n", - " raise TypeError(msg.format(type(max_samples)))\n", - "\n", - "def _get_unsampled_ix(random_state, n_samples: int) -> np.array:\n", - " \"\"\"\n", - " future-proof get unsampled indices\n", - " \"\"\"\n", - " n_bootstrap = _get_n_samples_bootstrap(n_samples, n_samples)\n", - " random_instance = check_random_state(random_state)\n", - " sample_indices = random_instance.randint(0, n_samples, n_bootstrap)\n", - " sample_counts = np.bincount(sample_indices, minlength=n_samples)\n", - "\n", - " return np.arange(n_samples)[sample_counts==0]\n", - "\n", - "def _oob_classifier_accuracy(rf, X_train, y_train) -> float:\n", - " \"\"\"\n", - " Compute out-of-bag (OOB) accuracy for a scikit-learn forest classifier.\n", - " \n", - " https://github.com/scikit-learn/scikit-learn/blob/a24c8b46/sklearn/ensemble/forest.py#L425\n", - " \"\"\"\n", - " X = X_train.values if isinstance(X_train, pd.DataFrame) else X_train\n", - " y = y_train.values if isinstance(y_train, pd.Series) else y_train\n", - "\n", - " n_samples = len(X)\n", - " n_classes = len(np.unique(y))\n", - " predictions = np.zeros((n_samples, n_classes))\n", - " for tree in rf.estimators_:\n", - " unsampled_indices = _get_unsampled_ix(tree.random_state, n_samples)\n", - " tree_preds = tree.predict_proba(X[unsampled_indices, :])\n", - " predictions[unsampled_indices] += tree_preds\n", - "\n", - " predicted_class_indexes = np.argmax(predictions, axis=1)\n", - " predicted_classes = [rf.classes_[i] for i in predicted_class_indexes]\n", - "\n", - " oob_score = np.mean(y == predicted_classes)\n", - " \n", - " return oob_score\n", - "\n", - "def permutation_importances(\n", - " context: MLClientCtx,\n", - " model: DataItem,\n", - " dataset: DataItem,\n", - " labels: str,\n", - " figsz=(10, 5),\n", - " plots_dest: str = \"plots\",\n", - " fitype: str = \"permute\"\n", - ") -> pd.DataFrame:\n", - " \"\"\"calculate change in metric\n", - " \n", - " type 'permute' uses a pre-estimated model\n", - " type 'dropcol' uses a re-estimates model\n", - " \n", - " :param context: the function's execution context\n", - " :param model: a trained model\n", - " :param dataset: features and ground truths, regression targets\n", - " :param labels name of the ground truths column\n", - " :param figsz: matplotlib figure size\n", - " :param plots_dest: path within artifact store\n", - " :\n", - " \"\"\"\n", - " model_file, model_data, _ = get_model(model.url, suffix='.pkl')\n", - " model = load(open(str(model_file), \"rb\"))\n", - " \n", - " X = dataset.as_df()\n", - " y = X.pop(labels)\n", - " header = X.columns\n", - " \n", - " # this will be paramettrized next version, and include regression\n", - " metric = _oob_classifier_accuracy\n", - " \n", - " baseline = metric(model, X, y)\n", - " \n", - " imp = []\n", - " for col in X.columns:\n", - " if fitype is \"permute\":\n", - " save = X[col].copy()\n", - " X[col] = np.random.permutation(X[col])\n", - " m = metric(model, X, y)\n", - " X[col] = save\n", - " imp.append(baseline - m)\n", - " elif fitype is \"dropcol\":\n", - " X_ = X.drop(col, axis=1)\n", - " model_ = clone(model)\n", - " model_.random_state = random_state\n", - " model_.fit(X_, y)\n", - " o = model_.oob_score_\n", - " imp.append(baseline - o)\n", - " else:\n", - " raise ValueError(\"unknown fitype, only 'permute' or 'dropcol' permitted\")\n", - "\n", - " # create a feature importance table with desired labels\n", - " zipped = zip(imp, header)\n", - " feature_imp = pd.DataFrame(sorted(zipped), columns=[\"importance\", \"feature\"])\n", - " feature_imp.sort_values(by=\"importance\", ascending=False, inplace=True)\n", - "\n", - " plt.clf()\n", - " plt.figure(figsize=figsz)\n", - " sns.barplot(x=\"importance\", y=\"feature\", data=feature_imp)\n", - " plt.title(f\"feature importances-{fitype}\")\n", - " plt.tight_layout()\n", - "\n", - " context.log_artifact(PlotArtifact(f\"feature importances-{fitype}\", body=plt.gcf()),\n", - " local_path=f\"{plots_dest}/feature-permutations.html\")\n", - " context.log_dataset(f\"feature-importances-{fitype}-tbl\", df=feature_imp, index=False)" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: end-code" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## save function" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-06-07 19:58:25,298 function spec saved to path: function.yaml\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from mlrun import code_to_function\n", - "from mlrun.platforms.other import auto_mount\n", - "\n", - "gpus = False\n", - "\n", - "# create job function object from notebook code\n", - "fn_params = {\n", - " \"name\" : \"feature-perms\",\n", - " \"handler\" : \"permutation_importances\",\n", - " \"kind\" : \"job\",\n", - " \"image\" : \"mlrun/ml-models\" if not gpus else \"mlrun/ml-models-gpu\",\n", - " \"description\" : \"estimate feature importances using permutations\",\n", - " \"categories\" : [\"analysis\"],\n", - " \"labels\" : {\"author\": \"yjb\"}\n", - "}\n", - "\n", - "perms_fn = code_to_function(**fn_params)\n", - "perms_fn.apply(auto_mount())\n", - "perms_fn.export(\"function.yaml\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## tests" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import import_function\n", - "from mlrun import NewTask, mlconf" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### get some data" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-06-07 19:58:25,352 starting run tasks arc-to-parq uid=e9bc67f2189c418d96bfde754d369956 -> http://mlrun-api:8080\n", - "[mlrun] 2020-06-07 19:58:25,486 Job is running in the background, pod: tasks-arc-to-parq-xqkr7\n", - "[mlrun] 2020-06-07 19:58:29,118 starting local run: main.py # arc_to_parquet\n", - "[mlrun] 2020-06-07 19:58:29,169 downloading https://raw.githubusercontent.com/parrt/random-forest-importances/master/notebooks/data/rent.csv to local tmp\n", - "[mlrun] 2020-06-07 19:58:29,535 destination file does not exist, downloading\n", - "[mlrun] 2020-06-07 19:58:29,898 log artifact rent at /User/artifacts/rent.csv, size: 1492462, db: Y\n", - "\n", - "[mlrun] 2020-06-07 19:58:29,917 run executed, status=completed\n", - "final state: succeeded\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 07 19:58:29completedtasks arc-to-parq
v3io_user=admin
kind=job
owner=admin
host=tasks-arc-to-parq-xqkr7
archive_url
key=rent
stats=True
file_ext=csv
rent
\n", - "
\n", - "
\n", - "
\n", - " Title\n", - " ×\n", - "
\n", - " \n", - "
\n", - "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "to track results use .show() or .logs() or in CLI: \n", - "!mlrun get run e9bc67f2189c418d96bfde754d369956 --project default , !mlrun logs e9bc67f2189c418d96bfde754d369956 --project default\n", - "[mlrun] 2020-06-07 19:58:31,666 run executed, status=completed\n" - ] - } - ], - "source": [ - "data_url = \"https://raw.githubusercontent.com/parrt/random-forest-importances/master/notebooks/data/rent.csv\"\n", - "\n", - "fn = import_function(\"hub://arc_to_parquet\", \"a2p\")\n", - "fn.apply(auto_mount())\n", - "\n", - "params = {\n", - " \"name\" : \"tasks arc-to-parq\",\n", - " \"params\" : {\"key\":\"rent\", \"stats\": True, \"file_ext\":\"csv\"}\n", - "}\n", - "acquire_run = fn.run(NewTask(**params),inputs={\"archive_url\" : data_url},\n", - " artifact_path=mlconf.artifact_path)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### train a model" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-06-07 19:58:31,704 starting run tasks random forest uid=57af834167264641905a5bb5e6b0e263 -> http://mlrun-api:8080\n", - "[mlrun] 2020-06-07 19:58:31,861 Job is running in the background, pod: tasks-random-forest-vkjk5\n", - "[mlrun] 2020-06-07 19:58:35,390 starting local run: main.py # train_model\n", - "[mlrun] 2020-06-07 19:58:36,310 log artifact test_set at /User/artifacts/data/test_set.parquet, size: 24484, db: Y\n", - "[mlrun] 2020-06-07 19:58:37,153 log artifact confusion-matrix at /User/artifacts/model/plots/confusion-matrix.html, size: 27401, db: N\n", - "[mlrun] 2020-06-07 19:58:37,598 log artifact feature-importances at /User/artifacts/model/plots/feature-importances.html, size: 19685, db: N\n", - "[mlrun] 2020-06-07 19:58:37,806 log artifact precision-recall-multiclass at /User/artifacts/model/plots/precision-recall-multiclass.html, size: 74009, db: N\n", - "[mlrun] 2020-06-07 19:58:37,936 log artifact roc-multiclass at /User/artifacts/model/plots/roc-multiclass.html, size: 73053, db: N\n", - "[mlrun] 2020-06-07 19:58:38,079 log artifact model at /User/artifacts/model/, size: 10346780, db: Y\n", - "\n", - "[mlrun] 2020-06-07 19:58:38,106 run executed, status=completed\n", - "final state: succeeded\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 07 19:58:36completedtasks random forest
v3io_user=admin
kind=job
owner=admin
host=tasks-random-forest-vkjk5
class=sklearn.ensemble.RandomForestClassifier
dataset
sample=-5000
model_pkg_class=sklearn.ensemble.RandomForestClassifier
label_column=interest_level
CLASS_n_estimators=100
CLASS_min_samples_leaf=1
CLASS_n_jobs=-1
CLASS_oob_score=True
test-accuracy=0.6902857142857143
test-error=0.3097142857142857
auc-micro=0.8567196734693878
auc-weighted=0.7077200281488216
f1-score=0.44361444815007395
precision_score=0.4969837043184901
recall_score=0.42733978329897576
test_set
confusion-matrix
feature-importances
precision-recall-multiclass
roc-multiclass
model
\n", - "
\n", - "
\n", - "
\n", - " Title\n", - " ×\n", - "
\n", - " \n", - "
\n", - "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "to track results use .show() or .logs() or in CLI: \n", - "!mlrun get run 57af834167264641905a5bb5e6b0e263 --project default , !mlrun logs 57af834167264641905a5bb5e6b0e263 --project default\n", - "[mlrun] 2020-06-07 19:58:41,115 run executed, status=completed\n" - ] - } - ], - "source": [ - "fn = import_function(\"hub://sklearn_classifier\", \"skrf\")\n", - "fn.apply(auto_mount())\n", - "\n", - "# define model\n", - "params = {\n", - " \"name\" : \"tasks random forest\",\n", - " \"params\" : {\n", - " \"sample\" : -5_000, # 5k random rows,\n", - " \"model_pkg_class\" : \"sklearn.ensemble.RandomForestClassifier\",\n", - " \"label_column\" : \"interest_level\",\n", - " \"CLASS_n_estimators\" : 100,\n", - " \"CLASS_min_samples_leaf\" : 1,\n", - " \"CLASS_n_jobs\" : -1,\n", - " \"CLASS_oob_score\" : True}\n", - "}\n", - "\n", - "train_run = fn.run(NewTask(**params), inputs={\"dataset\" : acquire_run.outputs[\"rent\"]},\n", - " artifact_path=mlconf.artifact_path)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "

Feature Importances

\n", - "" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from IPython.display import HTML\n", - "HTML(filename=train_run.outputs['feature-importances'])" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "data = acquire_run.outputs[\"rent\"]\n", - "labels = \"interest_level\"\n", - "model = train_run.outputs[\"model\"]\n" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-06-07 19:58:41,152 starting run features-permutation_importances uid=89235b15ac2a4213aefc906c178a1c5e -> http://mlrun-api:8080\n", - "[mlrun] 2020-06-07 19:58:41,312 Job is running in the background, pod: features-permutation-importances-dwxmt\n", - "[mlrun] 2020-06-07 19:58:44,871 starting local run: main.py # permutation_importances\n", - "[mlrun] 2020-06-07 19:58:48,714 log artifact feature importances-permute at /User/artifacts/plots/feature-permutations.html, size: 25694, db: Y\n", - "[mlrun] 2020-06-07 19:58:48,770 log artifact feature-importances-permute-tbl at /User/artifacts/feature-importances-permute-tbl.csv, size: 167, db: Y\n", - "\n", - "[mlrun] 2020-06-07 19:58:48,785 run executed, status=completed\n", - "final state: succeeded\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 07 19:58:45completedfeatures-permutation_importances
v3io_user=admin
kind=job
owner=admin
host=features-permutation-importances-dwxmt
model
dataset
labels=interest_level
plots_dest=plots
feature importances-permute
feature-importances-permute-tbl
\n", - "
\n", - "
\n", - "
\n", - " Title\n", - " ×\n", - "
\n", - " \n", - "
\n", - "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "to track results use .show() or .logs() or in CLI: \n", - "!mlrun get run 89235b15ac2a4213aefc906c178a1c5e --project default , !mlrun logs 89235b15ac2a4213aefc906c178a1c5e --project default\n", - "[mlrun] 2020-06-07 19:58:50,488 run executed, status=completed\n" - ] - } - ], - "source": [ - "fi_perms = perms_fn.run(\n", - " NewTask(params={\"labels\": labels, \n", - " \"plots_dest\": \"plots\"}),\n", - " inputs={\"model\": model, \"dataset\": data},\n", - " artifact_path=mlconf.artifact_path)" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from IPython.display import HTML\n", - "HTML(filename=fi_perms.outputs['feature importances-permute'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.8" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} \ No newline at end of file diff --git a/functions/development/feature_perms/0.8.0/src/feature_perms.py b/functions/development/feature_perms/0.8.0/src/feature_perms.py deleted file mode 100644 index 3a6c9948..00000000 --- a/functions/development/feature_perms/0.8.0/src/feature_perms.py +++ /dev/null @@ -1,160 +0,0 @@ -# Generated by nuclio.export.NuclioExporter - -import numpy as np -import pandas as pd -import numbers - -import sklearn -from sklearn.base import clone -from sklearn.utils import check_random_state - -import matplotlib.pyplot as plt -import seaborn as sns - -from cloudpickle import load - -from mlrun.execution import MLClientCtx -from mlrun.datastore import DataItem -from mlrun.artifacts import get_model, PlotArtifact -from typing import Union, Callable, List - - -def _get_n_samples_bootstrap(n_samples, max_samples) -> int: - """get the number of samples in a bootstrap sample - - returns the total number of samples to draw for the bootstrap sample - - private api in sklearn >= v0.24, taken from sklearn.ensemble._forest.py - - :param n_samples: Number of samples in the dataset. - :param max_samples: - The maximum number of samples to draw from the total available: - - if float, this indicates a fraction of the total and should be - the interval `(0, 1)`; - - if int, this indicates the exact number of samples; - - if None, this indicates the total number of samples. - """ - if max_samples is None: - return n_samples - - if isinstance(max_samples, numbers.Integral): - if not (1 <= max_samples <= n_samples): - msg = "`max_samples` must be in range 1 to {} but got value {}" - raise ValueError(msg.format(n_samples, max_samples)) - return max_samples - - if isinstance(max_samples, numbers.Real): - if not (0 < max_samples < 1): - msg = "`max_samples` must be in range (0, 1) but got value {}" - raise ValueError(msg.format(max_samples)) - return int(round(n_samples * max_samples)) - - msg = "`max_samples` should be int or float, but got type '{}'" - raise TypeError(msg.format(type(max_samples))) - - -def _get_unsampled_ix(random_state, n_samples: int) -> np.array: - """ - future-proof get unsampled indices - """ - n_bootstrap = _get_n_samples_bootstrap(n_samples, n_samples) - random_instance = check_random_state(random_state) - sample_indices = random_instance.randint(0, n_samples, n_bootstrap) - sample_counts = np.bincount(sample_indices, minlength=n_samples) - - return np.arange(n_samples)[sample_counts == 0] - - -def _oob_classifier_accuracy(rf, X_train, y_train) -> float: - """ - Compute out-of-bag (OOB) accuracy for a scikit-learn forest classifier. - - https://github.com/scikit-learn/scikit-learn/blob/a24c8b46/sklearn/ensemble/forest.py#L425 - """ - X = X_train.values if isinstance(X_train, pd.DataFrame) else X_train - y = y_train.values if isinstance(y_train, pd.Series) else y_train - - n_samples = len(X) - n_classes = len(np.unique(y)) - predictions = np.zeros((n_samples, n_classes)) - for tree in rf.estimators_: - unsampled_indices = _get_unsampled_ix(tree.random_state, n_samples) - tree_preds = tree.predict_proba(X[unsampled_indices, :]) - predictions[unsampled_indices] += tree_preds - - predicted_class_indexes = np.argmax(predictions, axis=1) - predicted_classes = [rf.classes_[i] for i in predicted_class_indexes] - - oob_score = np.mean(y == predicted_classes) - - return oob_score - - -def permutation_importance( - context: MLClientCtx, - model: DataItem, - dataset: DataItem, - labels: str, - figsz=(10, 5), - plots_dest: str = "plots", - fitype: str = "permute", -) -> pd.DataFrame: - """calculate change in metric - - type 'permute' uses a pre-estimated model - type 'dropcol' uses a re-estimates model - - :param context: the function's execution context - :param model: a trained model - :param dataset: features and ground truths, regression targets - :param labels name of the ground truths column - :param figsz: matplotlib figure size - :param plots_dest: path within artifact store - : - """ - model_file, model_data, _ = get_model(model.url, suffix=".pkl") - model = load(open(str(model_file), "rb")) - - X = dataset.as_df() - y = X.pop(labels) - header = X.columns - - metric = _oob_classifier_accuracy - - baseline = metric(model, X, y) - - imp = [] - for col in X.columns: - if fitype is "permute": - save = X[col].copy() - X[col] = np.random.permutation(X[col]) - m = metric(model, X, y) - X[col] = save - imp.append(baseline - m) - elif fitype is "dropcol": - X_ = X.drop(col, axis=1) - model_ = clone(model) - #model_.random_state = random_state - model_.fit(X_, y) - o = model_.oob_score_ - imp.append(baseline - o) - else: - raise ValueError("unknown fitype, only 'permute' or 'dropcol' permitted") - - zipped = zip(imp, header) - feature_imp = pd.DataFrame(sorted(zipped), columns=["importance", "feature"]) - feature_imp.sort_values(by="importance", ascending=False, inplace=True) - - plt.clf() - plt.figure(figsize=figsz) - sns.barplot(x="importance", y="feature", data=feature_imp) - plt.title(f"feature importances-{fitype}") - plt.tight_layout() - - context.log_artifact( - PlotArtifact(f"feature importances-{fitype}", body=plt.gcf()), - local_path=f"{plots_dest}/feature-permutations.html", - ) - context.log_dataset( - f"feature-importances-{fitype}-tbl", df=feature_imp, index=False - ) diff --git a/functions/development/feature_perms/0.8.0/src/function.yaml b/functions/development/feature_perms/0.8.0/src/function.yaml deleted file mode 100644 index e9d3913a..00000000 --- a/functions/development/feature_perms/0.8.0/src/function.yaml +++ /dev/null @@ -1,63 +0,0 @@ -kind: job -metadata: - name: feature-perms - tag: '' - hash: 2e32234a73e2e48f029cf6c957b150ec2ffd4bc7 - project: default - labels: - author: yjb - categories: - - data-analysis -spec: - command: '' - args: [] - image: mlrun/ml-models - env: [] - default_handler: permutation_importance - entry_points: - permutation_importance: - name: permutation_importance - doc: 'calculate change in metric - - - type ''permute'' uses a pre-estimated model - - type ''dropcol'' uses a re-estimates model' - parameters: - - name: context - type: MLClientCtx - doc: the function's execution context - default: '' - - name: model - type: DataItem - doc: a trained model - default: '' - - name: dataset - type: DataItem - doc: features and ground truths, regression targets - default: '' - - name: labels - type: str - default: '' - - name: figsz - doc: matplotlib figure size - default: - - 10 - - 5 - - name: plots_dest - type: str - doc: path within artifact store - default: plots - - name: fitype - type: str - default: permute - outputs: - - default: '' - lineno: 93 - description: estimate feature importances using permutations - build: - functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IG51bXB5IGFzIG5wCmltcG9ydCBwYW5kYXMgYXMgcGQKaW1wb3J0IG51bWJlcnMKCmltcG9ydCBza2xlYXJuCmZyb20gc2tsZWFybi5iYXNlIGltcG9ydCBjbG9uZQpmcm9tIHNrbGVhcm4udXRpbHMgaW1wb3J0IGNoZWNrX3JhbmRvbV9zdGF0ZQoKaW1wb3J0IG1hdHBsb3RsaWIucHlwbG90IGFzIHBsdAppbXBvcnQgc2VhYm9ybiBhcyBzbnMKCmZyb20gY2xvdWRwaWNrbGUgaW1wb3J0IGxvYWQKCmZyb20gbWxydW4uZXhlY3V0aW9uIGltcG9ydCBNTENsaWVudEN0eApmcm9tIG1scnVuLmRhdGFzdG9yZSBpbXBvcnQgRGF0YUl0ZW0KZnJvbSBtbHJ1bi5hcnRpZmFjdHMgaW1wb3J0IGdldF9tb2RlbCwgUGxvdEFydGlmYWN0CmZyb20gdHlwaW5nIGltcG9ydCBVbmlvbiwgQ2FsbGFibGUsIExpc3QKCgpkZWYgX2dldF9uX3NhbXBsZXNfYm9vdHN0cmFwKG5fc2FtcGxlcywgbWF4X3NhbXBsZXMpIC0+IGludDoKICAgICIiImdldCB0aGUgbnVtYmVyIG9mIHNhbXBsZXMgaW4gYSBib290c3RyYXAgc2FtcGxlCgogICAgcmV0dXJucyB0aGUgdG90YWwgbnVtYmVyIG9mIHNhbXBsZXMgdG8gZHJhdyBmb3IgdGhlIGJvb3RzdHJhcCBzYW1wbGUKCiAgICBwcml2YXRlIGFwaSBpbiBza2xlYXJuID49IHYwLjI0LCB0YWtlbiBmcm9tIHNrbGVhcm4uZW5zZW1ibGUuX2ZvcmVzdC5weQoKICAgIDpwYXJhbSBuX3NhbXBsZXM6ICAgTnVtYmVyIG9mIHNhbXBsZXMgaW4gdGhlIGRhdGFzZXQuCiAgICA6cGFyYW0gbWF4X3NhbXBsZXM6CiAgICAgICAgVGhlIG1heGltdW0gbnVtYmVyIG9mIHNhbXBsZXMgdG8gZHJhdyBmcm9tIHRoZSB0b3RhbCBhdmFpbGFibGU6CiAgICAgICAgICAgIC0gaWYgZmxvYXQsIHRoaXMgaW5kaWNhdGVzIGEgZnJhY3Rpb24gb2YgdGhlIHRvdGFsIGFuZCBzaG91bGQgYmUKICAgICAgICAgICAgICB0aGUgaW50ZXJ2YWwgYCgwLCAxKWA7CiAgICAgICAgICAgIC0gaWYgaW50LCB0aGlzIGluZGljYXRlcyB0aGUgZXhhY3QgbnVtYmVyIG9mIHNhbXBsZXM7CiAgICAgICAgICAgIC0gaWYgTm9uZSwgdGhpcyBpbmRpY2F0ZXMgdGhlIHRvdGFsIG51bWJlciBvZiBzYW1wbGVzLgogICAgIiIiCiAgICBpZiBtYXhfc2FtcGxlcyBpcyBOb25lOgogICAgICAgIHJldHVybiBuX3NhbXBsZXMKCiAgICBpZiBpc2luc3RhbmNlKG1heF9zYW1wbGVzLCBudW1iZXJzLkludGVncmFsKToKICAgICAgICBpZiBub3QgKDEgPD0gbWF4X3NhbXBsZXMgPD0gbl9zYW1wbGVzKToKICAgICAgICAgICAgbXNnID0gImBtYXhfc2FtcGxlc2AgbXVzdCBiZSBpbiByYW5nZSAxIHRvIHt9IGJ1dCBnb3QgdmFsdWUge30iCiAgICAgICAgICAgIHJhaXNlIFZhbHVlRXJyb3IobXNnLmZvcm1hdChuX3NhbXBsZXMsIG1heF9zYW1wbGVzKSkKICAgICAgICByZXR1cm4gbWF4X3NhbXBsZXMKCiAgICBpZiBpc2luc3RhbmNlKG1heF9zYW1wbGVzLCBudW1iZXJzLlJlYWwpOgogICAgICAgIGlmIG5vdCAoMCA8IG1heF9zYW1wbGVzIDwgMSk6CiAgICAgICAgICAgIG1zZyA9ICJgbWF4X3NhbXBsZXNgIG11c3QgYmUgaW4gcmFuZ2UgKDAsIDEpIGJ1dCBnb3QgdmFsdWUge30iCiAgICAgICAgICAgIHJhaXNlIFZhbHVlRXJyb3IobXNnLmZvcm1hdChtYXhfc2FtcGxlcykpCiAgICAgICAgcmV0dXJuIGludChyb3VuZChuX3NhbXBsZXMgKiBtYXhfc2FtcGxlcykpCgogICAgbXNnID0gImBtYXhfc2FtcGxlc2Agc2hvdWxkIGJlIGludCBvciBmbG9hdCwgYnV0IGdvdCB0eXBlICd7fSciCiAgICByYWlzZSBUeXBlRXJyb3IobXNnLmZvcm1hdCh0eXBlKG1heF9zYW1wbGVzKSkpCgoKZGVmIF9nZXRfdW5zYW1wbGVkX2l4KHJhbmRvbV9zdGF0ZSwgbl9zYW1wbGVzOiBpbnQpIC0+IG5wLmFycmF5OgogICAgIiIiCiAgICBmdXR1cmUtcHJvb2YgZ2V0IHVuc2FtcGxlZCBpbmRpY2VzCiAgICAiIiIKICAgIG5fYm9vdHN0cmFwID0gX2dldF9uX3NhbXBsZXNfYm9vdHN0cmFwKG5fc2FtcGxlcywgbl9zYW1wbGVzKQogICAgcmFuZG9tX2luc3RhbmNlID0gY2hlY2tfcmFuZG9tX3N0YXRlKHJhbmRvbV9zdGF0ZSkKICAgIHNhbXBsZV9pbmRpY2VzID0gcmFuZG9tX2luc3RhbmNlLnJhbmRpbnQoMCwgbl9zYW1wbGVzLCBuX2Jvb3RzdHJhcCkKICAgIHNhbXBsZV9jb3VudHMgPSBucC5iaW5jb3VudChzYW1wbGVfaW5kaWNlcywgbWlubGVuZ3RoPW5fc2FtcGxlcykKCiAgICByZXR1cm4gbnAuYXJhbmdlKG5fc2FtcGxlcylbc2FtcGxlX2NvdW50cyA9PSAwXQoKCmRlZiBfb29iX2NsYXNzaWZpZXJfYWNjdXJhY3kocmYsIFhfdHJhaW4sIHlfdHJhaW4pIC0+IGZsb2F0OgogICAgIiIiCiAgICBDb21wdXRlIG91dC1vZi1iYWcgKE9PQikgYWNjdXJhY3kgZm9yIGEgc2Npa2l0LWxlYXJuIGZvcmVzdCBjbGFzc2lmaWVyLgoKICAgIGh0dHBzOi8vZ2l0aHViLmNvbS9zY2lraXQtbGVhcm4vc2Npa2l0LWxlYXJuL2Jsb2IvYTI0YzhiNDYvc2tsZWFybi9lbnNlbWJsZS9mb3Jlc3QucHkjTDQyNQogICAgIiIiCiAgICBYID0gWF90cmFpbi52YWx1ZXMgaWYgaXNpbnN0YW5jZShYX3RyYWluLCBwZC5EYXRhRnJhbWUpIGVsc2UgWF90cmFpbgogICAgeSA9IHlfdHJhaW4udmFsdWVzIGlmIGlzaW5zdGFuY2UoeV90cmFpbiwgcGQuU2VyaWVzKSBlbHNlIHlfdHJhaW4KCiAgICBuX3NhbXBsZXMgPSBsZW4oWCkKICAgIG5fY2xhc3NlcyA9IGxlbihucC51bmlxdWUoeSkpCiAgICBwcmVkaWN0aW9ucyA9IG5wLnplcm9zKChuX3NhbXBsZXMsIG5fY2xhc3NlcykpCiAgICBmb3IgdHJlZSBpbiByZi5lc3RpbWF0b3JzXzoKICAgICAgICB1bnNhbXBsZWRfaW5kaWNlcyA9IF9nZXRfdW5zYW1wbGVkX2l4KHRyZWUucmFuZG9tX3N0YXRlLCBuX3NhbXBsZXMpCiAgICAgICAgdHJlZV9wcmVkcyA9IHRyZWUucHJlZGljdF9wcm9iYShYW3Vuc2FtcGxlZF9pbmRpY2VzLCA6XSkKICAgICAgICBwcmVkaWN0aW9uc1t1bnNhbXBsZWRfaW5kaWNlc10gKz0gdHJlZV9wcmVkcwoKICAgIHByZWRpY3RlZF9jbGFzc19pbmRleGVzID0gbnAuYXJnbWF4KHByZWRpY3Rpb25zLCBheGlzPTEpCiAgICBwcmVkaWN0ZWRfY2xhc3NlcyA9IFtyZi5jbGFzc2VzX1tpXSBmb3IgaSBpbiBwcmVkaWN0ZWRfY2xhc3NfaW5kZXhlc10KCiAgICBvb2Jfc2NvcmUgPSBucC5tZWFuKHkgPT0gcHJlZGljdGVkX2NsYXNzZXMpCgogICAgcmV0dXJuIG9vYl9zY29yZQoKCmRlZiBwZXJtdXRhdGlvbl9pbXBvcnRhbmNlKAogICAgY29udGV4dDogTUxDbGllbnRDdHgsCiAgICBtb2RlbDogRGF0YUl0ZW0sCiAgICBkYXRhc2V0OiBEYXRhSXRlbSwKICAgIGxhYmVsczogc3RyLAogICAgZmlnc3o9KDEwLCA1KSwKICAgIHBsb3RzX2Rlc3Q6IHN0ciA9ICJwbG90cyIsCiAgICBmaXR5cGU6IHN0ciA9ICJwZXJtdXRlIiwKKSAtPiBwZC5EYXRhRnJhbWU6CiAgICAiIiJjYWxjdWxhdGUgY2hhbmdlIGluIG1ldHJpYwoKICAgIHR5cGUgJ3Blcm11dGUnIHVzZXMgYSBwcmUtZXN0aW1hdGVkIG1vZGVsCiAgICB0eXBlICdkcm9wY29sJyB1c2VzIGEgcmUtZXN0aW1hdGVzIG1vZGVsCgogICAgOnBhcmFtIGNvbnRleHQ6ICAgICB0aGUgZnVuY3Rpb24ncyBleGVjdXRpb24gY29udGV4dAogICAgOnBhcmFtIG1vZGVsOiAgICAgICBhIHRyYWluZWQgbW9kZWwKICAgIDpwYXJhbSBkYXRhc2V0OiAgICAgZmVhdHVyZXMgYW5kIGdyb3VuZCB0cnV0aHMsIHJlZ3Jlc3Npb24gdGFyZ2V0cwogICAgOnBhcmFtIGxhYmVscyAgICAgICBuYW1lIG9mIHRoZSBncm91bmQgdHJ1dGhzIGNvbHVtbgogICAgOnBhcmFtIGZpZ3N6OiAgICAgICBtYXRwbG90bGliIGZpZ3VyZSBzaXplCiAgICA6cGFyYW0gcGxvdHNfZGVzdDogIHBhdGggd2l0aGluIGFydGlmYWN0IHN0b3JlCiAgICA6CiAgICAiIiIKICAgIG1vZGVsX2ZpbGUsIG1vZGVsX2RhdGEsIF8gPSBnZXRfbW9kZWwobW9kZWwudXJsLCBzdWZmaXg9Ii5wa2wiKQogICAgbW9kZWwgPSBsb2FkKG9wZW4oc3RyKG1vZGVsX2ZpbGUpLCAicmIiKSkKCiAgICBYID0gZGF0YXNldC5hc19kZigpCiAgICB5ID0gWC5wb3AobGFiZWxzKQogICAgaGVhZGVyID0gWC5jb2x1bW5zCgogICAgbWV0cmljID0gX29vYl9jbGFzc2lmaWVyX2FjY3VyYWN5CgogICAgYmFzZWxpbmUgPSBtZXRyaWMobW9kZWwsIFgsIHkpCgogICAgaW1wID0gW10KICAgIGZvciBjb2wgaW4gWC5jb2x1bW5zOgogICAgICAgIGlmIGZpdHlwZSBpcyAicGVybXV0ZSI6CiAgICAgICAgICAgIHNhdmUgPSBYW2NvbF0uY29weSgpCiAgICAgICAgICAgIFhbY29sXSA9IG5wLnJhbmRvbS5wZXJtdXRhdGlvbihYW2NvbF0pCiAgICAgICAgICAgIG0gPSBtZXRyaWMobW9kZWwsIFgsIHkpCiAgICAgICAgICAgIFhbY29sXSA9IHNhdmUKICAgICAgICAgICAgaW1wLmFwcGVuZChiYXNlbGluZSAtIG0pCiAgICAgICAgZWxpZiBmaXR5cGUgaXMgImRyb3Bjb2wiOgogICAgICAgICAgICBYXyA9IFguZHJvcChjb2wsIGF4aXM9MSkKICAgICAgICAgICAgbW9kZWxfID0gY2xvbmUobW9kZWwpCiAgICAgICAgICAgICNtb2RlbF8ucmFuZG9tX3N0YXRlID0gcmFuZG9tX3N0YXRlCiAgICAgICAgICAgIG1vZGVsXy5maXQoWF8sIHkpCiAgICAgICAgICAgIG8gPSBtb2RlbF8ub29iX3Njb3JlXwogICAgICAgICAgICBpbXAuYXBwZW5kKGJhc2VsaW5lIC0gbykKICAgICAgICBlbHNlOgogICAgICAgICAgICByYWlzZSBWYWx1ZUVycm9yKCJ1bmtub3duIGZpdHlwZSwgb25seSAncGVybXV0ZScgb3IgJ2Ryb3Bjb2wnIHBlcm1pdHRlZCIpCgogICAgemlwcGVkID0gemlwKGltcCwgaGVhZGVyKQogICAgZmVhdHVyZV9pbXAgPSBwZC5EYXRhRnJhbWUoc29ydGVkKHppcHBlZCksIGNvbHVtbnM9WyJpbXBvcnRhbmNlIiwgImZlYXR1cmUiXSkKICAgIGZlYXR1cmVfaW1wLnNvcnRfdmFsdWVzKGJ5PSJpbXBvcnRhbmNlIiwgYXNjZW5kaW5nPUZhbHNlLCBpbnBsYWNlPVRydWUpCgogICAgcGx0LmNsZigpCiAgICBwbHQuZmlndXJlKGZpZ3NpemU9Zmlnc3opCiAgICBzbnMuYmFycGxvdCh4PSJpbXBvcnRhbmNlIiwgeT0iZmVhdHVyZSIsIGRhdGE9ZmVhdHVyZV9pbXApCiAgICBwbHQudGl0bGUoZiJmZWF0dXJlIGltcG9ydGFuY2VzLXtmaXR5cGV9IikKICAgIHBsdC50aWdodF9sYXlvdXQoKQoKICAgIGNvbnRleHQubG9nX2FydGlmYWN0KAogICAgICAgIFBsb3RBcnRpZmFjdChmImZlYXR1cmUgaW1wb3J0YW5jZXMte2ZpdHlwZX0iLCBib2R5PXBsdC5nY2YoKSksCiAgICAgICAgbG9jYWxfcGF0aD1mIntwbG90c19kZXN0fS9mZWF0dXJlLXBlcm11dGF0aW9ucy5odG1sIiwKICAgICkKICAgIGNvbnRleHQubG9nX2RhdGFzZXQoCiAgICAgICAgZiJmZWF0dXJlLWltcG9ydGFuY2VzLXtmaXR5cGV9LXRibCIsIGRmPWZlYXR1cmVfaW1wLCBpbmRleD1GYWxzZQogICAgKQo= - commands: [] - code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/feature_perms/feature_perms.py - affinity: null -verbose: false diff --git a/functions/development/feature_perms/0.8.0/src/item.yaml b/functions/development/feature_perms/0.8.0/src/item.yaml deleted file mode 100644 index 8a3f940c..00000000 --- a/functions/development/feature_perms/0.8.0/src/item.yaml +++ /dev/null @@ -1,23 +0,0 @@ -apiVersion: v1 -categories: -- data-analysis -description: estimate feature importances using permutations -doc: '' -example: feature_perms.ipynb -generationDate: 2021-05-19:22-41 -icon: '' -labels: - author: yjb -maintainers: [] -marketplaceType: '' -mlrunVersion: 0.8.0 -name: feature-perms -platformVersion: 3.2.0 -spec: - filename: feature_perms.py - handler: permutation_importance - image: mlrun/ml-models - kind: job - requirements: [] -url: '' -version: 0.8.0 diff --git a/functions/development/feature_perms/0.8.0/src/requirements.txt b/functions/development/feature_perms/0.8.0/src/requirements.txt deleted file mode 100644 index f53a3289..00000000 --- a/functions/development/feature_perms/0.8.0/src/requirements.txt +++ /dev/null @@ -1,6 +0,0 @@ -mlrun -sklearn -matplotlib -seaborn -scikit-plot - diff --git a/functions/development/feature_perms/0.8.0/src/test_feature_perms.py b/functions/development/feature_perms/0.8.0/src/test_feature_perms.py deleted file mode 100644 index b270292d..00000000 --- a/functions/development/feature_perms/0.8.0/src/test_feature_perms.py +++ /dev/null @@ -1,98 +0,0 @@ -from mlrun import code_to_function, import_function -from pathlib import Path -import os - -ARTIFACTS_PATH = 'artifacts' -DATA_URL = "https://raw.githubusercontent.com/parrt/random-forest-importances/master/notebooks/data/rent.csv" -FEATURE_OUTPUT = ARTIFACTS_PATH + "/feature-importances-permute-tbl.parquet" - - -def arc_to_parquet(): - from mlrun import import_function - from mlrun.platforms import auto_mount - - archive_func = import_function('hub://arc_to_parquet') - archive_run = archive_func.run(handler="arc_to_parquet", - params={"key": "rent", "stats": True, "file_ext": "csv"}, - inputs={"archive_url": DATA_URL}, - artifact_path=os.getcwd() + '/artifacts' - , local=True - ) - - -def sklearn_classifier(run): - cwd = os.getcwd() - file_path = str(Path(cwd).parent.absolute()) + "/sklearn_classifier/sklearn_classifier.py" - fn = code_to_function(name='test_sklearn_classifier', - filename=file_path, - handler="train_model", - kind="local", - ) - fn.spec.command = file_path - fn.run(params={ - "sample": -5_000, # 5k random rows, - "model_pkg_class": "sklearn.ensemble.RandomForestClassifier", - "label_column": "interest_level", - "CLASS_n_estimators": 100, - "CLASS_min_samples_leaf": 1, - "CLASS_n_jobs": -1, - "CLASS_oob_score": True}, - handler="train_model", - inputs={"dataset": run.outputs["rent"]}, - artifact_path='artifacts' - # , local=True - ) - - -def train_model(): - from mlrun import import_function - from mlrun.platforms import auto_mount - - train = import_function('hub://sklearn_classifier') - # .apply(auto_mount()) - - train_run = train.run( - inputs={"dataset": "artifacts/rent.csv"}, - params={ - "sample": -5_000, # 5k random rows, - "model_pkg_class": "sklearn.ensemble.RandomForestClassifier", - "label_column": "interest_level", - "CLASS_n_estimators": 100, - "CLASS_min_samples_leaf": 1, - "CLASS_n_jobs": -1, - "CLASS_oob_score": True}, - local=True) - - -def test_feature_selection_run_local(): - arc_to_parquet() - train_model() - data = "artifacts/rent.csv" - labels = "interest_level" - model = "model/model.pkl" - fn = code_to_function(name='test_run_local_feature_perms', - filename="feature_perms.py", - handler="permutation_importance", - kind="local", - ) - fn.spec.command = "feature_perms.py" - fi_perms = fn.run(params={"labels": labels, - "plots_dest": "plots"}, - inputs={"model": model, "dataset": data}, - artifact_path='artifacts') - assert Path(FEATURE_OUTPUT).is_file() - - -def test_feature_perms_import_function(): - arc_to_parquet() - train_model() - data = "artifacts/rent.csv" - labels = "interest_level" - model = "model/model.pkl" - fi_perms = import_function("function.yaml") - fi_perms.run(params={"labels": labels, - "plots_dest": "plots"}, - inputs={"model": model, "dataset": data}, - artifact_path=os.getcwd() + '/artifacts' - , local=True) - assert Path(FEATURE_OUTPUT).is_file() diff --git a/functions/development/feature_perms/0.8.0/static/documentation.html b/functions/development/feature_perms/0.8.0/static/documentation.html deleted file mode 100644 index d485113f..00000000 --- a/functions/development/feature_perms/0.8.0/static/documentation.html +++ /dev/null @@ -1,145 +0,0 @@ - - - - - - - -feature_perms package - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- - -
-
-
-
-
-
-

feature_perms package

-
-

Submodules

-
-
-

feature_perms.feature_perms module

-
-
-feature_perms.feature_perms.permutation_importance(context: mlrun.execution.MLClientCtx, model: mlrun.datastore.base.DataItem, dataset: mlrun.datastore.base.DataItem, labels: str, figsz=(10, 5), plots_dest: str = 'plots', fitype: str = 'permute')pandas.core.frame.DataFrame[source]
-

calculate change in metric

-

type ‘permute’ uses a pre-estimated model -type ‘dropcol’ uses a re-estimates model

-
-
Parameters
-
    -
  • context – the function’s execution context

  • -
  • model – a trained model

  • -
  • dataset – features and ground truths, regression targets

  • -
-
-
-

:param labels name of the ground truths column -:param figsz: matplotlib figure size -:param plots_dest: path within artifact store -:

-
-
-
-

Module contents

-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/feature_perms/0.8.0/static/example.html b/functions/development/feature_perms/0.8.0/static/example.html deleted file mode 100644 index 955c97e1..00000000 --- a/functions/development/feature_perms/0.8.0/static/example.html +++ /dev/null @@ -1,1065 +0,0 @@ - - - - - - - -permutation_importances as reusable function - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- - -
-
-
-
-
-
-

permutation_importances as reusable function

-
-

function code

-
-
-
# nuclio: ignore
-import nuclio
-
-
-
-
-
-
-
import numpy as np
-import pandas as pd
-import numbers
-
-import sklearn
-from sklearn.base import clone
-from sklearn.utils import check_random_state
-
-import matplotlib.pyplot as plt
-import seaborn as sns
-
-from cloudpickle import load
-
-from mlrun.execution import MLClientCtx
-from mlrun.datastore import DataItem
-from mlrun.artifacts import get_model, PlotArtifact
-from typing import Union, Callable, List
-
-def _get_n_samples_bootstrap(n_samples, max_samples) -> int:
-    """get the number of samples in a bootstrap sample
-    
-    returns the total number of samples to draw for the bootstrap sample
-    
-    private api in sklearn >= v0.24, taken from sklearn.ensemble._forest.py
-
-    :param n_samples:   Number of samples in the dataset.
-    :param max_samples: 
-        The maximum number of samples to draw from the total available:
-            - if float, this indicates a fraction of the total and should be
-              the interval `(0, 1)`;
-            - if int, this indicates the exact number of samples;
-            - if None, this indicates the total number of samples.
-    """
-    if max_samples is None:
-        return n_samples
-
-    if isinstance(max_samples, numbers.Integral):
-        if not (1 <= max_samples <= n_samples):
-            msg = "`max_samples` must be in range 1 to {} but got value {}"
-            raise ValueError(msg.format(n_samples, max_samples))
-        return max_samples
-
-    if isinstance(max_samples, numbers.Real):
-        if not (0 < max_samples < 1):
-            msg = "`max_samples` must be in range (0, 1) but got value {}"
-            raise ValueError(msg.format(max_samples))
-        return int(round(n_samples * max_samples))
-
-    msg = "`max_samples` should be int or float, but got type '{}'"
-    raise TypeError(msg.format(type(max_samples)))
-
-def _get_unsampled_ix(random_state, n_samples: int) -> np.array:
-    """
-    future-proof get unsampled indices
-    """
-    n_bootstrap = _get_n_samples_bootstrap(n_samples, n_samples)
-    random_instance = check_random_state(random_state)
-    sample_indices = random_instance.randint(0, n_samples, n_bootstrap)
-    sample_counts = np.bincount(sample_indices, minlength=n_samples)
-
-    return np.arange(n_samples)[sample_counts==0]
-
-def _oob_classifier_accuracy(rf, X_train, y_train) -> float:
-    """
-    Compute out-of-bag (OOB) accuracy for a scikit-learn forest classifier.
-    
-    https://github.com/scikit-learn/scikit-learn/blob/a24c8b46/sklearn/ensemble/forest.py#L425
-    """
-    X = X_train.values if isinstance(X_train, pd.DataFrame) else X_train
-    y = y_train.values if isinstance(y_train, pd.Series) else y_train
-
-    n_samples = len(X)
-    n_classes = len(np.unique(y))
-    predictions = np.zeros((n_samples, n_classes))
-    for tree in rf.estimators_:
-        unsampled_indices = _get_unsampled_ix(tree.random_state, n_samples)
-        tree_preds = tree.predict_proba(X[unsampled_indices, :])
-        predictions[unsampled_indices] += tree_preds
-
-    predicted_class_indexes = np.argmax(predictions, axis=1)
-    predicted_classes = [rf.classes_[i] for i in predicted_class_indexes]
-
-    oob_score = np.mean(y == predicted_classes)
-    
-    return oob_score
-
-def permutation_importances(
-    context: MLClientCtx,
-    model: DataItem,
-    dataset: DataItem,
-    labels: str,
-    figsz=(10, 5),
-    plots_dest: str = "plots",
-    fitype: str = "permute"
-) -> pd.DataFrame:
-    """calculate change in metric
-    
-    type 'permute' uses a pre-estimated model
-    type 'dropcol' uses a re-estimates model
-    
-    :param context:     the function's execution context
-    :param model:       a trained model
-    :param dataset:     features and ground truths, regression targets
-    :param labels       name of the ground truths column
-    :param figsz:       matplotlib figure size
-    :param plots_dest:  path within artifact store
-    :
-    """
-    model_file, model_data, _ = get_model(model.url, suffix='.pkl')
-    model = load(open(str(model_file), "rb"))
-    
-    X = dataset.as_df()
-    y = X.pop(labels)
-    header = X.columns
-    
-    # this will be paramettrized next version, and include regression
-    metric = _oob_classifier_accuracy
-    
-    baseline = metric(model, X, y)
-    
-    imp = []
-    for col in X.columns:
-        if fitype is "permute":
-            save = X[col].copy()
-            X[col] = np.random.permutation(X[col])
-            m = metric(model, X, y)
-            X[col] = save
-            imp.append(baseline - m)
-        elif fitype is "dropcol":
-            X_ = X.drop(col, axis=1)
-            model_ = clone(model)
-            model_.random_state = random_state
-            model_.fit(X_, y)
-            o = model_.oob_score_
-            imp.append(baseline - o)
-        else:
-            raise ValueError("unknown fitype, only 'permute' or 'dropcol' permitted")
-
-    # create a feature importance table with desired labels
-    zipped = zip(imp, header)
-    feature_imp = pd.DataFrame(sorted(zipped), columns=["importance", "feature"])
-    feature_imp.sort_values(by="importance", ascending=False, inplace=True)
-
-    plt.clf()
-    plt.figure(figsize=figsz)
-    sns.barplot(x="importance", y="feature", data=feature_imp)
-    plt.title(f"feature importances-{fitype}")
-    plt.tight_layout()
-
-    context.log_artifact(PlotArtifact(f"feature importances-{fitype}", body=plt.gcf()),
-                         local_path=f"{plots_dest}/feature-permutations.html")
-    context.log_dataset(f"feature-importances-{fitype}-tbl", df=feature_imp, index=False)
-
-
-
-
-
-
-
# nuclio: end-code
-
-
-
-
-
-
-

save function

-
-
-
from mlrun import code_to_function
-from mlrun.platforms.other import auto_mount
-
-gpus = False
-
-# create job function object from notebook code
-fn_params = {
-    "name"        : "feature-perms",
-    "handler"     : "permutation_importances",
-    "kind"        : "job",
-    "image"       : "mlrun/ml-models" if not gpus else "mlrun/ml-models-gpu",
-    "description" : "estimate feature importances using permutations",
-    "categories"  : ["analysis"],
-    "labels"      : {"author": "yjb"}
-}
-
-perms_fn = code_to_function(**fn_params)
-perms_fn.apply(auto_mount())
-perms_fn.export("function.yaml")
-
-
-
-
-
[mlrun] 2020-06-07 19:58:25,298 function spec saved to path: function.yaml
-
-
-
<mlrun.runtimes.kubejob.KubejobRuntime at 0x7feb0104efd0>
-
-
-
-
-
-
-

tests

-
-
-
from mlrun import import_function
-from mlrun import NewTask, mlconf
-
-
-
-
-
-

get some data

-
-
-
data_url = "https://raw.githubusercontent.com/parrt/random-forest-importances/master/notebooks/data/rent.csv"
-
-fn = import_function("hub://arc_to_parquet", "a2p")
-fn.apply(auto_mount())
-
-params = {
-    "name" : "tasks arc-to-parq",
-    "params" : {"key":"rent", "stats": True, "file_ext":"csv"}
-}
-acquire_run = fn.run(NewTask(**params),inputs={"archive_url" : data_url},
-                     artifact_path=mlconf.artifact_path)
-
-
-
-
-
[mlrun] 2020-06-07 19:58:25,352 starting run tasks arc-to-parq uid=e9bc67f2189c418d96bfde754d369956  -> http://mlrun-api:8080
-[mlrun] 2020-06-07 19:58:25,486 Job is running in the background, pod: tasks-arc-to-parq-xqkr7
-[mlrun] 2020-06-07 19:58:29,118 starting local run: main.py # arc_to_parquet
-[mlrun] 2020-06-07 19:58:29,169 downloading https://raw.githubusercontent.com/parrt/random-forest-importances/master/notebooks/data/rent.csv to local tmp
-[mlrun] 2020-06-07 19:58:29,535 destination file does not exist, downloading
-[mlrun] 2020-06-07 19:58:29,898 log artifact rent at /User/artifacts/rent.csv, size: 1492462, db: Y
-
-[mlrun] 2020-06-07 19:58:29,917 run executed, status=completed
-final state: succeeded
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 07 19:58:29completedtasks arc-to-parq
v3io_user=admin
kind=job
owner=admin
host=tasks-arc-to-parq-xqkr7
archive_url
key=rent
stats=True
file_ext=csv
rent
-
- -
-
to track results use .show() or .logs() or in CLI: 
-!mlrun get run e9bc67f2189c418d96bfde754d369956 --project default , !mlrun logs e9bc67f2189c418d96bfde754d369956 --project default
-[mlrun] 2020-06-07 19:58:31,666 run executed, status=completed
-
-
-
-
-
-
-

train a model

-
-
-
fn = import_function("hub://sklearn_classifier", "skrf")
-fn.apply(auto_mount())
-
-# define model
-params = {
-    "name" : "tasks random forest",
-    "params" : {
-        "sample"                 : -5_000, # 5k random rows,
-        "model_pkg_class"        : "sklearn.ensemble.RandomForestClassifier",
-        "label_column"           : "interest_level",
-        "CLASS_n_estimators"     : 100,
-        "CLASS_min_samples_leaf" : 1,
-        "CLASS_n_jobs"           : -1,
-        "CLASS_oob_score"        : True}
-}
-
-train_run = fn.run(NewTask(**params), inputs={"dataset" : acquire_run.outputs["rent"]},
-                   artifact_path=mlconf.artifact_path)
-
-
-
-
-
[mlrun] 2020-06-07 19:58:31,704 starting run tasks random forest uid=57af834167264641905a5bb5e6b0e263  -> http://mlrun-api:8080
-[mlrun] 2020-06-07 19:58:31,861 Job is running in the background, pod: tasks-random-forest-vkjk5
-[mlrun] 2020-06-07 19:58:35,390 starting local run: main.py # train_model
-[mlrun] 2020-06-07 19:58:36,310 log artifact test_set at /User/artifacts/data/test_set.parquet, size: 24484, db: Y
-[mlrun] 2020-06-07 19:58:37,153 log artifact confusion-matrix at /User/artifacts/model/plots/confusion-matrix.html, size: 27401, db: N
-[mlrun] 2020-06-07 19:58:37,598 log artifact feature-importances at /User/artifacts/model/plots/feature-importances.html, size: 19685, db: N
-[mlrun] 2020-06-07 19:58:37,806 log artifact precision-recall-multiclass at /User/artifacts/model/plots/precision-recall-multiclass.html, size: 74009, db: N
-[mlrun] 2020-06-07 19:58:37,936 log artifact roc-multiclass at /User/artifacts/model/plots/roc-multiclass.html, size: 73053, db: N
-[mlrun] 2020-06-07 19:58:38,079 log artifact model at /User/artifacts/model/, size: 10346780, db: Y
-
-[mlrun] 2020-06-07 19:58:38,106 run executed, status=completed
-final state: succeeded
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 07 19:58:36completedtasks random forest
v3io_user=admin
kind=job
owner=admin
host=tasks-random-forest-vkjk5
class=sklearn.ensemble.RandomForestClassifier
dataset
sample=-5000
model_pkg_class=sklearn.ensemble.RandomForestClassifier
label_column=interest_level
CLASS_n_estimators=100
CLASS_min_samples_leaf=1
CLASS_n_jobs=-1
CLASS_oob_score=True
test-accuracy=0.6902857142857143
test-error=0.3097142857142857
auc-micro=0.8567196734693878
auc-weighted=0.7077200281488216
f1-score=0.44361444815007395
precision_score=0.4969837043184901
recall_score=0.42733978329897576
test_set
confusion-matrix
feature-importances
precision-recall-multiclass
roc-multiclass
model
-
- -
-
to track results use .show() or .logs() or in CLI: 
-!mlrun get run 57af834167264641905a5bb5e6b0e263 --project default , !mlrun logs 57af834167264641905a5bb5e6b0e263 --project default
-[mlrun] 2020-06-07 19:58:41,115 run executed, status=completed
-
-
-
-
-
-
-
from IPython.display import HTML
-HTML(filename=train_run.outputs['feature-importances'])
-
-
-
-
-

Feature Importances

-
-
-
-
-
data   = acquire_run.outputs["rent"]
-labels = "interest_level"
-model  = train_run.outputs["model"]
-
-
-
-
-
-
-
fi_perms = perms_fn.run(
-    NewTask(params={"labels": labels, 
-                    "plots_dest": "plots"}),
-    inputs={"model": model, "dataset": data},
-    artifact_path=mlconf.artifact_path)
-
-
-
-
-
[mlrun] 2020-06-07 19:58:41,152 starting run features-permutation_importances uid=89235b15ac2a4213aefc906c178a1c5e  -> http://mlrun-api:8080
-[mlrun] 2020-06-07 19:58:41,312 Job is running in the background, pod: features-permutation-importances-dwxmt
-[mlrun] 2020-06-07 19:58:44,871 starting local run: main.py # permutation_importances
-[mlrun] 2020-06-07 19:58:48,714 log artifact feature importances-permute at /User/artifacts/plots/feature-permutations.html, size: 25694, db: Y
-[mlrun] 2020-06-07 19:58:48,770 log artifact feature-importances-permute-tbl at /User/artifacts/feature-importances-permute-tbl.csv, size: 167, db: Y
-
-[mlrun] 2020-06-07 19:58:48,785 run executed, status=completed
-final state: succeeded
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 07 19:58:45completedfeatures-permutation_importances
v3io_user=admin
kind=job
owner=admin
host=features-permutation-importances-dwxmt
model
dataset
labels=interest_level
plots_dest=plots
feature importances-permute
feature-importances-permute-tbl
-
- -
-
to track results use .show() or .logs() or in CLI: 
-!mlrun get run 89235b15ac2a4213aefc906c178a1c5e --project default , !mlrun logs 89235b15ac2a4213aefc906c178a1c5e --project default
-[mlrun] 2020-06-07 19:58:50,488 run executed, status=completed
-
-
-
-
-
-
-
from IPython.display import HTML
-HTML(filename=fi_perms.outputs['feature importances-permute'])
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/feature_perms/0.8.0/static/function.html b/functions/development/feature_perms/0.8.0/static/function.html deleted file mode 100644 index 662b2eb8..00000000 --- a/functions/development/feature_perms/0.8.0/static/function.html +++ /dev/null @@ -1,85 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: job
-metadata:
-  name: feature-perms
-  tag: ''
-  hash: 2e32234a73e2e48f029cf6c957b150ec2ffd4bc7
-  project: default
-  labels:
-    author: yjb
-  categories:
-  - data-analysis
-spec:
-  command: ''
-  args: []
-  image: mlrun/ml-models
-  env: []
-  default_handler: permutation_importance
-  entry_points:
-    permutation_importance:
-      name: permutation_importance
-      doc: 'calculate change in metric
-
-
-        type ''permute'' uses a pre-estimated model
-
-        type ''dropcol'' uses a re-estimates model'
-      parameters:
-      - name: context
-        type: MLClientCtx
-        doc: the function's execution context
-        default: ''
-      - name: model
-        type: DataItem
-        doc: a trained model
-        default: ''
-      - name: dataset
-        type: DataItem
-        doc: features and ground truths, regression targets
-        default: ''
-      - name: labels
-        type: str
-        default: ''
-      - name: figsz
-        doc: matplotlib figure size
-        default:
-        - 10
-        - 5
-      - name: plots_dest
-        type: str
-        doc: path within artifact store
-        default: plots
-      - name: fitype
-        type: str
-        default: permute
-      outputs:
-      - default: ''
-      lineno: 93
-  description: estimate feature importances using permutations
-  build:
-    functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IG51bXB5IGFzIG5wCmltcG9ydCBwYW5kYXMgYXMgcGQKaW1wb3J0IG51bWJlcnMKCmltcG9ydCBza2xlYXJuCmZyb20gc2tsZWFybi5iYXNlIGltcG9ydCBjbG9uZQpmcm9tIHNrbGVhcm4udXRpbHMgaW1wb3J0IGNoZWNrX3JhbmRvbV9zdGF0ZQoKaW1wb3J0IG1hdHBsb3RsaWIucHlwbG90IGFzIHBsdAppbXBvcnQgc2VhYm9ybiBhcyBzbnMKCmZyb20gY2xvdWRwaWNrbGUgaW1wb3J0IGxvYWQKCmZyb20gbWxydW4uZXhlY3V0aW9uIGltcG9ydCBNTENsaWVudEN0eApmcm9tIG1scnVuLmRhdGFzdG9yZSBpbXBvcnQgRGF0YUl0ZW0KZnJvbSBtbHJ1bi5hcnRpZmFjdHMgaW1wb3J0IGdldF9tb2RlbCwgUGxvdEFydGlmYWN0CmZyb20gdHlwaW5nIGltcG9ydCBVbmlvbiwgQ2FsbGFibGUsIExpc3QKCgpkZWYgX2dldF9uX3NhbXBsZXNfYm9vdHN0cmFwKG5fc2FtcGxlcywgbWF4X3NhbXBsZXMpIC0+IGludDoKICAgICIiImdldCB0aGUgbnVtYmVyIG9mIHNhbXBsZXMgaW4gYSBib290c3RyYXAgc2FtcGxlCgogICAgcmV0dXJucyB0aGUgdG90YWwgbnVtYmVyIG9mIHNhbXBsZXMgdG8gZHJhdyBmb3IgdGhlIGJvb3RzdHJhcCBzYW1wbGUKCiAgICBwcml2YXRlIGFwaSBpbiBza2xlYXJuID49IHYwLjI0LCB0YWtlbiBmcm9tIHNrbGVhcm4uZW5zZW1ibGUuX2ZvcmVzdC5weQoKICAgIDpwYXJhbSBuX3NhbXBsZXM6ICAgTnVtYmVyIG9mIHNhbXBsZXMgaW4gdGhlIGRhdGFzZXQuCiAgICA6cGFyYW0gbWF4X3NhbXBsZXM6CiAgICAgICAgVGhlIG1heGltdW0gbnVtYmVyIG9mIHNhbXBsZXMgdG8gZHJhdyBmcm9tIHRoZSB0b3RhbCBhdmFpbGFibGU6CiAgICAgICAgICAgIC0gaWYgZmxvYXQsIHRoaXMgaW5kaWNhdGVzIGEgZnJhY3Rpb24gb2YgdGhlIHRvdGFsIGFuZCBzaG91bGQgYmUKICAgICAgICAgICAgICB0aGUgaW50ZXJ2YWwgYCgwLCAxKWA7CiAgICAgICAgICAgIC0gaWYgaW50LCB0aGlzIGluZGljYXRlcyB0aGUgZXhhY3QgbnVtYmVyIG9mIHNhbXBsZXM7CiAgICAgICAgICAgIC0gaWYgTm9uZSwgdGhpcyBpbmRpY2F0ZXMgdGhlIHRvdGFsIG51bWJlciBvZiBzYW1wbGVzLgogICAgIiIiCiAgICBpZiBtYXhfc2FtcGxlcyBpcyBOb25lOgogICAgICAgIHJldHVybiBuX3NhbXBsZXMKCiAgICBpZiBpc2luc3RhbmNlKG1heF9zYW1wbGVzLCBudW1iZXJzLkludGVncmFsKToKICAgICAgICBpZiBub3QgKDEgPD0gbWF4X3NhbXBsZXMgPD0gbl9zYW1wbGVzKToKICAgICAgICAgICAgbXNnID0gImBtYXhfc2FtcGxlc2AgbXVzdCBiZSBpbiByYW5nZSAxIHRvIHt9IGJ1dCBnb3QgdmFsdWUge30iCiAgICAgICAgICAgIHJhaXNlIFZhbHVlRXJyb3IobXNnLmZvcm1hdChuX3NhbXBsZXMsIG1heF9zYW1wbGVzKSkKICAgICAgICByZXR1cm4gbWF4X3NhbXBsZXMKCiAgICBpZiBpc2luc3RhbmNlKG1heF9zYW1wbGVzLCBudW1iZXJzLlJlYWwpOgogICAgICAgIGlmIG5vdCAoMCA8IG1heF9zYW1wbGVzIDwgMSk6CiAgICAgICAgICAgIG1zZyA9ICJgbWF4X3NhbXBsZXNgIG11c3QgYmUgaW4gcmFuZ2UgKDAsIDEpIGJ1dCBnb3QgdmFsdWUge30iCiAgICAgICAgICAgIHJhaXNlIFZhbHVlRXJyb3IobXNnLmZvcm1hdChtYXhfc2FtcGxlcykpCiAgICAgICAgcmV0dXJuIGludChyb3VuZChuX3NhbXBsZXMgKiBtYXhfc2FtcGxlcykpCgogICAgbXNnID0gImBtYXhfc2FtcGxlc2Agc2hvdWxkIGJlIGludCBvciBmbG9hdCwgYnV0IGdvdCB0eXBlICd7fSciCiAgICByYWlzZSBUeXBlRXJyb3IobXNnLmZvcm1hdCh0eXBlKG1heF9zYW1wbGVzKSkpCgoKZGVmIF9nZXRfdW5zYW1wbGVkX2l4KHJhbmRvbV9zdGF0ZSwgbl9zYW1wbGVzOiBpbnQpIC0+IG5wLmFycmF5OgogICAgIiIiCiAgICBmdXR1cmUtcHJvb2YgZ2V0IHVuc2FtcGxlZCBpbmRpY2VzCiAgICAiIiIKICAgIG5fYm9vdHN0cmFwID0gX2dldF9uX3NhbXBsZXNfYm9vdHN0cmFwKG5fc2FtcGxlcywgbl9zYW1wbGVzKQogICAgcmFuZG9tX2luc3RhbmNlID0gY2hlY2tfcmFuZG9tX3N0YXRlKHJhbmRvbV9zdGF0ZSkKICAgIHNhbXBsZV9pbmRpY2VzID0gcmFuZG9tX2luc3RhbmNlLnJhbmRpbnQoMCwgbl9zYW1wbGVzLCBuX2Jvb3RzdHJhcCkKICAgIHNhbXBsZV9jb3VudHMgPSBucC5iaW5jb3VudChzYW1wbGVfaW5kaWNlcywgbWlubGVuZ3RoPW5fc2FtcGxlcykKCiAgICByZXR1cm4gbnAuYXJhbmdlKG5fc2FtcGxlcylbc2FtcGxlX2NvdW50cyA9PSAwXQoKCmRlZiBfb29iX2NsYXNzaWZpZXJfYWNjdXJhY3kocmYsIFhfdHJhaW4sIHlfdHJhaW4pIC0+IGZsb2F0OgogICAgIiIiCiAgICBDb21wdXRlIG91dC1vZi1iYWcgKE9PQikgYWNjdXJhY3kgZm9yIGEgc2Npa2l0LWxlYXJuIGZvcmVzdCBjbGFzc2lmaWVyLgoKICAgIGh0dHBzOi8vZ2l0aHViLmNvbS9zY2lraXQtbGVhcm4vc2Npa2l0LWxlYXJuL2Jsb2IvYTI0YzhiNDYvc2tsZWFybi9lbnNlbWJsZS9mb3Jlc3QucHkjTDQyNQogICAgIiIiCiAgICBYID0gWF90cmFpbi52YWx1ZXMgaWYgaXNpbnN0YW5jZShYX3RyYWluLCBwZC5EYXRhRnJhbWUpIGVsc2UgWF90cmFpbgogICAgeSA9IHlfdHJhaW4udmFsdWVzIGlmIGlzaW5zdGFuY2UoeV90cmFpbiwgcGQuU2VyaWVzKSBlbHNlIHlfdHJhaW4KCiAgICBuX3NhbXBsZXMgPSBsZW4oWCkKICAgIG5fY2xhc3NlcyA9IGxlbihucC51bmlxdWUoeSkpCiAgICBwcmVkaWN0aW9ucyA9IG5wLnplcm9zKChuX3NhbXBsZXMsIG5fY2xhc3NlcykpCiAgICBmb3IgdHJlZSBpbiByZi5lc3RpbWF0b3JzXzoKICAgICAgICB1bnNhbXBsZWRfaW5kaWNlcyA9IF9nZXRfdW5zYW1wbGVkX2l4KHRyZWUucmFuZG9tX3N0YXRlLCBuX3NhbXBsZXMpCiAgICAgICAgdHJlZV9wcmVkcyA9IHRyZWUucHJlZGljdF9wcm9iYShYW3Vuc2FtcGxlZF9pbmRpY2VzLCA6XSkKICAgICAgICBwcmVkaWN0aW9uc1t1bnNhbXBsZWRfaW5kaWNlc10gKz0gdHJlZV9wcmVkcwoKICAgIHByZWRpY3RlZF9jbGFzc19pbmRleGVzID0gbnAuYXJnbWF4KHByZWRpY3Rpb25zLCBheGlzPTEpCiAgICBwcmVkaWN0ZWRfY2xhc3NlcyA9IFtyZi5jbGFzc2VzX1tpXSBmb3IgaSBpbiBwcmVkaWN0ZWRfY2xhc3NfaW5kZXhlc10KCiAgICBvb2Jfc2NvcmUgPSBucC5tZWFuKHkgPT0gcHJlZGljdGVkX2NsYXNzZXMpCgogICAgcmV0dXJuIG9vYl9zY29yZQoKCmRlZiBwZXJtdXRhdGlvbl9pbXBvcnRhbmNlKAogICAgY29udGV4dDogTUxDbGllbnRDdHgsCiAgICBtb2RlbDogRGF0YUl0ZW0sCiAgICBkYXRhc2V0OiBEYXRhSXRlbSwKICAgIGxhYmVsczogc3RyLAogICAgZmlnc3o9KDEwLCA1KSwKICAgIHBsb3RzX2Rlc3Q6IHN0ciA9ICJwbG90cyIsCiAgICBmaXR5cGU6IHN0ciA9ICJwZXJtdXRlIiwKKSAtPiBwZC5EYXRhRnJhbWU6CiAgICAiIiJjYWxjdWxhdGUgY2hhbmdlIGluIG1ldHJpYwoKICAgIHR5cGUgJ3Blcm11dGUnIHVzZXMgYSBwcmUtZXN0aW1hdGVkIG1vZGVsCiAgICB0eXBlICdkcm9wY29sJyB1c2VzIGEgcmUtZXN0aW1hdGVzIG1vZGVsCgogICAgOnBhcmFtIGNvbnRleHQ6ICAgICB0aGUgZnVuY3Rpb24ncyBleGVjdXRpb24gY29udGV4dAogICAgOnBhcmFtIG1vZGVsOiAgICAgICBhIHRyYWluZWQgbW9kZWwKICAgIDpwYXJhbSBkYXRhc2V0OiAgICAgZmVhdHVyZXMgYW5kIGdyb3VuZCB0cnV0aHMsIHJlZ3Jlc3Npb24gdGFyZ2V0cwogICAgOnBhcmFtIGxhYmVscyAgICAgICBuYW1lIG9mIHRoZSBncm91bmQgdHJ1dGhzIGNvbHVtbgogICAgOnBhcmFtIGZpZ3N6OiAgICAgICBtYXRwbG90bGliIGZpZ3VyZSBzaXplCiAgICA6cGFyYW0gcGxvdHNfZGVzdDogIHBhdGggd2l0aGluIGFydGlmYWN0IHN0b3JlCiAgICA6CiAgICAiIiIKICAgIG1vZGVsX2ZpbGUsIG1vZGVsX2RhdGEsIF8gPSBnZXRfbW9kZWwobW9kZWwudXJsLCBzdWZmaXg9Ii5wa2wiKQogICAgbW9kZWwgPSBsb2FkKG9wZW4oc3RyKG1vZGVsX2ZpbGUpLCAicmIiKSkKCiAgICBYID0gZGF0YXNldC5hc19kZigpCiAgICB5ID0gWC5wb3AobGFiZWxzKQogICAgaGVhZGVyID0gWC5jb2x1bW5zCgogICAgbWV0cmljID0gX29vYl9jbGFzc2lmaWVyX2FjY3VyYWN5CgogICAgYmFzZWxpbmUgPSBtZXRyaWMobW9kZWwsIFgsIHkpCgogICAgaW1wID0gW10KICAgIGZvciBjb2wgaW4gWC5jb2x1bW5zOgogICAgICAgIGlmIGZpdHlwZSBpcyAicGVybXV0ZSI6CiAgICAgICAgICAgIHNhdmUgPSBYW2NvbF0uY29weSgpCiAgICAgICAgICAgIFhbY29sXSA9IG5wLnJhbmRvbS5wZXJtdXRhdGlvbihYW2NvbF0pCiAgICAgICAgICAgIG0gPSBtZXRyaWMobW9kZWwsIFgsIHkpCiAgICAgICAgICAgIFhbY29sXSA9IHNhdmUKICAgICAgICAgICAgaW1wLmFwcGVuZChiYXNlbGluZSAtIG0pCiAgICAgICAgZWxpZiBmaXR5cGUgaXMgImRyb3Bjb2wiOgogICAgICAgICAgICBYXyA9IFguZHJvcChjb2wsIGF4aXM9MSkKICAgICAgICAgICAgbW9kZWxfID0gY2xvbmUobW9kZWwpCiAgICAgICAgICAgICNtb2RlbF8ucmFuZG9tX3N0YXRlID0gcmFuZG9tX3N0YXRlCiAgICAgICAgICAgIG1vZGVsXy5maXQoWF8sIHkpCiAgICAgICAgICAgIG8gPSBtb2RlbF8ub29iX3Njb3JlXwogICAgICAgICAgICBpbXAuYXBwZW5kKGJhc2VsaW5lIC0gbykKICAgICAgICBlbHNlOgogICAgICAgICAgICByYWlzZSBWYWx1ZUVycm9yKCJ1bmtub3duIGZpdHlwZSwgb25seSAncGVybXV0ZScgb3IgJ2Ryb3Bjb2wnIHBlcm1pdHRlZCIpCgogICAgemlwcGVkID0gemlwKGltcCwgaGVhZGVyKQogICAgZmVhdHVyZV9pbXAgPSBwZC5EYXRhRnJhbWUoc29ydGVkKHppcHBlZCksIGNvbHVtbnM9WyJpbXBvcnRhbmNlIiwgImZlYXR1cmUiXSkKICAgIGZlYXR1cmVfaW1wLnNvcnRfdmFsdWVzKGJ5PSJpbXBvcnRhbmNlIiwgYXNjZW5kaW5nPUZhbHNlLCBpbnBsYWNlPVRydWUpCgogICAgcGx0LmNsZigpCiAgICBwbHQuZmlndXJlKGZpZ3NpemU9Zmlnc3opCiAgICBzbnMuYmFycGxvdCh4PSJpbXBvcnRhbmNlIiwgeT0iZmVhdHVyZSIsIGRhdGE9ZmVhdHVyZV9pbXApCiAgICBwbHQudGl0bGUoZiJmZWF0dXJlIGltcG9ydGFuY2VzLXtmaXR5cGV9IikKICAgIHBsdC50aWdodF9sYXlvdXQoKQoKICAgIGNvbnRleHQubG9nX2FydGlmYWN0KAogICAgICAgIFBsb3RBcnRpZmFjdChmImZlYXR1cmUgaW1wb3J0YW5jZXMte2ZpdHlwZX0iLCBib2R5PXBsdC5nY2YoKSksCiAgICAgICAgbG9jYWxfcGF0aD1mIntwbG90c19kZXN0fS9mZWF0dXJlLXBlcm11dGF0aW9ucy5odG1sIiwKICAgICkKICAgIGNvbnRleHQubG9nX2RhdGFzZXQoCiAgICAgICAgZiJmZWF0dXJlLWltcG9ydGFuY2VzLXtmaXR5cGV9LXRibCIsIGRmPWZlYXR1cmVfaW1wLCBpbmRleD1GYWxzZQogICAgKQo=
-    commands: []
-    code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/feature_perms/feature_perms.py
-  affinity: null
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/feature_perms/0.8.0/static/item.html b/functions/development/feature_perms/0.8.0/static/item.html deleted file mode 100644 index e9f2a9e3..00000000 --- a/functions/development/feature_perms/0.8.0/static/item.html +++ /dev/null @@ -1,45 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- data-analysis
-description: estimate feature importances using permutations
-doc: ''
-example: feature_perms.ipynb
-generationDate: 2021-05-19:22-41
-icon: ''
-labels:
-  author: yjb
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 0.8.0
-name: feature-perms
-platformVersion: 3.2.0
-spec:
-  filename: feature_perms.py
-  handler: permutation_importance
-  image: mlrun/ml-models
-  kind: job
-  requirements: []
-url: ''
-version: 0.8.0
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/feature_perms/0.8.0/static/source.html b/functions/development/feature_perms/0.8.0/static/source.html deleted file mode 100644 index 3013427a..00000000 --- a/functions/development/feature_perms/0.8.0/static/source.html +++ /dev/null @@ -1,182 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-# Generated by nuclio.export.NuclioExporter
-
-import numpy as np
-import pandas as pd
-import numbers
-
-import sklearn
-from sklearn.base import clone
-from sklearn.utils import check_random_state
-
-import matplotlib.pyplot as plt
-import seaborn as sns
-
-from cloudpickle import load
-
-from mlrun.execution import MLClientCtx
-from mlrun.datastore import DataItem
-from mlrun.artifacts import get_model, PlotArtifact
-from typing import Union, Callable, List
-
-
-def _get_n_samples_bootstrap(n_samples, max_samples) -> int:
-    """get the number of samples in a bootstrap sample
-
-    returns the total number of samples to draw for the bootstrap sample
-
-    private api in sklearn >= v0.24, taken from sklearn.ensemble._forest.py
-
-    :param n_samples:   Number of samples in the dataset.
-    :param max_samples:
-        The maximum number of samples to draw from the total available:
-            - if float, this indicates a fraction of the total and should be
-              the interval `(0, 1)`;
-            - if int, this indicates the exact number of samples;
-            - if None, this indicates the total number of samples.
-    """
-    if max_samples is None:
-        return n_samples
-
-    if isinstance(max_samples, numbers.Integral):
-        if not (1 <= max_samples <= n_samples):
-            msg = "`max_samples` must be in range 1 to {} but got value {}"
-            raise ValueError(msg.format(n_samples, max_samples))
-        return max_samples
-
-    if isinstance(max_samples, numbers.Real):
-        if not (0 < max_samples < 1):
-            msg = "`max_samples` must be in range (0, 1) but got value {}"
-            raise ValueError(msg.format(max_samples))
-        return int(round(n_samples * max_samples))
-
-    msg = "`max_samples` should be int or float, but got type '{}'"
-    raise TypeError(msg.format(type(max_samples)))
-
-
-def _get_unsampled_ix(random_state, n_samples: int) -> np.array:
-    """
-    future-proof get unsampled indices
-    """
-    n_bootstrap = _get_n_samples_bootstrap(n_samples, n_samples)
-    random_instance = check_random_state(random_state)
-    sample_indices = random_instance.randint(0, n_samples, n_bootstrap)
-    sample_counts = np.bincount(sample_indices, minlength=n_samples)
-
-    return np.arange(n_samples)[sample_counts == 0]
-
-
-def _oob_classifier_accuracy(rf, X_train, y_train) -> float:
-    """
-    Compute out-of-bag (OOB) accuracy for a scikit-learn forest classifier.
-
-    https://github.com/scikit-learn/scikit-learn/blob/a24c8b46/sklearn/ensemble/forest.py#L425
-    """
-    X = X_train.values if isinstance(X_train, pd.DataFrame) else X_train
-    y = y_train.values if isinstance(y_train, pd.Series) else y_train
-
-    n_samples = len(X)
-    n_classes = len(np.unique(y))
-    predictions = np.zeros((n_samples, n_classes))
-    for tree in rf.estimators_:
-        unsampled_indices = _get_unsampled_ix(tree.random_state, n_samples)
-        tree_preds = tree.predict_proba(X[unsampled_indices, :])
-        predictions[unsampled_indices] += tree_preds
-
-    predicted_class_indexes = np.argmax(predictions, axis=1)
-    predicted_classes = [rf.classes_[i] for i in predicted_class_indexes]
-
-    oob_score = np.mean(y == predicted_classes)
-
-    return oob_score
-
-
-def permutation_importance(
-    context: MLClientCtx,
-    model: DataItem,
-    dataset: DataItem,
-    labels: str,
-    figsz=(10, 5),
-    plots_dest: str = "plots",
-    fitype: str = "permute",
-) -> pd.DataFrame:
-    """calculate change in metric
-
-    type 'permute' uses a pre-estimated model
-    type 'dropcol' uses a re-estimates model
-
-    :param context:     the function's execution context
-    :param model:       a trained model
-    :param dataset:     features and ground truths, regression targets
-    :param labels       name of the ground truths column
-    :param figsz:       matplotlib figure size
-    :param plots_dest:  path within artifact store
-    :
-    """
-    model_file, model_data, _ = get_model(model.url, suffix=".pkl")
-    model = load(open(str(model_file), "rb"))
-
-    X = dataset.as_df()
-    y = X.pop(labels)
-    header = X.columns
-
-    metric = _oob_classifier_accuracy
-
-    baseline = metric(model, X, y)
-
-    imp = []
-    for col in X.columns:
-        if fitype is "permute":
-            save = X[col].copy()
-            X[col] = np.random.permutation(X[col])
-            m = metric(model, X, y)
-            X[col] = save
-            imp.append(baseline - m)
-        elif fitype is "dropcol":
-            X_ = X.drop(col, axis=1)
-            model_ = clone(model)
-            #model_.random_state = random_state
-            model_.fit(X_, y)
-            o = model_.oob_score_
-            imp.append(baseline - o)
-        else:
-            raise ValueError("unknown fitype, only 'permute' or 'dropcol' permitted")
-
-    zipped = zip(imp, header)
-    feature_imp = pd.DataFrame(sorted(zipped), columns=["importance", "feature"])
-    feature_imp.sort_values(by="importance", ascending=False, inplace=True)
-
-    plt.clf()
-    plt.figure(figsize=figsz)
-    sns.barplot(x="importance", y="feature", data=feature_imp)
-    plt.title(f"feature importances-{fitype}")
-    plt.tight_layout()
-
-    context.log_artifact(
-        PlotArtifact(f"feature importances-{fitype}", body=plt.gcf()),
-        local_path=f"{plots_dest}/feature-permutations.html",
-    )
-    context.log_dataset(
-        f"feature-importances-{fitype}-tbl", df=feature_imp, index=False
-    )
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/feature_perms/0.9.0/src/README.ipynb b/functions/development/feature_perms/0.9.0/src/README.ipynb deleted file mode 100644 index 0929a6f6..00000000 --- a/functions/development/feature_perms/0.9.0/src/README.ipynb +++ /dev/null @@ -1,788 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# feature importances" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "There are a number of ways to compute feature importances and **the default estimates reported by scikit learn can be shown to be biased** under certain circumstances. In addition, many non-tree algorithms do not provide conveniently calculated feature importance estimates. The following demonstration is based on material that draws heavily from the following sources:" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## references\n", - "\n", - "\n", - "### repos\n", - "\n", - "* **[Feature importances for scikit-learn machine learning models](https://github.com/parrt/random-forest-importances)**, [MIT License](https://github.com/parrt/random-forest-importances/blob/master/LICENSE)\n", - "* **[Scikit-Learn ensemble module - forests](https://github.com/scikit-learn/scikit-learn/blob/0.23.1/sklearn/ensemble/_forest.py)**, [BSD License](https://github.com/scikit-learn/scikit-learn/blob/fd237278e895b42abe8d8d09105cbb82dc2cbba7/sklearn/ensemble/_forest.py#L40)\n", - "* **[ELI5 - Permutation Importance](https://eli5.readthedocs.io/en/latest/blackbox/permutation_importance.html)** \n", - "\n", - "### articles\n", - "\n", - "Strobl, C., Boulesteix, A., Zeileis, A. et al. **[Bias in random forest variable importance measures: Illustrations, sources and a solution](https://link.springer.com/article/10.1186/1471-2105-8-25#citeas)**. BMC Bioinformatics 8, 25 (2007). https://doi.org/10.1186/1471-2105-8-25 \n", - "\n", - "Strobl, C., Boulesteix, A., Kneib, T. et al. **[Conditional variable importance for random forests](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-9-307#citeas)**. BMC Bioinformatics 9, 307 (2008). https://doi.org/10.1186/1471-2105-9-307 " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## what we'll do\n", - "\n", - "* demonstrate an issue with default feature importance estimates \n", - "* provide alternatives and compare to the default \n", - "* create a new function `feature_perms` that implements a computationally simple algorithm \n", - "* create a new function `dropcol_importances` that implements a computationally intensive algorithm that is more accurate\n", - "* test our new functions\n", - "\n", - "It should be noted that although we are developing this notebook using a classification example, an almost identical presentation can be done for regression." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## imports" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "import pandas as pd\n", - "\n", - "import sklearn\n", - "from sklearn.base import clone\n", - "\n", - "from sklearn.ensemble import RandomForestClassifier as SomeModel\n", - "\n", - "import matplotlib.pyplot as plt\n", - "import seaborn as sns\n", - "\n", - "from typing import Union, Callable, List" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## default feature importances\n", - "\n", - "This is a function that plots default feature importances from an estimated model object when available. It is taken from mlrun's current source-code implementation:" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "def feature_importances(\n", - " model: SomeModel,\n", - " header: List[str], \n", - " figsz=(10, 5)\n", - ") -> None:\n", - " \"\"\"Display default model feature importances\n", - "\n", - " Only works for models with attribute 'feature_importances_`\n", - "\n", - " :param model: fitted model with a feature_importances_ attribute\n", - " :param header: feature labels\n", - " :param figsz: matplotlib figure size\n", - " \"\"\"\n", - " if not hasattr(model, \"feature_importances_\"):\n", - " raise Exception(\n", - " \"feature importances are only available for some models\")\n", - "\n", - " # create a feature importance table with desired labels\n", - " zipped = zip(model.feature_importances_, header)\n", - " feature_imp = pd.DataFrame(\n", - " sorted(zipped), columns=[\"freq\", \"feature\"]).sort_values(\n", - " by=\"freq\", ascending=False)\n", - "\n", - " plt.clf()\n", - " plt.figure(figsize=figsz)\n", - " sns.barplot(x=\"freq\", y=\"feature\", data=feature_imp)\n", - " plt.title(\"features\")\n", - " plt.tight_layout();\n", - " \n", - " return feature_imp" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## permuted features\n", - "\n", - "A proposed solution that has general applicability is randomly permuted features**[refs](#references)**: \n", - "* loop through the feature set \n", - "* shuffle one feature \n", - "* run predict\n", - "* compare the (marginal) change in accuracy (or other metric of interest) \n", - "\n", - "This approach is computationally more demanding than relying on the default values, however it can be easily parallelized. To perform the estimation we only need an estimated model and a held-out test set. The following was proposed in **[Beware Default Random Forest Importances](https://explained.ai/rf-importance/index.html)**:\n", - "\n", - "( the following 3 glue functions will no longer be publicly visible in the sklearn package from 0.24 onwards, consider this a temporary hack while we refactor these away)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**the following has been refactored in final version of function:**" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "from distutils.version import LooseVersion\n", - "import numpy as np\n", - "from sklearn.utils import check_random_state\n", - "\n", - "def _generate_sample_indices(random_state: int, n_samples: int, n_samples_bootstrap: int):\n", - " \"\"\"\n", - " Private function used to _parallel_build_trees function.\n", - " taken from:\n", - " https://github.com/scikit-learn/scikit-learn/blob/2253807bb488b6de73796aef2de38a6dcf282d86/sklearn/ensemble/_forest.py#L116\n", - " (public availability to be deprecated by sklearn v0.24)\n", - " \"\"\"\n", - " random_instance = check_random_state(random_state)\n", - " sample_indices = random_instance.randint(0, n_samples, n_samples_bootstrap)\n", - "\n", - " return sample_indices\n", - "\n", - "def _generate_unsampled_indices(random_state: int, n_samples: int, n_samples_bootstrap: int):\n", - " \"\"\"\n", - " Private function used to forest._set_oob_score function.\n", - " taken from: \n", - " https://github.com/scikit-learn/scikit-learn/blob/2253807bb488b6de73796aef2de38a6dcf282d86/sklearn/ensemble/_forest.py#L126\n", - " (public availability to be deprecated by sklearn v0.24)\n", - " \"\"\"\n", - " sample_indices = _generate_sample_indices(random_state, n_samples,\n", - " n_samples_bootstrap)\n", - " sample_counts = np.bincount(sample_indices, minlength=n_samples)\n", - " unsampled_mask = sample_counts == 0\n", - " indices_range = np.arange(n_samples)\n", - " unsampled_indices = indices_range[unsampled_mask]\n", - "\n", - " return unsampled_indices\n", - "\n", - "def _get_unsampled_indices(tree, n_samples: int):\n", - " \"\"\"\n", - " An interface to get unsampled indices regardless of sklearn version.\n", - " \"\"\"\n", - " import warnings\n", - " warnings.simplefilter(action=\"ignore\", category=FutureWarning)\n", - " if LooseVersion(sklearn.__version__) >= LooseVersion(\"0.22\"):\n", - " # Version 0.22 or newer uses 3 arguments.\n", - " from sklearn.ensemble.forest import _get_n_samples_bootstrap\n", - " n_samples_bootstrap = _get_n_samples_bootstrap(n_samples, n_samples)\n", - " return _generate_unsampled_indices(tree.random_state, n_samples,\n", - " n_samples_bootstrap)\n", - " else:\n", - " # Version 0.21 or older uses only two arguments.\n", - " return _generate_unsampled_indices(tree.random_state, n_samples)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The following function estimates classifier accuracy and has been borrowed from **[references](#references)**. See **[breitman on oob](https://www.stat.berkeley.edu/~breiman/OOBestimation.pdf)** for details on out-of-bag estimation:" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "def oob_classifier_accuracy(rf, X_train: np.array, y_train: np.array) -> float:\n", - " \"\"\"\n", - " Compute out-of-bag (OOB) accuracy for a scikit-learn forest classifier.\n", - " \n", - " https://github.com/scikit-learn/scikit-learn/blob/a24c8b46/sklearn/ensemble/forest.py#L425\n", - " \"\"\"\n", - " X = X_train.values\n", - " y = y_train.values\n", - "\n", - " n_samples = len(X)\n", - " n_classes = len(np.unique(y))\n", - " predictions = np.zeros((n_samples, n_classes))\n", - " for tree in rf.estimators_:\n", - " unsampled_indices = _get_unsampled_indices(tree, n_samples)\n", - " tree_preds = tree.predict_proba(X[unsampled_indices, :])\n", - " predictions[unsampled_indices] += tree_preds\n", - "\n", - " predicted_class_indexes = np.argmax(predictions, axis=1)\n", - " predicted_classes = [rf.classes_[i] for i in predicted_class_indexes]\n", - "\n", - " oob_score = np.mean(y == predicted_classes)\n", - " \n", - " return oob_score" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Putting it all together:" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "def permutation_importances(\n", - " model, \n", - " X_train: np.array,\n", - " y_train: np.array, \n", - " header: List[str],\n", - " metric: Callable = oob_classifier_accuracy,\n", - " figsz=(10, 5)\n", - ") -> np.array:\n", - " \"\"\"calculate change in metric from permuting feature columns\n", - " \n", - " modified from https://explained.ai/rf-importance/index.html\n", - " \n", - " uses a pre-estimated model\n", - "\n", - " :param X_train: training set features\n", - " :param y_train: training set ground truths, regression targets\n", - " :param header: column labels for X_train\n", - " :param figsz: matplotlib figure size\n", - " \n", - " \"\"\"\n", - " baseline = metric(model, X_train, y_train)\n", - " imp = []\n", - " for col in X_train.columns:\n", - " save = X_train[col].copy()\n", - " X_train[col] = np.random.permutation(X_train[col])\n", - " m = metric(model, X_train, y_train)\n", - " X_train[col] = save\n", - " imp.append(baseline - m)\n", - " \n", - " # create a feature importance table with desired labels\n", - " zipped = zip(imp, header)\n", - " feature_imp = pd.DataFrame(sorted(zipped), columns=[\"importance\", \"feature\"])\n", - " feature_imp.sort_values(by=\"importance\", ascending=False, inplace=True)\n", - "\n", - " plt.clf()\n", - " plt.figure(figsize=figsz)\n", - " sns.barplot(x=\"importance\", y=\"feature\", data=feature_imp)\n", - " plt.title(\"feature permutation importances\")\n", - " plt.tight_layout()\n", - "\n", - " return np.array(feature_imp)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## drop-column importances\n", - "\n", - "According to our **[references](#references)** a more accurate measure of feature importance would have us re-estimate the model after dropping a column. This is considered as being close to \"ideal\". Unfortunately, the entire model needs to be re-estimated for each column and without some approximating shortcut this is likely to be infeasible for large datasets.\n", - "\n", - "Here is the suggested implementation and **don't run this on big models!**:" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "def dropcol_importances(\n", - " model, \n", - " X_train: np.array,\n", - " y_train: np.array,\n", - " header: List[str] = [],\n", - " random_state: int = 1994,\n", - " figsz=(10, 5)\n", - ") -> pd.DataFrame:\n", - " \"\"\"drop columns and re-estimate model\n", - " \n", - " modified from https://explained.ai/rf-importance/index.html\n", - " \n", - " :param rf: model to fit\n", - " :param X_train: training set features\n", - " :param y_train: training set ground truth labels\n", - "\n", - " Returns:\n", - " pd.DataFrame: table of diffs vs baseline metric\n", - " \"\"\"\n", - " # cloning makes copy of model pre-fit\n", - " # calculate a baseline with all features\n", - " model_ = clone(model)\n", - " model_.random_state = random_state\n", - " model_.fit(X_train, y_train)\n", - " baseline = model_.oob_score_\n", - " \n", - " # now drop each colum, refit model and calc metric\n", - " imp = []\n", - " for col in X_train.columns:\n", - " X = X_train.drop(col, axis=1)\n", - " model_ = clone(model)\n", - " model_.random_state = random_state\n", - " model_.fit(X, y_train)\n", - " o = model_.oob_score_\n", - " imp.append(baseline - o)\n", - " \n", - " # put it all in a table\n", - " imp = np.array(imp)\n", - " feature_imps = pd.DataFrame(\n", - " data={'feature': X_train.columns,\n", - " 'importance': imp})\n", - " #feature_imps.set_index('feature', inplace=True)\n", - " feature_imps.sort_values('importance', ascending=True, inplace=True)\n", - " \n", - " plt.clf()\n", - " plt.figure(figsize=figsz)\n", - " sns.barplot(x=\"importance\", y=\"feature\", data=feature_imps)\n", - " plt.title(\"drop column feature importances\")\n", - " plt.tight_layout()\n", - " \n", - " return feature_imps" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## demonstration\n", - "\n", - "In this demonstratuon we are going to take a fraction of a fraction of **[Kaggle's RentHop rental listing interest competition](https://www.kaggle.com/c/two-sigma-connect-rental-listing-inquiries)**--the complete dataset is presently >80GB, we'll be looking at 5K rows. \n", - "\n", - "The competition's **[goal](https://www.kaggle.com/c/two-sigma-connect-rental-listing-inquiries)** was\n", - "> to predict the number of inquiries a new listing receives based on the listing’s creation date and other features. \n", - "\n", - "Doing so would help **[RentHop](https://www.renthop.com/)**\n", - "> better handle fraud control, identify potential listing quality issues, and allow owners and agents to better understand renters’ needs and preferences." - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [], - "source": [ - "data = \"/User/artifacts/two-sigma-connect-rental-listing-inquiries/\"\n", - "NFRAC = 0.1" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "sample dimensions (4935, 6)\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
bathroomsbedroomspricelongitudelatitudeinterest_level
182351.001800-73.996040.71972
421401.023000-73.987940.76533
46771.011350-73.899640.85492
\n", - "
" - ], - "text/plain": [ - " bathrooms bedrooms price longitude latitude interest_level\n", - "18235 1.0 0 1800 -73.9960 40.7197 2\n", - "42140 1.0 2 3000 -73.9879 40.7653 3\n", - "4677 1.0 1 1350 -73.8996 40.8549 2" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df = pd.read_csv(data + 'rent.csv').sample(frac=NFRAC)\n", - "print(\"sample dimensions\", df.shape)\n", - "df.head(3)" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [], - "source": [ - "features = ['bathrooms', 'bedrooms', 'longitude', 'latitude', 'price']\n", - "dfr = df[features]\n", - "\n", - "# drop price column\n", - "X_train, y_train = dfr.drop('price', axis=1), dfr['price']\n", - "\n", - "# insert column with random values\n", - "X_train['random'] = np.random.random(size=len(X_train))\n", - "features = ['bathrooms', 'bedrooms', 'longitude', 'latitude', 'random']" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "RandomForestClassifier(n_jobs=-1, oob_score=True)" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# define model\n", - "model_params = {\n", - " \"n_estimators\" : 100, \n", - " \"min_samples_leaf\" : 1,\n", - " \"n_jobs\" : -1,\n", - " \"oob_score\" : True\n", - "}\n", - "\n", - "model = SomeModel(**model_params)\n", - "\n", - "# estimate\n", - "model.fit(X_train, y_train)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### to run this the model needs a default attribute" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "default feature_importances [0.01683784 0.03215169 0.29983429 0.30418813 0.34698806]\n" - ] - }, - { - "data": { - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAsgAAAFgCAYAAACmDI9oAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjMsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+AADFEAAAdgklEQVR4nO3deZRlZX3u8e8jLTMC2jiBTQMiCIggjYgiBON1vAhGvKg4oFxxAuN1iokuoq5lrjfINSoqtsbZBSqJWa3eiDNoK0J309AgIMqgBoMMMiOm4Xf/OLvja6Wq+nRVnTqnqr+ftc6qfd7z7r1/+117NU+9vGdXqgpJkiRJPfcbdgGSJEnSKDEgS5IkSQ0DsiRJktQwIEuSJEkNA7IkSZLUMCBLkiRJDQOyJM1xSfZIcmGS25O8ftj1SNJcZ0CWpLnvrcD3q2qbqvrgVA+S5PtJ/ucM1iVJc5IBWZLmvp2BS4ddRJIFw65BkmaCAVmS5rAk3wUOB05Lcke33OJ9SX6Z5PokpyfZouu7fZKvJbkhye+67Z26z94DPLk5zmlJFiepNvi2s8xJjkuyPMn7k9wMvLNrf0WSy7pznJ1k5649Xd/fJrk1ycVJ9pnN8ZKkfhiQJWkOq6qnAD8ATqyqrYHXAI8C9gMeCewInNx1vx/wKXozzouAu4HTuuO8vT1OVZ3YZwkHAVcBDwbek+Qo4G+AvwB26I55Rtf3acChXX3bAccAN03pwiVpgAzIkjRPJAnwSuB/VdXNVXU78HfACwCq6qaq+qequqv77D3AYdM87XVV9aGqWltVdwOvAv53VV1WVWu78+/XzSL/B7ANsCeQrs9vpnl+SZpxBmRJmj92ALYEVia5JcktwDe6dpJsmeRjSa5NchtwLrBdkk2mcc5fjXm/M/CB5vw3AwF2rKrv0pux/jBwfZKlSR4wjXNL0kAYkCVp/riR3rKJvatqu+61bbf0AuBNwB7AQVX1AHrLHaAXYAFqzPHu7H5u2bQ9dEyfsfv8CnhVc/7tqmqLqvoRQFV9sKoOAPamt9TiLVO4TkkaKAOyJM0TVXUf8HHg/UkeDJBkxyRP77psQy9A35LkgcDfjjnE9cCuzfFuAP4NeHGSTZK8AthtPWWcDvx1kr2782+b5Pnd9oFJDkpyf3rh+/fAvVO/YkkaDAOyJM0vfwX8HDivW0bxbXqzxgD/AGxBb6b5PHrLL1ofAI7unj6x7nnKr6Q3y3sTvVnfH0128qr6CvB/gDO7818CPLP7+AH0AvzvgGu7Y75vapcpSYOTqrH/d0ySJEnaeDmDLEmSJDUMyJIkSVLDgCxJkiQ1DMiSJElSY8GwC5gvFi5cWIsXLx52GZIkSerTypUrb6yqHca2G5BnyOLFi1mxYsWwy5AkSVKfklw7XrtLLCRJkqSGAVmSJElqGJAlSZKkhgFZkiRJavglvRly2a9v4oC3fHbYZUiSJM05K0956bBL+BPOIEuSJEkNA7IkSZLUMCBLkiRJDQOyJEmS1DAgS5IkSQ0DsiRJktQwIEuSJEkNA7IkSZLUMCBLkiRJDQOyJEmS1DAgS5IkSQ0DsiRJktQwIEuSJEkNA7IkSZLUMCBLkiRJDQOyJEmS1DAgS5IkSQ0DsiRJktQwIEuSJEmNjSIgJ7kmycJh1yFJkqTRN/IBOT0jX6ckSZLmh5EMnkkWJ7ksyUeAVcA/JlmR5NIk72r6XZPkXUlWJVmTZM+u/UFJvpnkwiQfA9Ls88Ykl3SvNzTnuzzJJ7r2LyR5apLlSa5M8vhZHgJJkiQNyUgG5M4ewGeran/gTVW1BNgXOCzJvk2/G6vqccBHgTd3bX8L/LDbdxmwCCDJAcDLgYOAJwCvTLJ/t88jgQ9059gTeBFwSHfMvxnYVUqSJGmkjHJAvraqzuu2/0eSVcCFwN7AXk2/f+5+rgQWd9uHAp8HqKqvA7/r2g8BvlJVd1bVHd2+T+4+u7qq1lTVfcClwHeqqoA1zXH/RJITupntFWvvun1aFytJkqTRMMoB+U6AJLvQm8X986raF/g6sHnT757u573Agqa9xjlmxmkbexyA+5r394057h9PULW0qpZU1ZIFW24zyaElSZI0V4xyQF7nAfTC8q1JHgI8s499zgWOBUjyTGD7pv2oJFsm2Qp4LvCDmS9ZkiRJc9W4M6OjpKouSnIhvWUPVwHL+9jtXcAZ3bKMc4BfdsdaleTTwPldv09U1YVJFs903ZIkSZqb0ltmq+na6qG71J4vedf6O0qSJOlPrDzlpUM5b5KV3YMg/sRcWGIhSZIkzRoDsiRJktQwIEuSJEkNA7IkSZLUMCBLkiRJDQOyJEmS1DAgS5IkSQ0DsiRJktQwIEuSJEkNA7IkSZLUMCBLkiRJDQOyJEmS1DAgS5IkSQ0DsiRJktQwIEuSJEkNA7IkSZLUMCBLkiRJDQOyJEmS1Fgw7ALmi0fv9CBWnPLSYZchSZKkaXIGWZIkSWoYkCVJkqSGAVmSJElqGJAlSZKkhgFZkiRJahiQJUmSpIYBWZIkSWoYkCVJkqSGAVmSJElqGJAlSZKkhgFZkiRJaiwYdgHzxR9+cym/fPdjhl2GJEkaMYtOXjPsErSBnEGWJEmSGgZkSZIkqWFAliRJkhoGZEmSJKlhQJYkSZIaBmRJkiSpYUCWJEmSGgZkSZIkqWFAliRJkhoGZEmSJKlhQJYkSZIaBmRJkiSpYUCWJEmSGgZkSZIkqWFAliRJkhoGZEmSJKlhQJYkSZIaBmRJkiSpYUCWJEmSGiMZkJPcsZ7Pt0vy2ub9w5Oc1W3vl+RZUzjnO5O8ecOrlSRJ0nwykgG5D9sB/xmQq+q6qjq6e7sfsMEBWZIkSYIRD8hJtk7ynSSrkqxJcmT30XuB3ZKsTnJKksVJLkmyKfBu4Jjus2PGzgx3/RZ3229PckWSbwN7NH12S/KNJCuT/CDJnrN20ZIkSRqqBcMuYD1+Dzy3qm5LshA4L8ky4G3APlW1H8C6wFtVf0hyMrCkqk7sPnvneAdOcgDwAmB/euOwCljZfbwUeHVVXZnkIOAjwFPGOcYJwAkAO257/5m4XkmSJA3ZqAfkAH+X5FDgPmBH4CEzdOwnA1+pqrsAuuBNkq2BJwJfTrKu72bjHaCqltIL0+y74xY1Q3VJkiRpiEY9IB8L7AAcUFX/keQaYPMNPMZa/nQpSbv/eKH2fsAt62anJUmStHEZ6TXIwLbAb7twfDiwc9d+O7DNBPuM/ewa4HEASR4H7NK1nws8N8kWSbYBjgCoqtuAq5M8v9snSR47c5ckSZKkUTbqAfkLwJIkK+jNJl8OUFU3Acu7L9ydMmaf7wF7rfuSHvBPwAOTrAZeA/ysO8Yq4IvA6q7PD5pjHAscn+Qi4FLgSCRJkrRRSJVLZ2fCvjtuUV971SOHXYYkSRoxi05eM+wSNIEkK6tqydj2UZ9BliRJkmaVAVmSJElqGJAlSZKkhgFZkiRJahiQJUmSpIYBWZIkSWoYkCVJkqSGAVmSJElqGJAlSZKkhgFZkiRJahiQJUmSpIYBWZIkSWoYkCVJkqSGAVmSJElqGJAlSZKkhgFZkiRJahiQJUmSpMaCYRcwX2z6sL1ZdPKKYZchSZKkaXIGWZIkSWoYkCVJkqSGAVmSJElqGJAlSZKkhgFZkiRJahiQJUmSpIYBWZIkSWoYkCVJkqSGAVmSJElqGJAlSZKkhgFZkiRJaiwYdgHzxeW/vZwnfehJwy5DkiTNsuUnLR92CZphziBLkiRJDQOyJEmS1DAgS5IkSQ0DsiRJktQwIEuSJEkNA7IkSZLUMCBLkiRJjfUG5CQPSfKPSf61e79XkuMHX5okSZI0+/qZQf40cDbw8O79z4A3DKogSZIkaZj6CcgLq+pLwH0AVbUWuHegVUmSJElD0k9AvjPJg4ACSPIE4NaBViVJkiQNyYI++rwRWAbslmQ5sANw9ECrkiRJkoZk0oCc5H7A5sBhwB5AgCuq6j9moTZJkiRp1k0akKvqviSnVtXBwKWzVJMkSZI0NP2sQf5mkuclycCrkSRJkoas3zXIWwFrk/ye3jKLqqoHDLQySZIkaQjWG5CrapvZKESSJEkaBesNyEkOHa+9qs6d+XIkSZKk4epnicVbmu3NgccDK4GnDKQiSZIkaYjW+yW9qjqief03YB/g+umcNMkd09l/gmM+J8nbuu2jkuw1hWN8P8mSma5NkiRJc0c/T7EY69f0QvJIqaplVfXe7u1RwAYHZEmSJGm9ATnJh5J8sHudBvwAuGgmTp6eU5JckmRNkmO69j/rZnPPSnJ5ki+se8xckmd1bT/savpa135cktOSPBF4DnBKktVJdmtnhpMsTHJNt71FkjOTXJzki8AWTW1PS/LjJKuSfDnJ1jNxzZIkSRpt/axBXtFsrwXOqKrlM3T+vwD2Ax4LLAQuSLLuy3/7A3sD1wHLgSclWQF8DDi0qq5OcsbYA1bVj5IsA75WVWcBTPII59cAd1XVvkn2BVZ1/RcC7wCeWlV3Jvkreo+7e3e7c5ITgBMANt1+0ykOgSRJkkZJPwF5u6r6QNuQ5C/Htk3RIfQC973A9UnOAQ4EbgPOr6pfd+dbDSwG7gCuqqqru/3PoAuoU3Qo8EGAqro4ycVd+xPoLdFY3oXrTYEfj925qpYCSwG2XrR1TaMOSZIkjYh+1iC/bJy242bo/JP9db57mu176YX5qf41v7X88Vo3H/PZeME2wLeqar/utVdVHT/Fc0uSJGkOmTAgJ3lhkq8CuyRZ1ry+B9w0Q+c/FzgmySZJdqA3o3v+JP0vB3ZNsrh7f8wE/W4H2j9wcg1wQLd99JjzHwuQZB9g3679PHpLOh7ZfbZlkkf1cT2SJEma4yZbYvEj4Df01gaf2rTfDlw87h4b7ivAwfS+9FfAW6vq35PsOV7nqro7yWuBbyS5kYnD9JnAx5O8nl4gfh/wpSQvAb7b9Pso8KluacXqdcerqhuSHAeckWSzru87gJ9N/VIlSZI0F6Rqbi2dTbJ1Vd3RPdXiw8CVVfX+Yde19aKt67Fveeywy5AkSbNs+Ukz9ewCzbYkK6vqv/wNjH4e8/aEJBckuSPJH5Lcm+S2wZTZl1d2X9q7FNiW3lMtJEmSpBnRz1MsTgNeAHwZWAK8FHjkIIuaTDdbPPQZY0mSJM1P/QRkqurnSTbpHsf2qSQ/GnBdkiRJ0lD0E5DvSrIpsDrJ39P74t5Wgy1LkiRJGo5+noP8kq7ficCdwCOA5w2yKEmSJGlY1juDXFXXJtkCeFhVvWsWapIkSZKGpp+nWBxB7xnB3+je75dk2aALkyRJkoahnyUW7wQeD9wCUFWrgcWDK0mSJEkann4C8tqqunXglUiSJEkjoJ+nWFyS5EXAJkl2B15P789QS5IkSfPOhDPIST7Xbf4C2Bu4BzgDuA14w+BLkyRJkmbfZDPIByTZGTgGOBw4tflsS+D3gyxMkiRJGobJAvLp9J5csSuwomkPUF27JEmSNK9MuMSiqj5YVY8GPllVuzavXarKcCxJkqR5ab1Psaiq18xGIZIkSdIo6Ocxb5IkSdJGw4AsSZIkNfp5DrL6sOeD92T5ScuHXYYkSZKmyRlkSZIkqWFAliRJkhoGZEmSJKlhQJYkSZIaBmRJkiSpYUCWJEmSGgZkSZIkqWFAliRJkhoGZEmSJKlhQJYkSZIaBmRJkiSpsWDYBcwXt19xBeccetiwyxiIw849Z9glSJIkzRpnkCVJkqSGAVmSJElqGJAlSZKkhgFZkiRJahiQJUmSpIYBWZIkSWoYkCVJkqSGAVmSJElqGJAlSZKkhgFZkiRJahiQJUmSpIYBWZIkSWoYkCVJkqSGAVmSJElqGJAlSZKkhgFZkiRJahiQJUmSpIYBWZIkSWoYkCVJkqTGrAXkJIuTXDLb+0qSJEkbYk7PICdZMOwaJEmSNL/MdkBekOQzSS5OclaSLZMckOScJCuTnJ3kYQBd+0VJfgy8bt0BkhyX5MtJvgp8Mz2nJLkkyZokx3T9Jmr/s+58X0rysyTvTXJskvO7frt1/Z7f7XtRknNneZwkSZI0JLM9A7sHcHxVLU/ySXrB97nAkVV1Qxdi3wO8AvgUcFJVnZPklDHHORjYt6puTvI8YD/gscBC4IIu0D5xgna6tkcDNwNXAZ+oqscn+UvgJOANwMnA06vq35JsN97FJDkBOAHgIZttNu3BkSRJ0vDN9gzyr6pqebf9eeDpwD7At5KsBt4B7JRkW2C7qjqn6/u5Mcf5VlXd3G0fApxRVfdW1fXAOcCBk7QDXFBVv6mqe4BfAN/s2tcAi7vt5cCnk7wS2GS8i6mqpVW1pKqWbHv/+2/4aEiSJGnkzPYMco15fztwaVUd3DZ2M7Zj+7bubLtP0GeidoB7mu37mvf30Y1JVb06yUHAs4HVSfarqpsmOaYkSZLmgdmeQV6UZF0YfiFwHrDDurYk90+yd1XdAtya5JCu77GTHPNc4JgkmyTZATgUOH+S9r4k2a2qflJVJwM3Ao/YgOuUJEnSHDXbM8iXAS9L8jHgSuBDwNnAB7tlFQuAfwAuBV4OfDLJXV2fiXyF3prki+jNOr+1qv49yUTte/ZZ6ylJdqc3E/2d7jiSJEma51I12UoG9WuPbbappfs/bthlDMRh556z/k6SJElzTJKVVbVkbPucfg6yJEmSNNMMyJIkSVLDgCxJkiQ1DMiSJElSw4AsSZIkNQzIkiRJUsOALEmSJDUMyJIkSVLDgCxJkiQ1DMiSJElSw4AsSZIkNQzIkiRJUsOALEmSJDUMyJIkSVLDgCxJkiQ1DMiSJElSw4AsSZIkNQzIkiRJUmPBsAuYL7bZYw8OO/ecYZchSZKkaXIGWZIkSWoYkCVJkqSGAVmSJElqGJAlSZKkhgFZkiRJahiQJUmSpIYBWZIkSWoYkCVJkqSGAVmSJElqGJAlSZKkhn9qeob89te3ctqbvjqr5zzx1CNm9XySJEkbA2eQJUmSpIYBWZIkSWoYkCVJkqSGAVmSJElqGJAlSZKkhgFZkiRJahiQJUmSpIYBWZIkSWoYkCVJkqSGAVmSJElqGJAlSZKkhgFZkiRJahiQJUmSpIYBWZIkSWoYkCVJkqSGAVmSJElqGJAlSZKkhgFZkiRJahiQJUmSpMbAAnKSxUku2YD+xyV5ePP+miQLB1OdJEmSNL5RmkE+Dnj4+jq1kiwYTCmSJEnaWA06IC9I8pkkFyc5K8mWSU5OckGSS5IsTc/RwBLgC0lWJ9mi2/+kJKuSrEmyJ0CSd3b7fRP4bJLNk3yq63NhksO7fhO1H5fkX5J8NcnVSU5M8sauz3lJHtj1e32Sn3a1nzngcZIkSdKIGHRA3gNYWlX7ArcBrwVOq6oDq2ofYAvgv1fVWcAK4Niq2q+q7u72v7GqHgd8FHhzc9wDgCOr6kXA6wCq6jHAC4HPJNl8knaAfYAXAY8H3gPcVVX7Az8GXtr1eRuwf1f7q2d0VCRJkjSyBh2Qf1VVy7vtzwOHAIcn+UmSNcBTgL0n2f+fu58rgcVN+7ImRB8CfA6gqi4HrgUeNUk7wPeq6vaqugG4Ffhq176mOc/F9Ga0XwysHa+4JCckWZFkxR133TrJZUiSJGmuGHRArnHefwQ4upvZ/Tiw+X/Z64/u6X7eC7Trje9stjPBvhO1t8cFuK95f19znmcDH6Y3W71yvPXOVbW0qpZU1ZKtt9x2ktNJkiRprhh0QF6U5OBu+4XAD7vtG5NsDRzd9L0d2GYK5zgXOBYgyaOARcAVk7SvV5L7AY+oqu8BbwW2A7aeQm2SJEmaYwb9FIjLgJcl+RhwJb21xNvTW8pwDXBB0/fTwOlJ7gYOpn8f6fZbQ28pxHFVdU+Sidr7OeYmwOeTbEtvJvr9VXXLBtQkSZKkOSpVY1dBaCoWPXT3euux/3dWz3niqUfM6vkkSZLmkyQrq2rJ2PZReg6yJEmSNHQGZEmSJKlhQJYkSZIaBmRJkiSpYUCWJEmSGgZkSZIkqWFAliRJkhoGZEmSJKlhQJYkSZIaBmRJkiSpYUCWJEmSGgZkSZIkqWFAliRJkhoGZEmSJKlhQJYkSZIaBmRJkiSpYUCWJEmSGgZkSZIkqbFg2AXMFw/eaVtOPPWIYZchSZKkaXIGWZIkSWoYkCVJkqSGAVmSJElqGJAlSZKkhgFZkiRJaqSqhl3DvJDkduCKYdcxzywEbhx2EfOQ4zoYjuvMc0wHw3GdeY7pYMzGuO5cVTuMbfQxbzPniqpaMuwi5pMkKxzTmee4DobjOvMc08FwXGeeYzoYwxxXl1hIkiRJDQOyJEmS1DAgz5ylwy5gHnJMB8NxHQzHdeY5poPhuM48x3QwhjaufklPkiRJajiDLEmSJDUMyJIkSVLDgLweSZ6R5IokP0/ytnE+3yzJF7vPf5JkcfPZX3ftVyR5+mzWPeqmOq5JFie5O8nq7nX6bNc+yvoY10OTrEqyNsnRYz57WZIru9fLZq/q0TbNMb23uVeXzV7Vo6+PcX1jkp8muTjJd5Ls3HzmvTqOaY6p9+oE+hjXVydZ043dD5Ps1XxmDpjAVMd11nJAVfma4AVsAvwC2BXYFLgI2GtMn9cCp3fbLwC+2G3v1fXfDNilO84mw76mUXhNc1wXA5cM+xpG8dXnuC4G9gU+CxzdtD8QuKr7uX23vf2wr2nYr+mMaffZHcO+hlF89TmuhwNbdtuvaf4N8F6d4THt3nuvTn1cH9BsPwf4RrdtDhjMuM5KDnAGeXKPB35eVVdV1R+AM4Ejx/Q5EvhMt30W8OdJ0rWfWVX3VNXVwM+742l646qJrXdcq+qaqroYuG/Mvk8HvlVVN1fV74BvAc+YjaJH3HTGVBPrZ1y/V1V3dW/PA3bqtr1XxzedMdXE+hnX25q3WwHrnn5gDpjYdMZ1VhiQJ7cj8Kvm/a+7tnH7VNVa4FbgQX3uu7GazrgC7JLkwiTnJHnyoIudQ6Zzz3m/jm+647J5khVJzkty1MyWNqdt6LgeD/zrFPfdWExnTMF7dSJ9jWuS1yX5BfD3wOs3ZN+N1HTGFWYhB/inpic33ozl2N9gJurTz74bq+mM62+ARVV1U5IDgH9JsveY3zQ3VtO557xfxzfdcVlUVdcl2RX4bpI1VfWLGaptLut7XJO8GFgCHLah+25kpjOm4L06kb7Gtao+DHw4yYuAdwAv63ffjdR0xnVWcoAzyJP7NfCI5v1OwHUT9UmyANgWuLnPfTdWUx7X7n9V3QRQVSvprWF61MArnhumc895v45vWuNSVdd1P68Cvg/sP5PFzWF9jWuSpwJvB55TVfdsyL4boemMqffqxDb0fjsTWDcD7706sSmP62zlAAPy5C4Adk+yS5JN6X1ZbOy3e5fR+40G4Gjgu9VbRb4MeEF6T2PYBdgdOH+W6h51Ux7XJDsk2QSgm+nYnd6XdNTfuE7kbOBpSbZPsj3wtK5tYzflMe3GcrNueyHwJOCnA6t0blnvuCbZH/gYvSD32+Yj79XxTXlMvVcn1c+47t68fTZwZbdtDpjYlMd11nLAsL/JOOov4FnAz+j9hvL2ru3d9P6BAdgc+DK9xffnA7s2+7692+8K4JnDvpZRek11XIHnAZfS+8brKuCIYV/LKL36GNcD6f3mfidwE3Bps+8ruvH+OfDyYV/LqLymOqbAE4E13b26Bjh+2NcySq8+xvXbwPXA6u61rNnXe3UGx9R7ddrj+oHuv0urge8Bezf7mgNmeFxnKwf4p6YlSZKkhkssJEmSpIYBWZIkSWoYkCVJkqSGAVmSJElqGJAlSZKkhgFZkjZiSV6f5LIkXxh2LZI0KnzMmyRtxJJcTu/5rFc3bQuqau0Qy5KkoXIGWZI2UklOB3YFliW5NcnSJN8EPptkkySnJLkgycVJXtXtkySnJflpkq8n+X9Jjh7qhUjSDFsw7AIkScNRVa9O8gzgcOBE4AjgkKq6O8kJwK1VdWD3Z4iXd+F5f2AP4DHAQ+j9SeJPDucKJGkwDMiSpHWWVdXd3fbTgH2b2eFtgd2BQ4Ezqupe4Lok3x1CnZI0UAZkSdI6dzbbAU6qqrPbDkmeBfjlFUnzmmuQJUnjORt4TZL7AyR5VJKtgHOBF3RrlB9Gb3mGJM0rziBLksbzCWAxsCpJgBuAo4CvAE8B1gA/A84ZVoGSNCg+5k2SNGVJPg18rarOGnYtkjRTXGIhSZIkNZxBliRJkhrOIEuSJEkNA7IkSZLUMCBLkiRJDQOyJEmS1DAgS5IkSY3/D2zR9qPdY3rAAAAAAElFTkSuQmCC\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "if hasattr(model, \"feature_importances_\"):\n", - " print(\"default feature_importances\", model.feature_importances_)\n", - " feature_importances(model, features)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## permutation importances\n", - "\n", - "No need to check for default attributes or functions, this can be run on any kind of model:" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([[0.06545086119554205, 'longitude'],\n", - " [0.06160081053698076, 'latitude'],\n", - " [0.053495440729483285, 'bedrooms'],\n", - " [0.021681864235055734, 'bathrooms'],\n", - " [0.0004052684903748799, 'random']], dtype=object)" - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "pi = permutation_importances(model, X_train, y_train, features)\n", - "pi" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## drop-column importances" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
featureimportance
4random-0.042756
0bathrooms0.001824
1bedrooms0.023100
2longitude0.049240
3latitude0.051874
\n", - "
" - ], - "text/plain": [ - " feature importance\n", - "4 random -0.042756\n", - "0 bathrooms 0.001824\n", - "1 bedrooms 0.023100\n", - "2 longitude 0.049240\n", - "3 latitude 0.051874" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "dc = dropcol_importances(model, X_train, y_train)\n", - "dc" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## conclusions" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "So I would say location is a prime factor, then the number of bedrooms. Bathrooms often is gte bedrooms, and is likely correlated so one of them should likely be dropped." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.8" - }, - "toc-autonumbering": false, - "toc-showcode": false, - "toc-showmarkdowntxt": false - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/functions/development/feature_perms/0.9.0/src/feature_perms.ipynb b/functions/development/feature_perms/0.9.0/src/feature_perms.ipynb deleted file mode 100644 index 77da7b55..00000000 --- a/functions/development/feature_perms/0.9.0/src/feature_perms.ipynb +++ /dev/null @@ -1,1106 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# permutation_importances as reusable function" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## function code" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: ignore\n", - "import nuclio" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "import pandas as pd\n", - "import numbers\n", - "\n", - "import sklearn\n", - "from sklearn.base import clone\n", - "from sklearn.utils import check_random_state\n", - "\n", - "import matplotlib.pyplot as plt\n", - "import seaborn as sns\n", - "\n", - "from cloudpickle import load\n", - "\n", - "from mlrun.execution import MLClientCtx\n", - "from mlrun.datastore import DataItem\n", - "from mlrun.artifacts import get_model, PlotArtifact\n", - "from typing import Union, Callable, List\n", - "\n", - "def _get_n_samples_bootstrap(n_samples, max_samples) -> int:\n", - " \"\"\"get the number of samples in a bootstrap sample\n", - " \n", - " returns the total number of samples to draw for the bootstrap sample\n", - " \n", - " private api in sklearn >= v0.24, taken from sklearn.ensemble._forest.py\n", - "\n", - " :param n_samples: Number of samples in the dataset.\n", - " :param max_samples: \n", - " The maximum number of samples to draw from the total available:\n", - " - if float, this indicates a fraction of the total and should be\n", - " the interval `(0, 1)`;\n", - " - if int, this indicates the exact number of samples;\n", - " - if None, this indicates the total number of samples.\n", - " \"\"\"\n", - " if max_samples is None:\n", - " return n_samples\n", - "\n", - " if isinstance(max_samples, numbers.Integral):\n", - " if not (1 <= max_samples <= n_samples):\n", - " msg = \"`max_samples` must be in range 1 to {} but got value {}\"\n", - " raise ValueError(msg.format(n_samples, max_samples))\n", - " return max_samples\n", - "\n", - " if isinstance(max_samples, numbers.Real):\n", - " if not (0 < max_samples < 1):\n", - " msg = \"`max_samples` must be in range (0, 1) but got value {}\"\n", - " raise ValueError(msg.format(max_samples))\n", - " return int(round(n_samples * max_samples))\n", - "\n", - " msg = \"`max_samples` should be int or float, but got type '{}'\"\n", - " raise TypeError(msg.format(type(max_samples)))\n", - "\n", - "def _get_unsampled_ix(random_state, n_samples: int) -> np.array:\n", - " \"\"\"\n", - " future-proof get unsampled indices\n", - " \"\"\"\n", - " n_bootstrap = _get_n_samples_bootstrap(n_samples, n_samples)\n", - " random_instance = check_random_state(random_state)\n", - " sample_indices = random_instance.randint(0, n_samples, n_bootstrap)\n", - " sample_counts = np.bincount(sample_indices, minlength=n_samples)\n", - "\n", - " return np.arange(n_samples)[sample_counts==0]\n", - "\n", - "def _oob_classifier_accuracy(rf, X_train, y_train) -> float:\n", - " \"\"\"\n", - " Compute out-of-bag (OOB) accuracy for a scikit-learn forest classifier.\n", - " \n", - " https://github.com/scikit-learn/scikit-learn/blob/a24c8b46/sklearn/ensemble/forest.py#L425\n", - " \"\"\"\n", - " X = X_train.values if isinstance(X_train, pd.DataFrame) else X_train\n", - " y = y_train.values if isinstance(y_train, pd.Series) else y_train\n", - "\n", - " n_samples = len(X)\n", - " n_classes = len(np.unique(y))\n", - " predictions = np.zeros((n_samples, n_classes))\n", - " for tree in rf.estimators_:\n", - " unsampled_indices = _get_unsampled_ix(tree.random_state, n_samples)\n", - " tree_preds = tree.predict_proba(X[unsampled_indices, :])\n", - " predictions[unsampled_indices] += tree_preds\n", - "\n", - " predicted_class_indexes = np.argmax(predictions, axis=1)\n", - " predicted_classes = [rf.classes_[i] for i in predicted_class_indexes]\n", - "\n", - " oob_score = np.mean(y == predicted_classes)\n", - " \n", - " return oob_score\n", - "\n", - "def permutation_importances(\n", - " context: MLClientCtx,\n", - " model: DataItem,\n", - " dataset: DataItem,\n", - " labels: str,\n", - " figsz=(10, 5),\n", - " plots_dest: str = \"plots\",\n", - " fitype: str = \"permute\"\n", - ") -> pd.DataFrame:\n", - " \"\"\"calculate change in metric\n", - " \n", - " type 'permute' uses a pre-estimated model\n", - " type 'dropcol' uses a re-estimates model\n", - " \n", - " :param context: the function's execution context\n", - " :param model: a trained model\n", - " :param dataset: features and ground truths, regression targets\n", - " :param labels name of the ground truths column\n", - " :param figsz: matplotlib figure size\n", - " :param plots_dest: path within artifact store\n", - " :\n", - " \"\"\"\n", - " model_file, model_data, _ = get_model(model.url, suffix='.pkl')\n", - " model = load(open(str(model_file), \"rb\"))\n", - " \n", - " X = dataset.as_df()\n", - " y = X.pop(labels)\n", - " header = X.columns\n", - " \n", - " # this will be paramettrized next version, and include regression\n", - " metric = _oob_classifier_accuracy\n", - " \n", - " baseline = metric(model, X, y)\n", - " \n", - " imp = []\n", - " for col in X.columns:\n", - " if fitype is \"permute\":\n", - " save = X[col].copy()\n", - " X[col] = np.random.permutation(X[col])\n", - " m = metric(model, X, y)\n", - " X[col] = save\n", - " imp.append(baseline - m)\n", - " elif fitype is \"dropcol\":\n", - " X_ = X.drop(col, axis=1)\n", - " model_ = clone(model)\n", - " model_.random_state = random_state\n", - " model_.fit(X_, y)\n", - " o = model_.oob_score_\n", - " imp.append(baseline - o)\n", - " else:\n", - " raise ValueError(\"unknown fitype, only 'permute' or 'dropcol' permitted\")\n", - "\n", - " # create a feature importance table with desired labels\n", - " zipped = zip(imp, header)\n", - " feature_imp = pd.DataFrame(sorted(zipped), columns=[\"importance\", \"feature\"])\n", - " feature_imp.sort_values(by=\"importance\", ascending=False, inplace=True)\n", - "\n", - " plt.clf()\n", - " plt.figure(figsize=figsz)\n", - " sns.barplot(x=\"importance\", y=\"feature\", data=feature_imp)\n", - " plt.title(f\"feature importances-{fitype}\")\n", - " plt.tight_layout()\n", - "\n", - " context.log_artifact(PlotArtifact(f\"feature importances-{fitype}\", body=plt.gcf()),\n", - " local_path=f\"{plots_dest}/feature-permutations.html\")\n", - " context.log_dataset(f\"feature-importances-{fitype}-tbl\", df=feature_imp, index=False)" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: end-code" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## save function" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-06-07 19:58:25,298 function spec saved to path: function.yaml\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from mlrun import code_to_function\n", - "from mlrun.platforms.other import auto_mount\n", - "\n", - "gpus = False\n", - "\n", - "# create job function object from notebook code\n", - "fn_params = {\n", - " \"name\" : \"feature-perms\",\n", - " \"handler\" : \"permutation_importances\",\n", - " \"kind\" : \"job\",\n", - " \"image\" : \"mlrun/ml-models\" if not gpus else \"mlrun/ml-models-gpu\",\n", - " \"description\" : \"estimate feature importances using permutations\",\n", - " \"categories\" : [\"analysis\"],\n", - " \"labels\" : {\"author\": \"yjb\"}\n", - "}\n", - "\n", - "perms_fn = code_to_function(**fn_params)\n", - "perms_fn.apply(auto_mount())\n", - "perms_fn.export(\"function.yaml\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## tests" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import import_function\n", - "from mlrun import NewTask, mlconf" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### get some data" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-06-07 19:58:25,352 starting run tasks arc-to-parq uid=e9bc67f2189c418d96bfde754d369956 -> http://mlrun-api:8080\n", - "[mlrun] 2020-06-07 19:58:25,486 Job is running in the background, pod: tasks-arc-to-parq-xqkr7\n", - "[mlrun] 2020-06-07 19:58:29,118 starting local run: main.py # arc_to_parquet\n", - "[mlrun] 2020-06-07 19:58:29,169 downloading https://raw.githubusercontent.com/parrt/random-forest-importances/master/notebooks/data/rent.csv to local tmp\n", - "[mlrun] 2020-06-07 19:58:29,535 destination file does not exist, downloading\n", - "[mlrun] 2020-06-07 19:58:29,898 log artifact rent at /User/artifacts/rent.csv, size: 1492462, db: Y\n", - "\n", - "[mlrun] 2020-06-07 19:58:29,917 run executed, status=completed\n", - "final state: succeeded\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 07 19:58:29completedtasks arc-to-parq
v3io_user=admin
kind=job
owner=admin
host=tasks-arc-to-parq-xqkr7
archive_url
key=rent
stats=True
file_ext=csv
rent
\n", - "
\n", - "
\n", - "
\n", - " Title\n", - " ×\n", - "
\n", - " \n", - "
\n", - "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "to track results use .show() or .logs() or in CLI: \n", - "!mlrun get run e9bc67f2189c418d96bfde754d369956 --project default , !mlrun logs e9bc67f2189c418d96bfde754d369956 --project default\n", - "[mlrun] 2020-06-07 19:58:31,666 run executed, status=completed\n" - ] - } - ], - "source": [ - "data_url = \"https://raw.githubusercontent.com/parrt/random-forest-importances/master/notebooks/data/rent.csv\"\n", - "\n", - "fn = import_function(\"hub://arc_to_parquet\", \"a2p\")\n", - "fn.apply(auto_mount())\n", - "\n", - "params = {\n", - " \"name\" : \"tasks arc-to-parq\",\n", - " \"params\" : {\"key\":\"rent\", \"stats\": True, \"file_ext\":\"csv\"}\n", - "}\n", - "acquire_run = fn.run(NewTask(**params),inputs={\"archive_url\" : data_url},\n", - " artifact_path=mlconf.artifact_path)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### train a model" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-06-07 19:58:31,704 starting run tasks random forest uid=57af834167264641905a5bb5e6b0e263 -> http://mlrun-api:8080\n", - "[mlrun] 2020-06-07 19:58:31,861 Job is running in the background, pod: tasks-random-forest-vkjk5\n", - "[mlrun] 2020-06-07 19:58:35,390 starting local run: main.py # train_model\n", - "[mlrun] 2020-06-07 19:58:36,310 log artifact test_set at /User/artifacts/data/test_set.parquet, size: 24484, db: Y\n", - "[mlrun] 2020-06-07 19:58:37,153 log artifact confusion-matrix at /User/artifacts/model/plots/confusion-matrix.html, size: 27401, db: N\n", - "[mlrun] 2020-06-07 19:58:37,598 log artifact feature-importances at /User/artifacts/model/plots/feature-importances.html, size: 19685, db: N\n", - "[mlrun] 2020-06-07 19:58:37,806 log artifact precision-recall-multiclass at /User/artifacts/model/plots/precision-recall-multiclass.html, size: 74009, db: N\n", - "[mlrun] 2020-06-07 19:58:37,936 log artifact roc-multiclass at /User/artifacts/model/plots/roc-multiclass.html, size: 73053, db: N\n", - "[mlrun] 2020-06-07 19:58:38,079 log artifact model at /User/artifacts/model/, size: 10346780, db: Y\n", - "\n", - "[mlrun] 2020-06-07 19:58:38,106 run executed, status=completed\n", - "final state: succeeded\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 07 19:58:36completedtasks random forest
v3io_user=admin
kind=job
owner=admin
host=tasks-random-forest-vkjk5
class=sklearn.ensemble.RandomForestClassifier
dataset
sample=-5000
model_pkg_class=sklearn.ensemble.RandomForestClassifier
label_column=interest_level
CLASS_n_estimators=100
CLASS_min_samples_leaf=1
CLASS_n_jobs=-1
CLASS_oob_score=True
test-accuracy=0.6902857142857143
test-error=0.3097142857142857
auc-micro=0.8567196734693878
auc-weighted=0.7077200281488216
f1-score=0.44361444815007395
precision_score=0.4969837043184901
recall_score=0.42733978329897576
test_set
confusion-matrix
feature-importances
precision-recall-multiclass
roc-multiclass
model
\n", - "
\n", - "
\n", - "
\n", - " Title\n", - " ×\n", - "
\n", - " \n", - "
\n", - "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "to track results use .show() or .logs() or in CLI: \n", - "!mlrun get run 57af834167264641905a5bb5e6b0e263 --project default , !mlrun logs 57af834167264641905a5bb5e6b0e263 --project default\n", - "[mlrun] 2020-06-07 19:58:41,115 run executed, status=completed\n" - ] - } - ], - "source": [ - "fn = import_function(\"hub://sklearn_classifier\", \"skrf\")\n", - "fn.apply(auto_mount())\n", - "\n", - "# define model\n", - "params = {\n", - " \"name\" : \"tasks random forest\",\n", - " \"params\" : {\n", - " \"sample\" : -5_000, # 5k random rows,\n", - " \"model_pkg_class\" : \"sklearn.ensemble.RandomForestClassifier\",\n", - " \"label_column\" : \"interest_level\",\n", - " \"CLASS_n_estimators\" : 100,\n", - " \"CLASS_min_samples_leaf\" : 1,\n", - " \"CLASS_n_jobs\" : -1,\n", - " \"CLASS_oob_score\" : True}\n", - "}\n", - "\n", - "train_run = fn.run(NewTask(**params), inputs={\"dataset\" : acquire_run.outputs[\"rent\"]},\n", - " artifact_path=mlconf.artifact_path)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "

Feature Importances

\n", - "" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from IPython.display import HTML\n", - "HTML(filename=train_run.outputs['feature-importances'])" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "data = acquire_run.outputs[\"rent\"]\n", - "labels = \"interest_level\"\n", - "model = train_run.outputs[\"model\"]\n" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-06-07 19:58:41,152 starting run features-permutation_importances uid=89235b15ac2a4213aefc906c178a1c5e -> http://mlrun-api:8080\n", - "[mlrun] 2020-06-07 19:58:41,312 Job is running in the background, pod: features-permutation-importances-dwxmt\n", - "[mlrun] 2020-06-07 19:58:44,871 starting local run: main.py # permutation_importances\n", - "[mlrun] 2020-06-07 19:58:48,714 log artifact feature importances-permute at /User/artifacts/plots/feature-permutations.html, size: 25694, db: Y\n", - "[mlrun] 2020-06-07 19:58:48,770 log artifact feature-importances-permute-tbl at /User/artifacts/feature-importances-permute-tbl.csv, size: 167, db: Y\n", - "\n", - "[mlrun] 2020-06-07 19:58:48,785 run executed, status=completed\n", - "final state: succeeded\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 07 19:58:45completedfeatures-permutation_importances
v3io_user=admin
kind=job
owner=admin
host=features-permutation-importances-dwxmt
model
dataset
labels=interest_level
plots_dest=plots
feature importances-permute
feature-importances-permute-tbl
\n", - "
\n", - "
\n", - "
\n", - " Title\n", - " ×\n", - "
\n", - " \n", - "
\n", - "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "to track results use .show() or .logs() or in CLI: \n", - "!mlrun get run 89235b15ac2a4213aefc906c178a1c5e --project default , !mlrun logs 89235b15ac2a4213aefc906c178a1c5e --project default\n", - "[mlrun] 2020-06-07 19:58:50,488 run executed, status=completed\n" - ] - } - ], - "source": [ - "fi_perms = perms_fn.run(\n", - " NewTask(params={\"labels\": labels, \n", - " \"plots_dest\": \"plots\"}),\n", - " inputs={\"model\": model, \"dataset\": data},\n", - " artifact_path=mlconf.artifact_path)" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from IPython.display import HTML\n", - "HTML(filename=fi_perms.outputs['feature importances-permute'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.8" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} \ No newline at end of file diff --git a/functions/development/feature_perms/0.9.0/src/feature_perms.py b/functions/development/feature_perms/0.9.0/src/feature_perms.py deleted file mode 100644 index 3a6c9948..00000000 --- a/functions/development/feature_perms/0.9.0/src/feature_perms.py +++ /dev/null @@ -1,160 +0,0 @@ -# Generated by nuclio.export.NuclioExporter - -import numpy as np -import pandas as pd -import numbers - -import sklearn -from sklearn.base import clone -from sklearn.utils import check_random_state - -import matplotlib.pyplot as plt -import seaborn as sns - -from cloudpickle import load - -from mlrun.execution import MLClientCtx -from mlrun.datastore import DataItem -from mlrun.artifacts import get_model, PlotArtifact -from typing import Union, Callable, List - - -def _get_n_samples_bootstrap(n_samples, max_samples) -> int: - """get the number of samples in a bootstrap sample - - returns the total number of samples to draw for the bootstrap sample - - private api in sklearn >= v0.24, taken from sklearn.ensemble._forest.py - - :param n_samples: Number of samples in the dataset. - :param max_samples: - The maximum number of samples to draw from the total available: - - if float, this indicates a fraction of the total and should be - the interval `(0, 1)`; - - if int, this indicates the exact number of samples; - - if None, this indicates the total number of samples. - """ - if max_samples is None: - return n_samples - - if isinstance(max_samples, numbers.Integral): - if not (1 <= max_samples <= n_samples): - msg = "`max_samples` must be in range 1 to {} but got value {}" - raise ValueError(msg.format(n_samples, max_samples)) - return max_samples - - if isinstance(max_samples, numbers.Real): - if not (0 < max_samples < 1): - msg = "`max_samples` must be in range (0, 1) but got value {}" - raise ValueError(msg.format(max_samples)) - return int(round(n_samples * max_samples)) - - msg = "`max_samples` should be int or float, but got type '{}'" - raise TypeError(msg.format(type(max_samples))) - - -def _get_unsampled_ix(random_state, n_samples: int) -> np.array: - """ - future-proof get unsampled indices - """ - n_bootstrap = _get_n_samples_bootstrap(n_samples, n_samples) - random_instance = check_random_state(random_state) - sample_indices = random_instance.randint(0, n_samples, n_bootstrap) - sample_counts = np.bincount(sample_indices, minlength=n_samples) - - return np.arange(n_samples)[sample_counts == 0] - - -def _oob_classifier_accuracy(rf, X_train, y_train) -> float: - """ - Compute out-of-bag (OOB) accuracy for a scikit-learn forest classifier. - - https://github.com/scikit-learn/scikit-learn/blob/a24c8b46/sklearn/ensemble/forest.py#L425 - """ - X = X_train.values if isinstance(X_train, pd.DataFrame) else X_train - y = y_train.values if isinstance(y_train, pd.Series) else y_train - - n_samples = len(X) - n_classes = len(np.unique(y)) - predictions = np.zeros((n_samples, n_classes)) - for tree in rf.estimators_: - unsampled_indices = _get_unsampled_ix(tree.random_state, n_samples) - tree_preds = tree.predict_proba(X[unsampled_indices, :]) - predictions[unsampled_indices] += tree_preds - - predicted_class_indexes = np.argmax(predictions, axis=1) - predicted_classes = [rf.classes_[i] for i in predicted_class_indexes] - - oob_score = np.mean(y == predicted_classes) - - return oob_score - - -def permutation_importance( - context: MLClientCtx, - model: DataItem, - dataset: DataItem, - labels: str, - figsz=(10, 5), - plots_dest: str = "plots", - fitype: str = "permute", -) -> pd.DataFrame: - """calculate change in metric - - type 'permute' uses a pre-estimated model - type 'dropcol' uses a re-estimates model - - :param context: the function's execution context - :param model: a trained model - :param dataset: features and ground truths, regression targets - :param labels name of the ground truths column - :param figsz: matplotlib figure size - :param plots_dest: path within artifact store - : - """ - model_file, model_data, _ = get_model(model.url, suffix=".pkl") - model = load(open(str(model_file), "rb")) - - X = dataset.as_df() - y = X.pop(labels) - header = X.columns - - metric = _oob_classifier_accuracy - - baseline = metric(model, X, y) - - imp = [] - for col in X.columns: - if fitype is "permute": - save = X[col].copy() - X[col] = np.random.permutation(X[col]) - m = metric(model, X, y) - X[col] = save - imp.append(baseline - m) - elif fitype is "dropcol": - X_ = X.drop(col, axis=1) - model_ = clone(model) - #model_.random_state = random_state - model_.fit(X_, y) - o = model_.oob_score_ - imp.append(baseline - o) - else: - raise ValueError("unknown fitype, only 'permute' or 'dropcol' permitted") - - zipped = zip(imp, header) - feature_imp = pd.DataFrame(sorted(zipped), columns=["importance", "feature"]) - feature_imp.sort_values(by="importance", ascending=False, inplace=True) - - plt.clf() - plt.figure(figsize=figsz) - sns.barplot(x="importance", y="feature", data=feature_imp) - plt.title(f"feature importances-{fitype}") - plt.tight_layout() - - context.log_artifact( - PlotArtifact(f"feature importances-{fitype}", body=plt.gcf()), - local_path=f"{plots_dest}/feature-permutations.html", - ) - context.log_dataset( - f"feature-importances-{fitype}-tbl", df=feature_imp, index=False - ) diff --git a/functions/development/feature_perms/0.9.0/src/function.yaml b/functions/development/feature_perms/0.9.0/src/function.yaml deleted file mode 100644 index 713981fd..00000000 --- a/functions/development/feature_perms/0.9.0/src/function.yaml +++ /dev/null @@ -1,63 +0,0 @@ -kind: job -metadata: - name: feature-perms - tag: '' - hash: 2e32234a73e2e48f029cf6c957b150ec2ffd4bc7 - project: '' - labels: - author: yjb - categories: - - data-analysis -spec: - command: '' - args: [] - image: mlrun/ml-models - env: [] - default_handler: permutation_importance - entry_points: - permutation_importance: - name: permutation_importance - doc: 'calculate change in metric - - - type ''permute'' uses a pre-estimated model - - type ''dropcol'' uses a re-estimates model' - parameters: - - name: context - type: MLClientCtx - doc: the function's execution context - default: '' - - name: model - type: DataItem - doc: a trained model - default: '' - - name: dataset - type: DataItem - doc: features and ground truths, regression targets - default: '' - - name: labels - type: str - default: '' - - name: figsz - doc: matplotlib figure size - default: - - 10 - - 5 - - name: plots_dest - type: str - doc: path within artifact store - default: plots - - name: fitype - type: str - default: permute - outputs: - - default: '' - lineno: 93 - description: estimate feature importances using permutations - build: - functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IG51bXB5IGFzIG5wCmltcG9ydCBwYW5kYXMgYXMgcGQKaW1wb3J0IG51bWJlcnMKCmltcG9ydCBza2xlYXJuCmZyb20gc2tsZWFybi5iYXNlIGltcG9ydCBjbG9uZQpmcm9tIHNrbGVhcm4udXRpbHMgaW1wb3J0IGNoZWNrX3JhbmRvbV9zdGF0ZQoKaW1wb3J0IG1hdHBsb3RsaWIucHlwbG90IGFzIHBsdAppbXBvcnQgc2VhYm9ybiBhcyBzbnMKCmZyb20gY2xvdWRwaWNrbGUgaW1wb3J0IGxvYWQKCmZyb20gbWxydW4uZXhlY3V0aW9uIGltcG9ydCBNTENsaWVudEN0eApmcm9tIG1scnVuLmRhdGFzdG9yZSBpbXBvcnQgRGF0YUl0ZW0KZnJvbSBtbHJ1bi5hcnRpZmFjdHMgaW1wb3J0IGdldF9tb2RlbCwgUGxvdEFydGlmYWN0CmZyb20gdHlwaW5nIGltcG9ydCBVbmlvbiwgQ2FsbGFibGUsIExpc3QKCgpkZWYgX2dldF9uX3NhbXBsZXNfYm9vdHN0cmFwKG5fc2FtcGxlcywgbWF4X3NhbXBsZXMpIC0+IGludDoKICAgICIiImdldCB0aGUgbnVtYmVyIG9mIHNhbXBsZXMgaW4gYSBib290c3RyYXAgc2FtcGxlCgogICAgcmV0dXJucyB0aGUgdG90YWwgbnVtYmVyIG9mIHNhbXBsZXMgdG8gZHJhdyBmb3IgdGhlIGJvb3RzdHJhcCBzYW1wbGUKCiAgICBwcml2YXRlIGFwaSBpbiBza2xlYXJuID49IHYwLjI0LCB0YWtlbiBmcm9tIHNrbGVhcm4uZW5zZW1ibGUuX2ZvcmVzdC5weQoKICAgIDpwYXJhbSBuX3NhbXBsZXM6ICAgTnVtYmVyIG9mIHNhbXBsZXMgaW4gdGhlIGRhdGFzZXQuCiAgICA6cGFyYW0gbWF4X3NhbXBsZXM6CiAgICAgICAgVGhlIG1heGltdW0gbnVtYmVyIG9mIHNhbXBsZXMgdG8gZHJhdyBmcm9tIHRoZSB0b3RhbCBhdmFpbGFibGU6CiAgICAgICAgICAgIC0gaWYgZmxvYXQsIHRoaXMgaW5kaWNhdGVzIGEgZnJhY3Rpb24gb2YgdGhlIHRvdGFsIGFuZCBzaG91bGQgYmUKICAgICAgICAgICAgICB0aGUgaW50ZXJ2YWwgYCgwLCAxKWA7CiAgICAgICAgICAgIC0gaWYgaW50LCB0aGlzIGluZGljYXRlcyB0aGUgZXhhY3QgbnVtYmVyIG9mIHNhbXBsZXM7CiAgICAgICAgICAgIC0gaWYgTm9uZSwgdGhpcyBpbmRpY2F0ZXMgdGhlIHRvdGFsIG51bWJlciBvZiBzYW1wbGVzLgogICAgIiIiCiAgICBpZiBtYXhfc2FtcGxlcyBpcyBOb25lOgogICAgICAgIHJldHVybiBuX3NhbXBsZXMKCiAgICBpZiBpc2luc3RhbmNlKG1heF9zYW1wbGVzLCBudW1iZXJzLkludGVncmFsKToKICAgICAgICBpZiBub3QgKDEgPD0gbWF4X3NhbXBsZXMgPD0gbl9zYW1wbGVzKToKICAgICAgICAgICAgbXNnID0gImBtYXhfc2FtcGxlc2AgbXVzdCBiZSBpbiByYW5nZSAxIHRvIHt9IGJ1dCBnb3QgdmFsdWUge30iCiAgICAgICAgICAgIHJhaXNlIFZhbHVlRXJyb3IobXNnLmZvcm1hdChuX3NhbXBsZXMsIG1heF9zYW1wbGVzKSkKICAgICAgICByZXR1cm4gbWF4X3NhbXBsZXMKCiAgICBpZiBpc2luc3RhbmNlKG1heF9zYW1wbGVzLCBudW1iZXJzLlJlYWwpOgogICAgICAgIGlmIG5vdCAoMCA8IG1heF9zYW1wbGVzIDwgMSk6CiAgICAgICAgICAgIG1zZyA9ICJgbWF4X3NhbXBsZXNgIG11c3QgYmUgaW4gcmFuZ2UgKDAsIDEpIGJ1dCBnb3QgdmFsdWUge30iCiAgICAgICAgICAgIHJhaXNlIFZhbHVlRXJyb3IobXNnLmZvcm1hdChtYXhfc2FtcGxlcykpCiAgICAgICAgcmV0dXJuIGludChyb3VuZChuX3NhbXBsZXMgKiBtYXhfc2FtcGxlcykpCgogICAgbXNnID0gImBtYXhfc2FtcGxlc2Agc2hvdWxkIGJlIGludCBvciBmbG9hdCwgYnV0IGdvdCB0eXBlICd7fSciCiAgICByYWlzZSBUeXBlRXJyb3IobXNnLmZvcm1hdCh0eXBlKG1heF9zYW1wbGVzKSkpCgoKZGVmIF9nZXRfdW5zYW1wbGVkX2l4KHJhbmRvbV9zdGF0ZSwgbl9zYW1wbGVzOiBpbnQpIC0+IG5wLmFycmF5OgogICAgIiIiCiAgICBmdXR1cmUtcHJvb2YgZ2V0IHVuc2FtcGxlZCBpbmRpY2VzCiAgICAiIiIKICAgIG5fYm9vdHN0cmFwID0gX2dldF9uX3NhbXBsZXNfYm9vdHN0cmFwKG5fc2FtcGxlcywgbl9zYW1wbGVzKQogICAgcmFuZG9tX2luc3RhbmNlID0gY2hlY2tfcmFuZG9tX3N0YXRlKHJhbmRvbV9zdGF0ZSkKICAgIHNhbXBsZV9pbmRpY2VzID0gcmFuZG9tX2luc3RhbmNlLnJhbmRpbnQoMCwgbl9zYW1wbGVzLCBuX2Jvb3RzdHJhcCkKICAgIHNhbXBsZV9jb3VudHMgPSBucC5iaW5jb3VudChzYW1wbGVfaW5kaWNlcywgbWlubGVuZ3RoPW5fc2FtcGxlcykKCiAgICByZXR1cm4gbnAuYXJhbmdlKG5fc2FtcGxlcylbc2FtcGxlX2NvdW50cyA9PSAwXQoKCmRlZiBfb29iX2NsYXNzaWZpZXJfYWNjdXJhY3kocmYsIFhfdHJhaW4sIHlfdHJhaW4pIC0+IGZsb2F0OgogICAgIiIiCiAgICBDb21wdXRlIG91dC1vZi1iYWcgKE9PQikgYWNjdXJhY3kgZm9yIGEgc2Npa2l0LWxlYXJuIGZvcmVzdCBjbGFzc2lmaWVyLgoKICAgIGh0dHBzOi8vZ2l0aHViLmNvbS9zY2lraXQtbGVhcm4vc2Npa2l0LWxlYXJuL2Jsb2IvYTI0YzhiNDYvc2tsZWFybi9lbnNlbWJsZS9mb3Jlc3QucHkjTDQyNQogICAgIiIiCiAgICBYID0gWF90cmFpbi52YWx1ZXMgaWYgaXNpbnN0YW5jZShYX3RyYWluLCBwZC5EYXRhRnJhbWUpIGVsc2UgWF90cmFpbgogICAgeSA9IHlfdHJhaW4udmFsdWVzIGlmIGlzaW5zdGFuY2UoeV90cmFpbiwgcGQuU2VyaWVzKSBlbHNlIHlfdHJhaW4KCiAgICBuX3NhbXBsZXMgPSBsZW4oWCkKICAgIG5fY2xhc3NlcyA9IGxlbihucC51bmlxdWUoeSkpCiAgICBwcmVkaWN0aW9ucyA9IG5wLnplcm9zKChuX3NhbXBsZXMsIG5fY2xhc3NlcykpCiAgICBmb3IgdHJlZSBpbiByZi5lc3RpbWF0b3JzXzoKICAgICAgICB1bnNhbXBsZWRfaW5kaWNlcyA9IF9nZXRfdW5zYW1wbGVkX2l4KHRyZWUucmFuZG9tX3N0YXRlLCBuX3NhbXBsZXMpCiAgICAgICAgdHJlZV9wcmVkcyA9IHRyZWUucHJlZGljdF9wcm9iYShYW3Vuc2FtcGxlZF9pbmRpY2VzLCA6XSkKICAgICAgICBwcmVkaWN0aW9uc1t1bnNhbXBsZWRfaW5kaWNlc10gKz0gdHJlZV9wcmVkcwoKICAgIHByZWRpY3RlZF9jbGFzc19pbmRleGVzID0gbnAuYXJnbWF4KHByZWRpY3Rpb25zLCBheGlzPTEpCiAgICBwcmVkaWN0ZWRfY2xhc3NlcyA9IFtyZi5jbGFzc2VzX1tpXSBmb3IgaSBpbiBwcmVkaWN0ZWRfY2xhc3NfaW5kZXhlc10KCiAgICBvb2Jfc2NvcmUgPSBucC5tZWFuKHkgPT0gcHJlZGljdGVkX2NsYXNzZXMpCgogICAgcmV0dXJuIG9vYl9zY29yZQoKCmRlZiBwZXJtdXRhdGlvbl9pbXBvcnRhbmNlKAogICAgY29udGV4dDogTUxDbGllbnRDdHgsCiAgICBtb2RlbDogRGF0YUl0ZW0sCiAgICBkYXRhc2V0OiBEYXRhSXRlbSwKICAgIGxhYmVsczogc3RyLAogICAgZmlnc3o9KDEwLCA1KSwKICAgIHBsb3RzX2Rlc3Q6IHN0ciA9ICJwbG90cyIsCiAgICBmaXR5cGU6IHN0ciA9ICJwZXJtdXRlIiwKKSAtPiBwZC5EYXRhRnJhbWU6CiAgICAiIiJjYWxjdWxhdGUgY2hhbmdlIGluIG1ldHJpYwoKICAgIHR5cGUgJ3Blcm11dGUnIHVzZXMgYSBwcmUtZXN0aW1hdGVkIG1vZGVsCiAgICB0eXBlICdkcm9wY29sJyB1c2VzIGEgcmUtZXN0aW1hdGVzIG1vZGVsCgogICAgOnBhcmFtIGNvbnRleHQ6ICAgICB0aGUgZnVuY3Rpb24ncyBleGVjdXRpb24gY29udGV4dAogICAgOnBhcmFtIG1vZGVsOiAgICAgICBhIHRyYWluZWQgbW9kZWwKICAgIDpwYXJhbSBkYXRhc2V0OiAgICAgZmVhdHVyZXMgYW5kIGdyb3VuZCB0cnV0aHMsIHJlZ3Jlc3Npb24gdGFyZ2V0cwogICAgOnBhcmFtIGxhYmVscyAgICAgICBuYW1lIG9mIHRoZSBncm91bmQgdHJ1dGhzIGNvbHVtbgogICAgOnBhcmFtIGZpZ3N6OiAgICAgICBtYXRwbG90bGliIGZpZ3VyZSBzaXplCiAgICA6cGFyYW0gcGxvdHNfZGVzdDogIHBhdGggd2l0aGluIGFydGlmYWN0IHN0b3JlCiAgICA6CiAgICAiIiIKICAgIG1vZGVsX2ZpbGUsIG1vZGVsX2RhdGEsIF8gPSBnZXRfbW9kZWwobW9kZWwudXJsLCBzdWZmaXg9Ii5wa2wiKQogICAgbW9kZWwgPSBsb2FkKG9wZW4oc3RyKG1vZGVsX2ZpbGUpLCAicmIiKSkKCiAgICBYID0gZGF0YXNldC5hc19kZigpCiAgICB5ID0gWC5wb3AobGFiZWxzKQogICAgaGVhZGVyID0gWC5jb2x1bW5zCgogICAgbWV0cmljID0gX29vYl9jbGFzc2lmaWVyX2FjY3VyYWN5CgogICAgYmFzZWxpbmUgPSBtZXRyaWMobW9kZWwsIFgsIHkpCgogICAgaW1wID0gW10KICAgIGZvciBjb2wgaW4gWC5jb2x1bW5zOgogICAgICAgIGlmIGZpdHlwZSBpcyAicGVybXV0ZSI6CiAgICAgICAgICAgIHNhdmUgPSBYW2NvbF0uY29weSgpCiAgICAgICAgICAgIFhbY29sXSA9IG5wLnJhbmRvbS5wZXJtdXRhdGlvbihYW2NvbF0pCiAgICAgICAgICAgIG0gPSBtZXRyaWMobW9kZWwsIFgsIHkpCiAgICAgICAgICAgIFhbY29sXSA9IHNhdmUKICAgICAgICAgICAgaW1wLmFwcGVuZChiYXNlbGluZSAtIG0pCiAgICAgICAgZWxpZiBmaXR5cGUgaXMgImRyb3Bjb2wiOgogICAgICAgICAgICBYXyA9IFguZHJvcChjb2wsIGF4aXM9MSkKICAgICAgICAgICAgbW9kZWxfID0gY2xvbmUobW9kZWwpCiAgICAgICAgICAgICNtb2RlbF8ucmFuZG9tX3N0YXRlID0gcmFuZG9tX3N0YXRlCiAgICAgICAgICAgIG1vZGVsXy5maXQoWF8sIHkpCiAgICAgICAgICAgIG8gPSBtb2RlbF8ub29iX3Njb3JlXwogICAgICAgICAgICBpbXAuYXBwZW5kKGJhc2VsaW5lIC0gbykKICAgICAgICBlbHNlOgogICAgICAgICAgICByYWlzZSBWYWx1ZUVycm9yKCJ1bmtub3duIGZpdHlwZSwgb25seSAncGVybXV0ZScgb3IgJ2Ryb3Bjb2wnIHBlcm1pdHRlZCIpCgogICAgemlwcGVkID0gemlwKGltcCwgaGVhZGVyKQogICAgZmVhdHVyZV9pbXAgPSBwZC5EYXRhRnJhbWUoc29ydGVkKHppcHBlZCksIGNvbHVtbnM9WyJpbXBvcnRhbmNlIiwgImZlYXR1cmUiXSkKICAgIGZlYXR1cmVfaW1wLnNvcnRfdmFsdWVzKGJ5PSJpbXBvcnRhbmNlIiwgYXNjZW5kaW5nPUZhbHNlLCBpbnBsYWNlPVRydWUpCgogICAgcGx0LmNsZigpCiAgICBwbHQuZmlndXJlKGZpZ3NpemU9Zmlnc3opCiAgICBzbnMuYmFycGxvdCh4PSJpbXBvcnRhbmNlIiwgeT0iZmVhdHVyZSIsIGRhdGE9ZmVhdHVyZV9pbXApCiAgICBwbHQudGl0bGUoZiJmZWF0dXJlIGltcG9ydGFuY2VzLXtmaXR5cGV9IikKICAgIHBsdC50aWdodF9sYXlvdXQoKQoKICAgIGNvbnRleHQubG9nX2FydGlmYWN0KAogICAgICAgIFBsb3RBcnRpZmFjdChmImZlYXR1cmUgaW1wb3J0YW5jZXMte2ZpdHlwZX0iLCBib2R5PXBsdC5nY2YoKSksCiAgICAgICAgbG9jYWxfcGF0aD1mIntwbG90c19kZXN0fS9mZWF0dXJlLXBlcm11dGF0aW9ucy5odG1sIiwKICAgICkKICAgIGNvbnRleHQubG9nX2RhdGFzZXQoCiAgICAgICAgZiJmZWF0dXJlLWltcG9ydGFuY2VzLXtmaXR5cGV9LXRibCIsIGRmPWZlYXR1cmVfaW1wLCBpbmRleD1GYWxzZQogICAgKQo= - commands: [] - code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/feature_perms/feature_perms.py - affinity: null -verbose: false diff --git a/functions/development/feature_perms/0.9.0/src/item.yaml b/functions/development/feature_perms/0.9.0/src/item.yaml deleted file mode 100644 index ec93f7ad..00000000 --- a/functions/development/feature_perms/0.9.0/src/item.yaml +++ /dev/null @@ -1,23 +0,0 @@ -apiVersion: v1 -categories: -- data-analysis -description: estimate feature importances using permutations -doc: '' -example: feature_perms.ipynb -generationDate: 2021-11-18:12-28 -icon: '' -labels: - author: yjb -maintainers: [] -marketplaceType: '' -mlrunVersion: 0.8.0 -name: feature-perms -platformVersion: 3.2.0 -spec: - filename: feature_perms.py - handler: permutation_importance - image: mlrun/ml-models - kind: job - requirements: [] -url: '' -version: 0.9.0 diff --git a/functions/development/feature_perms/0.9.0/src/requirements.txt b/functions/development/feature_perms/0.9.0/src/requirements.txt deleted file mode 100644 index f53a3289..00000000 --- a/functions/development/feature_perms/0.9.0/src/requirements.txt +++ /dev/null @@ -1,6 +0,0 @@ -mlrun -sklearn -matplotlib -seaborn -scikit-plot - diff --git a/functions/development/feature_perms/0.9.0/src/test_feature_perms.py b/functions/development/feature_perms/0.9.0/src/test_feature_perms.py deleted file mode 100644 index b270292d..00000000 --- a/functions/development/feature_perms/0.9.0/src/test_feature_perms.py +++ /dev/null @@ -1,98 +0,0 @@ -from mlrun import code_to_function, import_function -from pathlib import Path -import os - -ARTIFACTS_PATH = 'artifacts' -DATA_URL = "https://raw.githubusercontent.com/parrt/random-forest-importances/master/notebooks/data/rent.csv" -FEATURE_OUTPUT = ARTIFACTS_PATH + "/feature-importances-permute-tbl.parquet" - - -def arc_to_parquet(): - from mlrun import import_function - from mlrun.platforms import auto_mount - - archive_func = import_function('hub://arc_to_parquet') - archive_run = archive_func.run(handler="arc_to_parquet", - params={"key": "rent", "stats": True, "file_ext": "csv"}, - inputs={"archive_url": DATA_URL}, - artifact_path=os.getcwd() + '/artifacts' - , local=True - ) - - -def sklearn_classifier(run): - cwd = os.getcwd() - file_path = str(Path(cwd).parent.absolute()) + "/sklearn_classifier/sklearn_classifier.py" - fn = code_to_function(name='test_sklearn_classifier', - filename=file_path, - handler="train_model", - kind="local", - ) - fn.spec.command = file_path - fn.run(params={ - "sample": -5_000, # 5k random rows, - "model_pkg_class": "sklearn.ensemble.RandomForestClassifier", - "label_column": "interest_level", - "CLASS_n_estimators": 100, - "CLASS_min_samples_leaf": 1, - "CLASS_n_jobs": -1, - "CLASS_oob_score": True}, - handler="train_model", - inputs={"dataset": run.outputs["rent"]}, - artifact_path='artifacts' - # , local=True - ) - - -def train_model(): - from mlrun import import_function - from mlrun.platforms import auto_mount - - train = import_function('hub://sklearn_classifier') - # .apply(auto_mount()) - - train_run = train.run( - inputs={"dataset": "artifacts/rent.csv"}, - params={ - "sample": -5_000, # 5k random rows, - "model_pkg_class": "sklearn.ensemble.RandomForestClassifier", - "label_column": "interest_level", - "CLASS_n_estimators": 100, - "CLASS_min_samples_leaf": 1, - "CLASS_n_jobs": -1, - "CLASS_oob_score": True}, - local=True) - - -def test_feature_selection_run_local(): - arc_to_parquet() - train_model() - data = "artifacts/rent.csv" - labels = "interest_level" - model = "model/model.pkl" - fn = code_to_function(name='test_run_local_feature_perms', - filename="feature_perms.py", - handler="permutation_importance", - kind="local", - ) - fn.spec.command = "feature_perms.py" - fi_perms = fn.run(params={"labels": labels, - "plots_dest": "plots"}, - inputs={"model": model, "dataset": data}, - artifact_path='artifacts') - assert Path(FEATURE_OUTPUT).is_file() - - -def test_feature_perms_import_function(): - arc_to_parquet() - train_model() - data = "artifacts/rent.csv" - labels = "interest_level" - model = "model/model.pkl" - fi_perms = import_function("function.yaml") - fi_perms.run(params={"labels": labels, - "plots_dest": "plots"}, - inputs={"model": model, "dataset": data}, - artifact_path=os.getcwd() + '/artifacts' - , local=True) - assert Path(FEATURE_OUTPUT).is_file() diff --git a/functions/development/feature_perms/0.9.0/static/documentation.html b/functions/development/feature_perms/0.9.0/static/documentation.html deleted file mode 100644 index ed89d14b..00000000 --- a/functions/development/feature_perms/0.9.0/static/documentation.html +++ /dev/null @@ -1,125 +0,0 @@ - - - - - - - -feature_perms package - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- - -
-
-
-
-
-
-

feature_perms package

-
-

Submodules

-
-
-

feature_perms.feature_perms module

-
-
-

Module contents

-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/feature_perms/0.9.0/static/example.html b/functions/development/feature_perms/0.9.0/static/example.html deleted file mode 100644 index 955c97e1..00000000 --- a/functions/development/feature_perms/0.9.0/static/example.html +++ /dev/null @@ -1,1065 +0,0 @@ - - - - - - - -permutation_importances as reusable function - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- - -
-
-
-
-
-
-

permutation_importances as reusable function

-
-

function code

-
-
-
# nuclio: ignore
-import nuclio
-
-
-
-
-
-
-
import numpy as np
-import pandas as pd
-import numbers
-
-import sklearn
-from sklearn.base import clone
-from sklearn.utils import check_random_state
-
-import matplotlib.pyplot as plt
-import seaborn as sns
-
-from cloudpickle import load
-
-from mlrun.execution import MLClientCtx
-from mlrun.datastore import DataItem
-from mlrun.artifacts import get_model, PlotArtifact
-from typing import Union, Callable, List
-
-def _get_n_samples_bootstrap(n_samples, max_samples) -> int:
-    """get the number of samples in a bootstrap sample
-    
-    returns the total number of samples to draw for the bootstrap sample
-    
-    private api in sklearn >= v0.24, taken from sklearn.ensemble._forest.py
-
-    :param n_samples:   Number of samples in the dataset.
-    :param max_samples: 
-        The maximum number of samples to draw from the total available:
-            - if float, this indicates a fraction of the total and should be
-              the interval `(0, 1)`;
-            - if int, this indicates the exact number of samples;
-            - if None, this indicates the total number of samples.
-    """
-    if max_samples is None:
-        return n_samples
-
-    if isinstance(max_samples, numbers.Integral):
-        if not (1 <= max_samples <= n_samples):
-            msg = "`max_samples` must be in range 1 to {} but got value {}"
-            raise ValueError(msg.format(n_samples, max_samples))
-        return max_samples
-
-    if isinstance(max_samples, numbers.Real):
-        if not (0 < max_samples < 1):
-            msg = "`max_samples` must be in range (0, 1) but got value {}"
-            raise ValueError(msg.format(max_samples))
-        return int(round(n_samples * max_samples))
-
-    msg = "`max_samples` should be int or float, but got type '{}'"
-    raise TypeError(msg.format(type(max_samples)))
-
-def _get_unsampled_ix(random_state, n_samples: int) -> np.array:
-    """
-    future-proof get unsampled indices
-    """
-    n_bootstrap = _get_n_samples_bootstrap(n_samples, n_samples)
-    random_instance = check_random_state(random_state)
-    sample_indices = random_instance.randint(0, n_samples, n_bootstrap)
-    sample_counts = np.bincount(sample_indices, minlength=n_samples)
-
-    return np.arange(n_samples)[sample_counts==0]
-
-def _oob_classifier_accuracy(rf, X_train, y_train) -> float:
-    """
-    Compute out-of-bag (OOB) accuracy for a scikit-learn forest classifier.
-    
-    https://github.com/scikit-learn/scikit-learn/blob/a24c8b46/sklearn/ensemble/forest.py#L425
-    """
-    X = X_train.values if isinstance(X_train, pd.DataFrame) else X_train
-    y = y_train.values if isinstance(y_train, pd.Series) else y_train
-
-    n_samples = len(X)
-    n_classes = len(np.unique(y))
-    predictions = np.zeros((n_samples, n_classes))
-    for tree in rf.estimators_:
-        unsampled_indices = _get_unsampled_ix(tree.random_state, n_samples)
-        tree_preds = tree.predict_proba(X[unsampled_indices, :])
-        predictions[unsampled_indices] += tree_preds
-
-    predicted_class_indexes = np.argmax(predictions, axis=1)
-    predicted_classes = [rf.classes_[i] for i in predicted_class_indexes]
-
-    oob_score = np.mean(y == predicted_classes)
-    
-    return oob_score
-
-def permutation_importances(
-    context: MLClientCtx,
-    model: DataItem,
-    dataset: DataItem,
-    labels: str,
-    figsz=(10, 5),
-    plots_dest: str = "plots",
-    fitype: str = "permute"
-) -> pd.DataFrame:
-    """calculate change in metric
-    
-    type 'permute' uses a pre-estimated model
-    type 'dropcol' uses a re-estimates model
-    
-    :param context:     the function's execution context
-    :param model:       a trained model
-    :param dataset:     features and ground truths, regression targets
-    :param labels       name of the ground truths column
-    :param figsz:       matplotlib figure size
-    :param plots_dest:  path within artifact store
-    :
-    """
-    model_file, model_data, _ = get_model(model.url, suffix='.pkl')
-    model = load(open(str(model_file), "rb"))
-    
-    X = dataset.as_df()
-    y = X.pop(labels)
-    header = X.columns
-    
-    # this will be paramettrized next version, and include regression
-    metric = _oob_classifier_accuracy
-    
-    baseline = metric(model, X, y)
-    
-    imp = []
-    for col in X.columns:
-        if fitype is "permute":
-            save = X[col].copy()
-            X[col] = np.random.permutation(X[col])
-            m = metric(model, X, y)
-            X[col] = save
-            imp.append(baseline - m)
-        elif fitype is "dropcol":
-            X_ = X.drop(col, axis=1)
-            model_ = clone(model)
-            model_.random_state = random_state
-            model_.fit(X_, y)
-            o = model_.oob_score_
-            imp.append(baseline - o)
-        else:
-            raise ValueError("unknown fitype, only 'permute' or 'dropcol' permitted")
-
-    # create a feature importance table with desired labels
-    zipped = zip(imp, header)
-    feature_imp = pd.DataFrame(sorted(zipped), columns=["importance", "feature"])
-    feature_imp.sort_values(by="importance", ascending=False, inplace=True)
-
-    plt.clf()
-    plt.figure(figsize=figsz)
-    sns.barplot(x="importance", y="feature", data=feature_imp)
-    plt.title(f"feature importances-{fitype}")
-    plt.tight_layout()
-
-    context.log_artifact(PlotArtifact(f"feature importances-{fitype}", body=plt.gcf()),
-                         local_path=f"{plots_dest}/feature-permutations.html")
-    context.log_dataset(f"feature-importances-{fitype}-tbl", df=feature_imp, index=False)
-
-
-
-
-
-
-
# nuclio: end-code
-
-
-
-
-
-
-

save function

-
-
-
from mlrun import code_to_function
-from mlrun.platforms.other import auto_mount
-
-gpus = False
-
-# create job function object from notebook code
-fn_params = {
-    "name"        : "feature-perms",
-    "handler"     : "permutation_importances",
-    "kind"        : "job",
-    "image"       : "mlrun/ml-models" if not gpus else "mlrun/ml-models-gpu",
-    "description" : "estimate feature importances using permutations",
-    "categories"  : ["analysis"],
-    "labels"      : {"author": "yjb"}
-}
-
-perms_fn = code_to_function(**fn_params)
-perms_fn.apply(auto_mount())
-perms_fn.export("function.yaml")
-
-
-
-
-
[mlrun] 2020-06-07 19:58:25,298 function spec saved to path: function.yaml
-
-
-
<mlrun.runtimes.kubejob.KubejobRuntime at 0x7feb0104efd0>
-
-
-
-
-
-
-

tests

-
-
-
from mlrun import import_function
-from mlrun import NewTask, mlconf
-
-
-
-
-
-

get some data

-
-
-
data_url = "https://raw.githubusercontent.com/parrt/random-forest-importances/master/notebooks/data/rent.csv"
-
-fn = import_function("hub://arc_to_parquet", "a2p")
-fn.apply(auto_mount())
-
-params = {
-    "name" : "tasks arc-to-parq",
-    "params" : {"key":"rent", "stats": True, "file_ext":"csv"}
-}
-acquire_run = fn.run(NewTask(**params),inputs={"archive_url" : data_url},
-                     artifact_path=mlconf.artifact_path)
-
-
-
-
-
[mlrun] 2020-06-07 19:58:25,352 starting run tasks arc-to-parq uid=e9bc67f2189c418d96bfde754d369956  -> http://mlrun-api:8080
-[mlrun] 2020-06-07 19:58:25,486 Job is running in the background, pod: tasks-arc-to-parq-xqkr7
-[mlrun] 2020-06-07 19:58:29,118 starting local run: main.py # arc_to_parquet
-[mlrun] 2020-06-07 19:58:29,169 downloading https://raw.githubusercontent.com/parrt/random-forest-importances/master/notebooks/data/rent.csv to local tmp
-[mlrun] 2020-06-07 19:58:29,535 destination file does not exist, downloading
-[mlrun] 2020-06-07 19:58:29,898 log artifact rent at /User/artifacts/rent.csv, size: 1492462, db: Y
-
-[mlrun] 2020-06-07 19:58:29,917 run executed, status=completed
-final state: succeeded
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 07 19:58:29completedtasks arc-to-parq
v3io_user=admin
kind=job
owner=admin
host=tasks-arc-to-parq-xqkr7
archive_url
key=rent
stats=True
file_ext=csv
rent
-
- -
-
to track results use .show() or .logs() or in CLI: 
-!mlrun get run e9bc67f2189c418d96bfde754d369956 --project default , !mlrun logs e9bc67f2189c418d96bfde754d369956 --project default
-[mlrun] 2020-06-07 19:58:31,666 run executed, status=completed
-
-
-
-
-
-
-

train a model

-
-
-
fn = import_function("hub://sklearn_classifier", "skrf")
-fn.apply(auto_mount())
-
-# define model
-params = {
-    "name" : "tasks random forest",
-    "params" : {
-        "sample"                 : -5_000, # 5k random rows,
-        "model_pkg_class"        : "sklearn.ensemble.RandomForestClassifier",
-        "label_column"           : "interest_level",
-        "CLASS_n_estimators"     : 100,
-        "CLASS_min_samples_leaf" : 1,
-        "CLASS_n_jobs"           : -1,
-        "CLASS_oob_score"        : True}
-}
-
-train_run = fn.run(NewTask(**params), inputs={"dataset" : acquire_run.outputs["rent"]},
-                   artifact_path=mlconf.artifact_path)
-
-
-
-
-
[mlrun] 2020-06-07 19:58:31,704 starting run tasks random forest uid=57af834167264641905a5bb5e6b0e263  -> http://mlrun-api:8080
-[mlrun] 2020-06-07 19:58:31,861 Job is running in the background, pod: tasks-random-forest-vkjk5
-[mlrun] 2020-06-07 19:58:35,390 starting local run: main.py # train_model
-[mlrun] 2020-06-07 19:58:36,310 log artifact test_set at /User/artifacts/data/test_set.parquet, size: 24484, db: Y
-[mlrun] 2020-06-07 19:58:37,153 log artifact confusion-matrix at /User/artifacts/model/plots/confusion-matrix.html, size: 27401, db: N
-[mlrun] 2020-06-07 19:58:37,598 log artifact feature-importances at /User/artifacts/model/plots/feature-importances.html, size: 19685, db: N
-[mlrun] 2020-06-07 19:58:37,806 log artifact precision-recall-multiclass at /User/artifacts/model/plots/precision-recall-multiclass.html, size: 74009, db: N
-[mlrun] 2020-06-07 19:58:37,936 log artifact roc-multiclass at /User/artifacts/model/plots/roc-multiclass.html, size: 73053, db: N
-[mlrun] 2020-06-07 19:58:38,079 log artifact model at /User/artifacts/model/, size: 10346780, db: Y
-
-[mlrun] 2020-06-07 19:58:38,106 run executed, status=completed
-final state: succeeded
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 07 19:58:36completedtasks random forest
v3io_user=admin
kind=job
owner=admin
host=tasks-random-forest-vkjk5
class=sklearn.ensemble.RandomForestClassifier
dataset
sample=-5000
model_pkg_class=sklearn.ensemble.RandomForestClassifier
label_column=interest_level
CLASS_n_estimators=100
CLASS_min_samples_leaf=1
CLASS_n_jobs=-1
CLASS_oob_score=True
test-accuracy=0.6902857142857143
test-error=0.3097142857142857
auc-micro=0.8567196734693878
auc-weighted=0.7077200281488216
f1-score=0.44361444815007395
precision_score=0.4969837043184901
recall_score=0.42733978329897576
test_set
confusion-matrix
feature-importances
precision-recall-multiclass
roc-multiclass
model
-
- -
-
to track results use .show() or .logs() or in CLI: 
-!mlrun get run 57af834167264641905a5bb5e6b0e263 --project default , !mlrun logs 57af834167264641905a5bb5e6b0e263 --project default
-[mlrun] 2020-06-07 19:58:41,115 run executed, status=completed
-
-
-
-
-
-
-
from IPython.display import HTML
-HTML(filename=train_run.outputs['feature-importances'])
-
-
-
-
-

Feature Importances

-
-
-
-
-
data   = acquire_run.outputs["rent"]
-labels = "interest_level"
-model  = train_run.outputs["model"]
-
-
-
-
-
-
-
fi_perms = perms_fn.run(
-    NewTask(params={"labels": labels, 
-                    "plots_dest": "plots"}),
-    inputs={"model": model, "dataset": data},
-    artifact_path=mlconf.artifact_path)
-
-
-
-
-
[mlrun] 2020-06-07 19:58:41,152 starting run features-permutation_importances uid=89235b15ac2a4213aefc906c178a1c5e  -> http://mlrun-api:8080
-[mlrun] 2020-06-07 19:58:41,312 Job is running in the background, pod: features-permutation-importances-dwxmt
-[mlrun] 2020-06-07 19:58:44,871 starting local run: main.py # permutation_importances
-[mlrun] 2020-06-07 19:58:48,714 log artifact feature importances-permute at /User/artifacts/plots/feature-permutations.html, size: 25694, db: Y
-[mlrun] 2020-06-07 19:58:48,770 log artifact feature-importances-permute-tbl at /User/artifacts/feature-importances-permute-tbl.csv, size: 167, db: Y
-
-[mlrun] 2020-06-07 19:58:48,785 run executed, status=completed
-final state: succeeded
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 07 19:58:45completedfeatures-permutation_importances
v3io_user=admin
kind=job
owner=admin
host=features-permutation-importances-dwxmt
model
dataset
labels=interest_level
plots_dest=plots
feature importances-permute
feature-importances-permute-tbl
-
- -
-
to track results use .show() or .logs() or in CLI: 
-!mlrun get run 89235b15ac2a4213aefc906c178a1c5e --project default , !mlrun logs 89235b15ac2a4213aefc906c178a1c5e --project default
-[mlrun] 2020-06-07 19:58:50,488 run executed, status=completed
-
-
-
-
-
-
-
from IPython.display import HTML
-HTML(filename=fi_perms.outputs['feature importances-permute'])
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/feature_perms/0.9.0/static/function.html b/functions/development/feature_perms/0.9.0/static/function.html deleted file mode 100644 index f9520633..00000000 --- a/functions/development/feature_perms/0.9.0/static/function.html +++ /dev/null @@ -1,85 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: job
-metadata:
-  name: feature-perms
-  tag: ''
-  hash: 2e32234a73e2e48f029cf6c957b150ec2ffd4bc7
-  project: ''
-  labels:
-    author: yjb
-  categories:
-  - data-analysis
-spec:
-  command: ''
-  args: []
-  image: mlrun/ml-models
-  env: []
-  default_handler: permutation_importance
-  entry_points:
-    permutation_importance:
-      name: permutation_importance
-      doc: 'calculate change in metric
-
-
-        type ''permute'' uses a pre-estimated model
-
-        type ''dropcol'' uses a re-estimates model'
-      parameters:
-      - name: context
-        type: MLClientCtx
-        doc: the function's execution context
-        default: ''
-      - name: model
-        type: DataItem
-        doc: a trained model
-        default: ''
-      - name: dataset
-        type: DataItem
-        doc: features and ground truths, regression targets
-        default: ''
-      - name: labels
-        type: str
-        default: ''
-      - name: figsz
-        doc: matplotlib figure size
-        default:
-        - 10
-        - 5
-      - name: plots_dest
-        type: str
-        doc: path within artifact store
-        default: plots
-      - name: fitype
-        type: str
-        default: permute
-      outputs:
-      - default: ''
-      lineno: 93
-  description: estimate feature importances using permutations
-  build:
-    functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IG51bXB5IGFzIG5wCmltcG9ydCBwYW5kYXMgYXMgcGQKaW1wb3J0IG51bWJlcnMKCmltcG9ydCBza2xlYXJuCmZyb20gc2tsZWFybi5iYXNlIGltcG9ydCBjbG9uZQpmcm9tIHNrbGVhcm4udXRpbHMgaW1wb3J0IGNoZWNrX3JhbmRvbV9zdGF0ZQoKaW1wb3J0IG1hdHBsb3RsaWIucHlwbG90IGFzIHBsdAppbXBvcnQgc2VhYm9ybiBhcyBzbnMKCmZyb20gY2xvdWRwaWNrbGUgaW1wb3J0IGxvYWQKCmZyb20gbWxydW4uZXhlY3V0aW9uIGltcG9ydCBNTENsaWVudEN0eApmcm9tIG1scnVuLmRhdGFzdG9yZSBpbXBvcnQgRGF0YUl0ZW0KZnJvbSBtbHJ1bi5hcnRpZmFjdHMgaW1wb3J0IGdldF9tb2RlbCwgUGxvdEFydGlmYWN0CmZyb20gdHlwaW5nIGltcG9ydCBVbmlvbiwgQ2FsbGFibGUsIExpc3QKCgpkZWYgX2dldF9uX3NhbXBsZXNfYm9vdHN0cmFwKG5fc2FtcGxlcywgbWF4X3NhbXBsZXMpIC0+IGludDoKICAgICIiImdldCB0aGUgbnVtYmVyIG9mIHNhbXBsZXMgaW4gYSBib290c3RyYXAgc2FtcGxlCgogICAgcmV0dXJucyB0aGUgdG90YWwgbnVtYmVyIG9mIHNhbXBsZXMgdG8gZHJhdyBmb3IgdGhlIGJvb3RzdHJhcCBzYW1wbGUKCiAgICBwcml2YXRlIGFwaSBpbiBza2xlYXJuID49IHYwLjI0LCB0YWtlbiBmcm9tIHNrbGVhcm4uZW5zZW1ibGUuX2ZvcmVzdC5weQoKICAgIDpwYXJhbSBuX3NhbXBsZXM6ICAgTnVtYmVyIG9mIHNhbXBsZXMgaW4gdGhlIGRhdGFzZXQuCiAgICA6cGFyYW0gbWF4X3NhbXBsZXM6CiAgICAgICAgVGhlIG1heGltdW0gbnVtYmVyIG9mIHNhbXBsZXMgdG8gZHJhdyBmcm9tIHRoZSB0b3RhbCBhdmFpbGFibGU6CiAgICAgICAgICAgIC0gaWYgZmxvYXQsIHRoaXMgaW5kaWNhdGVzIGEgZnJhY3Rpb24gb2YgdGhlIHRvdGFsIGFuZCBzaG91bGQgYmUKICAgICAgICAgICAgICB0aGUgaW50ZXJ2YWwgYCgwLCAxKWA7CiAgICAgICAgICAgIC0gaWYgaW50LCB0aGlzIGluZGljYXRlcyB0aGUgZXhhY3QgbnVtYmVyIG9mIHNhbXBsZXM7CiAgICAgICAgICAgIC0gaWYgTm9uZSwgdGhpcyBpbmRpY2F0ZXMgdGhlIHRvdGFsIG51bWJlciBvZiBzYW1wbGVzLgogICAgIiIiCiAgICBpZiBtYXhfc2FtcGxlcyBpcyBOb25lOgogICAgICAgIHJldHVybiBuX3NhbXBsZXMKCiAgICBpZiBpc2luc3RhbmNlKG1heF9zYW1wbGVzLCBudW1iZXJzLkludGVncmFsKToKICAgICAgICBpZiBub3QgKDEgPD0gbWF4X3NhbXBsZXMgPD0gbl9zYW1wbGVzKToKICAgICAgICAgICAgbXNnID0gImBtYXhfc2FtcGxlc2AgbXVzdCBiZSBpbiByYW5nZSAxIHRvIHt9IGJ1dCBnb3QgdmFsdWUge30iCiAgICAgICAgICAgIHJhaXNlIFZhbHVlRXJyb3IobXNnLmZvcm1hdChuX3NhbXBsZXMsIG1heF9zYW1wbGVzKSkKICAgICAgICByZXR1cm4gbWF4X3NhbXBsZXMKCiAgICBpZiBpc2luc3RhbmNlKG1heF9zYW1wbGVzLCBudW1iZXJzLlJlYWwpOgogICAgICAgIGlmIG5vdCAoMCA8IG1heF9zYW1wbGVzIDwgMSk6CiAgICAgICAgICAgIG1zZyA9ICJgbWF4X3NhbXBsZXNgIG11c3QgYmUgaW4gcmFuZ2UgKDAsIDEpIGJ1dCBnb3QgdmFsdWUge30iCiAgICAgICAgICAgIHJhaXNlIFZhbHVlRXJyb3IobXNnLmZvcm1hdChtYXhfc2FtcGxlcykpCiAgICAgICAgcmV0dXJuIGludChyb3VuZChuX3NhbXBsZXMgKiBtYXhfc2FtcGxlcykpCgogICAgbXNnID0gImBtYXhfc2FtcGxlc2Agc2hvdWxkIGJlIGludCBvciBmbG9hdCwgYnV0IGdvdCB0eXBlICd7fSciCiAgICByYWlzZSBUeXBlRXJyb3IobXNnLmZvcm1hdCh0eXBlKG1heF9zYW1wbGVzKSkpCgoKZGVmIF9nZXRfdW5zYW1wbGVkX2l4KHJhbmRvbV9zdGF0ZSwgbl9zYW1wbGVzOiBpbnQpIC0+IG5wLmFycmF5OgogICAgIiIiCiAgICBmdXR1cmUtcHJvb2YgZ2V0IHVuc2FtcGxlZCBpbmRpY2VzCiAgICAiIiIKICAgIG5fYm9vdHN0cmFwID0gX2dldF9uX3NhbXBsZXNfYm9vdHN0cmFwKG5fc2FtcGxlcywgbl9zYW1wbGVzKQogICAgcmFuZG9tX2luc3RhbmNlID0gY2hlY2tfcmFuZG9tX3N0YXRlKHJhbmRvbV9zdGF0ZSkKICAgIHNhbXBsZV9pbmRpY2VzID0gcmFuZG9tX2luc3RhbmNlLnJhbmRpbnQoMCwgbl9zYW1wbGVzLCBuX2Jvb3RzdHJhcCkKICAgIHNhbXBsZV9jb3VudHMgPSBucC5iaW5jb3VudChzYW1wbGVfaW5kaWNlcywgbWlubGVuZ3RoPW5fc2FtcGxlcykKCiAgICByZXR1cm4gbnAuYXJhbmdlKG5fc2FtcGxlcylbc2FtcGxlX2NvdW50cyA9PSAwXQoKCmRlZiBfb29iX2NsYXNzaWZpZXJfYWNjdXJhY3kocmYsIFhfdHJhaW4sIHlfdHJhaW4pIC0+IGZsb2F0OgogICAgIiIiCiAgICBDb21wdXRlIG91dC1vZi1iYWcgKE9PQikgYWNjdXJhY3kgZm9yIGEgc2Npa2l0LWxlYXJuIGZvcmVzdCBjbGFzc2lmaWVyLgoKICAgIGh0dHBzOi8vZ2l0aHViLmNvbS9zY2lraXQtbGVhcm4vc2Npa2l0LWxlYXJuL2Jsb2IvYTI0YzhiNDYvc2tsZWFybi9lbnNlbWJsZS9mb3Jlc3QucHkjTDQyNQogICAgIiIiCiAgICBYID0gWF90cmFpbi52YWx1ZXMgaWYgaXNpbnN0YW5jZShYX3RyYWluLCBwZC5EYXRhRnJhbWUpIGVsc2UgWF90cmFpbgogICAgeSA9IHlfdHJhaW4udmFsdWVzIGlmIGlzaW5zdGFuY2UoeV90cmFpbiwgcGQuU2VyaWVzKSBlbHNlIHlfdHJhaW4KCiAgICBuX3NhbXBsZXMgPSBsZW4oWCkKICAgIG5fY2xhc3NlcyA9IGxlbihucC51bmlxdWUoeSkpCiAgICBwcmVkaWN0aW9ucyA9IG5wLnplcm9zKChuX3NhbXBsZXMsIG5fY2xhc3NlcykpCiAgICBmb3IgdHJlZSBpbiByZi5lc3RpbWF0b3JzXzoKICAgICAgICB1bnNhbXBsZWRfaW5kaWNlcyA9IF9nZXRfdW5zYW1wbGVkX2l4KHRyZWUucmFuZG9tX3N0YXRlLCBuX3NhbXBsZXMpCiAgICAgICAgdHJlZV9wcmVkcyA9IHRyZWUucHJlZGljdF9wcm9iYShYW3Vuc2FtcGxlZF9pbmRpY2VzLCA6XSkKICAgICAgICBwcmVkaWN0aW9uc1t1bnNhbXBsZWRfaW5kaWNlc10gKz0gdHJlZV9wcmVkcwoKICAgIHByZWRpY3RlZF9jbGFzc19pbmRleGVzID0gbnAuYXJnbWF4KHByZWRpY3Rpb25zLCBheGlzPTEpCiAgICBwcmVkaWN0ZWRfY2xhc3NlcyA9IFtyZi5jbGFzc2VzX1tpXSBmb3IgaSBpbiBwcmVkaWN0ZWRfY2xhc3NfaW5kZXhlc10KCiAgICBvb2Jfc2NvcmUgPSBucC5tZWFuKHkgPT0gcHJlZGljdGVkX2NsYXNzZXMpCgogICAgcmV0dXJuIG9vYl9zY29yZQoKCmRlZiBwZXJtdXRhdGlvbl9pbXBvcnRhbmNlKAogICAgY29udGV4dDogTUxDbGllbnRDdHgsCiAgICBtb2RlbDogRGF0YUl0ZW0sCiAgICBkYXRhc2V0OiBEYXRhSXRlbSwKICAgIGxhYmVsczogc3RyLAogICAgZmlnc3o9KDEwLCA1KSwKICAgIHBsb3RzX2Rlc3Q6IHN0ciA9ICJwbG90cyIsCiAgICBmaXR5cGU6IHN0ciA9ICJwZXJtdXRlIiwKKSAtPiBwZC5EYXRhRnJhbWU6CiAgICAiIiJjYWxjdWxhdGUgY2hhbmdlIGluIG1ldHJpYwoKICAgIHR5cGUgJ3Blcm11dGUnIHVzZXMgYSBwcmUtZXN0aW1hdGVkIG1vZGVsCiAgICB0eXBlICdkcm9wY29sJyB1c2VzIGEgcmUtZXN0aW1hdGVzIG1vZGVsCgogICAgOnBhcmFtIGNvbnRleHQ6ICAgICB0aGUgZnVuY3Rpb24ncyBleGVjdXRpb24gY29udGV4dAogICAgOnBhcmFtIG1vZGVsOiAgICAgICBhIHRyYWluZWQgbW9kZWwKICAgIDpwYXJhbSBkYXRhc2V0OiAgICAgZmVhdHVyZXMgYW5kIGdyb3VuZCB0cnV0aHMsIHJlZ3Jlc3Npb24gdGFyZ2V0cwogICAgOnBhcmFtIGxhYmVscyAgICAgICBuYW1lIG9mIHRoZSBncm91bmQgdHJ1dGhzIGNvbHVtbgogICAgOnBhcmFtIGZpZ3N6OiAgICAgICBtYXRwbG90bGliIGZpZ3VyZSBzaXplCiAgICA6cGFyYW0gcGxvdHNfZGVzdDogIHBhdGggd2l0aGluIGFydGlmYWN0IHN0b3JlCiAgICA6CiAgICAiIiIKICAgIG1vZGVsX2ZpbGUsIG1vZGVsX2RhdGEsIF8gPSBnZXRfbW9kZWwobW9kZWwudXJsLCBzdWZmaXg9Ii5wa2wiKQogICAgbW9kZWwgPSBsb2FkKG9wZW4oc3RyKG1vZGVsX2ZpbGUpLCAicmIiKSkKCiAgICBYID0gZGF0YXNldC5hc19kZigpCiAgICB5ID0gWC5wb3AobGFiZWxzKQogICAgaGVhZGVyID0gWC5jb2x1bW5zCgogICAgbWV0cmljID0gX29vYl9jbGFzc2lmaWVyX2FjY3VyYWN5CgogICAgYmFzZWxpbmUgPSBtZXRyaWMobW9kZWwsIFgsIHkpCgogICAgaW1wID0gW10KICAgIGZvciBjb2wgaW4gWC5jb2x1bW5zOgogICAgICAgIGlmIGZpdHlwZSBpcyAicGVybXV0ZSI6CiAgICAgICAgICAgIHNhdmUgPSBYW2NvbF0uY29weSgpCiAgICAgICAgICAgIFhbY29sXSA9IG5wLnJhbmRvbS5wZXJtdXRhdGlvbihYW2NvbF0pCiAgICAgICAgICAgIG0gPSBtZXRyaWMobW9kZWwsIFgsIHkpCiAgICAgICAgICAgIFhbY29sXSA9IHNhdmUKICAgICAgICAgICAgaW1wLmFwcGVuZChiYXNlbGluZSAtIG0pCiAgICAgICAgZWxpZiBmaXR5cGUgaXMgImRyb3Bjb2wiOgogICAgICAgICAgICBYXyA9IFguZHJvcChjb2wsIGF4aXM9MSkKICAgICAgICAgICAgbW9kZWxfID0gY2xvbmUobW9kZWwpCiAgICAgICAgICAgICNtb2RlbF8ucmFuZG9tX3N0YXRlID0gcmFuZG9tX3N0YXRlCiAgICAgICAgICAgIG1vZGVsXy5maXQoWF8sIHkpCiAgICAgICAgICAgIG8gPSBtb2RlbF8ub29iX3Njb3JlXwogICAgICAgICAgICBpbXAuYXBwZW5kKGJhc2VsaW5lIC0gbykKICAgICAgICBlbHNlOgogICAgICAgICAgICByYWlzZSBWYWx1ZUVycm9yKCJ1bmtub3duIGZpdHlwZSwgb25seSAncGVybXV0ZScgb3IgJ2Ryb3Bjb2wnIHBlcm1pdHRlZCIpCgogICAgemlwcGVkID0gemlwKGltcCwgaGVhZGVyKQogICAgZmVhdHVyZV9pbXAgPSBwZC5EYXRhRnJhbWUoc29ydGVkKHppcHBlZCksIGNvbHVtbnM9WyJpbXBvcnRhbmNlIiwgImZlYXR1cmUiXSkKICAgIGZlYXR1cmVfaW1wLnNvcnRfdmFsdWVzKGJ5PSJpbXBvcnRhbmNlIiwgYXNjZW5kaW5nPUZhbHNlLCBpbnBsYWNlPVRydWUpCgogICAgcGx0LmNsZigpCiAgICBwbHQuZmlndXJlKGZpZ3NpemU9Zmlnc3opCiAgICBzbnMuYmFycGxvdCh4PSJpbXBvcnRhbmNlIiwgeT0iZmVhdHVyZSIsIGRhdGE9ZmVhdHVyZV9pbXApCiAgICBwbHQudGl0bGUoZiJmZWF0dXJlIGltcG9ydGFuY2VzLXtmaXR5cGV9IikKICAgIHBsdC50aWdodF9sYXlvdXQoKQoKICAgIGNvbnRleHQubG9nX2FydGlmYWN0KAogICAgICAgIFBsb3RBcnRpZmFjdChmImZlYXR1cmUgaW1wb3J0YW5jZXMte2ZpdHlwZX0iLCBib2R5PXBsdC5nY2YoKSksCiAgICAgICAgbG9jYWxfcGF0aD1mIntwbG90c19kZXN0fS9mZWF0dXJlLXBlcm11dGF0aW9ucy5odG1sIiwKICAgICkKICAgIGNvbnRleHQubG9nX2RhdGFzZXQoCiAgICAgICAgZiJmZWF0dXJlLWltcG9ydGFuY2VzLXtmaXR5cGV9LXRibCIsIGRmPWZlYXR1cmVfaW1wLCBpbmRleD1GYWxzZQogICAgKQo=
-    commands: []
-    code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/feature_perms/feature_perms.py
-  affinity: null
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/feature_perms/0.9.0/static/item.html b/functions/development/feature_perms/0.9.0/static/item.html deleted file mode 100644 index 715fda93..00000000 --- a/functions/development/feature_perms/0.9.0/static/item.html +++ /dev/null @@ -1,45 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- data-analysis
-description: estimate feature importances using permutations
-doc: ''
-example: feature_perms.ipynb
-generationDate: 2021-11-18:12-28
-icon: ''
-labels:
-  author: yjb
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 0.8.0
-name: feature-perms
-platformVersion: 3.2.0
-spec:
-  filename: feature_perms.py
-  handler: permutation_importance
-  image: mlrun/ml-models
-  kind: job
-  requirements: []
-url: ''
-version: 0.9.0
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/feature_perms/0.9.0/static/source.html b/functions/development/feature_perms/0.9.0/static/source.html deleted file mode 100644 index 3013427a..00000000 --- a/functions/development/feature_perms/0.9.0/static/source.html +++ /dev/null @@ -1,182 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-# Generated by nuclio.export.NuclioExporter
-
-import numpy as np
-import pandas as pd
-import numbers
-
-import sklearn
-from sklearn.base import clone
-from sklearn.utils import check_random_state
-
-import matplotlib.pyplot as plt
-import seaborn as sns
-
-from cloudpickle import load
-
-from mlrun.execution import MLClientCtx
-from mlrun.datastore import DataItem
-from mlrun.artifacts import get_model, PlotArtifact
-from typing import Union, Callable, List
-
-
-def _get_n_samples_bootstrap(n_samples, max_samples) -> int:
-    """get the number of samples in a bootstrap sample
-
-    returns the total number of samples to draw for the bootstrap sample
-
-    private api in sklearn >= v0.24, taken from sklearn.ensemble._forest.py
-
-    :param n_samples:   Number of samples in the dataset.
-    :param max_samples:
-        The maximum number of samples to draw from the total available:
-            - if float, this indicates a fraction of the total and should be
-              the interval `(0, 1)`;
-            - if int, this indicates the exact number of samples;
-            - if None, this indicates the total number of samples.
-    """
-    if max_samples is None:
-        return n_samples
-
-    if isinstance(max_samples, numbers.Integral):
-        if not (1 <= max_samples <= n_samples):
-            msg = "`max_samples` must be in range 1 to {} but got value {}"
-            raise ValueError(msg.format(n_samples, max_samples))
-        return max_samples
-
-    if isinstance(max_samples, numbers.Real):
-        if not (0 < max_samples < 1):
-            msg = "`max_samples` must be in range (0, 1) but got value {}"
-            raise ValueError(msg.format(max_samples))
-        return int(round(n_samples * max_samples))
-
-    msg = "`max_samples` should be int or float, but got type '{}'"
-    raise TypeError(msg.format(type(max_samples)))
-
-
-def _get_unsampled_ix(random_state, n_samples: int) -> np.array:
-    """
-    future-proof get unsampled indices
-    """
-    n_bootstrap = _get_n_samples_bootstrap(n_samples, n_samples)
-    random_instance = check_random_state(random_state)
-    sample_indices = random_instance.randint(0, n_samples, n_bootstrap)
-    sample_counts = np.bincount(sample_indices, minlength=n_samples)
-
-    return np.arange(n_samples)[sample_counts == 0]
-
-
-def _oob_classifier_accuracy(rf, X_train, y_train) -> float:
-    """
-    Compute out-of-bag (OOB) accuracy for a scikit-learn forest classifier.
-
-    https://github.com/scikit-learn/scikit-learn/blob/a24c8b46/sklearn/ensemble/forest.py#L425
-    """
-    X = X_train.values if isinstance(X_train, pd.DataFrame) else X_train
-    y = y_train.values if isinstance(y_train, pd.Series) else y_train
-
-    n_samples = len(X)
-    n_classes = len(np.unique(y))
-    predictions = np.zeros((n_samples, n_classes))
-    for tree in rf.estimators_:
-        unsampled_indices = _get_unsampled_ix(tree.random_state, n_samples)
-        tree_preds = tree.predict_proba(X[unsampled_indices, :])
-        predictions[unsampled_indices] += tree_preds
-
-    predicted_class_indexes = np.argmax(predictions, axis=1)
-    predicted_classes = [rf.classes_[i] for i in predicted_class_indexes]
-
-    oob_score = np.mean(y == predicted_classes)
-
-    return oob_score
-
-
-def permutation_importance(
-    context: MLClientCtx,
-    model: DataItem,
-    dataset: DataItem,
-    labels: str,
-    figsz=(10, 5),
-    plots_dest: str = "plots",
-    fitype: str = "permute",
-) -> pd.DataFrame:
-    """calculate change in metric
-
-    type 'permute' uses a pre-estimated model
-    type 'dropcol' uses a re-estimates model
-
-    :param context:     the function's execution context
-    :param model:       a trained model
-    :param dataset:     features and ground truths, regression targets
-    :param labels       name of the ground truths column
-    :param figsz:       matplotlib figure size
-    :param plots_dest:  path within artifact store
-    :
-    """
-    model_file, model_data, _ = get_model(model.url, suffix=".pkl")
-    model = load(open(str(model_file), "rb"))
-
-    X = dataset.as_df()
-    y = X.pop(labels)
-    header = X.columns
-
-    metric = _oob_classifier_accuracy
-
-    baseline = metric(model, X, y)
-
-    imp = []
-    for col in X.columns:
-        if fitype is "permute":
-            save = X[col].copy()
-            X[col] = np.random.permutation(X[col])
-            m = metric(model, X, y)
-            X[col] = save
-            imp.append(baseline - m)
-        elif fitype is "dropcol":
-            X_ = X.drop(col, axis=1)
-            model_ = clone(model)
-            #model_.random_state = random_state
-            model_.fit(X_, y)
-            o = model_.oob_score_
-            imp.append(baseline - o)
-        else:
-            raise ValueError("unknown fitype, only 'permute' or 'dropcol' permitted")
-
-    zipped = zip(imp, header)
-    feature_imp = pd.DataFrame(sorted(zipped), columns=["importance", "feature"])
-    feature_imp.sort_values(by="importance", ascending=False, inplace=True)
-
-    plt.clf()
-    plt.figure(figsize=figsz)
-    sns.barplot(x="importance", y="feature", data=feature_imp)
-    plt.title(f"feature importances-{fitype}")
-    plt.tight_layout()
-
-    context.log_artifact(
-        PlotArtifact(f"feature importances-{fitype}", body=plt.gcf()),
-        local_path=f"{plots_dest}/feature-permutations.html",
-    )
-    context.log_dataset(
-        f"feature-importances-{fitype}-tbl", df=feature_imp, index=False
-    )
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/feature_perms/1.0.0/src/README.ipynb b/functions/development/feature_perms/1.0.0/src/README.ipynb deleted file mode 100644 index 0929a6f6..00000000 --- a/functions/development/feature_perms/1.0.0/src/README.ipynb +++ /dev/null @@ -1,788 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# feature importances" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "There are a number of ways to compute feature importances and **the default estimates reported by scikit learn can be shown to be biased** under certain circumstances. In addition, many non-tree algorithms do not provide conveniently calculated feature importance estimates. The following demonstration is based on material that draws heavily from the following sources:" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## references\n", - "\n", - "\n", - "### repos\n", - "\n", - "* **[Feature importances for scikit-learn machine learning models](https://github.com/parrt/random-forest-importances)**, [MIT License](https://github.com/parrt/random-forest-importances/blob/master/LICENSE)\n", - "* **[Scikit-Learn ensemble module - forests](https://github.com/scikit-learn/scikit-learn/blob/0.23.1/sklearn/ensemble/_forest.py)**, [BSD License](https://github.com/scikit-learn/scikit-learn/blob/fd237278e895b42abe8d8d09105cbb82dc2cbba7/sklearn/ensemble/_forest.py#L40)\n", - "* **[ELI5 - Permutation Importance](https://eli5.readthedocs.io/en/latest/blackbox/permutation_importance.html)** \n", - "\n", - "### articles\n", - "\n", - "Strobl, C., Boulesteix, A., Zeileis, A. et al. **[Bias in random forest variable importance measures: Illustrations, sources and a solution](https://link.springer.com/article/10.1186/1471-2105-8-25#citeas)**. BMC Bioinformatics 8, 25 (2007). https://doi.org/10.1186/1471-2105-8-25 \n", - "\n", - "Strobl, C., Boulesteix, A., Kneib, T. et al. **[Conditional variable importance for random forests](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-9-307#citeas)**. BMC Bioinformatics 9, 307 (2008). https://doi.org/10.1186/1471-2105-9-307 " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## what we'll do\n", - "\n", - "* demonstrate an issue with default feature importance estimates \n", - "* provide alternatives and compare to the default \n", - "* create a new function `feature_perms` that implements a computationally simple algorithm \n", - "* create a new function `dropcol_importances` that implements a computationally intensive algorithm that is more accurate\n", - "* test our new functions\n", - "\n", - "It should be noted that although we are developing this notebook using a classification example, an almost identical presentation can be done for regression." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## imports" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "import pandas as pd\n", - "\n", - "import sklearn\n", - "from sklearn.base import clone\n", - "\n", - "from sklearn.ensemble import RandomForestClassifier as SomeModel\n", - "\n", - "import matplotlib.pyplot as plt\n", - "import seaborn as sns\n", - "\n", - "from typing import Union, Callable, List" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## default feature importances\n", - "\n", - "This is a function that plots default feature importances from an estimated model object when available. It is taken from mlrun's current source-code implementation:" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "def feature_importances(\n", - " model: SomeModel,\n", - " header: List[str], \n", - " figsz=(10, 5)\n", - ") -> None:\n", - " \"\"\"Display default model feature importances\n", - "\n", - " Only works for models with attribute 'feature_importances_`\n", - "\n", - " :param model: fitted model with a feature_importances_ attribute\n", - " :param header: feature labels\n", - " :param figsz: matplotlib figure size\n", - " \"\"\"\n", - " if not hasattr(model, \"feature_importances_\"):\n", - " raise Exception(\n", - " \"feature importances are only available for some models\")\n", - "\n", - " # create a feature importance table with desired labels\n", - " zipped = zip(model.feature_importances_, header)\n", - " feature_imp = pd.DataFrame(\n", - " sorted(zipped), columns=[\"freq\", \"feature\"]).sort_values(\n", - " by=\"freq\", ascending=False)\n", - "\n", - " plt.clf()\n", - " plt.figure(figsize=figsz)\n", - " sns.barplot(x=\"freq\", y=\"feature\", data=feature_imp)\n", - " plt.title(\"features\")\n", - " plt.tight_layout();\n", - " \n", - " return feature_imp" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## permuted features\n", - "\n", - "A proposed solution that has general applicability is randomly permuted features**[refs](#references)**: \n", - "* loop through the feature set \n", - "* shuffle one feature \n", - "* run predict\n", - "* compare the (marginal) change in accuracy (or other metric of interest) \n", - "\n", - "This approach is computationally more demanding than relying on the default values, however it can be easily parallelized. To perform the estimation we only need an estimated model and a held-out test set. The following was proposed in **[Beware Default Random Forest Importances](https://explained.ai/rf-importance/index.html)**:\n", - "\n", - "( the following 3 glue functions will no longer be publicly visible in the sklearn package from 0.24 onwards, consider this a temporary hack while we refactor these away)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**the following has been refactored in final version of function:**" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "from distutils.version import LooseVersion\n", - "import numpy as np\n", - "from sklearn.utils import check_random_state\n", - "\n", - "def _generate_sample_indices(random_state: int, n_samples: int, n_samples_bootstrap: int):\n", - " \"\"\"\n", - " Private function used to _parallel_build_trees function.\n", - " taken from:\n", - " https://github.com/scikit-learn/scikit-learn/blob/2253807bb488b6de73796aef2de38a6dcf282d86/sklearn/ensemble/_forest.py#L116\n", - " (public availability to be deprecated by sklearn v0.24)\n", - " \"\"\"\n", - " random_instance = check_random_state(random_state)\n", - " sample_indices = random_instance.randint(0, n_samples, n_samples_bootstrap)\n", - "\n", - " return sample_indices\n", - "\n", - "def _generate_unsampled_indices(random_state: int, n_samples: int, n_samples_bootstrap: int):\n", - " \"\"\"\n", - " Private function used to forest._set_oob_score function.\n", - " taken from: \n", - " https://github.com/scikit-learn/scikit-learn/blob/2253807bb488b6de73796aef2de38a6dcf282d86/sklearn/ensemble/_forest.py#L126\n", - " (public availability to be deprecated by sklearn v0.24)\n", - " \"\"\"\n", - " sample_indices = _generate_sample_indices(random_state, n_samples,\n", - " n_samples_bootstrap)\n", - " sample_counts = np.bincount(sample_indices, minlength=n_samples)\n", - " unsampled_mask = sample_counts == 0\n", - " indices_range = np.arange(n_samples)\n", - " unsampled_indices = indices_range[unsampled_mask]\n", - "\n", - " return unsampled_indices\n", - "\n", - "def _get_unsampled_indices(tree, n_samples: int):\n", - " \"\"\"\n", - " An interface to get unsampled indices regardless of sklearn version.\n", - " \"\"\"\n", - " import warnings\n", - " warnings.simplefilter(action=\"ignore\", category=FutureWarning)\n", - " if LooseVersion(sklearn.__version__) >= LooseVersion(\"0.22\"):\n", - " # Version 0.22 or newer uses 3 arguments.\n", - " from sklearn.ensemble.forest import _get_n_samples_bootstrap\n", - " n_samples_bootstrap = _get_n_samples_bootstrap(n_samples, n_samples)\n", - " return _generate_unsampled_indices(tree.random_state, n_samples,\n", - " n_samples_bootstrap)\n", - " else:\n", - " # Version 0.21 or older uses only two arguments.\n", - " return _generate_unsampled_indices(tree.random_state, n_samples)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The following function estimates classifier accuracy and has been borrowed from **[references](#references)**. See **[breitman on oob](https://www.stat.berkeley.edu/~breiman/OOBestimation.pdf)** for details on out-of-bag estimation:" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "def oob_classifier_accuracy(rf, X_train: np.array, y_train: np.array) -> float:\n", - " \"\"\"\n", - " Compute out-of-bag (OOB) accuracy for a scikit-learn forest classifier.\n", - " \n", - " https://github.com/scikit-learn/scikit-learn/blob/a24c8b46/sklearn/ensemble/forest.py#L425\n", - " \"\"\"\n", - " X = X_train.values\n", - " y = y_train.values\n", - "\n", - " n_samples = len(X)\n", - " n_classes = len(np.unique(y))\n", - " predictions = np.zeros((n_samples, n_classes))\n", - " for tree in rf.estimators_:\n", - " unsampled_indices = _get_unsampled_indices(tree, n_samples)\n", - " tree_preds = tree.predict_proba(X[unsampled_indices, :])\n", - " predictions[unsampled_indices] += tree_preds\n", - "\n", - " predicted_class_indexes = np.argmax(predictions, axis=1)\n", - " predicted_classes = [rf.classes_[i] for i in predicted_class_indexes]\n", - "\n", - " oob_score = np.mean(y == predicted_classes)\n", - " \n", - " return oob_score" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Putting it all together:" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "def permutation_importances(\n", - " model, \n", - " X_train: np.array,\n", - " y_train: np.array, \n", - " header: List[str],\n", - " metric: Callable = oob_classifier_accuracy,\n", - " figsz=(10, 5)\n", - ") -> np.array:\n", - " \"\"\"calculate change in metric from permuting feature columns\n", - " \n", - " modified from https://explained.ai/rf-importance/index.html\n", - " \n", - " uses a pre-estimated model\n", - "\n", - " :param X_train: training set features\n", - " :param y_train: training set ground truths, regression targets\n", - " :param header: column labels for X_train\n", - " :param figsz: matplotlib figure size\n", - " \n", - " \"\"\"\n", - " baseline = metric(model, X_train, y_train)\n", - " imp = []\n", - " for col in X_train.columns:\n", - " save = X_train[col].copy()\n", - " X_train[col] = np.random.permutation(X_train[col])\n", - " m = metric(model, X_train, y_train)\n", - " X_train[col] = save\n", - " imp.append(baseline - m)\n", - " \n", - " # create a feature importance table with desired labels\n", - " zipped = zip(imp, header)\n", - " feature_imp = pd.DataFrame(sorted(zipped), columns=[\"importance\", \"feature\"])\n", - " feature_imp.sort_values(by=\"importance\", ascending=False, inplace=True)\n", - "\n", - " plt.clf()\n", - " plt.figure(figsize=figsz)\n", - " sns.barplot(x=\"importance\", y=\"feature\", data=feature_imp)\n", - " plt.title(\"feature permutation importances\")\n", - " plt.tight_layout()\n", - "\n", - " return np.array(feature_imp)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## drop-column importances\n", - "\n", - "According to our **[references](#references)** a more accurate measure of feature importance would have us re-estimate the model after dropping a column. This is considered as being close to \"ideal\". Unfortunately, the entire model needs to be re-estimated for each column and without some approximating shortcut this is likely to be infeasible for large datasets.\n", - "\n", - "Here is the suggested implementation and **don't run this on big models!**:" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "def dropcol_importances(\n", - " model, \n", - " X_train: np.array,\n", - " y_train: np.array,\n", - " header: List[str] = [],\n", - " random_state: int = 1994,\n", - " figsz=(10, 5)\n", - ") -> pd.DataFrame:\n", - " \"\"\"drop columns and re-estimate model\n", - " \n", - " modified from https://explained.ai/rf-importance/index.html\n", - " \n", - " :param rf: model to fit\n", - " :param X_train: training set features\n", - " :param y_train: training set ground truth labels\n", - "\n", - " Returns:\n", - " pd.DataFrame: table of diffs vs baseline metric\n", - " \"\"\"\n", - " # cloning makes copy of model pre-fit\n", - " # calculate a baseline with all features\n", - " model_ = clone(model)\n", - " model_.random_state = random_state\n", - " model_.fit(X_train, y_train)\n", - " baseline = model_.oob_score_\n", - " \n", - " # now drop each colum, refit model and calc metric\n", - " imp = []\n", - " for col in X_train.columns:\n", - " X = X_train.drop(col, axis=1)\n", - " model_ = clone(model)\n", - " model_.random_state = random_state\n", - " model_.fit(X, y_train)\n", - " o = model_.oob_score_\n", - " imp.append(baseline - o)\n", - " \n", - " # put it all in a table\n", - " imp = np.array(imp)\n", - " feature_imps = pd.DataFrame(\n", - " data={'feature': X_train.columns,\n", - " 'importance': imp})\n", - " #feature_imps.set_index('feature', inplace=True)\n", - " feature_imps.sort_values('importance', ascending=True, inplace=True)\n", - " \n", - " plt.clf()\n", - " plt.figure(figsize=figsz)\n", - " sns.barplot(x=\"importance\", y=\"feature\", data=feature_imps)\n", - " plt.title(\"drop column feature importances\")\n", - " plt.tight_layout()\n", - " \n", - " return feature_imps" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## demonstration\n", - "\n", - "In this demonstratuon we are going to take a fraction of a fraction of **[Kaggle's RentHop rental listing interest competition](https://www.kaggle.com/c/two-sigma-connect-rental-listing-inquiries)**--the complete dataset is presently >80GB, we'll be looking at 5K rows. \n", - "\n", - "The competition's **[goal](https://www.kaggle.com/c/two-sigma-connect-rental-listing-inquiries)** was\n", - "> to predict the number of inquiries a new listing receives based on the listing’s creation date and other features. \n", - "\n", - "Doing so would help **[RentHop](https://www.renthop.com/)**\n", - "> better handle fraud control, identify potential listing quality issues, and allow owners and agents to better understand renters’ needs and preferences." - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [], - "source": [ - "data = \"/User/artifacts/two-sigma-connect-rental-listing-inquiries/\"\n", - "NFRAC = 0.1" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "sample dimensions (4935, 6)\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
bathroomsbedroomspricelongitudelatitudeinterest_level
182351.001800-73.996040.71972
421401.023000-73.987940.76533
46771.011350-73.899640.85492
\n", - "
" - ], - "text/plain": [ - " bathrooms bedrooms price longitude latitude interest_level\n", - "18235 1.0 0 1800 -73.9960 40.7197 2\n", - "42140 1.0 2 3000 -73.9879 40.7653 3\n", - "4677 1.0 1 1350 -73.8996 40.8549 2" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df = pd.read_csv(data + 'rent.csv').sample(frac=NFRAC)\n", - "print(\"sample dimensions\", df.shape)\n", - "df.head(3)" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [], - "source": [ - "features = ['bathrooms', 'bedrooms', 'longitude', 'latitude', 'price']\n", - "dfr = df[features]\n", - "\n", - "# drop price column\n", - "X_train, y_train = dfr.drop('price', axis=1), dfr['price']\n", - "\n", - "# insert column with random values\n", - "X_train['random'] = np.random.random(size=len(X_train))\n", - "features = ['bathrooms', 'bedrooms', 'longitude', 'latitude', 'random']" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "RandomForestClassifier(n_jobs=-1, oob_score=True)" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# define model\n", - "model_params = {\n", - " \"n_estimators\" : 100, \n", - " \"min_samples_leaf\" : 1,\n", - " \"n_jobs\" : -1,\n", - " \"oob_score\" : True\n", - "}\n", - "\n", - "model = SomeModel(**model_params)\n", - "\n", - "# estimate\n", - "model.fit(X_train, y_train)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### to run this the model needs a default attribute" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "default feature_importances [0.01683784 0.03215169 0.29983429 0.30418813 0.34698806]\n" - ] - }, - { - "data": { - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "if hasattr(model, \"feature_importances_\"):\n", - " print(\"default feature_importances\", model.feature_importances_)\n", - " feature_importances(model, features)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## permutation importances\n", - "\n", - "No need to check for default attributes or functions, this can be run on any kind of model:" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([[0.06545086119554205, 'longitude'],\n", - " [0.06160081053698076, 'latitude'],\n", - " [0.053495440729483285, 'bedrooms'],\n", - " [0.021681864235055734, 'bathrooms'],\n", - " [0.0004052684903748799, 'random']], dtype=object)" - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "pi = permutation_importances(model, X_train, y_train, features)\n", - "pi" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## drop-column importances" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
featureimportance
4random-0.042756
0bathrooms0.001824
1bedrooms0.023100
2longitude0.049240
3latitude0.051874
\n", - "
" - ], - "text/plain": [ - " feature importance\n", - "4 random -0.042756\n", - "0 bathrooms 0.001824\n", - "1 bedrooms 0.023100\n", - "2 longitude 0.049240\n", - "3 latitude 0.051874" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "dc = dropcol_importances(model, X_train, y_train)\n", - "dc" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## conclusions" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "So I would say location is a prime factor, then the number of bedrooms. Bathrooms often is gte bedrooms, and is likely correlated so one of them should likely be dropped." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.8" - }, - "toc-autonumbering": false, - "toc-showcode": false, - "toc-showmarkdowntxt": false - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/functions/development/feature_perms/1.0.0/src/feature_perms.ipynb b/functions/development/feature_perms/1.0.0/src/feature_perms.ipynb deleted file mode 100644 index 77da7b55..00000000 --- a/functions/development/feature_perms/1.0.0/src/feature_perms.ipynb +++ /dev/null @@ -1,1106 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# permutation_importances as reusable function" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## function code" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: ignore\n", - "import nuclio" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "import pandas as pd\n", - "import numbers\n", - "\n", - "import sklearn\n", - "from sklearn.base import clone\n", - "from sklearn.utils import check_random_state\n", - "\n", - "import matplotlib.pyplot as plt\n", - "import seaborn as sns\n", - "\n", - "from cloudpickle import load\n", - "\n", - "from mlrun.execution import MLClientCtx\n", - "from mlrun.datastore import DataItem\n", - "from mlrun.artifacts import get_model, PlotArtifact\n", - "from typing import Union, Callable, List\n", - "\n", - "def _get_n_samples_bootstrap(n_samples, max_samples) -> int:\n", - " \"\"\"get the number of samples in a bootstrap sample\n", - " \n", - " returns the total number of samples to draw for the bootstrap sample\n", - " \n", - " private api in sklearn >= v0.24, taken from sklearn.ensemble._forest.py\n", - "\n", - " :param n_samples: Number of samples in the dataset.\n", - " :param max_samples: \n", - " The maximum number of samples to draw from the total available:\n", - " - if float, this indicates a fraction of the total and should be\n", - " the interval `(0, 1)`;\n", - " - if int, this indicates the exact number of samples;\n", - " - if None, this indicates the total number of samples.\n", - " \"\"\"\n", - " if max_samples is None:\n", - " return n_samples\n", - "\n", - " if isinstance(max_samples, numbers.Integral):\n", - " if not (1 <= max_samples <= n_samples):\n", - " msg = \"`max_samples` must be in range 1 to {} but got value {}\"\n", - " raise ValueError(msg.format(n_samples, max_samples))\n", - " return max_samples\n", - "\n", - " if isinstance(max_samples, numbers.Real):\n", - " if not (0 < max_samples < 1):\n", - " msg = \"`max_samples` must be in range (0, 1) but got value {}\"\n", - " raise ValueError(msg.format(max_samples))\n", - " return int(round(n_samples * max_samples))\n", - "\n", - " msg = \"`max_samples` should be int or float, but got type '{}'\"\n", - " raise TypeError(msg.format(type(max_samples)))\n", - "\n", - "def _get_unsampled_ix(random_state, n_samples: int) -> np.array:\n", - " \"\"\"\n", - " future-proof get unsampled indices\n", - " \"\"\"\n", - " n_bootstrap = _get_n_samples_bootstrap(n_samples, n_samples)\n", - " random_instance = check_random_state(random_state)\n", - " sample_indices = random_instance.randint(0, n_samples, n_bootstrap)\n", - " sample_counts = np.bincount(sample_indices, minlength=n_samples)\n", - "\n", - " return np.arange(n_samples)[sample_counts==0]\n", - "\n", - "def _oob_classifier_accuracy(rf, X_train, y_train) -> float:\n", - " \"\"\"\n", - " Compute out-of-bag (OOB) accuracy for a scikit-learn forest classifier.\n", - " \n", - " https://github.com/scikit-learn/scikit-learn/blob/a24c8b46/sklearn/ensemble/forest.py#L425\n", - " \"\"\"\n", - " X = X_train.values if isinstance(X_train, pd.DataFrame) else X_train\n", - " y = y_train.values if isinstance(y_train, pd.Series) else y_train\n", - "\n", - " n_samples = len(X)\n", - " n_classes = len(np.unique(y))\n", - " predictions = np.zeros((n_samples, n_classes))\n", - " for tree in rf.estimators_:\n", - " unsampled_indices = _get_unsampled_ix(tree.random_state, n_samples)\n", - " tree_preds = tree.predict_proba(X[unsampled_indices, :])\n", - " predictions[unsampled_indices] += tree_preds\n", - "\n", - " predicted_class_indexes = np.argmax(predictions, axis=1)\n", - " predicted_classes = [rf.classes_[i] for i in predicted_class_indexes]\n", - "\n", - " oob_score = np.mean(y == predicted_classes)\n", - " \n", - " return oob_score\n", - "\n", - "def permutation_importances(\n", - " context: MLClientCtx,\n", - " model: DataItem,\n", - " dataset: DataItem,\n", - " labels: str,\n", - " figsz=(10, 5),\n", - " plots_dest: str = \"plots\",\n", - " fitype: str = \"permute\"\n", - ") -> pd.DataFrame:\n", - " \"\"\"calculate change in metric\n", - " \n", - " type 'permute' uses a pre-estimated model\n", - " type 'dropcol' uses a re-estimates model\n", - " \n", - " :param context: the function's execution context\n", - " :param model: a trained model\n", - " :param dataset: features and ground truths, regression targets\n", - " :param labels name of the ground truths column\n", - " :param figsz: matplotlib figure size\n", - " :param plots_dest: path within artifact store\n", - " :\n", - " \"\"\"\n", - " model_file, model_data, _ = get_model(model.url, suffix='.pkl')\n", - " model = load(open(str(model_file), \"rb\"))\n", - " \n", - " X = dataset.as_df()\n", - " y = X.pop(labels)\n", - " header = X.columns\n", - " \n", - " # this will be paramettrized next version, and include regression\n", - " metric = _oob_classifier_accuracy\n", - " \n", - " baseline = metric(model, X, y)\n", - " \n", - " imp = []\n", - " for col in X.columns:\n", - " if fitype is \"permute\":\n", - " save = X[col].copy()\n", - " X[col] = np.random.permutation(X[col])\n", - " m = metric(model, X, y)\n", - " X[col] = save\n", - " imp.append(baseline - m)\n", - " elif fitype is \"dropcol\":\n", - " X_ = X.drop(col, axis=1)\n", - " model_ = clone(model)\n", - " model_.random_state = random_state\n", - " model_.fit(X_, y)\n", - " o = model_.oob_score_\n", - " imp.append(baseline - o)\n", - " else:\n", - " raise ValueError(\"unknown fitype, only 'permute' or 'dropcol' permitted\")\n", - "\n", - " # create a feature importance table with desired labels\n", - " zipped = zip(imp, header)\n", - " feature_imp = pd.DataFrame(sorted(zipped), columns=[\"importance\", \"feature\"])\n", - " feature_imp.sort_values(by=\"importance\", ascending=False, inplace=True)\n", - "\n", - " plt.clf()\n", - " plt.figure(figsize=figsz)\n", - " sns.barplot(x=\"importance\", y=\"feature\", data=feature_imp)\n", - " plt.title(f\"feature importances-{fitype}\")\n", - " plt.tight_layout()\n", - "\n", - " context.log_artifact(PlotArtifact(f\"feature importances-{fitype}\", body=plt.gcf()),\n", - " local_path=f\"{plots_dest}/feature-permutations.html\")\n", - " context.log_dataset(f\"feature-importances-{fitype}-tbl\", df=feature_imp, index=False)" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: end-code" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## save function" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-06-07 19:58:25,298 function spec saved to path: function.yaml\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from mlrun import code_to_function\n", - "from mlrun.platforms.other import auto_mount\n", - "\n", - "gpus = False\n", - "\n", - "# create job function object from notebook code\n", - "fn_params = {\n", - " \"name\" : \"feature-perms\",\n", - " \"handler\" : \"permutation_importances\",\n", - " \"kind\" : \"job\",\n", - " \"image\" : \"mlrun/ml-models\" if not gpus else \"mlrun/ml-models-gpu\",\n", - " \"description\" : \"estimate feature importances using permutations\",\n", - " \"categories\" : [\"analysis\"],\n", - " \"labels\" : {\"author\": \"yjb\"}\n", - "}\n", - "\n", - "perms_fn = code_to_function(**fn_params)\n", - "perms_fn.apply(auto_mount())\n", - "perms_fn.export(\"function.yaml\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## tests" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import import_function\n", - "from mlrun import NewTask, mlconf" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### get some data" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-06-07 19:58:25,352 starting run tasks arc-to-parq uid=e9bc67f2189c418d96bfde754d369956 -> http://mlrun-api:8080\n", - "[mlrun] 2020-06-07 19:58:25,486 Job is running in the background, pod: tasks-arc-to-parq-xqkr7\n", - "[mlrun] 2020-06-07 19:58:29,118 starting local run: main.py # arc_to_parquet\n", - "[mlrun] 2020-06-07 19:58:29,169 downloading https://raw.githubusercontent.com/parrt/random-forest-importances/master/notebooks/data/rent.csv to local tmp\n", - "[mlrun] 2020-06-07 19:58:29,535 destination file does not exist, downloading\n", - "[mlrun] 2020-06-07 19:58:29,898 log artifact rent at /User/artifacts/rent.csv, size: 1492462, db: Y\n", - "\n", - "[mlrun] 2020-06-07 19:58:29,917 run executed, status=completed\n", - "final state: succeeded\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 07 19:58:29completedtasks arc-to-parq
v3io_user=admin
kind=job
owner=admin
host=tasks-arc-to-parq-xqkr7
archive_url
key=rent
stats=True
file_ext=csv
rent
\n", - "
\n", - "
\n", - "
\n", - " Title\n", - " ×\n", - "
\n", - " \n", - "
\n", - "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "to track results use .show() or .logs() or in CLI: \n", - "!mlrun get run e9bc67f2189c418d96bfde754d369956 --project default , !mlrun logs e9bc67f2189c418d96bfde754d369956 --project default\n", - "[mlrun] 2020-06-07 19:58:31,666 run executed, status=completed\n" - ] - } - ], - "source": [ - "data_url = \"https://raw.githubusercontent.com/parrt/random-forest-importances/master/notebooks/data/rent.csv\"\n", - "\n", - "fn = import_function(\"hub://arc_to_parquet\", \"a2p\")\n", - "fn.apply(auto_mount())\n", - "\n", - "params = {\n", - " \"name\" : \"tasks arc-to-parq\",\n", - " \"params\" : {\"key\":\"rent\", \"stats\": True, \"file_ext\":\"csv\"}\n", - "}\n", - "acquire_run = fn.run(NewTask(**params),inputs={\"archive_url\" : data_url},\n", - " artifact_path=mlconf.artifact_path)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### train a model" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-06-07 19:58:31,704 starting run tasks random forest uid=57af834167264641905a5bb5e6b0e263 -> http://mlrun-api:8080\n", - "[mlrun] 2020-06-07 19:58:31,861 Job is running in the background, pod: tasks-random-forest-vkjk5\n", - "[mlrun] 2020-06-07 19:58:35,390 starting local run: main.py # train_model\n", - "[mlrun] 2020-06-07 19:58:36,310 log artifact test_set at /User/artifacts/data/test_set.parquet, size: 24484, db: Y\n", - "[mlrun] 2020-06-07 19:58:37,153 log artifact confusion-matrix at /User/artifacts/model/plots/confusion-matrix.html, size: 27401, db: N\n", - "[mlrun] 2020-06-07 19:58:37,598 log artifact feature-importances at /User/artifacts/model/plots/feature-importances.html, size: 19685, db: N\n", - "[mlrun] 2020-06-07 19:58:37,806 log artifact precision-recall-multiclass at /User/artifacts/model/plots/precision-recall-multiclass.html, size: 74009, db: N\n", - "[mlrun] 2020-06-07 19:58:37,936 log artifact roc-multiclass at /User/artifacts/model/plots/roc-multiclass.html, size: 73053, db: N\n", - "[mlrun] 2020-06-07 19:58:38,079 log artifact model at /User/artifacts/model/, size: 10346780, db: Y\n", - "\n", - "[mlrun] 2020-06-07 19:58:38,106 run executed, status=completed\n", - "final state: succeeded\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 07 19:58:36completedtasks random forest
v3io_user=admin
kind=job
owner=admin
host=tasks-random-forest-vkjk5
class=sklearn.ensemble.RandomForestClassifier
dataset
sample=-5000
model_pkg_class=sklearn.ensemble.RandomForestClassifier
label_column=interest_level
CLASS_n_estimators=100
CLASS_min_samples_leaf=1
CLASS_n_jobs=-1
CLASS_oob_score=True
test-accuracy=0.6902857142857143
test-error=0.3097142857142857
auc-micro=0.8567196734693878
auc-weighted=0.7077200281488216
f1-score=0.44361444815007395
precision_score=0.4969837043184901
recall_score=0.42733978329897576
test_set
confusion-matrix
feature-importances
precision-recall-multiclass
roc-multiclass
model
\n", - "
\n", - "
\n", - "
\n", - " Title\n", - " ×\n", - "
\n", - " \n", - "
\n", - "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "to track results use .show() or .logs() or in CLI: \n", - "!mlrun get run 57af834167264641905a5bb5e6b0e263 --project default , !mlrun logs 57af834167264641905a5bb5e6b0e263 --project default\n", - "[mlrun] 2020-06-07 19:58:41,115 run executed, status=completed\n" - ] - } - ], - "source": [ - "fn = import_function(\"hub://sklearn_classifier\", \"skrf\")\n", - "fn.apply(auto_mount())\n", - "\n", - "# define model\n", - "params = {\n", - " \"name\" : \"tasks random forest\",\n", - " \"params\" : {\n", - " \"sample\" : -5_000, # 5k random rows,\n", - " \"model_pkg_class\" : \"sklearn.ensemble.RandomForestClassifier\",\n", - " \"label_column\" : \"interest_level\",\n", - " \"CLASS_n_estimators\" : 100,\n", - " \"CLASS_min_samples_leaf\" : 1,\n", - " \"CLASS_n_jobs\" : -1,\n", - " \"CLASS_oob_score\" : True}\n", - "}\n", - "\n", - "train_run = fn.run(NewTask(**params), inputs={\"dataset\" : acquire_run.outputs[\"rent\"]},\n", - " artifact_path=mlconf.artifact_path)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "

Feature Importances

\n", - "" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from IPython.display import HTML\n", - "HTML(filename=train_run.outputs['feature-importances'])" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "data = acquire_run.outputs[\"rent\"]\n", - "labels = \"interest_level\"\n", - "model = train_run.outputs[\"model\"]\n" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-06-07 19:58:41,152 starting run features-permutation_importances uid=89235b15ac2a4213aefc906c178a1c5e -> http://mlrun-api:8080\n", - "[mlrun] 2020-06-07 19:58:41,312 Job is running in the background, pod: features-permutation-importances-dwxmt\n", - "[mlrun] 2020-06-07 19:58:44,871 starting local run: main.py # permutation_importances\n", - "[mlrun] 2020-06-07 19:58:48,714 log artifact feature importances-permute at /User/artifacts/plots/feature-permutations.html, size: 25694, db: Y\n", - "[mlrun] 2020-06-07 19:58:48,770 log artifact feature-importances-permute-tbl at /User/artifacts/feature-importances-permute-tbl.csv, size: 167, db: Y\n", - "\n", - "[mlrun] 2020-06-07 19:58:48,785 run executed, status=completed\n", - "final state: succeeded\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 07 19:58:45completedfeatures-permutation_importances
v3io_user=admin
kind=job
owner=admin
host=features-permutation-importances-dwxmt
model
dataset
labels=interest_level
plots_dest=plots
feature importances-permute
feature-importances-permute-tbl
\n", - "
\n", - "
\n", - "
\n", - " Title\n", - " ×\n", - "
\n", - " \n", - "
\n", - "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "to track results use .show() or .logs() or in CLI: \n", - "!mlrun get run 89235b15ac2a4213aefc906c178a1c5e --project default , !mlrun logs 89235b15ac2a4213aefc906c178a1c5e --project default\n", - "[mlrun] 2020-06-07 19:58:50,488 run executed, status=completed\n" - ] - } - ], - "source": [ - "fi_perms = perms_fn.run(\n", - " NewTask(params={\"labels\": labels, \n", - " \"plots_dest\": \"plots\"}),\n", - " inputs={\"model\": model, \"dataset\": data},\n", - " artifact_path=mlconf.artifact_path)" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from IPython.display import HTML\n", - "HTML(filename=fi_perms.outputs['feature importances-permute'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.8" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} \ No newline at end of file diff --git a/functions/development/feature_perms/1.0.0/src/feature_perms.py b/functions/development/feature_perms/1.0.0/src/feature_perms.py deleted file mode 100644 index 3a6c9948..00000000 --- a/functions/development/feature_perms/1.0.0/src/feature_perms.py +++ /dev/null @@ -1,160 +0,0 @@ -# Generated by nuclio.export.NuclioExporter - -import numpy as np -import pandas as pd -import numbers - -import sklearn -from sklearn.base import clone -from sklearn.utils import check_random_state - -import matplotlib.pyplot as plt -import seaborn as sns - -from cloudpickle import load - -from mlrun.execution import MLClientCtx -from mlrun.datastore import DataItem -from mlrun.artifacts import get_model, PlotArtifact -from typing import Union, Callable, List - - -def _get_n_samples_bootstrap(n_samples, max_samples) -> int: - """get the number of samples in a bootstrap sample - - returns the total number of samples to draw for the bootstrap sample - - private api in sklearn >= v0.24, taken from sklearn.ensemble._forest.py - - :param n_samples: Number of samples in the dataset. - :param max_samples: - The maximum number of samples to draw from the total available: - - if float, this indicates a fraction of the total and should be - the interval `(0, 1)`; - - if int, this indicates the exact number of samples; - - if None, this indicates the total number of samples. - """ - if max_samples is None: - return n_samples - - if isinstance(max_samples, numbers.Integral): - if not (1 <= max_samples <= n_samples): - msg = "`max_samples` must be in range 1 to {} but got value {}" - raise ValueError(msg.format(n_samples, max_samples)) - return max_samples - - if isinstance(max_samples, numbers.Real): - if not (0 < max_samples < 1): - msg = "`max_samples` must be in range (0, 1) but got value {}" - raise ValueError(msg.format(max_samples)) - return int(round(n_samples * max_samples)) - - msg = "`max_samples` should be int or float, but got type '{}'" - raise TypeError(msg.format(type(max_samples))) - - -def _get_unsampled_ix(random_state, n_samples: int) -> np.array: - """ - future-proof get unsampled indices - """ - n_bootstrap = _get_n_samples_bootstrap(n_samples, n_samples) - random_instance = check_random_state(random_state) - sample_indices = random_instance.randint(0, n_samples, n_bootstrap) - sample_counts = np.bincount(sample_indices, minlength=n_samples) - - return np.arange(n_samples)[sample_counts == 0] - - -def _oob_classifier_accuracy(rf, X_train, y_train) -> float: - """ - Compute out-of-bag (OOB) accuracy for a scikit-learn forest classifier. - - https://github.com/scikit-learn/scikit-learn/blob/a24c8b46/sklearn/ensemble/forest.py#L425 - """ - X = X_train.values if isinstance(X_train, pd.DataFrame) else X_train - y = y_train.values if isinstance(y_train, pd.Series) else y_train - - n_samples = len(X) - n_classes = len(np.unique(y)) - predictions = np.zeros((n_samples, n_classes)) - for tree in rf.estimators_: - unsampled_indices = _get_unsampled_ix(tree.random_state, n_samples) - tree_preds = tree.predict_proba(X[unsampled_indices, :]) - predictions[unsampled_indices] += tree_preds - - predicted_class_indexes = np.argmax(predictions, axis=1) - predicted_classes = [rf.classes_[i] for i in predicted_class_indexes] - - oob_score = np.mean(y == predicted_classes) - - return oob_score - - -def permutation_importance( - context: MLClientCtx, - model: DataItem, - dataset: DataItem, - labels: str, - figsz=(10, 5), - plots_dest: str = "plots", - fitype: str = "permute", -) -> pd.DataFrame: - """calculate change in metric - - type 'permute' uses a pre-estimated model - type 'dropcol' uses a re-estimates model - - :param context: the function's execution context - :param model: a trained model - :param dataset: features and ground truths, regression targets - :param labels name of the ground truths column - :param figsz: matplotlib figure size - :param plots_dest: path within artifact store - : - """ - model_file, model_data, _ = get_model(model.url, suffix=".pkl") - model = load(open(str(model_file), "rb")) - - X = dataset.as_df() - y = X.pop(labels) - header = X.columns - - metric = _oob_classifier_accuracy - - baseline = metric(model, X, y) - - imp = [] - for col in X.columns: - if fitype is "permute": - save = X[col].copy() - X[col] = np.random.permutation(X[col]) - m = metric(model, X, y) - X[col] = save - imp.append(baseline - m) - elif fitype is "dropcol": - X_ = X.drop(col, axis=1) - model_ = clone(model) - #model_.random_state = random_state - model_.fit(X_, y) - o = model_.oob_score_ - imp.append(baseline - o) - else: - raise ValueError("unknown fitype, only 'permute' or 'dropcol' permitted") - - zipped = zip(imp, header) - feature_imp = pd.DataFrame(sorted(zipped), columns=["importance", "feature"]) - feature_imp.sort_values(by="importance", ascending=False, inplace=True) - - plt.clf() - plt.figure(figsize=figsz) - sns.barplot(x="importance", y="feature", data=feature_imp) - plt.title(f"feature importances-{fitype}") - plt.tight_layout() - - context.log_artifact( - PlotArtifact(f"feature importances-{fitype}", body=plt.gcf()), - local_path=f"{plots_dest}/feature-permutations.html", - ) - context.log_dataset( - f"feature-importances-{fitype}-tbl", df=feature_imp, index=False - ) diff --git a/functions/development/feature_perms/1.0.0/src/function.yaml b/functions/development/feature_perms/1.0.0/src/function.yaml deleted file mode 100644 index 713981fd..00000000 --- a/functions/development/feature_perms/1.0.0/src/function.yaml +++ /dev/null @@ -1,63 +0,0 @@ -kind: job -metadata: - name: feature-perms - tag: '' - hash: 2e32234a73e2e48f029cf6c957b150ec2ffd4bc7 - project: '' - labels: - author: yjb - categories: - - data-analysis -spec: - command: '' - args: [] - image: mlrun/ml-models - env: [] - default_handler: permutation_importance - entry_points: - permutation_importance: - name: permutation_importance - doc: 'calculate change in metric - - - type ''permute'' uses a pre-estimated model - - type ''dropcol'' uses a re-estimates model' - parameters: - - name: context - type: MLClientCtx - doc: the function's execution context - default: '' - - name: model - type: DataItem - doc: a trained model - default: '' - - name: dataset - type: DataItem - doc: features and ground truths, regression targets - default: '' - - name: labels - type: str - default: '' - - name: figsz - doc: matplotlib figure size - default: - - 10 - - 5 - - name: plots_dest - type: str - doc: path within artifact store - default: plots - - name: fitype - type: str - default: permute - outputs: - - default: '' - lineno: 93 - description: estimate feature importances using permutations - build: - functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IG51bXB5IGFzIG5wCmltcG9ydCBwYW5kYXMgYXMgcGQKaW1wb3J0IG51bWJlcnMKCmltcG9ydCBza2xlYXJuCmZyb20gc2tsZWFybi5iYXNlIGltcG9ydCBjbG9uZQpmcm9tIHNrbGVhcm4udXRpbHMgaW1wb3J0IGNoZWNrX3JhbmRvbV9zdGF0ZQoKaW1wb3J0IG1hdHBsb3RsaWIucHlwbG90IGFzIHBsdAppbXBvcnQgc2VhYm9ybiBhcyBzbnMKCmZyb20gY2xvdWRwaWNrbGUgaW1wb3J0IGxvYWQKCmZyb20gbWxydW4uZXhlY3V0aW9uIGltcG9ydCBNTENsaWVudEN0eApmcm9tIG1scnVuLmRhdGFzdG9yZSBpbXBvcnQgRGF0YUl0ZW0KZnJvbSBtbHJ1bi5hcnRpZmFjdHMgaW1wb3J0IGdldF9tb2RlbCwgUGxvdEFydGlmYWN0CmZyb20gdHlwaW5nIGltcG9ydCBVbmlvbiwgQ2FsbGFibGUsIExpc3QKCgpkZWYgX2dldF9uX3NhbXBsZXNfYm9vdHN0cmFwKG5fc2FtcGxlcywgbWF4X3NhbXBsZXMpIC0+IGludDoKICAgICIiImdldCB0aGUgbnVtYmVyIG9mIHNhbXBsZXMgaW4gYSBib290c3RyYXAgc2FtcGxlCgogICAgcmV0dXJucyB0aGUgdG90YWwgbnVtYmVyIG9mIHNhbXBsZXMgdG8gZHJhdyBmb3IgdGhlIGJvb3RzdHJhcCBzYW1wbGUKCiAgICBwcml2YXRlIGFwaSBpbiBza2xlYXJuID49IHYwLjI0LCB0YWtlbiBmcm9tIHNrbGVhcm4uZW5zZW1ibGUuX2ZvcmVzdC5weQoKICAgIDpwYXJhbSBuX3NhbXBsZXM6ICAgTnVtYmVyIG9mIHNhbXBsZXMgaW4gdGhlIGRhdGFzZXQuCiAgICA6cGFyYW0gbWF4X3NhbXBsZXM6CiAgICAgICAgVGhlIG1heGltdW0gbnVtYmVyIG9mIHNhbXBsZXMgdG8gZHJhdyBmcm9tIHRoZSB0b3RhbCBhdmFpbGFibGU6CiAgICAgICAgICAgIC0gaWYgZmxvYXQsIHRoaXMgaW5kaWNhdGVzIGEgZnJhY3Rpb24gb2YgdGhlIHRvdGFsIGFuZCBzaG91bGQgYmUKICAgICAgICAgICAgICB0aGUgaW50ZXJ2YWwgYCgwLCAxKWA7CiAgICAgICAgICAgIC0gaWYgaW50LCB0aGlzIGluZGljYXRlcyB0aGUgZXhhY3QgbnVtYmVyIG9mIHNhbXBsZXM7CiAgICAgICAgICAgIC0gaWYgTm9uZSwgdGhpcyBpbmRpY2F0ZXMgdGhlIHRvdGFsIG51bWJlciBvZiBzYW1wbGVzLgogICAgIiIiCiAgICBpZiBtYXhfc2FtcGxlcyBpcyBOb25lOgogICAgICAgIHJldHVybiBuX3NhbXBsZXMKCiAgICBpZiBpc2luc3RhbmNlKG1heF9zYW1wbGVzLCBudW1iZXJzLkludGVncmFsKToKICAgICAgICBpZiBub3QgKDEgPD0gbWF4X3NhbXBsZXMgPD0gbl9zYW1wbGVzKToKICAgICAgICAgICAgbXNnID0gImBtYXhfc2FtcGxlc2AgbXVzdCBiZSBpbiByYW5nZSAxIHRvIHt9IGJ1dCBnb3QgdmFsdWUge30iCiAgICAgICAgICAgIHJhaXNlIFZhbHVlRXJyb3IobXNnLmZvcm1hdChuX3NhbXBsZXMsIG1heF9zYW1wbGVzKSkKICAgICAgICByZXR1cm4gbWF4X3NhbXBsZXMKCiAgICBpZiBpc2luc3RhbmNlKG1heF9zYW1wbGVzLCBudW1iZXJzLlJlYWwpOgogICAgICAgIGlmIG5vdCAoMCA8IG1heF9zYW1wbGVzIDwgMSk6CiAgICAgICAgICAgIG1zZyA9ICJgbWF4X3NhbXBsZXNgIG11c3QgYmUgaW4gcmFuZ2UgKDAsIDEpIGJ1dCBnb3QgdmFsdWUge30iCiAgICAgICAgICAgIHJhaXNlIFZhbHVlRXJyb3IobXNnLmZvcm1hdChtYXhfc2FtcGxlcykpCiAgICAgICAgcmV0dXJuIGludChyb3VuZChuX3NhbXBsZXMgKiBtYXhfc2FtcGxlcykpCgogICAgbXNnID0gImBtYXhfc2FtcGxlc2Agc2hvdWxkIGJlIGludCBvciBmbG9hdCwgYnV0IGdvdCB0eXBlICd7fSciCiAgICByYWlzZSBUeXBlRXJyb3IobXNnLmZvcm1hdCh0eXBlKG1heF9zYW1wbGVzKSkpCgoKZGVmIF9nZXRfdW5zYW1wbGVkX2l4KHJhbmRvbV9zdGF0ZSwgbl9zYW1wbGVzOiBpbnQpIC0+IG5wLmFycmF5OgogICAgIiIiCiAgICBmdXR1cmUtcHJvb2YgZ2V0IHVuc2FtcGxlZCBpbmRpY2VzCiAgICAiIiIKICAgIG5fYm9vdHN0cmFwID0gX2dldF9uX3NhbXBsZXNfYm9vdHN0cmFwKG5fc2FtcGxlcywgbl9zYW1wbGVzKQogICAgcmFuZG9tX2luc3RhbmNlID0gY2hlY2tfcmFuZG9tX3N0YXRlKHJhbmRvbV9zdGF0ZSkKICAgIHNhbXBsZV9pbmRpY2VzID0gcmFuZG9tX2luc3RhbmNlLnJhbmRpbnQoMCwgbl9zYW1wbGVzLCBuX2Jvb3RzdHJhcCkKICAgIHNhbXBsZV9jb3VudHMgPSBucC5iaW5jb3VudChzYW1wbGVfaW5kaWNlcywgbWlubGVuZ3RoPW5fc2FtcGxlcykKCiAgICByZXR1cm4gbnAuYXJhbmdlKG5fc2FtcGxlcylbc2FtcGxlX2NvdW50cyA9PSAwXQoKCmRlZiBfb29iX2NsYXNzaWZpZXJfYWNjdXJhY3kocmYsIFhfdHJhaW4sIHlfdHJhaW4pIC0+IGZsb2F0OgogICAgIiIiCiAgICBDb21wdXRlIG91dC1vZi1iYWcgKE9PQikgYWNjdXJhY3kgZm9yIGEgc2Npa2l0LWxlYXJuIGZvcmVzdCBjbGFzc2lmaWVyLgoKICAgIGh0dHBzOi8vZ2l0aHViLmNvbS9zY2lraXQtbGVhcm4vc2Npa2l0LWxlYXJuL2Jsb2IvYTI0YzhiNDYvc2tsZWFybi9lbnNlbWJsZS9mb3Jlc3QucHkjTDQyNQogICAgIiIiCiAgICBYID0gWF90cmFpbi52YWx1ZXMgaWYgaXNpbnN0YW5jZShYX3RyYWluLCBwZC5EYXRhRnJhbWUpIGVsc2UgWF90cmFpbgogICAgeSA9IHlfdHJhaW4udmFsdWVzIGlmIGlzaW5zdGFuY2UoeV90cmFpbiwgcGQuU2VyaWVzKSBlbHNlIHlfdHJhaW4KCiAgICBuX3NhbXBsZXMgPSBsZW4oWCkKICAgIG5fY2xhc3NlcyA9IGxlbihucC51bmlxdWUoeSkpCiAgICBwcmVkaWN0aW9ucyA9IG5wLnplcm9zKChuX3NhbXBsZXMsIG5fY2xhc3NlcykpCiAgICBmb3IgdHJlZSBpbiByZi5lc3RpbWF0b3JzXzoKICAgICAgICB1bnNhbXBsZWRfaW5kaWNlcyA9IF9nZXRfdW5zYW1wbGVkX2l4KHRyZWUucmFuZG9tX3N0YXRlLCBuX3NhbXBsZXMpCiAgICAgICAgdHJlZV9wcmVkcyA9IHRyZWUucHJlZGljdF9wcm9iYShYW3Vuc2FtcGxlZF9pbmRpY2VzLCA6XSkKICAgICAgICBwcmVkaWN0aW9uc1t1bnNhbXBsZWRfaW5kaWNlc10gKz0gdHJlZV9wcmVkcwoKICAgIHByZWRpY3RlZF9jbGFzc19pbmRleGVzID0gbnAuYXJnbWF4KHByZWRpY3Rpb25zLCBheGlzPTEpCiAgICBwcmVkaWN0ZWRfY2xhc3NlcyA9IFtyZi5jbGFzc2VzX1tpXSBmb3IgaSBpbiBwcmVkaWN0ZWRfY2xhc3NfaW5kZXhlc10KCiAgICBvb2Jfc2NvcmUgPSBucC5tZWFuKHkgPT0gcHJlZGljdGVkX2NsYXNzZXMpCgogICAgcmV0dXJuIG9vYl9zY29yZQoKCmRlZiBwZXJtdXRhdGlvbl9pbXBvcnRhbmNlKAogICAgY29udGV4dDogTUxDbGllbnRDdHgsCiAgICBtb2RlbDogRGF0YUl0ZW0sCiAgICBkYXRhc2V0OiBEYXRhSXRlbSwKICAgIGxhYmVsczogc3RyLAogICAgZmlnc3o9KDEwLCA1KSwKICAgIHBsb3RzX2Rlc3Q6IHN0ciA9ICJwbG90cyIsCiAgICBmaXR5cGU6IHN0ciA9ICJwZXJtdXRlIiwKKSAtPiBwZC5EYXRhRnJhbWU6CiAgICAiIiJjYWxjdWxhdGUgY2hhbmdlIGluIG1ldHJpYwoKICAgIHR5cGUgJ3Blcm11dGUnIHVzZXMgYSBwcmUtZXN0aW1hdGVkIG1vZGVsCiAgICB0eXBlICdkcm9wY29sJyB1c2VzIGEgcmUtZXN0aW1hdGVzIG1vZGVsCgogICAgOnBhcmFtIGNvbnRleHQ6ICAgICB0aGUgZnVuY3Rpb24ncyBleGVjdXRpb24gY29udGV4dAogICAgOnBhcmFtIG1vZGVsOiAgICAgICBhIHRyYWluZWQgbW9kZWwKICAgIDpwYXJhbSBkYXRhc2V0OiAgICAgZmVhdHVyZXMgYW5kIGdyb3VuZCB0cnV0aHMsIHJlZ3Jlc3Npb24gdGFyZ2V0cwogICAgOnBhcmFtIGxhYmVscyAgICAgICBuYW1lIG9mIHRoZSBncm91bmQgdHJ1dGhzIGNvbHVtbgogICAgOnBhcmFtIGZpZ3N6OiAgICAgICBtYXRwbG90bGliIGZpZ3VyZSBzaXplCiAgICA6cGFyYW0gcGxvdHNfZGVzdDogIHBhdGggd2l0aGluIGFydGlmYWN0IHN0b3JlCiAgICA6CiAgICAiIiIKICAgIG1vZGVsX2ZpbGUsIG1vZGVsX2RhdGEsIF8gPSBnZXRfbW9kZWwobW9kZWwudXJsLCBzdWZmaXg9Ii5wa2wiKQogICAgbW9kZWwgPSBsb2FkKG9wZW4oc3RyKG1vZGVsX2ZpbGUpLCAicmIiKSkKCiAgICBYID0gZGF0YXNldC5hc19kZigpCiAgICB5ID0gWC5wb3AobGFiZWxzKQogICAgaGVhZGVyID0gWC5jb2x1bW5zCgogICAgbWV0cmljID0gX29vYl9jbGFzc2lmaWVyX2FjY3VyYWN5CgogICAgYmFzZWxpbmUgPSBtZXRyaWMobW9kZWwsIFgsIHkpCgogICAgaW1wID0gW10KICAgIGZvciBjb2wgaW4gWC5jb2x1bW5zOgogICAgICAgIGlmIGZpdHlwZSBpcyAicGVybXV0ZSI6CiAgICAgICAgICAgIHNhdmUgPSBYW2NvbF0uY29weSgpCiAgICAgICAgICAgIFhbY29sXSA9IG5wLnJhbmRvbS5wZXJtdXRhdGlvbihYW2NvbF0pCiAgICAgICAgICAgIG0gPSBtZXRyaWMobW9kZWwsIFgsIHkpCiAgICAgICAgICAgIFhbY29sXSA9IHNhdmUKICAgICAgICAgICAgaW1wLmFwcGVuZChiYXNlbGluZSAtIG0pCiAgICAgICAgZWxpZiBmaXR5cGUgaXMgImRyb3Bjb2wiOgogICAgICAgICAgICBYXyA9IFguZHJvcChjb2wsIGF4aXM9MSkKICAgICAgICAgICAgbW9kZWxfID0gY2xvbmUobW9kZWwpCiAgICAgICAgICAgICNtb2RlbF8ucmFuZG9tX3N0YXRlID0gcmFuZG9tX3N0YXRlCiAgICAgICAgICAgIG1vZGVsXy5maXQoWF8sIHkpCiAgICAgICAgICAgIG8gPSBtb2RlbF8ub29iX3Njb3JlXwogICAgICAgICAgICBpbXAuYXBwZW5kKGJhc2VsaW5lIC0gbykKICAgICAgICBlbHNlOgogICAgICAgICAgICByYWlzZSBWYWx1ZUVycm9yKCJ1bmtub3duIGZpdHlwZSwgb25seSAncGVybXV0ZScgb3IgJ2Ryb3Bjb2wnIHBlcm1pdHRlZCIpCgogICAgemlwcGVkID0gemlwKGltcCwgaGVhZGVyKQogICAgZmVhdHVyZV9pbXAgPSBwZC5EYXRhRnJhbWUoc29ydGVkKHppcHBlZCksIGNvbHVtbnM9WyJpbXBvcnRhbmNlIiwgImZlYXR1cmUiXSkKICAgIGZlYXR1cmVfaW1wLnNvcnRfdmFsdWVzKGJ5PSJpbXBvcnRhbmNlIiwgYXNjZW5kaW5nPUZhbHNlLCBpbnBsYWNlPVRydWUpCgogICAgcGx0LmNsZigpCiAgICBwbHQuZmlndXJlKGZpZ3NpemU9Zmlnc3opCiAgICBzbnMuYmFycGxvdCh4PSJpbXBvcnRhbmNlIiwgeT0iZmVhdHVyZSIsIGRhdGE9ZmVhdHVyZV9pbXApCiAgICBwbHQudGl0bGUoZiJmZWF0dXJlIGltcG9ydGFuY2VzLXtmaXR5cGV9IikKICAgIHBsdC50aWdodF9sYXlvdXQoKQoKICAgIGNvbnRleHQubG9nX2FydGlmYWN0KAogICAgICAgIFBsb3RBcnRpZmFjdChmImZlYXR1cmUgaW1wb3J0YW5jZXMte2ZpdHlwZX0iLCBib2R5PXBsdC5nY2YoKSksCiAgICAgICAgbG9jYWxfcGF0aD1mIntwbG90c19kZXN0fS9mZWF0dXJlLXBlcm11dGF0aW9ucy5odG1sIiwKICAgICkKICAgIGNvbnRleHQubG9nX2RhdGFzZXQoCiAgICAgICAgZiJmZWF0dXJlLWltcG9ydGFuY2VzLXtmaXR5cGV9LXRibCIsIGRmPWZlYXR1cmVfaW1wLCBpbmRleD1GYWxzZQogICAgKQo= - commands: [] - code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/feature_perms/feature_perms.py - affinity: null -verbose: false diff --git a/functions/development/feature_perms/1.0.0/src/item.yaml b/functions/development/feature_perms/1.0.0/src/item.yaml deleted file mode 100644 index b9f712a4..00000000 --- a/functions/development/feature_perms/1.0.0/src/item.yaml +++ /dev/null @@ -1,23 +0,0 @@ -apiVersion: v1 -categories: -- data-analysis -description: estimate feature importances using permutations -doc: '' -example: feature_perms.ipynb -generationDate: 2021-11-18:12-28 -icon: '' -labels: - author: yjb -maintainers: [] -marketplaceType: '' -mlrunVersion: 0.8.0 -name: feature-perms -platformVersion: 3.2.0 -spec: - filename: feature_perms.py - handler: permutation_importance - image: mlrun/ml-models - kind: job - requirements: [] -url: '' -version: 1.0.0 diff --git a/functions/development/feature_perms/1.0.0/src/requirements.txt b/functions/development/feature_perms/1.0.0/src/requirements.txt deleted file mode 100644 index f53a3289..00000000 --- a/functions/development/feature_perms/1.0.0/src/requirements.txt +++ /dev/null @@ -1,6 +0,0 @@ -mlrun -sklearn -matplotlib -seaborn -scikit-plot - diff --git a/functions/development/feature_perms/1.0.0/src/test_feature_perms.py b/functions/development/feature_perms/1.0.0/src/test_feature_perms.py deleted file mode 100644 index b270292d..00000000 --- a/functions/development/feature_perms/1.0.0/src/test_feature_perms.py +++ /dev/null @@ -1,98 +0,0 @@ -from mlrun import code_to_function, import_function -from pathlib import Path -import os - -ARTIFACTS_PATH = 'artifacts' -DATA_URL = "https://raw.githubusercontent.com/parrt/random-forest-importances/master/notebooks/data/rent.csv" -FEATURE_OUTPUT = ARTIFACTS_PATH + "/feature-importances-permute-tbl.parquet" - - -def arc_to_parquet(): - from mlrun import import_function - from mlrun.platforms import auto_mount - - archive_func = import_function('hub://arc_to_parquet') - archive_run = archive_func.run(handler="arc_to_parquet", - params={"key": "rent", "stats": True, "file_ext": "csv"}, - inputs={"archive_url": DATA_URL}, - artifact_path=os.getcwd() + '/artifacts' - , local=True - ) - - -def sklearn_classifier(run): - cwd = os.getcwd() - file_path = str(Path(cwd).parent.absolute()) + "/sklearn_classifier/sklearn_classifier.py" - fn = code_to_function(name='test_sklearn_classifier', - filename=file_path, - handler="train_model", - kind="local", - ) - fn.spec.command = file_path - fn.run(params={ - "sample": -5_000, # 5k random rows, - "model_pkg_class": "sklearn.ensemble.RandomForestClassifier", - "label_column": "interest_level", - "CLASS_n_estimators": 100, - "CLASS_min_samples_leaf": 1, - "CLASS_n_jobs": -1, - "CLASS_oob_score": True}, - handler="train_model", - inputs={"dataset": run.outputs["rent"]}, - artifact_path='artifacts' - # , local=True - ) - - -def train_model(): - from mlrun import import_function - from mlrun.platforms import auto_mount - - train = import_function('hub://sklearn_classifier') - # .apply(auto_mount()) - - train_run = train.run( - inputs={"dataset": "artifacts/rent.csv"}, - params={ - "sample": -5_000, # 5k random rows, - "model_pkg_class": "sklearn.ensemble.RandomForestClassifier", - "label_column": "interest_level", - "CLASS_n_estimators": 100, - "CLASS_min_samples_leaf": 1, - "CLASS_n_jobs": -1, - "CLASS_oob_score": True}, - local=True) - - -def test_feature_selection_run_local(): - arc_to_parquet() - train_model() - data = "artifacts/rent.csv" - labels = "interest_level" - model = "model/model.pkl" - fn = code_to_function(name='test_run_local_feature_perms', - filename="feature_perms.py", - handler="permutation_importance", - kind="local", - ) - fn.spec.command = "feature_perms.py" - fi_perms = fn.run(params={"labels": labels, - "plots_dest": "plots"}, - inputs={"model": model, "dataset": data}, - artifact_path='artifacts') - assert Path(FEATURE_OUTPUT).is_file() - - -def test_feature_perms_import_function(): - arc_to_parquet() - train_model() - data = "artifacts/rent.csv" - labels = "interest_level" - model = "model/model.pkl" - fi_perms = import_function("function.yaml") - fi_perms.run(params={"labels": labels, - "plots_dest": "plots"}, - inputs={"model": model, "dataset": data}, - artifact_path=os.getcwd() + '/artifacts' - , local=True) - assert Path(FEATURE_OUTPUT).is_file() diff --git a/functions/development/feature_perms/1.0.0/static/documentation.html b/functions/development/feature_perms/1.0.0/static/documentation.html deleted file mode 100644 index ffbe8803..00000000 --- a/functions/development/feature_perms/1.0.0/static/documentation.html +++ /dev/null @@ -1,148 +0,0 @@ - - - - - - - -feature_perms package - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- - -
-
-
-
-
-
-

feature_perms package

-
-

Submodules

-
-
-

feature_perms.feature_perms module

-
-
-feature_perms.feature_perms.permutation_importance(context: mlrun.execution.MLClientCtx, model: mlrun.datastore.base.DataItem, dataset: mlrun.datastore.base.DataItem, labels: str, figsz=(10, 5), plots_dest: str = 'plots', fitype: str = 'permute')pandas.core.frame.DataFrame[source]
-

calculate change in metric

-

type ‘permute’ uses a pre-estimated model -type ‘dropcol’ uses a re-estimates model

-
-
Parameters
-
    -
  • context – the function’s execution context

  • -
  • model – a trained model

  • -
  • dataset – features and ground truths, regression targets

  • -
-
-
-

:param labels name of the ground truths column -:param figsz: matplotlib figure size -:param plots_dest: path within artifact store -:

-
-
-
-

Module contents

-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/feature_perms/1.0.0/static/example.html b/functions/development/feature_perms/1.0.0/static/example.html deleted file mode 100644 index 5a492f6f..00000000 --- a/functions/development/feature_perms/1.0.0/static/example.html +++ /dev/null @@ -1,1068 +0,0 @@ - - - - - - - -permutation_importances as reusable function - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- - -
-
-
-
-
-
-

permutation_importances as reusable function

-
-

function code

-
-
-
# nuclio: ignore
-import nuclio
-
-
-
-
-
-
-
import numpy as np
-import pandas as pd
-import numbers
-
-import sklearn
-from sklearn.base import clone
-from sklearn.utils import check_random_state
-
-import matplotlib.pyplot as plt
-import seaborn as sns
-
-from cloudpickle import load
-
-from mlrun.execution import MLClientCtx
-from mlrun.datastore import DataItem
-from mlrun.artifacts import get_model, PlotArtifact
-from typing import Union, Callable, List
-
-def _get_n_samples_bootstrap(n_samples, max_samples) -> int:
-    """get the number of samples in a bootstrap sample
-    
-    returns the total number of samples to draw for the bootstrap sample
-    
-    private api in sklearn >= v0.24, taken from sklearn.ensemble._forest.py
-
-    :param n_samples:   Number of samples in the dataset.
-    :param max_samples: 
-        The maximum number of samples to draw from the total available:
-            - if float, this indicates a fraction of the total and should be
-              the interval `(0, 1)`;
-            - if int, this indicates the exact number of samples;
-            - if None, this indicates the total number of samples.
-    """
-    if max_samples is None:
-        return n_samples
-
-    if isinstance(max_samples, numbers.Integral):
-        if not (1 <= max_samples <= n_samples):
-            msg = "`max_samples` must be in range 1 to {} but got value {}"
-            raise ValueError(msg.format(n_samples, max_samples))
-        return max_samples
-
-    if isinstance(max_samples, numbers.Real):
-        if not (0 < max_samples < 1):
-            msg = "`max_samples` must be in range (0, 1) but got value {}"
-            raise ValueError(msg.format(max_samples))
-        return int(round(n_samples * max_samples))
-
-    msg = "`max_samples` should be int or float, but got type '{}'"
-    raise TypeError(msg.format(type(max_samples)))
-
-def _get_unsampled_ix(random_state, n_samples: int) -> np.array:
-    """
-    future-proof get unsampled indices
-    """
-    n_bootstrap = _get_n_samples_bootstrap(n_samples, n_samples)
-    random_instance = check_random_state(random_state)
-    sample_indices = random_instance.randint(0, n_samples, n_bootstrap)
-    sample_counts = np.bincount(sample_indices, minlength=n_samples)
-
-    return np.arange(n_samples)[sample_counts==0]
-
-def _oob_classifier_accuracy(rf, X_train, y_train) -> float:
-    """
-    Compute out-of-bag (OOB) accuracy for a scikit-learn forest classifier.
-    
-    https://github.com/scikit-learn/scikit-learn/blob/a24c8b46/sklearn/ensemble/forest.py#L425
-    """
-    X = X_train.values if isinstance(X_train, pd.DataFrame) else X_train
-    y = y_train.values if isinstance(y_train, pd.Series) else y_train
-
-    n_samples = len(X)
-    n_classes = len(np.unique(y))
-    predictions = np.zeros((n_samples, n_classes))
-    for tree in rf.estimators_:
-        unsampled_indices = _get_unsampled_ix(tree.random_state, n_samples)
-        tree_preds = tree.predict_proba(X[unsampled_indices, :])
-        predictions[unsampled_indices] += tree_preds
-
-    predicted_class_indexes = np.argmax(predictions, axis=1)
-    predicted_classes = [rf.classes_[i] for i in predicted_class_indexes]
-
-    oob_score = np.mean(y == predicted_classes)
-    
-    return oob_score
-
-def permutation_importances(
-    context: MLClientCtx,
-    model: DataItem,
-    dataset: DataItem,
-    labels: str,
-    figsz=(10, 5),
-    plots_dest: str = "plots",
-    fitype: str = "permute"
-) -> pd.DataFrame:
-    """calculate change in metric
-    
-    type 'permute' uses a pre-estimated model
-    type 'dropcol' uses a re-estimates model
-    
-    :param context:     the function's execution context
-    :param model:       a trained model
-    :param dataset:     features and ground truths, regression targets
-    :param labels       name of the ground truths column
-    :param figsz:       matplotlib figure size
-    :param plots_dest:  path within artifact store
-    :
-    """
-    model_file, model_data, _ = get_model(model.url, suffix='.pkl')
-    model = load(open(str(model_file), "rb"))
-    
-    X = dataset.as_df()
-    y = X.pop(labels)
-    header = X.columns
-    
-    # this will be paramettrized next version, and include regression
-    metric = _oob_classifier_accuracy
-    
-    baseline = metric(model, X, y)
-    
-    imp = []
-    for col in X.columns:
-        if fitype is "permute":
-            save = X[col].copy()
-            X[col] = np.random.permutation(X[col])
-            m = metric(model, X, y)
-            X[col] = save
-            imp.append(baseline - m)
-        elif fitype is "dropcol":
-            X_ = X.drop(col, axis=1)
-            model_ = clone(model)
-            model_.random_state = random_state
-            model_.fit(X_, y)
-            o = model_.oob_score_
-            imp.append(baseline - o)
-        else:
-            raise ValueError("unknown fitype, only 'permute' or 'dropcol' permitted")
-
-    # create a feature importance table with desired labels
-    zipped = zip(imp, header)
-    feature_imp = pd.DataFrame(sorted(zipped), columns=["importance", "feature"])
-    feature_imp.sort_values(by="importance", ascending=False, inplace=True)
-
-    plt.clf()
-    plt.figure(figsize=figsz)
-    sns.barplot(x="importance", y="feature", data=feature_imp)
-    plt.title(f"feature importances-{fitype}")
-    plt.tight_layout()
-
-    context.log_artifact(PlotArtifact(f"feature importances-{fitype}", body=plt.gcf()),
-                         local_path=f"{plots_dest}/feature-permutations.html")
-    context.log_dataset(f"feature-importances-{fitype}-tbl", df=feature_imp, index=False)
-
-
-
-
-
-
-
# nuclio: end-code
-
-
-
-
-
-
-

save function

-
-
-
from mlrun import code_to_function
-from mlrun.platforms.other import auto_mount
-
-gpus = False
-
-# create job function object from notebook code
-fn_params = {
-    "name"        : "feature-perms",
-    "handler"     : "permutation_importances",
-    "kind"        : "job",
-    "image"       : "mlrun/ml-models" if not gpus else "mlrun/ml-models-gpu",
-    "description" : "estimate feature importances using permutations",
-    "categories"  : ["analysis"],
-    "labels"      : {"author": "yjb"}
-}
-
-perms_fn = code_to_function(**fn_params)
-perms_fn.apply(auto_mount())
-perms_fn.export("function.yaml")
-
-
-
-
-
[mlrun] 2020-06-07 19:58:25,298 function spec saved to path: function.yaml
-
-
-
<mlrun.runtimes.kubejob.KubejobRuntime at 0x7feb0104efd0>
-
-
-
-
-
-
-

tests

-
-
-
from mlrun import import_function
-from mlrun import NewTask, mlconf
-
-
-
-
-
-

get some data

-
-
-
data_url = "https://raw.githubusercontent.com/parrt/random-forest-importances/master/notebooks/data/rent.csv"
-
-fn = import_function("hub://arc_to_parquet", "a2p")
-fn.apply(auto_mount())
-
-params = {
-    "name" : "tasks arc-to-parq",
-    "params" : {"key":"rent", "stats": True, "file_ext":"csv"}
-}
-acquire_run = fn.run(NewTask(**params),inputs={"archive_url" : data_url},
-                     artifact_path=mlconf.artifact_path)
-
-
-
-
-
[mlrun] 2020-06-07 19:58:25,352 starting run tasks arc-to-parq uid=e9bc67f2189c418d96bfde754d369956  -> http://mlrun-api:8080
-[mlrun] 2020-06-07 19:58:25,486 Job is running in the background, pod: tasks-arc-to-parq-xqkr7
-[mlrun] 2020-06-07 19:58:29,118 starting local run: main.py # arc_to_parquet
-[mlrun] 2020-06-07 19:58:29,169 downloading https://raw.githubusercontent.com/parrt/random-forest-importances/master/notebooks/data/rent.csv to local tmp
-[mlrun] 2020-06-07 19:58:29,535 destination file does not exist, downloading
-[mlrun] 2020-06-07 19:58:29,898 log artifact rent at /User/artifacts/rent.csv, size: 1492462, db: Y
-
-[mlrun] 2020-06-07 19:58:29,917 run executed, status=completed
-final state: succeeded
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 07 19:58:29completedtasks arc-to-parq
v3io_user=admin
kind=job
owner=admin
host=tasks-arc-to-parq-xqkr7
archive_url
key=rent
stats=True
file_ext=csv
rent
-
- -
-
to track results use .show() or .logs() or in CLI: 
-!mlrun get run e9bc67f2189c418d96bfde754d369956 --project default , !mlrun logs e9bc67f2189c418d96bfde754d369956 --project default
-[mlrun] 2020-06-07 19:58:31,666 run executed, status=completed
-
-
-
-
-
-
-

train a model

-
-
-
fn = import_function("hub://sklearn_classifier", "skrf")
-fn.apply(auto_mount())
-
-# define model
-params = {
-    "name" : "tasks random forest",
-    "params" : {
-        "sample"                 : -5_000, # 5k random rows,
-        "model_pkg_class"        : "sklearn.ensemble.RandomForestClassifier",
-        "label_column"           : "interest_level",
-        "CLASS_n_estimators"     : 100,
-        "CLASS_min_samples_leaf" : 1,
-        "CLASS_n_jobs"           : -1,
-        "CLASS_oob_score"        : True}
-}
-
-train_run = fn.run(NewTask(**params), inputs={"dataset" : acquire_run.outputs["rent"]},
-                   artifact_path=mlconf.artifact_path)
-
-
-
-
-
[mlrun] 2020-06-07 19:58:31,704 starting run tasks random forest uid=57af834167264641905a5bb5e6b0e263  -> http://mlrun-api:8080
-[mlrun] 2020-06-07 19:58:31,861 Job is running in the background, pod: tasks-random-forest-vkjk5
-[mlrun] 2020-06-07 19:58:35,390 starting local run: main.py # train_model
-[mlrun] 2020-06-07 19:58:36,310 log artifact test_set at /User/artifacts/data/test_set.parquet, size: 24484, db: Y
-[mlrun] 2020-06-07 19:58:37,153 log artifact confusion-matrix at /User/artifacts/model/plots/confusion-matrix.html, size: 27401, db: N
-[mlrun] 2020-06-07 19:58:37,598 log artifact feature-importances at /User/artifacts/model/plots/feature-importances.html, size: 19685, db: N
-[mlrun] 2020-06-07 19:58:37,806 log artifact precision-recall-multiclass at /User/artifacts/model/plots/precision-recall-multiclass.html, size: 74009, db: N
-[mlrun] 2020-06-07 19:58:37,936 log artifact roc-multiclass at /User/artifacts/model/plots/roc-multiclass.html, size: 73053, db: N
-[mlrun] 2020-06-07 19:58:38,079 log artifact model at /User/artifacts/model/, size: 10346780, db: Y
-
-[mlrun] 2020-06-07 19:58:38,106 run executed, status=completed
-final state: succeeded
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 07 19:58:36completedtasks random forest
v3io_user=admin
kind=job
owner=admin
host=tasks-random-forest-vkjk5
class=sklearn.ensemble.RandomForestClassifier
dataset
sample=-5000
model_pkg_class=sklearn.ensemble.RandomForestClassifier
label_column=interest_level
CLASS_n_estimators=100
CLASS_min_samples_leaf=1
CLASS_n_jobs=-1
CLASS_oob_score=True
test-accuracy=0.6902857142857143
test-error=0.3097142857142857
auc-micro=0.8567196734693878
auc-weighted=0.7077200281488216
f1-score=0.44361444815007395
precision_score=0.4969837043184901
recall_score=0.42733978329897576
test_set
confusion-matrix
feature-importances
precision-recall-multiclass
roc-multiclass
model
-
- -
-
to track results use .show() or .logs() or in CLI: 
-!mlrun get run 57af834167264641905a5bb5e6b0e263 --project default , !mlrun logs 57af834167264641905a5bb5e6b0e263 --project default
-[mlrun] 2020-06-07 19:58:41,115 run executed, status=completed
-
-
-
-
-
-
-
from IPython.display import HTML
-HTML(filename=train_run.outputs['feature-importances'])
-
-
-
-
-

Feature Importances

-
-
-
-
-
data   = acquire_run.outputs["rent"]
-labels = "interest_level"
-model  = train_run.outputs["model"]
-
-
-
-
-
-
-
fi_perms = perms_fn.run(
-    NewTask(params={"labels": labels, 
-                    "plots_dest": "plots"}),
-    inputs={"model": model, "dataset": data},
-    artifact_path=mlconf.artifact_path)
-
-
-
-
-
[mlrun] 2020-06-07 19:58:41,152 starting run features-permutation_importances uid=89235b15ac2a4213aefc906c178a1c5e  -> http://mlrun-api:8080
-[mlrun] 2020-06-07 19:58:41,312 Job is running in the background, pod: features-permutation-importances-dwxmt
-[mlrun] 2020-06-07 19:58:44,871 starting local run: main.py # permutation_importances
-[mlrun] 2020-06-07 19:58:48,714 log artifact feature importances-permute at /User/artifacts/plots/feature-permutations.html, size: 25694, db: Y
-[mlrun] 2020-06-07 19:58:48,770 log artifact feature-importances-permute-tbl at /User/artifacts/feature-importances-permute-tbl.csv, size: 167, db: Y
-
-[mlrun] 2020-06-07 19:58:48,785 run executed, status=completed
-final state: succeeded
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 07 19:58:45completedfeatures-permutation_importances
v3io_user=admin
kind=job
owner=admin
host=features-permutation-importances-dwxmt
model
dataset
labels=interest_level
plots_dest=plots
feature importances-permute
feature-importances-permute-tbl
-
- -
-
to track results use .show() or .logs() or in CLI: 
-!mlrun get run 89235b15ac2a4213aefc906c178a1c5e --project default , !mlrun logs 89235b15ac2a4213aefc906c178a1c5e --project default
-[mlrun] 2020-06-07 19:58:50,488 run executed, status=completed
-
-
-
-
-
-
-
from IPython.display import HTML
-HTML(filename=fi_perms.outputs['feature importances-permute'])
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/feature_perms/1.0.0/static/function.html b/functions/development/feature_perms/1.0.0/static/function.html deleted file mode 100644 index f9520633..00000000 --- a/functions/development/feature_perms/1.0.0/static/function.html +++ /dev/null @@ -1,85 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: job
-metadata:
-  name: feature-perms
-  tag: ''
-  hash: 2e32234a73e2e48f029cf6c957b150ec2ffd4bc7
-  project: ''
-  labels:
-    author: yjb
-  categories:
-  - data-analysis
-spec:
-  command: ''
-  args: []
-  image: mlrun/ml-models
-  env: []
-  default_handler: permutation_importance
-  entry_points:
-    permutation_importance:
-      name: permutation_importance
-      doc: 'calculate change in metric
-
-
-        type ''permute'' uses a pre-estimated model
-
-        type ''dropcol'' uses a re-estimates model'
-      parameters:
-      - name: context
-        type: MLClientCtx
-        doc: the function's execution context
-        default: ''
-      - name: model
-        type: DataItem
-        doc: a trained model
-        default: ''
-      - name: dataset
-        type: DataItem
-        doc: features and ground truths, regression targets
-        default: ''
-      - name: labels
-        type: str
-        default: ''
-      - name: figsz
-        doc: matplotlib figure size
-        default:
-        - 10
-        - 5
-      - name: plots_dest
-        type: str
-        doc: path within artifact store
-        default: plots
-      - name: fitype
-        type: str
-        default: permute
-      outputs:
-      - default: ''
-      lineno: 93
-  description: estimate feature importances using permutations
-  build:
-    functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IG51bXB5IGFzIG5wCmltcG9ydCBwYW5kYXMgYXMgcGQKaW1wb3J0IG51bWJlcnMKCmltcG9ydCBza2xlYXJuCmZyb20gc2tsZWFybi5iYXNlIGltcG9ydCBjbG9uZQpmcm9tIHNrbGVhcm4udXRpbHMgaW1wb3J0IGNoZWNrX3JhbmRvbV9zdGF0ZQoKaW1wb3J0IG1hdHBsb3RsaWIucHlwbG90IGFzIHBsdAppbXBvcnQgc2VhYm9ybiBhcyBzbnMKCmZyb20gY2xvdWRwaWNrbGUgaW1wb3J0IGxvYWQKCmZyb20gbWxydW4uZXhlY3V0aW9uIGltcG9ydCBNTENsaWVudEN0eApmcm9tIG1scnVuLmRhdGFzdG9yZSBpbXBvcnQgRGF0YUl0ZW0KZnJvbSBtbHJ1bi5hcnRpZmFjdHMgaW1wb3J0IGdldF9tb2RlbCwgUGxvdEFydGlmYWN0CmZyb20gdHlwaW5nIGltcG9ydCBVbmlvbiwgQ2FsbGFibGUsIExpc3QKCgpkZWYgX2dldF9uX3NhbXBsZXNfYm9vdHN0cmFwKG5fc2FtcGxlcywgbWF4X3NhbXBsZXMpIC0+IGludDoKICAgICIiImdldCB0aGUgbnVtYmVyIG9mIHNhbXBsZXMgaW4gYSBib290c3RyYXAgc2FtcGxlCgogICAgcmV0dXJucyB0aGUgdG90YWwgbnVtYmVyIG9mIHNhbXBsZXMgdG8gZHJhdyBmb3IgdGhlIGJvb3RzdHJhcCBzYW1wbGUKCiAgICBwcml2YXRlIGFwaSBpbiBza2xlYXJuID49IHYwLjI0LCB0YWtlbiBmcm9tIHNrbGVhcm4uZW5zZW1ibGUuX2ZvcmVzdC5weQoKICAgIDpwYXJhbSBuX3NhbXBsZXM6ICAgTnVtYmVyIG9mIHNhbXBsZXMgaW4gdGhlIGRhdGFzZXQuCiAgICA6cGFyYW0gbWF4X3NhbXBsZXM6CiAgICAgICAgVGhlIG1heGltdW0gbnVtYmVyIG9mIHNhbXBsZXMgdG8gZHJhdyBmcm9tIHRoZSB0b3RhbCBhdmFpbGFibGU6CiAgICAgICAgICAgIC0gaWYgZmxvYXQsIHRoaXMgaW5kaWNhdGVzIGEgZnJhY3Rpb24gb2YgdGhlIHRvdGFsIGFuZCBzaG91bGQgYmUKICAgICAgICAgICAgICB0aGUgaW50ZXJ2YWwgYCgwLCAxKWA7CiAgICAgICAgICAgIC0gaWYgaW50LCB0aGlzIGluZGljYXRlcyB0aGUgZXhhY3QgbnVtYmVyIG9mIHNhbXBsZXM7CiAgICAgICAgICAgIC0gaWYgTm9uZSwgdGhpcyBpbmRpY2F0ZXMgdGhlIHRvdGFsIG51bWJlciBvZiBzYW1wbGVzLgogICAgIiIiCiAgICBpZiBtYXhfc2FtcGxlcyBpcyBOb25lOgogICAgICAgIHJldHVybiBuX3NhbXBsZXMKCiAgICBpZiBpc2luc3RhbmNlKG1heF9zYW1wbGVzLCBudW1iZXJzLkludGVncmFsKToKICAgICAgICBpZiBub3QgKDEgPD0gbWF4X3NhbXBsZXMgPD0gbl9zYW1wbGVzKToKICAgICAgICAgICAgbXNnID0gImBtYXhfc2FtcGxlc2AgbXVzdCBiZSBpbiByYW5nZSAxIHRvIHt9IGJ1dCBnb3QgdmFsdWUge30iCiAgICAgICAgICAgIHJhaXNlIFZhbHVlRXJyb3IobXNnLmZvcm1hdChuX3NhbXBsZXMsIG1heF9zYW1wbGVzKSkKICAgICAgICByZXR1cm4gbWF4X3NhbXBsZXMKCiAgICBpZiBpc2luc3RhbmNlKG1heF9zYW1wbGVzLCBudW1iZXJzLlJlYWwpOgogICAgICAgIGlmIG5vdCAoMCA8IG1heF9zYW1wbGVzIDwgMSk6CiAgICAgICAgICAgIG1zZyA9ICJgbWF4X3NhbXBsZXNgIG11c3QgYmUgaW4gcmFuZ2UgKDAsIDEpIGJ1dCBnb3QgdmFsdWUge30iCiAgICAgICAgICAgIHJhaXNlIFZhbHVlRXJyb3IobXNnLmZvcm1hdChtYXhfc2FtcGxlcykpCiAgICAgICAgcmV0dXJuIGludChyb3VuZChuX3NhbXBsZXMgKiBtYXhfc2FtcGxlcykpCgogICAgbXNnID0gImBtYXhfc2FtcGxlc2Agc2hvdWxkIGJlIGludCBvciBmbG9hdCwgYnV0IGdvdCB0eXBlICd7fSciCiAgICByYWlzZSBUeXBlRXJyb3IobXNnLmZvcm1hdCh0eXBlKG1heF9zYW1wbGVzKSkpCgoKZGVmIF9nZXRfdW5zYW1wbGVkX2l4KHJhbmRvbV9zdGF0ZSwgbl9zYW1wbGVzOiBpbnQpIC0+IG5wLmFycmF5OgogICAgIiIiCiAgICBmdXR1cmUtcHJvb2YgZ2V0IHVuc2FtcGxlZCBpbmRpY2VzCiAgICAiIiIKICAgIG5fYm9vdHN0cmFwID0gX2dldF9uX3NhbXBsZXNfYm9vdHN0cmFwKG5fc2FtcGxlcywgbl9zYW1wbGVzKQogICAgcmFuZG9tX2luc3RhbmNlID0gY2hlY2tfcmFuZG9tX3N0YXRlKHJhbmRvbV9zdGF0ZSkKICAgIHNhbXBsZV9pbmRpY2VzID0gcmFuZG9tX2luc3RhbmNlLnJhbmRpbnQoMCwgbl9zYW1wbGVzLCBuX2Jvb3RzdHJhcCkKICAgIHNhbXBsZV9jb3VudHMgPSBucC5iaW5jb3VudChzYW1wbGVfaW5kaWNlcywgbWlubGVuZ3RoPW5fc2FtcGxlcykKCiAgICByZXR1cm4gbnAuYXJhbmdlKG5fc2FtcGxlcylbc2FtcGxlX2NvdW50cyA9PSAwXQoKCmRlZiBfb29iX2NsYXNzaWZpZXJfYWNjdXJhY3kocmYsIFhfdHJhaW4sIHlfdHJhaW4pIC0+IGZsb2F0OgogICAgIiIiCiAgICBDb21wdXRlIG91dC1vZi1iYWcgKE9PQikgYWNjdXJhY3kgZm9yIGEgc2Npa2l0LWxlYXJuIGZvcmVzdCBjbGFzc2lmaWVyLgoKICAgIGh0dHBzOi8vZ2l0aHViLmNvbS9zY2lraXQtbGVhcm4vc2Npa2l0LWxlYXJuL2Jsb2IvYTI0YzhiNDYvc2tsZWFybi9lbnNlbWJsZS9mb3Jlc3QucHkjTDQyNQogICAgIiIiCiAgICBYID0gWF90cmFpbi52YWx1ZXMgaWYgaXNpbnN0YW5jZShYX3RyYWluLCBwZC5EYXRhRnJhbWUpIGVsc2UgWF90cmFpbgogICAgeSA9IHlfdHJhaW4udmFsdWVzIGlmIGlzaW5zdGFuY2UoeV90cmFpbiwgcGQuU2VyaWVzKSBlbHNlIHlfdHJhaW4KCiAgICBuX3NhbXBsZXMgPSBsZW4oWCkKICAgIG5fY2xhc3NlcyA9IGxlbihucC51bmlxdWUoeSkpCiAgICBwcmVkaWN0aW9ucyA9IG5wLnplcm9zKChuX3NhbXBsZXMsIG5fY2xhc3NlcykpCiAgICBmb3IgdHJlZSBpbiByZi5lc3RpbWF0b3JzXzoKICAgICAgICB1bnNhbXBsZWRfaW5kaWNlcyA9IF9nZXRfdW5zYW1wbGVkX2l4KHRyZWUucmFuZG9tX3N0YXRlLCBuX3NhbXBsZXMpCiAgICAgICAgdHJlZV9wcmVkcyA9IHRyZWUucHJlZGljdF9wcm9iYShYW3Vuc2FtcGxlZF9pbmRpY2VzLCA6XSkKICAgICAgICBwcmVkaWN0aW9uc1t1bnNhbXBsZWRfaW5kaWNlc10gKz0gdHJlZV9wcmVkcwoKICAgIHByZWRpY3RlZF9jbGFzc19pbmRleGVzID0gbnAuYXJnbWF4KHByZWRpY3Rpb25zLCBheGlzPTEpCiAgICBwcmVkaWN0ZWRfY2xhc3NlcyA9IFtyZi5jbGFzc2VzX1tpXSBmb3IgaSBpbiBwcmVkaWN0ZWRfY2xhc3NfaW5kZXhlc10KCiAgICBvb2Jfc2NvcmUgPSBucC5tZWFuKHkgPT0gcHJlZGljdGVkX2NsYXNzZXMpCgogICAgcmV0dXJuIG9vYl9zY29yZQoKCmRlZiBwZXJtdXRhdGlvbl9pbXBvcnRhbmNlKAogICAgY29udGV4dDogTUxDbGllbnRDdHgsCiAgICBtb2RlbDogRGF0YUl0ZW0sCiAgICBkYXRhc2V0OiBEYXRhSXRlbSwKICAgIGxhYmVsczogc3RyLAogICAgZmlnc3o9KDEwLCA1KSwKICAgIHBsb3RzX2Rlc3Q6IHN0ciA9ICJwbG90cyIsCiAgICBmaXR5cGU6IHN0ciA9ICJwZXJtdXRlIiwKKSAtPiBwZC5EYXRhRnJhbWU6CiAgICAiIiJjYWxjdWxhdGUgY2hhbmdlIGluIG1ldHJpYwoKICAgIHR5cGUgJ3Blcm11dGUnIHVzZXMgYSBwcmUtZXN0aW1hdGVkIG1vZGVsCiAgICB0eXBlICdkcm9wY29sJyB1c2VzIGEgcmUtZXN0aW1hdGVzIG1vZGVsCgogICAgOnBhcmFtIGNvbnRleHQ6ICAgICB0aGUgZnVuY3Rpb24ncyBleGVjdXRpb24gY29udGV4dAogICAgOnBhcmFtIG1vZGVsOiAgICAgICBhIHRyYWluZWQgbW9kZWwKICAgIDpwYXJhbSBkYXRhc2V0OiAgICAgZmVhdHVyZXMgYW5kIGdyb3VuZCB0cnV0aHMsIHJlZ3Jlc3Npb24gdGFyZ2V0cwogICAgOnBhcmFtIGxhYmVscyAgICAgICBuYW1lIG9mIHRoZSBncm91bmQgdHJ1dGhzIGNvbHVtbgogICAgOnBhcmFtIGZpZ3N6OiAgICAgICBtYXRwbG90bGliIGZpZ3VyZSBzaXplCiAgICA6cGFyYW0gcGxvdHNfZGVzdDogIHBhdGggd2l0aGluIGFydGlmYWN0IHN0b3JlCiAgICA6CiAgICAiIiIKICAgIG1vZGVsX2ZpbGUsIG1vZGVsX2RhdGEsIF8gPSBnZXRfbW9kZWwobW9kZWwudXJsLCBzdWZmaXg9Ii5wa2wiKQogICAgbW9kZWwgPSBsb2FkKG9wZW4oc3RyKG1vZGVsX2ZpbGUpLCAicmIiKSkKCiAgICBYID0gZGF0YXNldC5hc19kZigpCiAgICB5ID0gWC5wb3AobGFiZWxzKQogICAgaGVhZGVyID0gWC5jb2x1bW5zCgogICAgbWV0cmljID0gX29vYl9jbGFzc2lmaWVyX2FjY3VyYWN5CgogICAgYmFzZWxpbmUgPSBtZXRyaWMobW9kZWwsIFgsIHkpCgogICAgaW1wID0gW10KICAgIGZvciBjb2wgaW4gWC5jb2x1bW5zOgogICAgICAgIGlmIGZpdHlwZSBpcyAicGVybXV0ZSI6CiAgICAgICAgICAgIHNhdmUgPSBYW2NvbF0uY29weSgpCiAgICAgICAgICAgIFhbY29sXSA9IG5wLnJhbmRvbS5wZXJtdXRhdGlvbihYW2NvbF0pCiAgICAgICAgICAgIG0gPSBtZXRyaWMobW9kZWwsIFgsIHkpCiAgICAgICAgICAgIFhbY29sXSA9IHNhdmUKICAgICAgICAgICAgaW1wLmFwcGVuZChiYXNlbGluZSAtIG0pCiAgICAgICAgZWxpZiBmaXR5cGUgaXMgImRyb3Bjb2wiOgogICAgICAgICAgICBYXyA9IFguZHJvcChjb2wsIGF4aXM9MSkKICAgICAgICAgICAgbW9kZWxfID0gY2xvbmUobW9kZWwpCiAgICAgICAgICAgICNtb2RlbF8ucmFuZG9tX3N0YXRlID0gcmFuZG9tX3N0YXRlCiAgICAgICAgICAgIG1vZGVsXy5maXQoWF8sIHkpCiAgICAgICAgICAgIG8gPSBtb2RlbF8ub29iX3Njb3JlXwogICAgICAgICAgICBpbXAuYXBwZW5kKGJhc2VsaW5lIC0gbykKICAgICAgICBlbHNlOgogICAgICAgICAgICByYWlzZSBWYWx1ZUVycm9yKCJ1bmtub3duIGZpdHlwZSwgb25seSAncGVybXV0ZScgb3IgJ2Ryb3Bjb2wnIHBlcm1pdHRlZCIpCgogICAgemlwcGVkID0gemlwKGltcCwgaGVhZGVyKQogICAgZmVhdHVyZV9pbXAgPSBwZC5EYXRhRnJhbWUoc29ydGVkKHppcHBlZCksIGNvbHVtbnM9WyJpbXBvcnRhbmNlIiwgImZlYXR1cmUiXSkKICAgIGZlYXR1cmVfaW1wLnNvcnRfdmFsdWVzKGJ5PSJpbXBvcnRhbmNlIiwgYXNjZW5kaW5nPUZhbHNlLCBpbnBsYWNlPVRydWUpCgogICAgcGx0LmNsZigpCiAgICBwbHQuZmlndXJlKGZpZ3NpemU9Zmlnc3opCiAgICBzbnMuYmFycGxvdCh4PSJpbXBvcnRhbmNlIiwgeT0iZmVhdHVyZSIsIGRhdGE9ZmVhdHVyZV9pbXApCiAgICBwbHQudGl0bGUoZiJmZWF0dXJlIGltcG9ydGFuY2VzLXtmaXR5cGV9IikKICAgIHBsdC50aWdodF9sYXlvdXQoKQoKICAgIGNvbnRleHQubG9nX2FydGlmYWN0KAogICAgICAgIFBsb3RBcnRpZmFjdChmImZlYXR1cmUgaW1wb3J0YW5jZXMte2ZpdHlwZX0iLCBib2R5PXBsdC5nY2YoKSksCiAgICAgICAgbG9jYWxfcGF0aD1mIntwbG90c19kZXN0fS9mZWF0dXJlLXBlcm11dGF0aW9ucy5odG1sIiwKICAgICkKICAgIGNvbnRleHQubG9nX2RhdGFzZXQoCiAgICAgICAgZiJmZWF0dXJlLWltcG9ydGFuY2VzLXtmaXR5cGV9LXRibCIsIGRmPWZlYXR1cmVfaW1wLCBpbmRleD1GYWxzZQogICAgKQo=
-    commands: []
-    code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/feature_perms/feature_perms.py
-  affinity: null
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/feature_perms/1.0.0/static/item.html b/functions/development/feature_perms/1.0.0/static/item.html deleted file mode 100644 index daa64744..00000000 --- a/functions/development/feature_perms/1.0.0/static/item.html +++ /dev/null @@ -1,45 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- data-analysis
-description: estimate feature importances using permutations
-doc: ''
-example: feature_perms.ipynb
-generationDate: 2021-11-18:12-28
-icon: ''
-labels:
-  author: yjb
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 0.8.0
-name: feature-perms
-platformVersion: 3.2.0
-spec:
-  filename: feature_perms.py
-  handler: permutation_importance
-  image: mlrun/ml-models
-  kind: job
-  requirements: []
-url: ''
-version: 1.0.0
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/feature_perms/1.0.0/static/source.html b/functions/development/feature_perms/1.0.0/static/source.html deleted file mode 100644 index 3013427a..00000000 --- a/functions/development/feature_perms/1.0.0/static/source.html +++ /dev/null @@ -1,182 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-# Generated by nuclio.export.NuclioExporter
-
-import numpy as np
-import pandas as pd
-import numbers
-
-import sklearn
-from sklearn.base import clone
-from sklearn.utils import check_random_state
-
-import matplotlib.pyplot as plt
-import seaborn as sns
-
-from cloudpickle import load
-
-from mlrun.execution import MLClientCtx
-from mlrun.datastore import DataItem
-from mlrun.artifacts import get_model, PlotArtifact
-from typing import Union, Callable, List
-
-
-def _get_n_samples_bootstrap(n_samples, max_samples) -> int:
-    """get the number of samples in a bootstrap sample
-
-    returns the total number of samples to draw for the bootstrap sample
-
-    private api in sklearn >= v0.24, taken from sklearn.ensemble._forest.py
-
-    :param n_samples:   Number of samples in the dataset.
-    :param max_samples:
-        The maximum number of samples to draw from the total available:
-            - if float, this indicates a fraction of the total and should be
-              the interval `(0, 1)`;
-            - if int, this indicates the exact number of samples;
-            - if None, this indicates the total number of samples.
-    """
-    if max_samples is None:
-        return n_samples
-
-    if isinstance(max_samples, numbers.Integral):
-        if not (1 <= max_samples <= n_samples):
-            msg = "`max_samples` must be in range 1 to {} but got value {}"
-            raise ValueError(msg.format(n_samples, max_samples))
-        return max_samples
-
-    if isinstance(max_samples, numbers.Real):
-        if not (0 < max_samples < 1):
-            msg = "`max_samples` must be in range (0, 1) but got value {}"
-            raise ValueError(msg.format(max_samples))
-        return int(round(n_samples * max_samples))
-
-    msg = "`max_samples` should be int or float, but got type '{}'"
-    raise TypeError(msg.format(type(max_samples)))
-
-
-def _get_unsampled_ix(random_state, n_samples: int) -> np.array:
-    """
-    future-proof get unsampled indices
-    """
-    n_bootstrap = _get_n_samples_bootstrap(n_samples, n_samples)
-    random_instance = check_random_state(random_state)
-    sample_indices = random_instance.randint(0, n_samples, n_bootstrap)
-    sample_counts = np.bincount(sample_indices, minlength=n_samples)
-
-    return np.arange(n_samples)[sample_counts == 0]
-
-
-def _oob_classifier_accuracy(rf, X_train, y_train) -> float:
-    """
-    Compute out-of-bag (OOB) accuracy for a scikit-learn forest classifier.
-
-    https://github.com/scikit-learn/scikit-learn/blob/a24c8b46/sklearn/ensemble/forest.py#L425
-    """
-    X = X_train.values if isinstance(X_train, pd.DataFrame) else X_train
-    y = y_train.values if isinstance(y_train, pd.Series) else y_train
-
-    n_samples = len(X)
-    n_classes = len(np.unique(y))
-    predictions = np.zeros((n_samples, n_classes))
-    for tree in rf.estimators_:
-        unsampled_indices = _get_unsampled_ix(tree.random_state, n_samples)
-        tree_preds = tree.predict_proba(X[unsampled_indices, :])
-        predictions[unsampled_indices] += tree_preds
-
-    predicted_class_indexes = np.argmax(predictions, axis=1)
-    predicted_classes = [rf.classes_[i] for i in predicted_class_indexes]
-
-    oob_score = np.mean(y == predicted_classes)
-
-    return oob_score
-
-
-def permutation_importance(
-    context: MLClientCtx,
-    model: DataItem,
-    dataset: DataItem,
-    labels: str,
-    figsz=(10, 5),
-    plots_dest: str = "plots",
-    fitype: str = "permute",
-) -> pd.DataFrame:
-    """calculate change in metric
-
-    type 'permute' uses a pre-estimated model
-    type 'dropcol' uses a re-estimates model
-
-    :param context:     the function's execution context
-    :param model:       a trained model
-    :param dataset:     features and ground truths, regression targets
-    :param labels       name of the ground truths column
-    :param figsz:       matplotlib figure size
-    :param plots_dest:  path within artifact store
-    :
-    """
-    model_file, model_data, _ = get_model(model.url, suffix=".pkl")
-    model = load(open(str(model_file), "rb"))
-
-    X = dataset.as_df()
-    y = X.pop(labels)
-    header = X.columns
-
-    metric = _oob_classifier_accuracy
-
-    baseline = metric(model, X, y)
-
-    imp = []
-    for col in X.columns:
-        if fitype is "permute":
-            save = X[col].copy()
-            X[col] = np.random.permutation(X[col])
-            m = metric(model, X, y)
-            X[col] = save
-            imp.append(baseline - m)
-        elif fitype is "dropcol":
-            X_ = X.drop(col, axis=1)
-            model_ = clone(model)
-            #model_.random_state = random_state
-            model_.fit(X_, y)
-            o = model_.oob_score_
-            imp.append(baseline - o)
-        else:
-            raise ValueError("unknown fitype, only 'permute' or 'dropcol' permitted")
-
-    zipped = zip(imp, header)
-    feature_imp = pd.DataFrame(sorted(zipped), columns=["importance", "feature"])
-    feature_imp.sort_values(by="importance", ascending=False, inplace=True)
-
-    plt.clf()
-    plt.figure(figsize=figsz)
-    sns.barplot(x="importance", y="feature", data=feature_imp)
-    plt.title(f"feature importances-{fitype}")
-    plt.tight_layout()
-
-    context.log_artifact(
-        PlotArtifact(f"feature importances-{fitype}", body=plt.gcf()),
-        local_path=f"{plots_dest}/feature-permutations.html",
-    )
-    context.log_dataset(
-        f"feature-importances-{fitype}-tbl", df=feature_imp, index=False
-    )
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/feature_perms/1.1.0/src/README.ipynb b/functions/development/feature_perms/1.1.0/src/README.ipynb deleted file mode 100644 index 0929a6f6..00000000 --- a/functions/development/feature_perms/1.1.0/src/README.ipynb +++ /dev/null @@ -1,788 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# feature importances" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "There are a number of ways to compute feature importances and **the default estimates reported by scikit learn can be shown to be biased** under certain circumstances. In addition, many non-tree algorithms do not provide conveniently calculated feature importance estimates. The following demonstration is based on material that draws heavily from the following sources:" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## references\n", - "\n", - "\n", - "### repos\n", - "\n", - "* **[Feature importances for scikit-learn machine learning models](https://github.com/parrt/random-forest-importances)**, [MIT License](https://github.com/parrt/random-forest-importances/blob/master/LICENSE)\n", - "* **[Scikit-Learn ensemble module - forests](https://github.com/scikit-learn/scikit-learn/blob/0.23.1/sklearn/ensemble/_forest.py)**, [BSD License](https://github.com/scikit-learn/scikit-learn/blob/fd237278e895b42abe8d8d09105cbb82dc2cbba7/sklearn/ensemble/_forest.py#L40)\n", - "* **[ELI5 - Permutation Importance](https://eli5.readthedocs.io/en/latest/blackbox/permutation_importance.html)** \n", - "\n", - "### articles\n", - "\n", - "Strobl, C., Boulesteix, A., Zeileis, A. et al. **[Bias in random forest variable importance measures: Illustrations, sources and a solution](https://link.springer.com/article/10.1186/1471-2105-8-25#citeas)**. BMC Bioinformatics 8, 25 (2007). https://doi.org/10.1186/1471-2105-8-25 \n", - "\n", - "Strobl, C., Boulesteix, A., Kneib, T. et al. **[Conditional variable importance for random forests](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-9-307#citeas)**. BMC Bioinformatics 9, 307 (2008). https://doi.org/10.1186/1471-2105-9-307 " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## what we'll do\n", - "\n", - "* demonstrate an issue with default feature importance estimates \n", - "* provide alternatives and compare to the default \n", - "* create a new function `feature_perms` that implements a computationally simple algorithm \n", - "* create a new function `dropcol_importances` that implements a computationally intensive algorithm that is more accurate\n", - "* test our new functions\n", - "\n", - "It should be noted that although we are developing this notebook using a classification example, an almost identical presentation can be done for regression." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## imports" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "import pandas as pd\n", - "\n", - "import sklearn\n", - "from sklearn.base import clone\n", - "\n", - "from sklearn.ensemble import RandomForestClassifier as SomeModel\n", - "\n", - "import matplotlib.pyplot as plt\n", - "import seaborn as sns\n", - "\n", - "from typing import Union, Callable, List" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## default feature importances\n", - "\n", - "This is a function that plots default feature importances from an estimated model object when available. It is taken from mlrun's current source-code implementation:" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "def feature_importances(\n", - " model: SomeModel,\n", - " header: List[str], \n", - " figsz=(10, 5)\n", - ") -> None:\n", - " \"\"\"Display default model feature importances\n", - "\n", - " Only works for models with attribute 'feature_importances_`\n", - "\n", - " :param model: fitted model with a feature_importances_ attribute\n", - " :param header: feature labels\n", - " :param figsz: matplotlib figure size\n", - " \"\"\"\n", - " if not hasattr(model, \"feature_importances_\"):\n", - " raise Exception(\n", - " \"feature importances are only available for some models\")\n", - "\n", - " # create a feature importance table with desired labels\n", - " zipped = zip(model.feature_importances_, header)\n", - " feature_imp = pd.DataFrame(\n", - " sorted(zipped), columns=[\"freq\", \"feature\"]).sort_values(\n", - " by=\"freq\", ascending=False)\n", - "\n", - " plt.clf()\n", - " plt.figure(figsize=figsz)\n", - " sns.barplot(x=\"freq\", y=\"feature\", data=feature_imp)\n", - " plt.title(\"features\")\n", - " plt.tight_layout();\n", - " \n", - " return feature_imp" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## permuted features\n", - "\n", - "A proposed solution that has general applicability is randomly permuted features**[refs](#references)**: \n", - "* loop through the feature set \n", - "* shuffle one feature \n", - "* run predict\n", - "* compare the (marginal) change in accuracy (or other metric of interest) \n", - "\n", - "This approach is computationally more demanding than relying on the default values, however it can be easily parallelized. To perform the estimation we only need an estimated model and a held-out test set. The following was proposed in **[Beware Default Random Forest Importances](https://explained.ai/rf-importance/index.html)**:\n", - "\n", - "( the following 3 glue functions will no longer be publicly visible in the sklearn package from 0.24 onwards, consider this a temporary hack while we refactor these away)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**the following has been refactored in final version of function:**" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "from distutils.version import LooseVersion\n", - "import numpy as np\n", - "from sklearn.utils import check_random_state\n", - "\n", - "def _generate_sample_indices(random_state: int, n_samples: int, n_samples_bootstrap: int):\n", - " \"\"\"\n", - " Private function used to _parallel_build_trees function.\n", - " taken from:\n", - " https://github.com/scikit-learn/scikit-learn/blob/2253807bb488b6de73796aef2de38a6dcf282d86/sklearn/ensemble/_forest.py#L116\n", - " (public availability to be deprecated by sklearn v0.24)\n", - " \"\"\"\n", - " random_instance = check_random_state(random_state)\n", - " sample_indices = random_instance.randint(0, n_samples, n_samples_bootstrap)\n", - "\n", - " return sample_indices\n", - "\n", - "def _generate_unsampled_indices(random_state: int, n_samples: int, n_samples_bootstrap: int):\n", - " \"\"\"\n", - " Private function used to forest._set_oob_score function.\n", - " taken from: \n", - " https://github.com/scikit-learn/scikit-learn/blob/2253807bb488b6de73796aef2de38a6dcf282d86/sklearn/ensemble/_forest.py#L126\n", - " (public availability to be deprecated by sklearn v0.24)\n", - " \"\"\"\n", - " sample_indices = _generate_sample_indices(random_state, n_samples,\n", - " n_samples_bootstrap)\n", - " sample_counts = np.bincount(sample_indices, minlength=n_samples)\n", - " unsampled_mask = sample_counts == 0\n", - " indices_range = np.arange(n_samples)\n", - " unsampled_indices = indices_range[unsampled_mask]\n", - "\n", - " return unsampled_indices\n", - "\n", - "def _get_unsampled_indices(tree, n_samples: int):\n", - " \"\"\"\n", - " An interface to get unsampled indices regardless of sklearn version.\n", - " \"\"\"\n", - " import warnings\n", - " warnings.simplefilter(action=\"ignore\", category=FutureWarning)\n", - " if LooseVersion(sklearn.__version__) >= LooseVersion(\"0.22\"):\n", - " # Version 0.22 or newer uses 3 arguments.\n", - " from sklearn.ensemble.forest import _get_n_samples_bootstrap\n", - " n_samples_bootstrap = _get_n_samples_bootstrap(n_samples, n_samples)\n", - " return _generate_unsampled_indices(tree.random_state, n_samples,\n", - " n_samples_bootstrap)\n", - " else:\n", - " # Version 0.21 or older uses only two arguments.\n", - " return _generate_unsampled_indices(tree.random_state, n_samples)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The following function estimates classifier accuracy and has been borrowed from **[references](#references)**. See **[breitman on oob](https://www.stat.berkeley.edu/~breiman/OOBestimation.pdf)** for details on out-of-bag estimation:" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "def oob_classifier_accuracy(rf, X_train: np.array, y_train: np.array) -> float:\n", - " \"\"\"\n", - " Compute out-of-bag (OOB) accuracy for a scikit-learn forest classifier.\n", - " \n", - " https://github.com/scikit-learn/scikit-learn/blob/a24c8b46/sklearn/ensemble/forest.py#L425\n", - " \"\"\"\n", - " X = X_train.values\n", - " y = y_train.values\n", - "\n", - " n_samples = len(X)\n", - " n_classes = len(np.unique(y))\n", - " predictions = np.zeros((n_samples, n_classes))\n", - " for tree in rf.estimators_:\n", - " unsampled_indices = _get_unsampled_indices(tree, n_samples)\n", - " tree_preds = tree.predict_proba(X[unsampled_indices, :])\n", - " predictions[unsampled_indices] += tree_preds\n", - "\n", - " predicted_class_indexes = np.argmax(predictions, axis=1)\n", - " predicted_classes = [rf.classes_[i] for i in predicted_class_indexes]\n", - "\n", - " oob_score = np.mean(y == predicted_classes)\n", - " \n", - " return oob_score" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Putting it all together:" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "def permutation_importances(\n", - " model, \n", - " X_train: np.array,\n", - " y_train: np.array, \n", - " header: List[str],\n", - " metric: Callable = oob_classifier_accuracy,\n", - " figsz=(10, 5)\n", - ") -> np.array:\n", - " \"\"\"calculate change in metric from permuting feature columns\n", - " \n", - " modified from https://explained.ai/rf-importance/index.html\n", - " \n", - " uses a pre-estimated model\n", - "\n", - " :param X_train: training set features\n", - " :param y_train: training set ground truths, regression targets\n", - " :param header: column labels for X_train\n", - " :param figsz: matplotlib figure size\n", - " \n", - " \"\"\"\n", - " baseline = metric(model, X_train, y_train)\n", - " imp = []\n", - " for col in X_train.columns:\n", - " save = X_train[col].copy()\n", - " X_train[col] = np.random.permutation(X_train[col])\n", - " m = metric(model, X_train, y_train)\n", - " X_train[col] = save\n", - " imp.append(baseline - m)\n", - " \n", - " # create a feature importance table with desired labels\n", - " zipped = zip(imp, header)\n", - " feature_imp = pd.DataFrame(sorted(zipped), columns=[\"importance\", \"feature\"])\n", - " feature_imp.sort_values(by=\"importance\", ascending=False, inplace=True)\n", - "\n", - " plt.clf()\n", - " plt.figure(figsize=figsz)\n", - " sns.barplot(x=\"importance\", y=\"feature\", data=feature_imp)\n", - " plt.title(\"feature permutation importances\")\n", - " plt.tight_layout()\n", - "\n", - " return np.array(feature_imp)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## drop-column importances\n", - "\n", - "According to our **[references](#references)** a more accurate measure of feature importance would have us re-estimate the model after dropping a column. This is considered as being close to \"ideal\". Unfortunately, the entire model needs to be re-estimated for each column and without some approximating shortcut this is likely to be infeasible for large datasets.\n", - "\n", - "Here is the suggested implementation and **don't run this on big models!**:" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "def dropcol_importances(\n", - " model, \n", - " X_train: np.array,\n", - " y_train: np.array,\n", - " header: List[str] = [],\n", - " random_state: int = 1994,\n", - " figsz=(10, 5)\n", - ") -> pd.DataFrame:\n", - " \"\"\"drop columns and re-estimate model\n", - " \n", - " modified from https://explained.ai/rf-importance/index.html\n", - " \n", - " :param rf: model to fit\n", - " :param X_train: training set features\n", - " :param y_train: training set ground truth labels\n", - "\n", - " Returns:\n", - " pd.DataFrame: table of diffs vs baseline metric\n", - " \"\"\"\n", - " # cloning makes copy of model pre-fit\n", - " # calculate a baseline with all features\n", - " model_ = clone(model)\n", - " model_.random_state = random_state\n", - " model_.fit(X_train, y_train)\n", - " baseline = model_.oob_score_\n", - " \n", - " # now drop each colum, refit model and calc metric\n", - " imp = []\n", - " for col in X_train.columns:\n", - " X = X_train.drop(col, axis=1)\n", - " model_ = clone(model)\n", - " model_.random_state = random_state\n", - " model_.fit(X, y_train)\n", - " o = model_.oob_score_\n", - " imp.append(baseline - o)\n", - " \n", - " # put it all in a table\n", - " imp = np.array(imp)\n", - " feature_imps = pd.DataFrame(\n", - " data={'feature': X_train.columns,\n", - " 'importance': imp})\n", - " #feature_imps.set_index('feature', inplace=True)\n", - " feature_imps.sort_values('importance', ascending=True, inplace=True)\n", - " \n", - " plt.clf()\n", - " plt.figure(figsize=figsz)\n", - " sns.barplot(x=\"importance\", y=\"feature\", data=feature_imps)\n", - " plt.title(\"drop column feature importances\")\n", - " plt.tight_layout()\n", - " \n", - " return feature_imps" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## demonstration\n", - "\n", - "In this demonstratuon we are going to take a fraction of a fraction of **[Kaggle's RentHop rental listing interest competition](https://www.kaggle.com/c/two-sigma-connect-rental-listing-inquiries)**--the complete dataset is presently >80GB, we'll be looking at 5K rows. \n", - "\n", - "The competition's **[goal](https://www.kaggle.com/c/two-sigma-connect-rental-listing-inquiries)** was\n", - "> to predict the number of inquiries a new listing receives based on the listing’s creation date and other features. \n", - "\n", - "Doing so would help **[RentHop](https://www.renthop.com/)**\n", - "> better handle fraud control, identify potential listing quality issues, and allow owners and agents to better understand renters’ needs and preferences." - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [], - "source": [ - "data = \"/User/artifacts/two-sigma-connect-rental-listing-inquiries/\"\n", - "NFRAC = 0.1" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "sample dimensions (4935, 6)\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
bathroomsbedroomspricelongitudelatitudeinterest_level
182351.001800-73.996040.71972
421401.023000-73.987940.76533
46771.011350-73.899640.85492
\n", - "
" - ], - "text/plain": [ - " bathrooms bedrooms price longitude latitude interest_level\n", - "18235 1.0 0 1800 -73.9960 40.7197 2\n", - "42140 1.0 2 3000 -73.9879 40.7653 3\n", - "4677 1.0 1 1350 -73.8996 40.8549 2" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df = pd.read_csv(data + 'rent.csv').sample(frac=NFRAC)\n", - "print(\"sample dimensions\", df.shape)\n", - "df.head(3)" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [], - "source": [ - "features = ['bathrooms', 'bedrooms', 'longitude', 'latitude', 'price']\n", - "dfr = df[features]\n", - "\n", - "# drop price column\n", - "X_train, y_train = dfr.drop('price', axis=1), dfr['price']\n", - "\n", - "# insert column with random values\n", - "X_train['random'] = np.random.random(size=len(X_train))\n", - "features = ['bathrooms', 'bedrooms', 'longitude', 'latitude', 'random']" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "RandomForestClassifier(n_jobs=-1, oob_score=True)" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# define model\n", - "model_params = {\n", - " \"n_estimators\" : 100, \n", - " \"min_samples_leaf\" : 1,\n", - " \"n_jobs\" : -1,\n", - " \"oob_score\" : True\n", - "}\n", - "\n", - "model = SomeModel(**model_params)\n", - "\n", - "# estimate\n", - "model.fit(X_train, y_train)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### to run this the model needs a default attribute" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "default feature_importances [0.01683784 0.03215169 0.29983429 0.30418813 0.34698806]\n" - ] - }, - { - "data": { - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "if hasattr(model, \"feature_importances_\"):\n", - " print(\"default feature_importances\", model.feature_importances_)\n", - " feature_importances(model, features)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## permutation importances\n", - "\n", - "No need to check for default attributes or functions, this can be run on any kind of model:" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([[0.06545086119554205, 'longitude'],\n", - " [0.06160081053698076, 'latitude'],\n", - " [0.053495440729483285, 'bedrooms'],\n", - " [0.021681864235055734, 'bathrooms'],\n", - " [0.0004052684903748799, 'random']], dtype=object)" - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "pi = permutation_importances(model, X_train, y_train, features)\n", - "pi" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## drop-column importances" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
featureimportance
4random-0.042756
0bathrooms0.001824
1bedrooms0.023100
2longitude0.049240
3latitude0.051874
\n", - "
" - ], - "text/plain": [ - " feature importance\n", - "4 random -0.042756\n", - "0 bathrooms 0.001824\n", - "1 bedrooms 0.023100\n", - "2 longitude 0.049240\n", - "3 latitude 0.051874" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "dc = dropcol_importances(model, X_train, y_train)\n", - "dc" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## conclusions" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "So I would say location is a prime factor, then the number of bedrooms. Bathrooms often is gte bedrooms, and is likely correlated so one of them should likely be dropped." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.8" - }, - "toc-autonumbering": false, - "toc-showcode": false, - "toc-showmarkdowntxt": false - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/functions/development/feature_perms/1.1.0/src/feature_perms.ipynb b/functions/development/feature_perms/1.1.0/src/feature_perms.ipynb deleted file mode 100644 index 77da7b55..00000000 --- a/functions/development/feature_perms/1.1.0/src/feature_perms.ipynb +++ /dev/null @@ -1,1106 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# permutation_importances as reusable function" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## function code" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: ignore\n", - "import nuclio" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "import pandas as pd\n", - "import numbers\n", - "\n", - "import sklearn\n", - "from sklearn.base import clone\n", - "from sklearn.utils import check_random_state\n", - "\n", - "import matplotlib.pyplot as plt\n", - "import seaborn as sns\n", - "\n", - "from cloudpickle import load\n", - "\n", - "from mlrun.execution import MLClientCtx\n", - "from mlrun.datastore import DataItem\n", - "from mlrun.artifacts import get_model, PlotArtifact\n", - "from typing import Union, Callable, List\n", - "\n", - "def _get_n_samples_bootstrap(n_samples, max_samples) -> int:\n", - " \"\"\"get the number of samples in a bootstrap sample\n", - " \n", - " returns the total number of samples to draw for the bootstrap sample\n", - " \n", - " private api in sklearn >= v0.24, taken from sklearn.ensemble._forest.py\n", - "\n", - " :param n_samples: Number of samples in the dataset.\n", - " :param max_samples: \n", - " The maximum number of samples to draw from the total available:\n", - " - if float, this indicates a fraction of the total and should be\n", - " the interval `(0, 1)`;\n", - " - if int, this indicates the exact number of samples;\n", - " - if None, this indicates the total number of samples.\n", - " \"\"\"\n", - " if max_samples is None:\n", - " return n_samples\n", - "\n", - " if isinstance(max_samples, numbers.Integral):\n", - " if not (1 <= max_samples <= n_samples):\n", - " msg = \"`max_samples` must be in range 1 to {} but got value {}\"\n", - " raise ValueError(msg.format(n_samples, max_samples))\n", - " return max_samples\n", - "\n", - " if isinstance(max_samples, numbers.Real):\n", - " if not (0 < max_samples < 1):\n", - " msg = \"`max_samples` must be in range (0, 1) but got value {}\"\n", - " raise ValueError(msg.format(max_samples))\n", - " return int(round(n_samples * max_samples))\n", - "\n", - " msg = \"`max_samples` should be int or float, but got type '{}'\"\n", - " raise TypeError(msg.format(type(max_samples)))\n", - "\n", - "def _get_unsampled_ix(random_state, n_samples: int) -> np.array:\n", - " \"\"\"\n", - " future-proof get unsampled indices\n", - " \"\"\"\n", - " n_bootstrap = _get_n_samples_bootstrap(n_samples, n_samples)\n", - " random_instance = check_random_state(random_state)\n", - " sample_indices = random_instance.randint(0, n_samples, n_bootstrap)\n", - " sample_counts = np.bincount(sample_indices, minlength=n_samples)\n", - "\n", - " return np.arange(n_samples)[sample_counts==0]\n", - "\n", - "def _oob_classifier_accuracy(rf, X_train, y_train) -> float:\n", - " \"\"\"\n", - " Compute out-of-bag (OOB) accuracy for a scikit-learn forest classifier.\n", - " \n", - " https://github.com/scikit-learn/scikit-learn/blob/a24c8b46/sklearn/ensemble/forest.py#L425\n", - " \"\"\"\n", - " X = X_train.values if isinstance(X_train, pd.DataFrame) else X_train\n", - " y = y_train.values if isinstance(y_train, pd.Series) else y_train\n", - "\n", - " n_samples = len(X)\n", - " n_classes = len(np.unique(y))\n", - " predictions = np.zeros((n_samples, n_classes))\n", - " for tree in rf.estimators_:\n", - " unsampled_indices = _get_unsampled_ix(tree.random_state, n_samples)\n", - " tree_preds = tree.predict_proba(X[unsampled_indices, :])\n", - " predictions[unsampled_indices] += tree_preds\n", - "\n", - " predicted_class_indexes = np.argmax(predictions, axis=1)\n", - " predicted_classes = [rf.classes_[i] for i in predicted_class_indexes]\n", - "\n", - " oob_score = np.mean(y == predicted_classes)\n", - " \n", - " return oob_score\n", - "\n", - "def permutation_importances(\n", - " context: MLClientCtx,\n", - " model: DataItem,\n", - " dataset: DataItem,\n", - " labels: str,\n", - " figsz=(10, 5),\n", - " plots_dest: str = \"plots\",\n", - " fitype: str = \"permute\"\n", - ") -> pd.DataFrame:\n", - " \"\"\"calculate change in metric\n", - " \n", - " type 'permute' uses a pre-estimated model\n", - " type 'dropcol' uses a re-estimates model\n", - " \n", - " :param context: the function's execution context\n", - " :param model: a trained model\n", - " :param dataset: features and ground truths, regression targets\n", - " :param labels name of the ground truths column\n", - " :param figsz: matplotlib figure size\n", - " :param plots_dest: path within artifact store\n", - " :\n", - " \"\"\"\n", - " model_file, model_data, _ = get_model(model.url, suffix='.pkl')\n", - " model = load(open(str(model_file), \"rb\"))\n", - " \n", - " X = dataset.as_df()\n", - " y = X.pop(labels)\n", - " header = X.columns\n", - " \n", - " # this will be paramettrized next version, and include regression\n", - " metric = _oob_classifier_accuracy\n", - " \n", - " baseline = metric(model, X, y)\n", - " \n", - " imp = []\n", - " for col in X.columns:\n", - " if fitype is \"permute\":\n", - " save = X[col].copy()\n", - " X[col] = np.random.permutation(X[col])\n", - " m = metric(model, X, y)\n", - " X[col] = save\n", - " imp.append(baseline - m)\n", - " elif fitype is \"dropcol\":\n", - " X_ = X.drop(col, axis=1)\n", - " model_ = clone(model)\n", - " model_.random_state = random_state\n", - " model_.fit(X_, y)\n", - " o = model_.oob_score_\n", - " imp.append(baseline - o)\n", - " else:\n", - " raise ValueError(\"unknown fitype, only 'permute' or 'dropcol' permitted\")\n", - "\n", - " # create a feature importance table with desired labels\n", - " zipped = zip(imp, header)\n", - " feature_imp = pd.DataFrame(sorted(zipped), columns=[\"importance\", \"feature\"])\n", - " feature_imp.sort_values(by=\"importance\", ascending=False, inplace=True)\n", - "\n", - " plt.clf()\n", - " plt.figure(figsize=figsz)\n", - " sns.barplot(x=\"importance\", y=\"feature\", data=feature_imp)\n", - " plt.title(f\"feature importances-{fitype}\")\n", - " plt.tight_layout()\n", - "\n", - " context.log_artifact(PlotArtifact(f\"feature importances-{fitype}\", body=plt.gcf()),\n", - " local_path=f\"{plots_dest}/feature-permutations.html\")\n", - " context.log_dataset(f\"feature-importances-{fitype}-tbl\", df=feature_imp, index=False)" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: end-code" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## save function" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-06-07 19:58:25,298 function spec saved to path: function.yaml\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from mlrun import code_to_function\n", - "from mlrun.platforms.other import auto_mount\n", - "\n", - "gpus = False\n", - "\n", - "# create job function object from notebook code\n", - "fn_params = {\n", - " \"name\" : \"feature-perms\",\n", - " \"handler\" : \"permutation_importances\",\n", - " \"kind\" : \"job\",\n", - " \"image\" : \"mlrun/ml-models\" if not gpus else \"mlrun/ml-models-gpu\",\n", - " \"description\" : \"estimate feature importances using permutations\",\n", - " \"categories\" : [\"analysis\"],\n", - " \"labels\" : {\"author\": \"yjb\"}\n", - "}\n", - "\n", - "perms_fn = code_to_function(**fn_params)\n", - "perms_fn.apply(auto_mount())\n", - "perms_fn.export(\"function.yaml\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## tests" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import import_function\n", - "from mlrun import NewTask, mlconf" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### get some data" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-06-07 19:58:25,352 starting run tasks arc-to-parq uid=e9bc67f2189c418d96bfde754d369956 -> http://mlrun-api:8080\n", - "[mlrun] 2020-06-07 19:58:25,486 Job is running in the background, pod: tasks-arc-to-parq-xqkr7\n", - "[mlrun] 2020-06-07 19:58:29,118 starting local run: main.py # arc_to_parquet\n", - "[mlrun] 2020-06-07 19:58:29,169 downloading https://raw.githubusercontent.com/parrt/random-forest-importances/master/notebooks/data/rent.csv to local tmp\n", - "[mlrun] 2020-06-07 19:58:29,535 destination file does not exist, downloading\n", - "[mlrun] 2020-06-07 19:58:29,898 log artifact rent at /User/artifacts/rent.csv, size: 1492462, db: Y\n", - "\n", - "[mlrun] 2020-06-07 19:58:29,917 run executed, status=completed\n", - "final state: succeeded\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 07 19:58:29completedtasks arc-to-parq
v3io_user=admin
kind=job
owner=admin
host=tasks-arc-to-parq-xqkr7
archive_url
key=rent
stats=True
file_ext=csv
rent
\n", - "
\n", - "
\n", - "
\n", - " Title\n", - " ×\n", - "
\n", - " \n", - "
\n", - "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "to track results use .show() or .logs() or in CLI: \n", - "!mlrun get run e9bc67f2189c418d96bfde754d369956 --project default , !mlrun logs e9bc67f2189c418d96bfde754d369956 --project default\n", - "[mlrun] 2020-06-07 19:58:31,666 run executed, status=completed\n" - ] - } - ], - "source": [ - "data_url = \"https://raw.githubusercontent.com/parrt/random-forest-importances/master/notebooks/data/rent.csv\"\n", - "\n", - "fn = import_function(\"hub://arc_to_parquet\", \"a2p\")\n", - "fn.apply(auto_mount())\n", - "\n", - "params = {\n", - " \"name\" : \"tasks arc-to-parq\",\n", - " \"params\" : {\"key\":\"rent\", \"stats\": True, \"file_ext\":\"csv\"}\n", - "}\n", - "acquire_run = fn.run(NewTask(**params),inputs={\"archive_url\" : data_url},\n", - " artifact_path=mlconf.artifact_path)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### train a model" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-06-07 19:58:31,704 starting run tasks random forest uid=57af834167264641905a5bb5e6b0e263 -> http://mlrun-api:8080\n", - "[mlrun] 2020-06-07 19:58:31,861 Job is running in the background, pod: tasks-random-forest-vkjk5\n", - "[mlrun] 2020-06-07 19:58:35,390 starting local run: main.py # train_model\n", - "[mlrun] 2020-06-07 19:58:36,310 log artifact test_set at /User/artifacts/data/test_set.parquet, size: 24484, db: Y\n", - "[mlrun] 2020-06-07 19:58:37,153 log artifact confusion-matrix at /User/artifacts/model/plots/confusion-matrix.html, size: 27401, db: N\n", - "[mlrun] 2020-06-07 19:58:37,598 log artifact feature-importances at /User/artifacts/model/plots/feature-importances.html, size: 19685, db: N\n", - "[mlrun] 2020-06-07 19:58:37,806 log artifact precision-recall-multiclass at /User/artifacts/model/plots/precision-recall-multiclass.html, size: 74009, db: N\n", - "[mlrun] 2020-06-07 19:58:37,936 log artifact roc-multiclass at /User/artifacts/model/plots/roc-multiclass.html, size: 73053, db: N\n", - "[mlrun] 2020-06-07 19:58:38,079 log artifact model at /User/artifacts/model/, size: 10346780, db: Y\n", - "\n", - "[mlrun] 2020-06-07 19:58:38,106 run executed, status=completed\n", - "final state: succeeded\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 07 19:58:36completedtasks random forest
v3io_user=admin
kind=job
owner=admin
host=tasks-random-forest-vkjk5
class=sklearn.ensemble.RandomForestClassifier
dataset
sample=-5000
model_pkg_class=sklearn.ensemble.RandomForestClassifier
label_column=interest_level
CLASS_n_estimators=100
CLASS_min_samples_leaf=1
CLASS_n_jobs=-1
CLASS_oob_score=True
test-accuracy=0.6902857142857143
test-error=0.3097142857142857
auc-micro=0.8567196734693878
auc-weighted=0.7077200281488216
f1-score=0.44361444815007395
precision_score=0.4969837043184901
recall_score=0.42733978329897576
test_set
confusion-matrix
feature-importances
precision-recall-multiclass
roc-multiclass
model
\n", - "
\n", - "
\n", - "
\n", - " Title\n", - " ×\n", - "
\n", - " \n", - "
\n", - "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "to track results use .show() or .logs() or in CLI: \n", - "!mlrun get run 57af834167264641905a5bb5e6b0e263 --project default , !mlrun logs 57af834167264641905a5bb5e6b0e263 --project default\n", - "[mlrun] 2020-06-07 19:58:41,115 run executed, status=completed\n" - ] - } - ], - "source": [ - "fn = import_function(\"hub://sklearn_classifier\", \"skrf\")\n", - "fn.apply(auto_mount())\n", - "\n", - "# define model\n", - "params = {\n", - " \"name\" : \"tasks random forest\",\n", - " \"params\" : {\n", - " \"sample\" : -5_000, # 5k random rows,\n", - " \"model_pkg_class\" : \"sklearn.ensemble.RandomForestClassifier\",\n", - " \"label_column\" : \"interest_level\",\n", - " \"CLASS_n_estimators\" : 100,\n", - " \"CLASS_min_samples_leaf\" : 1,\n", - " \"CLASS_n_jobs\" : -1,\n", - " \"CLASS_oob_score\" : True}\n", - "}\n", - "\n", - "train_run = fn.run(NewTask(**params), inputs={\"dataset\" : acquire_run.outputs[\"rent\"]},\n", - " artifact_path=mlconf.artifact_path)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "

Feature Importances

\n", - "" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from IPython.display import HTML\n", - "HTML(filename=train_run.outputs['feature-importances'])" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "data = acquire_run.outputs[\"rent\"]\n", - "labels = \"interest_level\"\n", - "model = train_run.outputs[\"model\"]\n" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-06-07 19:58:41,152 starting run features-permutation_importances uid=89235b15ac2a4213aefc906c178a1c5e -> http://mlrun-api:8080\n", - "[mlrun] 2020-06-07 19:58:41,312 Job is running in the background, pod: features-permutation-importances-dwxmt\n", - "[mlrun] 2020-06-07 19:58:44,871 starting local run: main.py # permutation_importances\n", - "[mlrun] 2020-06-07 19:58:48,714 log artifact feature importances-permute at /User/artifacts/plots/feature-permutations.html, size: 25694, db: Y\n", - "[mlrun] 2020-06-07 19:58:48,770 log artifact feature-importances-permute-tbl at /User/artifacts/feature-importances-permute-tbl.csv, size: 167, db: Y\n", - "\n", - "[mlrun] 2020-06-07 19:58:48,785 run executed, status=completed\n", - "final state: succeeded\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 07 19:58:45completedfeatures-permutation_importances
v3io_user=admin
kind=job
owner=admin
host=features-permutation-importances-dwxmt
model
dataset
labels=interest_level
plots_dest=plots
feature importances-permute
feature-importances-permute-tbl
\n", - "
\n", - "
\n", - "
\n", - " Title\n", - " ×\n", - "
\n", - " \n", - "
\n", - "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "to track results use .show() or .logs() or in CLI: \n", - "!mlrun get run 89235b15ac2a4213aefc906c178a1c5e --project default , !mlrun logs 89235b15ac2a4213aefc906c178a1c5e --project default\n", - "[mlrun] 2020-06-07 19:58:50,488 run executed, status=completed\n" - ] - } - ], - "source": [ - "fi_perms = perms_fn.run(\n", - " NewTask(params={\"labels\": labels, \n", - " \"plots_dest\": \"plots\"}),\n", - " inputs={\"model\": model, \"dataset\": data},\n", - " artifact_path=mlconf.artifact_path)" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from IPython.display import HTML\n", - "HTML(filename=fi_perms.outputs['feature importances-permute'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.8" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} \ No newline at end of file diff --git a/functions/development/feature_perms/1.1.0/src/feature_perms.py b/functions/development/feature_perms/1.1.0/src/feature_perms.py deleted file mode 100644 index 13caae32..00000000 --- a/functions/development/feature_perms/1.1.0/src/feature_perms.py +++ /dev/null @@ -1,174 +0,0 @@ -# Copyright 2019 Iguazio -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# Generated by nuclio.export.NuclioExporter - -import numpy as np -import pandas as pd -import numbers - -import sklearn -from sklearn.base import clone -from sklearn.utils import check_random_state - -import matplotlib.pyplot as plt -import seaborn as sns - -from cloudpickle import load - -from mlrun.execution import MLClientCtx -from mlrun.datastore import DataItem -from mlrun.artifacts import get_model, PlotArtifact -from typing import Union, Callable, List - - -def _get_n_samples_bootstrap(n_samples, max_samples) -> int: - """get the number of samples in a bootstrap sample - - returns the total number of samples to draw for the bootstrap sample - - private api in sklearn >= v0.24, taken from sklearn.ensemble._forest.py - - :param n_samples: Number of samples in the dataset. - :param max_samples: - The maximum number of samples to draw from the total available: - - if float, this indicates a fraction of the total and should be - the interval `(0, 1)`; - - if int, this indicates the exact number of samples; - - if None, this indicates the total number of samples. - """ - if max_samples is None: - return n_samples - - if isinstance(max_samples, numbers.Integral): - if not (1 <= max_samples <= n_samples): - msg = "`max_samples` must be in range 1 to {} but got value {}" - raise ValueError(msg.format(n_samples, max_samples)) - return max_samples - - if isinstance(max_samples, numbers.Real): - if not (0 < max_samples < 1): - msg = "`max_samples` must be in range (0, 1) but got value {}" - raise ValueError(msg.format(max_samples)) - return int(round(n_samples * max_samples)) - - msg = "`max_samples` should be int or float, but got type '{}'" - raise TypeError(msg.format(type(max_samples))) - - -def _get_unsampled_ix(random_state, n_samples: int) -> np.array: - """ - future-proof get unsampled indices - """ - n_bootstrap = _get_n_samples_bootstrap(n_samples, n_samples) - random_instance = check_random_state(random_state) - sample_indices = random_instance.randint(0, n_samples, n_bootstrap) - sample_counts = np.bincount(sample_indices, minlength=n_samples) - - return np.arange(n_samples)[sample_counts == 0] - - -def _oob_classifier_accuracy(rf, X_train, y_train) -> float: - """ - Compute out-of-bag (OOB) accuracy for a scikit-learn forest classifier. - - https://github.com/scikit-learn/scikit-learn/blob/a24c8b46/sklearn/ensemble/forest.py#L425 - """ - X = X_train.values if isinstance(X_train, pd.DataFrame) else X_train - y = y_train.values if isinstance(y_train, pd.Series) else y_train - - n_samples = len(X) - n_classes = len(np.unique(y)) - predictions = np.zeros((n_samples, n_classes)) - for tree in rf.estimators_: - unsampled_indices = _get_unsampled_ix(tree.random_state, n_samples) - tree_preds = tree.predict_proba(X[unsampled_indices, :]) - predictions[unsampled_indices] += tree_preds - - predicted_class_indexes = np.argmax(predictions, axis=1) - predicted_classes = [rf.classes_[i] for i in predicted_class_indexes] - - oob_score = np.mean(y == predicted_classes) - - return oob_score - - -def permutation_importance( - context: MLClientCtx, - model: DataItem, - dataset: DataItem, - labels: str, - figsz=(10, 5), - plots_dest: str = "plots", - fitype: str = "permute", -) -> pd.DataFrame: - """calculate change in metric - - type 'permute' uses a pre-estimated model - type 'dropcol' uses a re-estimates model - - :param context: the function's execution context - :param model: a trained model - :param dataset: features and ground truths, regression targets - :param labels name of the ground truths column - :param figsz: matplotlib figure size - :param plots_dest: path within artifact store - : - """ - model_file, model_data, _ = get_model(model.url, suffix=".pkl") - model = load(open(str(model_file), "rb")) - - X = dataset.as_df() - y = X.pop(labels) - header = X.columns - - metric = _oob_classifier_accuracy - - baseline = metric(model, X, y) - - imp = [] - for col in X.columns: - if fitype is "permute": - save = X[col].copy() - X[col] = np.random.permutation(X[col]) - m = metric(model, X, y) - X[col] = save - imp.append(baseline - m) - elif fitype is "dropcol": - X_ = X.drop(col, axis=1) - model_ = clone(model) - #model_.random_state = random_state - model_.fit(X_, y) - o = model_.oob_score_ - imp.append(baseline - o) - else: - raise ValueError("unknown fitype, only 'permute' or 'dropcol' permitted") - - zipped = zip(imp, header) - feature_imp = pd.DataFrame(sorted(zipped), columns=["importance", "feature"]) - feature_imp.sort_values(by="importance", ascending=False, inplace=True) - - plt.clf() - plt.figure(figsize=figsz) - sns.barplot(x="importance", y="feature", data=feature_imp) - plt.title(f"feature importances-{fitype}") - plt.tight_layout() - - context.log_artifact( - PlotArtifact(f"feature importances-{fitype}", body=plt.gcf()), - local_path=f"{plots_dest}/feature-permutations.html", - ) - context.log_dataset( - f"feature-importances-{fitype}-tbl", df=feature_imp, index=False - ) diff --git a/functions/development/feature_perms/1.1.0/src/function.yaml b/functions/development/feature_perms/1.1.0/src/function.yaml deleted file mode 100644 index 713981fd..00000000 --- a/functions/development/feature_perms/1.1.0/src/function.yaml +++ /dev/null @@ -1,63 +0,0 @@ -kind: job -metadata: - name: feature-perms - tag: '' - hash: 2e32234a73e2e48f029cf6c957b150ec2ffd4bc7 - project: '' - labels: - author: yjb - categories: - - data-analysis -spec: - command: '' - args: [] - image: mlrun/ml-models - env: [] - default_handler: permutation_importance - entry_points: - permutation_importance: - name: permutation_importance - doc: 'calculate change in metric - - - type ''permute'' uses a pre-estimated model - - type ''dropcol'' uses a re-estimates model' - parameters: - - name: context - type: MLClientCtx - doc: the function's execution context - default: '' - - name: model - type: DataItem - doc: a trained model - default: '' - - name: dataset - type: DataItem - doc: features and ground truths, regression targets - default: '' - - name: labels - type: str - default: '' - - name: figsz - doc: matplotlib figure size - default: - - 10 - - 5 - - name: plots_dest - type: str - doc: path within artifact store - default: plots - - name: fitype - type: str - default: permute - outputs: - - default: '' - lineno: 93 - description: estimate feature importances using permutations - build: - functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IG51bXB5IGFzIG5wCmltcG9ydCBwYW5kYXMgYXMgcGQKaW1wb3J0IG51bWJlcnMKCmltcG9ydCBza2xlYXJuCmZyb20gc2tsZWFybi5iYXNlIGltcG9ydCBjbG9uZQpmcm9tIHNrbGVhcm4udXRpbHMgaW1wb3J0IGNoZWNrX3JhbmRvbV9zdGF0ZQoKaW1wb3J0IG1hdHBsb3RsaWIucHlwbG90IGFzIHBsdAppbXBvcnQgc2VhYm9ybiBhcyBzbnMKCmZyb20gY2xvdWRwaWNrbGUgaW1wb3J0IGxvYWQKCmZyb20gbWxydW4uZXhlY3V0aW9uIGltcG9ydCBNTENsaWVudEN0eApmcm9tIG1scnVuLmRhdGFzdG9yZSBpbXBvcnQgRGF0YUl0ZW0KZnJvbSBtbHJ1bi5hcnRpZmFjdHMgaW1wb3J0IGdldF9tb2RlbCwgUGxvdEFydGlmYWN0CmZyb20gdHlwaW5nIGltcG9ydCBVbmlvbiwgQ2FsbGFibGUsIExpc3QKCgpkZWYgX2dldF9uX3NhbXBsZXNfYm9vdHN0cmFwKG5fc2FtcGxlcywgbWF4X3NhbXBsZXMpIC0+IGludDoKICAgICIiImdldCB0aGUgbnVtYmVyIG9mIHNhbXBsZXMgaW4gYSBib290c3RyYXAgc2FtcGxlCgogICAgcmV0dXJucyB0aGUgdG90YWwgbnVtYmVyIG9mIHNhbXBsZXMgdG8gZHJhdyBmb3IgdGhlIGJvb3RzdHJhcCBzYW1wbGUKCiAgICBwcml2YXRlIGFwaSBpbiBza2xlYXJuID49IHYwLjI0LCB0YWtlbiBmcm9tIHNrbGVhcm4uZW5zZW1ibGUuX2ZvcmVzdC5weQoKICAgIDpwYXJhbSBuX3NhbXBsZXM6ICAgTnVtYmVyIG9mIHNhbXBsZXMgaW4gdGhlIGRhdGFzZXQuCiAgICA6cGFyYW0gbWF4X3NhbXBsZXM6CiAgICAgICAgVGhlIG1heGltdW0gbnVtYmVyIG9mIHNhbXBsZXMgdG8gZHJhdyBmcm9tIHRoZSB0b3RhbCBhdmFpbGFibGU6CiAgICAgICAgICAgIC0gaWYgZmxvYXQsIHRoaXMgaW5kaWNhdGVzIGEgZnJhY3Rpb24gb2YgdGhlIHRvdGFsIGFuZCBzaG91bGQgYmUKICAgICAgICAgICAgICB0aGUgaW50ZXJ2YWwgYCgwLCAxKWA7CiAgICAgICAgICAgIC0gaWYgaW50LCB0aGlzIGluZGljYXRlcyB0aGUgZXhhY3QgbnVtYmVyIG9mIHNhbXBsZXM7CiAgICAgICAgICAgIC0gaWYgTm9uZSwgdGhpcyBpbmRpY2F0ZXMgdGhlIHRvdGFsIG51bWJlciBvZiBzYW1wbGVzLgogICAgIiIiCiAgICBpZiBtYXhfc2FtcGxlcyBpcyBOb25lOgogICAgICAgIHJldHVybiBuX3NhbXBsZXMKCiAgICBpZiBpc2luc3RhbmNlKG1heF9zYW1wbGVzLCBudW1iZXJzLkludGVncmFsKToKICAgICAgICBpZiBub3QgKDEgPD0gbWF4X3NhbXBsZXMgPD0gbl9zYW1wbGVzKToKICAgICAgICAgICAgbXNnID0gImBtYXhfc2FtcGxlc2AgbXVzdCBiZSBpbiByYW5nZSAxIHRvIHt9IGJ1dCBnb3QgdmFsdWUge30iCiAgICAgICAgICAgIHJhaXNlIFZhbHVlRXJyb3IobXNnLmZvcm1hdChuX3NhbXBsZXMsIG1heF9zYW1wbGVzKSkKICAgICAgICByZXR1cm4gbWF4X3NhbXBsZXMKCiAgICBpZiBpc2luc3RhbmNlKG1heF9zYW1wbGVzLCBudW1iZXJzLlJlYWwpOgogICAgICAgIGlmIG5vdCAoMCA8IG1heF9zYW1wbGVzIDwgMSk6CiAgICAgICAgICAgIG1zZyA9ICJgbWF4X3NhbXBsZXNgIG11c3QgYmUgaW4gcmFuZ2UgKDAsIDEpIGJ1dCBnb3QgdmFsdWUge30iCiAgICAgICAgICAgIHJhaXNlIFZhbHVlRXJyb3IobXNnLmZvcm1hdChtYXhfc2FtcGxlcykpCiAgICAgICAgcmV0dXJuIGludChyb3VuZChuX3NhbXBsZXMgKiBtYXhfc2FtcGxlcykpCgogICAgbXNnID0gImBtYXhfc2FtcGxlc2Agc2hvdWxkIGJlIGludCBvciBmbG9hdCwgYnV0IGdvdCB0eXBlICd7fSciCiAgICByYWlzZSBUeXBlRXJyb3IobXNnLmZvcm1hdCh0eXBlKG1heF9zYW1wbGVzKSkpCgoKZGVmIF9nZXRfdW5zYW1wbGVkX2l4KHJhbmRvbV9zdGF0ZSwgbl9zYW1wbGVzOiBpbnQpIC0+IG5wLmFycmF5OgogICAgIiIiCiAgICBmdXR1cmUtcHJvb2YgZ2V0IHVuc2FtcGxlZCBpbmRpY2VzCiAgICAiIiIKICAgIG5fYm9vdHN0cmFwID0gX2dldF9uX3NhbXBsZXNfYm9vdHN0cmFwKG5fc2FtcGxlcywgbl9zYW1wbGVzKQogICAgcmFuZG9tX2luc3RhbmNlID0gY2hlY2tfcmFuZG9tX3N0YXRlKHJhbmRvbV9zdGF0ZSkKICAgIHNhbXBsZV9pbmRpY2VzID0gcmFuZG9tX2luc3RhbmNlLnJhbmRpbnQoMCwgbl9zYW1wbGVzLCBuX2Jvb3RzdHJhcCkKICAgIHNhbXBsZV9jb3VudHMgPSBucC5iaW5jb3VudChzYW1wbGVfaW5kaWNlcywgbWlubGVuZ3RoPW5fc2FtcGxlcykKCiAgICByZXR1cm4gbnAuYXJhbmdlKG5fc2FtcGxlcylbc2FtcGxlX2NvdW50cyA9PSAwXQoKCmRlZiBfb29iX2NsYXNzaWZpZXJfYWNjdXJhY3kocmYsIFhfdHJhaW4sIHlfdHJhaW4pIC0+IGZsb2F0OgogICAgIiIiCiAgICBDb21wdXRlIG91dC1vZi1iYWcgKE9PQikgYWNjdXJhY3kgZm9yIGEgc2Npa2l0LWxlYXJuIGZvcmVzdCBjbGFzc2lmaWVyLgoKICAgIGh0dHBzOi8vZ2l0aHViLmNvbS9zY2lraXQtbGVhcm4vc2Npa2l0LWxlYXJuL2Jsb2IvYTI0YzhiNDYvc2tsZWFybi9lbnNlbWJsZS9mb3Jlc3QucHkjTDQyNQogICAgIiIiCiAgICBYID0gWF90cmFpbi52YWx1ZXMgaWYgaXNpbnN0YW5jZShYX3RyYWluLCBwZC5EYXRhRnJhbWUpIGVsc2UgWF90cmFpbgogICAgeSA9IHlfdHJhaW4udmFsdWVzIGlmIGlzaW5zdGFuY2UoeV90cmFpbiwgcGQuU2VyaWVzKSBlbHNlIHlfdHJhaW4KCiAgICBuX3NhbXBsZXMgPSBsZW4oWCkKICAgIG5fY2xhc3NlcyA9IGxlbihucC51bmlxdWUoeSkpCiAgICBwcmVkaWN0aW9ucyA9IG5wLnplcm9zKChuX3NhbXBsZXMsIG5fY2xhc3NlcykpCiAgICBmb3IgdHJlZSBpbiByZi5lc3RpbWF0b3JzXzoKICAgICAgICB1bnNhbXBsZWRfaW5kaWNlcyA9IF9nZXRfdW5zYW1wbGVkX2l4KHRyZWUucmFuZG9tX3N0YXRlLCBuX3NhbXBsZXMpCiAgICAgICAgdHJlZV9wcmVkcyA9IHRyZWUucHJlZGljdF9wcm9iYShYW3Vuc2FtcGxlZF9pbmRpY2VzLCA6XSkKICAgICAgICBwcmVkaWN0aW9uc1t1bnNhbXBsZWRfaW5kaWNlc10gKz0gdHJlZV9wcmVkcwoKICAgIHByZWRpY3RlZF9jbGFzc19pbmRleGVzID0gbnAuYXJnbWF4KHByZWRpY3Rpb25zLCBheGlzPTEpCiAgICBwcmVkaWN0ZWRfY2xhc3NlcyA9IFtyZi5jbGFzc2VzX1tpXSBmb3IgaSBpbiBwcmVkaWN0ZWRfY2xhc3NfaW5kZXhlc10KCiAgICBvb2Jfc2NvcmUgPSBucC5tZWFuKHkgPT0gcHJlZGljdGVkX2NsYXNzZXMpCgogICAgcmV0dXJuIG9vYl9zY29yZQoKCmRlZiBwZXJtdXRhdGlvbl9pbXBvcnRhbmNlKAogICAgY29udGV4dDogTUxDbGllbnRDdHgsCiAgICBtb2RlbDogRGF0YUl0ZW0sCiAgICBkYXRhc2V0OiBEYXRhSXRlbSwKICAgIGxhYmVsczogc3RyLAogICAgZmlnc3o9KDEwLCA1KSwKICAgIHBsb3RzX2Rlc3Q6IHN0ciA9ICJwbG90cyIsCiAgICBmaXR5cGU6IHN0ciA9ICJwZXJtdXRlIiwKKSAtPiBwZC5EYXRhRnJhbWU6CiAgICAiIiJjYWxjdWxhdGUgY2hhbmdlIGluIG1ldHJpYwoKICAgIHR5cGUgJ3Blcm11dGUnIHVzZXMgYSBwcmUtZXN0aW1hdGVkIG1vZGVsCiAgICB0eXBlICdkcm9wY29sJyB1c2VzIGEgcmUtZXN0aW1hdGVzIG1vZGVsCgogICAgOnBhcmFtIGNvbnRleHQ6ICAgICB0aGUgZnVuY3Rpb24ncyBleGVjdXRpb24gY29udGV4dAogICAgOnBhcmFtIG1vZGVsOiAgICAgICBhIHRyYWluZWQgbW9kZWwKICAgIDpwYXJhbSBkYXRhc2V0OiAgICAgZmVhdHVyZXMgYW5kIGdyb3VuZCB0cnV0aHMsIHJlZ3Jlc3Npb24gdGFyZ2V0cwogICAgOnBhcmFtIGxhYmVscyAgICAgICBuYW1lIG9mIHRoZSBncm91bmQgdHJ1dGhzIGNvbHVtbgogICAgOnBhcmFtIGZpZ3N6OiAgICAgICBtYXRwbG90bGliIGZpZ3VyZSBzaXplCiAgICA6cGFyYW0gcGxvdHNfZGVzdDogIHBhdGggd2l0aGluIGFydGlmYWN0IHN0b3JlCiAgICA6CiAgICAiIiIKICAgIG1vZGVsX2ZpbGUsIG1vZGVsX2RhdGEsIF8gPSBnZXRfbW9kZWwobW9kZWwudXJsLCBzdWZmaXg9Ii5wa2wiKQogICAgbW9kZWwgPSBsb2FkKG9wZW4oc3RyKG1vZGVsX2ZpbGUpLCAicmIiKSkKCiAgICBYID0gZGF0YXNldC5hc19kZigpCiAgICB5ID0gWC5wb3AobGFiZWxzKQogICAgaGVhZGVyID0gWC5jb2x1bW5zCgogICAgbWV0cmljID0gX29vYl9jbGFzc2lmaWVyX2FjY3VyYWN5CgogICAgYmFzZWxpbmUgPSBtZXRyaWMobW9kZWwsIFgsIHkpCgogICAgaW1wID0gW10KICAgIGZvciBjb2wgaW4gWC5jb2x1bW5zOgogICAgICAgIGlmIGZpdHlwZSBpcyAicGVybXV0ZSI6CiAgICAgICAgICAgIHNhdmUgPSBYW2NvbF0uY29weSgpCiAgICAgICAgICAgIFhbY29sXSA9IG5wLnJhbmRvbS5wZXJtdXRhdGlvbihYW2NvbF0pCiAgICAgICAgICAgIG0gPSBtZXRyaWMobW9kZWwsIFgsIHkpCiAgICAgICAgICAgIFhbY29sXSA9IHNhdmUKICAgICAgICAgICAgaW1wLmFwcGVuZChiYXNlbGluZSAtIG0pCiAgICAgICAgZWxpZiBmaXR5cGUgaXMgImRyb3Bjb2wiOgogICAgICAgICAgICBYXyA9IFguZHJvcChjb2wsIGF4aXM9MSkKICAgICAgICAgICAgbW9kZWxfID0gY2xvbmUobW9kZWwpCiAgICAgICAgICAgICNtb2RlbF8ucmFuZG9tX3N0YXRlID0gcmFuZG9tX3N0YXRlCiAgICAgICAgICAgIG1vZGVsXy5maXQoWF8sIHkpCiAgICAgICAgICAgIG8gPSBtb2RlbF8ub29iX3Njb3JlXwogICAgICAgICAgICBpbXAuYXBwZW5kKGJhc2VsaW5lIC0gbykKICAgICAgICBlbHNlOgogICAgICAgICAgICByYWlzZSBWYWx1ZUVycm9yKCJ1bmtub3duIGZpdHlwZSwgb25seSAncGVybXV0ZScgb3IgJ2Ryb3Bjb2wnIHBlcm1pdHRlZCIpCgogICAgemlwcGVkID0gemlwKGltcCwgaGVhZGVyKQogICAgZmVhdHVyZV9pbXAgPSBwZC5EYXRhRnJhbWUoc29ydGVkKHppcHBlZCksIGNvbHVtbnM9WyJpbXBvcnRhbmNlIiwgImZlYXR1cmUiXSkKICAgIGZlYXR1cmVfaW1wLnNvcnRfdmFsdWVzKGJ5PSJpbXBvcnRhbmNlIiwgYXNjZW5kaW5nPUZhbHNlLCBpbnBsYWNlPVRydWUpCgogICAgcGx0LmNsZigpCiAgICBwbHQuZmlndXJlKGZpZ3NpemU9Zmlnc3opCiAgICBzbnMuYmFycGxvdCh4PSJpbXBvcnRhbmNlIiwgeT0iZmVhdHVyZSIsIGRhdGE9ZmVhdHVyZV9pbXApCiAgICBwbHQudGl0bGUoZiJmZWF0dXJlIGltcG9ydGFuY2VzLXtmaXR5cGV9IikKICAgIHBsdC50aWdodF9sYXlvdXQoKQoKICAgIGNvbnRleHQubG9nX2FydGlmYWN0KAogICAgICAgIFBsb3RBcnRpZmFjdChmImZlYXR1cmUgaW1wb3J0YW5jZXMte2ZpdHlwZX0iLCBib2R5PXBsdC5nY2YoKSksCiAgICAgICAgbG9jYWxfcGF0aD1mIntwbG90c19kZXN0fS9mZWF0dXJlLXBlcm11dGF0aW9ucy5odG1sIiwKICAgICkKICAgIGNvbnRleHQubG9nX2RhdGFzZXQoCiAgICAgICAgZiJmZWF0dXJlLWltcG9ydGFuY2VzLXtmaXR5cGV9LXRibCIsIGRmPWZlYXR1cmVfaW1wLCBpbmRleD1GYWxzZQogICAgKQo= - commands: [] - code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/feature_perms/feature_perms.py - affinity: null -verbose: false diff --git a/functions/development/feature_perms/1.1.0/src/item.yaml b/functions/development/feature_perms/1.1.0/src/item.yaml deleted file mode 100644 index bd909d3e..00000000 --- a/functions/development/feature_perms/1.1.0/src/item.yaml +++ /dev/null @@ -1,25 +0,0 @@ -apiVersion: v1 -categories: -- data-analysis -description: estimate feature importances using permutations -doc: '' -example: feature_perms.ipynb -generationDate: 2022-08-28:17-25 -hidden: false -icon: '' -labels: - author: yjb -maintainers: [] -marketplaceType: '' -mlrunVersion: 1.1.0 -name: feature-perms -platformVersion: 3.5.0 -spec: - filename: feature_perms.py - handler: permutation_importance - image: mlrun/ml-models - kind: job - requirements: [] -url: '' -version: 1.1.0 -test_valid : False diff --git a/functions/development/feature_perms/1.1.0/src/requirements.txt b/functions/development/feature_perms/1.1.0/src/requirements.txt deleted file mode 100644 index 70a079c7..00000000 --- a/functions/development/feature_perms/1.1.0/src/requirements.txt +++ /dev/null @@ -1,5 +0,0 @@ -scikit-learn -matplotlib -seaborn -scikit-plot - diff --git a/functions/development/feature_perms/1.1.0/src/test_feature_perms.py b/functions/development/feature_perms/1.1.0/src/test_feature_perms.py deleted file mode 100644 index a59891ea..00000000 --- a/functions/development/feature_perms/1.1.0/src/test_feature_perms.py +++ /dev/null @@ -1,134 +0,0 @@ -# Copyright 2019 Iguazio -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -from mlrun import code_to_function, import_function -from pathlib import Path -import os - -ARTIFACTS_PATH = 'artifacts' -DATA_URL = "https://raw.githubusercontent.com/parrt/random-forest-importances/master/notebooks/data/rent.csv" -FEATURE_OUTPUT = "feature-importances-permute-tbl" - - -def arc_to_parquet(): - from mlrun import import_function - - archive_func = import_function('hub://arc_to_parquet') - - archive_run = archive_func.run( - handler="arc_to_parquet", - params={"key": "rent", "stats": True, "file_ext": "csv"}, - inputs={"archive_url": DATA_URL}, - artifact_path=os.getcwd() + '/artifacts', - local=True, - ) - - return archive_run.artifact('rent').url - - -def sklearn_classifier(run): - cwd = os.getcwd() - file_path = str(Path(cwd).parent.absolute()) + "/sklearn_classifier/sklearn_classifier.py" - fn = code_to_function( - name='test_sklearn_classifier', - filename=file_path, - handler="train_model", - kind="local", - ) - - fn.spec.command = file_path - fn.run( - params={ - "sample": -5_000, # 5k random rows, - "model_pkg_class": "sklearn.ensemble.RandomForestClassifier", - "label_column": "interest_level", - "CLASS_n_estimators": 100, - "CLASS_min_samples_leaf": 1, - "CLASS_n_jobs": -1, - "CLASS_oob_score": True, - }, - handler="train_model", - inputs={"dataset": run.outputs["rent"]}, - artifact_path='artifacts', - ) - - -def train_model(data): - from mlrun import import_function - - train = import_function('hub://sklearn_classifier') - - train_run = train.run( - inputs={"dataset": data}, - params={ - "sample": -5_000, # 5k random rows, - "model_pkg_class": "sklearn.ensemble.RandomForestClassifier", - "label_column": "interest_level", - "CLASS_n_estimators": 100, - "CLASS_min_samples_leaf": 1, - "CLASS_n_jobs": -1, - "CLASS_oob_score": True, - }, - local=True - ) - - return train_run.artifact('model').url - - -def test_feature_selection_run_local(): - data = arc_to_parquet() - model = train_model(data) - labels = "interest_level" - fn = code_to_function( - name='test_run_local_feature_perms', - filename="feature_perms.py", - handler="permutation_importance", - kind="local", - ) - fn.spec.command = "feature_perms.py" - - run = fn.run( - params={ - "labels": labels, - "plots_dest": "plots", - }, - inputs={ - "model": model, - "dataset": data, - }, - artifact_path='artifacts', - ) - - assert run.artifact(FEATURE_OUTPUT).get() - - -def test_feature_perms_import_function(): - data = arc_to_parquet() - model = train_model(data) - labels = "interest_level" - fn = import_function("function.yaml") - - run = fn.run( - params={ - "labels": labels, - "plots_dest": "plots" - }, - inputs={ - "model": model, - "dataset": data}, - artifact_path=os.getcwd() + '/artifacts', - local=True, - ) - - assert run.artifact(FEATURE_OUTPUT).get() diff --git a/functions/development/feature_perms/1.1.0/static/documentation.html b/functions/development/feature_perms/1.1.0/static/documentation.html deleted file mode 100644 index 66452f5a..00000000 --- a/functions/development/feature_perms/1.1.0/static/documentation.html +++ /dev/null @@ -1,242 +0,0 @@ - - - - - - - -feature_perms package - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - - - -
-
- - -
-
-
- -
-

feature_perms package

- -
- -
-
-
-
-
-

feature_perms package#

-
-

Submodules#

-
-
-

feature_perms.feature_perms module#

-
-
-feature_perms.feature_perms.permutation_importance(context: mlrun.execution.MLClientCtx, model: mlrun.datastore.base.DataItem, dataset: mlrun.datastore.base.DataItem, labels: str, figsz=(10, 5), plots_dest: str = 'plots', fitype: str = 'permute')pandas.core.frame.DataFrame[source]#
-

calculate change in metric

-

type ‘permute’ uses a pre-estimated model -type ‘dropcol’ uses a re-estimates model

-
-
Parameters
-
    -
  • context – the function’s execution context

  • -
  • model – a trained model

  • -
  • dataset – features and ground truths, regression targets

  • -
-
-
-

:param labels name of the ground truths column -:param figsz: matplotlib figure size -:param plots_dest: path within artifact store -:

-
-
-
-

Module contents#

-
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/feature_perms/1.1.0/static/example.html b/functions/development/feature_perms/1.1.0/static/example.html deleted file mode 100644 index a5869008..00000000 --- a/functions/development/feature_perms/1.1.0/static/example.html +++ /dev/null @@ -1,1184 +0,0 @@ - - - - - - - -permutation_importances as reusable function - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - - - -
-
- - -
-
-
- -
-

permutation_importances as reusable function

- -
- -
-
-
-
-
-

permutation_importances as reusable function#

-
-

function code#

-
-
-
# nuclio: ignore
-import nuclio
-
-
-
-
-
-
-
import numpy as np
-import pandas as pd
-import numbers
-
-import sklearn
-from sklearn.base import clone
-from sklearn.utils import check_random_state
-
-import matplotlib.pyplot as plt
-import seaborn as sns
-
-from cloudpickle import load
-
-from mlrun.execution import MLClientCtx
-from mlrun.datastore import DataItem
-from mlrun.artifacts import get_model, PlotArtifact
-from typing import Union, Callable, List
-
-def _get_n_samples_bootstrap(n_samples, max_samples) -> int:
-    """get the number of samples in a bootstrap sample
-    
-    returns the total number of samples to draw for the bootstrap sample
-    
-    private api in sklearn >= v0.24, taken from sklearn.ensemble._forest.py
-
-    :param n_samples:   Number of samples in the dataset.
-    :param max_samples: 
-        The maximum number of samples to draw from the total available:
-            - if float, this indicates a fraction of the total and should be
-              the interval `(0, 1)`;
-            - if int, this indicates the exact number of samples;
-            - if None, this indicates the total number of samples.
-    """
-    if max_samples is None:
-        return n_samples
-
-    if isinstance(max_samples, numbers.Integral):
-        if not (1 <= max_samples <= n_samples):
-            msg = "`max_samples` must be in range 1 to {} but got value {}"
-            raise ValueError(msg.format(n_samples, max_samples))
-        return max_samples
-
-    if isinstance(max_samples, numbers.Real):
-        if not (0 < max_samples < 1):
-            msg = "`max_samples` must be in range (0, 1) but got value {}"
-            raise ValueError(msg.format(max_samples))
-        return int(round(n_samples * max_samples))
-
-    msg = "`max_samples` should be int or float, but got type '{}'"
-    raise TypeError(msg.format(type(max_samples)))
-
-def _get_unsampled_ix(random_state, n_samples: int) -> np.array:
-    """
-    future-proof get unsampled indices
-    """
-    n_bootstrap = _get_n_samples_bootstrap(n_samples, n_samples)
-    random_instance = check_random_state(random_state)
-    sample_indices = random_instance.randint(0, n_samples, n_bootstrap)
-    sample_counts = np.bincount(sample_indices, minlength=n_samples)
-
-    return np.arange(n_samples)[sample_counts==0]
-
-def _oob_classifier_accuracy(rf, X_train, y_train) -> float:
-    """
-    Compute out-of-bag (OOB) accuracy for a scikit-learn forest classifier.
-    
-    https://github.com/scikit-learn/scikit-learn/blob/a24c8b46/sklearn/ensemble/forest.py#L425
-    """
-    X = X_train.values if isinstance(X_train, pd.DataFrame) else X_train
-    y = y_train.values if isinstance(y_train, pd.Series) else y_train
-
-    n_samples = len(X)
-    n_classes = len(np.unique(y))
-    predictions = np.zeros((n_samples, n_classes))
-    for tree in rf.estimators_:
-        unsampled_indices = _get_unsampled_ix(tree.random_state, n_samples)
-        tree_preds = tree.predict_proba(X[unsampled_indices, :])
-        predictions[unsampled_indices] += tree_preds
-
-    predicted_class_indexes = np.argmax(predictions, axis=1)
-    predicted_classes = [rf.classes_[i] for i in predicted_class_indexes]
-
-    oob_score = np.mean(y == predicted_classes)
-    
-    return oob_score
-
-def permutation_importances(
-    context: MLClientCtx,
-    model: DataItem,
-    dataset: DataItem,
-    labels: str,
-    figsz=(10, 5),
-    plots_dest: str = "plots",
-    fitype: str = "permute"
-) -> pd.DataFrame:
-    """calculate change in metric
-    
-    type 'permute' uses a pre-estimated model
-    type 'dropcol' uses a re-estimates model
-    
-    :param context:     the function's execution context
-    :param model:       a trained model
-    :param dataset:     features and ground truths, regression targets
-    :param labels       name of the ground truths column
-    :param figsz:       matplotlib figure size
-    :param plots_dest:  path within artifact store
-    :
-    """
-    model_file, model_data, _ = get_model(model.url, suffix='.pkl')
-    model = load(open(str(model_file), "rb"))
-    
-    X = dataset.as_df()
-    y = X.pop(labels)
-    header = X.columns
-    
-    # this will be paramettrized next version, and include regression
-    metric = _oob_classifier_accuracy
-    
-    baseline = metric(model, X, y)
-    
-    imp = []
-    for col in X.columns:
-        if fitype is "permute":
-            save = X[col].copy()
-            X[col] = np.random.permutation(X[col])
-            m = metric(model, X, y)
-            X[col] = save
-            imp.append(baseline - m)
-        elif fitype is "dropcol":
-            X_ = X.drop(col, axis=1)
-            model_ = clone(model)
-            model_.random_state = random_state
-            model_.fit(X_, y)
-            o = model_.oob_score_
-            imp.append(baseline - o)
-        else:
-            raise ValueError("unknown fitype, only 'permute' or 'dropcol' permitted")
-
-    # create a feature importance table with desired labels
-    zipped = zip(imp, header)
-    feature_imp = pd.DataFrame(sorted(zipped), columns=["importance", "feature"])
-    feature_imp.sort_values(by="importance", ascending=False, inplace=True)
-
-    plt.clf()
-    plt.figure(figsize=figsz)
-    sns.barplot(x="importance", y="feature", data=feature_imp)
-    plt.title(f"feature importances-{fitype}")
-    plt.tight_layout()
-
-    context.log_artifact(PlotArtifact(f"feature importances-{fitype}", body=plt.gcf()),
-                         local_path=f"{plots_dest}/feature-permutations.html")
-    context.log_dataset(f"feature-importances-{fitype}-tbl", df=feature_imp, index=False)
-
-
-
-
-
-
-
# nuclio: end-code
-
-
-
-
-
-
-

save function#

-
-
-
from mlrun import code_to_function
-from mlrun.platforms.other import auto_mount
-
-gpus = False
-
-# create job function object from notebook code
-fn_params = {
-    "name"        : "feature-perms",
-    "handler"     : "permutation_importances",
-    "kind"        : "job",
-    "image"       : "mlrun/ml-models" if not gpus else "mlrun/ml-models-gpu",
-    "description" : "estimate feature importances using permutations",
-    "categories"  : ["analysis"],
-    "labels"      : {"author": "yjb"}
-}
-
-perms_fn = code_to_function(**fn_params)
-perms_fn.apply(auto_mount())
-perms_fn.export("function.yaml")
-
-
-
-
-
[mlrun] 2020-06-07 19:58:25,298 function spec saved to path: function.yaml
-
-
-
<mlrun.runtimes.kubejob.KubejobRuntime at 0x7feb0104efd0>
-
-
-
-
-
-
-

tests#

-
-
-
from mlrun import import_function
-from mlrun import NewTask, mlconf
-
-
-
-
-
-

get some data#

-
-
-
data_url = "https://raw.githubusercontent.com/parrt/random-forest-importances/master/notebooks/data/rent.csv"
-
-fn = import_function("hub://arc_to_parquet", "a2p")
-fn.apply(auto_mount())
-
-params = {
-    "name" : "tasks arc-to-parq",
-    "params" : {"key":"rent", "stats": True, "file_ext":"csv"}
-}
-acquire_run = fn.run(NewTask(**params),inputs={"archive_url" : data_url},
-                     artifact_path=mlconf.artifact_path)
-
-
-
-
-
[mlrun] 2020-06-07 19:58:25,352 starting run tasks arc-to-parq uid=e9bc67f2189c418d96bfde754d369956  -> http://mlrun-api:8080
-[mlrun] 2020-06-07 19:58:25,486 Job is running in the background, pod: tasks-arc-to-parq-xqkr7
-[mlrun] 2020-06-07 19:58:29,118 starting local run: main.py # arc_to_parquet
-[mlrun] 2020-06-07 19:58:29,169 downloading https://raw.githubusercontent.com/parrt/random-forest-importances/master/notebooks/data/rent.csv to local tmp
-[mlrun] 2020-06-07 19:58:29,535 destination file does not exist, downloading
-[mlrun] 2020-06-07 19:58:29,898 log artifact rent at /User/artifacts/rent.csv, size: 1492462, db: Y
-
-[mlrun] 2020-06-07 19:58:29,917 run executed, status=completed
-final state: succeeded
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 07 19:58:29completedtasks arc-to-parq
v3io_user=admin
kind=job
owner=admin
host=tasks-arc-to-parq-xqkr7
archive_url
key=rent
stats=True
file_ext=csv
rent
-
- -
-
to track results use .show() or .logs() or in CLI: 
-!mlrun get run e9bc67f2189c418d96bfde754d369956 --project default , !mlrun logs e9bc67f2189c418d96bfde754d369956 --project default
-[mlrun] 2020-06-07 19:58:31,666 run executed, status=completed
-
-
-
-
-
-
-

train a model#

-
-
-
fn = import_function("hub://sklearn_classifier", "skrf")
-fn.apply(auto_mount())
-
-# define model
-params = {
-    "name" : "tasks random forest",
-    "params" : {
-        "sample"                 : -5_000, # 5k random rows,
-        "model_pkg_class"        : "sklearn.ensemble.RandomForestClassifier",
-        "label_column"           : "interest_level",
-        "CLASS_n_estimators"     : 100,
-        "CLASS_min_samples_leaf" : 1,
-        "CLASS_n_jobs"           : -1,
-        "CLASS_oob_score"        : True}
-}
-
-train_run = fn.run(NewTask(**params), inputs={"dataset" : acquire_run.outputs["rent"]},
-                   artifact_path=mlconf.artifact_path)
-
-
-
-
-
[mlrun] 2020-06-07 19:58:31,704 starting run tasks random forest uid=57af834167264641905a5bb5e6b0e263  -> http://mlrun-api:8080
-[mlrun] 2020-06-07 19:58:31,861 Job is running in the background, pod: tasks-random-forest-vkjk5
-[mlrun] 2020-06-07 19:58:35,390 starting local run: main.py # train_model
-[mlrun] 2020-06-07 19:58:36,310 log artifact test_set at /User/artifacts/data/test_set.parquet, size: 24484, db: Y
-[mlrun] 2020-06-07 19:58:37,153 log artifact confusion-matrix at /User/artifacts/model/plots/confusion-matrix.html, size: 27401, db: N
-[mlrun] 2020-06-07 19:58:37,598 log artifact feature-importances at /User/artifacts/model/plots/feature-importances.html, size: 19685, db: N
-[mlrun] 2020-06-07 19:58:37,806 log artifact precision-recall-multiclass at /User/artifacts/model/plots/precision-recall-multiclass.html, size: 74009, db: N
-[mlrun] 2020-06-07 19:58:37,936 log artifact roc-multiclass at /User/artifacts/model/plots/roc-multiclass.html, size: 73053, db: N
-[mlrun] 2020-06-07 19:58:38,079 log artifact model at /User/artifacts/model/, size: 10346780, db: Y
-
-[mlrun] 2020-06-07 19:58:38,106 run executed, status=completed
-final state: succeeded
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 07 19:58:36completedtasks random forest
v3io_user=admin
kind=job
owner=admin
host=tasks-random-forest-vkjk5
class=sklearn.ensemble.RandomForestClassifier
dataset
sample=-5000
model_pkg_class=sklearn.ensemble.RandomForestClassifier
label_column=interest_level
CLASS_n_estimators=100
CLASS_min_samples_leaf=1
CLASS_n_jobs=-1
CLASS_oob_score=True
test-accuracy=0.6902857142857143
test-error=0.3097142857142857
auc-micro=0.8567196734693878
auc-weighted=0.7077200281488216
f1-score=0.44361444815007395
precision_score=0.4969837043184901
recall_score=0.42733978329897576
test_set
confusion-matrix
feature-importances
precision-recall-multiclass
roc-multiclass
model
-
- -
-
to track results use .show() or .logs() or in CLI: 
-!mlrun get run 57af834167264641905a5bb5e6b0e263 --project default , !mlrun logs 57af834167264641905a5bb5e6b0e263 --project default
-[mlrun] 2020-06-07 19:58:41,115 run executed, status=completed
-
-
-
-
-
-
-
from IPython.display import HTML
-HTML(filename=train_run.outputs['feature-importances'])
-
-
-
-
-

Feature Importances

-
-
-
-
-
data   = acquire_run.outputs["rent"]
-labels = "interest_level"
-model  = train_run.outputs["model"]
-
-
-
-
-
-
-
fi_perms = perms_fn.run(
-    NewTask(params={"labels": labels, 
-                    "plots_dest": "plots"}),
-    inputs={"model": model, "dataset": data},
-    artifact_path=mlconf.artifact_path)
-
-
-
-
-
[mlrun] 2020-06-07 19:58:41,152 starting run features-permutation_importances uid=89235b15ac2a4213aefc906c178a1c5e  -> http://mlrun-api:8080
-[mlrun] 2020-06-07 19:58:41,312 Job is running in the background, pod: features-permutation-importances-dwxmt
-[mlrun] 2020-06-07 19:58:44,871 starting local run: main.py # permutation_importances
-[mlrun] 2020-06-07 19:58:48,714 log artifact feature importances-permute at /User/artifacts/plots/feature-permutations.html, size: 25694, db: Y
-[mlrun] 2020-06-07 19:58:48,770 log artifact feature-importances-permute-tbl at /User/artifacts/feature-importances-permute-tbl.csv, size: 167, db: Y
-
-[mlrun] 2020-06-07 19:58:48,785 run executed, status=completed
-final state: succeeded
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 07 19:58:45completedfeatures-permutation_importances
v3io_user=admin
kind=job
owner=admin
host=features-permutation-importances-dwxmt
model
dataset
labels=interest_level
plots_dest=plots
feature importances-permute
feature-importances-permute-tbl
-
- -
-
to track results use .show() or .logs() or in CLI: 
-!mlrun get run 89235b15ac2a4213aefc906c178a1c5e --project default , !mlrun logs 89235b15ac2a4213aefc906c178a1c5e --project default
-[mlrun] 2020-06-07 19:58:50,488 run executed, status=completed
-
-
-
-
-
-
-
from IPython.display import HTML
-HTML(filename=fi_perms.outputs['feature importances-permute'])
-
-
-
-
-
-
-
-
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/feature_perms/1.1.0/static/feature_perms.html b/functions/development/feature_perms/1.1.0/static/feature_perms.html deleted file mode 100644 index 5121c7f7..00000000 --- a/functions/development/feature_perms/1.1.0/static/feature_perms.html +++ /dev/null @@ -1,314 +0,0 @@ - - - - - - - -feature_perms.feature_perms - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - -
-
- -
-
-
-
-
- -
-

- -
-
-
-
-
-
-
-

Source code for feature_perms.feature_perms

-# Copyright 2019 Iguazio
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# Generated by nuclio.export.NuclioExporter
-
-import numpy as np
-import pandas as pd
-import numbers
-
-import sklearn
-from sklearn.base import clone
-from sklearn.utils import check_random_state
-
-import matplotlib.pyplot as plt
-import seaborn as sns
-
-from cloudpickle import load
-
-from mlrun.execution import MLClientCtx
-from mlrun.datastore import DataItem
-from mlrun.artifacts import get_model, PlotArtifact
-from typing import Union, Callable, List
-
-
-def _get_n_samples_bootstrap(n_samples, max_samples) -> int:
-    """get the number of samples in a bootstrap sample
-
-    returns the total number of samples to draw for the bootstrap sample
-
-    private api in sklearn >= v0.24, taken from sklearn.ensemble._forest.py
-
-    :param n_samples:   Number of samples in the dataset.
-    :param max_samples:
-        The maximum number of samples to draw from the total available:
-            - if float, this indicates a fraction of the total and should be
-              the interval `(0, 1)`;
-            - if int, this indicates the exact number of samples;
-            - if None, this indicates the total number of samples.
-    """
-    if max_samples is None:
-        return n_samples
-
-    if isinstance(max_samples, numbers.Integral):
-        if not (1 <= max_samples <= n_samples):
-            msg = "`max_samples` must be in range 1 to {} but got value {}"
-            raise ValueError(msg.format(n_samples, max_samples))
-        return max_samples
-
-    if isinstance(max_samples, numbers.Real):
-        if not (0 < max_samples < 1):
-            msg = "`max_samples` must be in range (0, 1) but got value {}"
-            raise ValueError(msg.format(max_samples))
-        return int(round(n_samples * max_samples))
-
-    msg = "`max_samples` should be int or float, but got type '{}'"
-    raise TypeError(msg.format(type(max_samples)))
-
-
-def _get_unsampled_ix(random_state, n_samples: int) -> np.array:
-    """
-    future-proof get unsampled indices
-    """
-    n_bootstrap = _get_n_samples_bootstrap(n_samples, n_samples)
-    random_instance = check_random_state(random_state)
-    sample_indices = random_instance.randint(0, n_samples, n_bootstrap)
-    sample_counts = np.bincount(sample_indices, minlength=n_samples)
-
-    return np.arange(n_samples)[sample_counts == 0]
-
-
-def _oob_classifier_accuracy(rf, X_train, y_train) -> float:
-    """
-    Compute out-of-bag (OOB) accuracy for a scikit-learn forest classifier.
-
-    https://github.com/scikit-learn/scikit-learn/blob/a24c8b46/sklearn/ensemble/forest.py#L425
-    """
-    X = X_train.values if isinstance(X_train, pd.DataFrame) else X_train
-    y = y_train.values if isinstance(y_train, pd.Series) else y_train
-
-    n_samples = len(X)
-    n_classes = len(np.unique(y))
-    predictions = np.zeros((n_samples, n_classes))
-    for tree in rf.estimators_:
-        unsampled_indices = _get_unsampled_ix(tree.random_state, n_samples)
-        tree_preds = tree.predict_proba(X[unsampled_indices, :])
-        predictions[unsampled_indices] += tree_preds
-
-    predicted_class_indexes = np.argmax(predictions, axis=1)
-    predicted_classes = [rf.classes_[i] for i in predicted_class_indexes]
-
-    oob_score = np.mean(y == predicted_classes)
-
-    return oob_score
-
-
-
[docs]def permutation_importance( - context: MLClientCtx, - model: DataItem, - dataset: DataItem, - labels: str, - figsz=(10, 5), - plots_dest: str = "plots", - fitype: str = "permute", -) -> pd.DataFrame: - """calculate change in metric - - type 'permute' uses a pre-estimated model - type 'dropcol' uses a re-estimates model - - :param context: the function's execution context - :param model: a trained model - :param dataset: features and ground truths, regression targets - :param labels name of the ground truths column - :param figsz: matplotlib figure size - :param plots_dest: path within artifact store - : - """ - model_file, model_data, _ = get_model(model.url, suffix=".pkl") - model = load(open(str(model_file), "rb")) - - X = dataset.as_df() - y = X.pop(labels) - header = X.columns - - metric = _oob_classifier_accuracy - - baseline = metric(model, X, y) - - imp = [] - for col in X.columns: - if fitype is "permute": - save = X[col].copy() - X[col] = np.random.permutation(X[col]) - m = metric(model, X, y) - X[col] = save - imp.append(baseline - m) - elif fitype is "dropcol": - X_ = X.drop(col, axis=1) - model_ = clone(model) - #model_.random_state = random_state - model_.fit(X_, y) - o = model_.oob_score_ - imp.append(baseline - o) - else: - raise ValueError("unknown fitype, only 'permute' or 'dropcol' permitted") - - zipped = zip(imp, header) - feature_imp = pd.DataFrame(sorted(zipped), columns=["importance", "feature"]) - feature_imp.sort_values(by="importance", ascending=False, inplace=True) - - plt.clf() - plt.figure(figsize=figsz) - sns.barplot(x="importance", y="feature", data=feature_imp) - plt.title(f"feature importances-{fitype}") - plt.tight_layout() - - context.log_artifact( - PlotArtifact(f"feature importances-{fitype}", body=plt.gcf()), - local_path=f"{plots_dest}/feature-permutations.html", - ) - context.log_dataset( - f"feature-importances-{fitype}-tbl", df=feature_imp, index=False - )
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/feature_perms/1.1.0/static/function.html b/functions/development/feature_perms/1.1.0/static/function.html deleted file mode 100644 index f9520633..00000000 --- a/functions/development/feature_perms/1.1.0/static/function.html +++ /dev/null @@ -1,85 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: job
-metadata:
-  name: feature-perms
-  tag: ''
-  hash: 2e32234a73e2e48f029cf6c957b150ec2ffd4bc7
-  project: ''
-  labels:
-    author: yjb
-  categories:
-  - data-analysis
-spec:
-  command: ''
-  args: []
-  image: mlrun/ml-models
-  env: []
-  default_handler: permutation_importance
-  entry_points:
-    permutation_importance:
-      name: permutation_importance
-      doc: 'calculate change in metric
-
-
-        type ''permute'' uses a pre-estimated model
-
-        type ''dropcol'' uses a re-estimates model'
-      parameters:
-      - name: context
-        type: MLClientCtx
-        doc: the function's execution context
-        default: ''
-      - name: model
-        type: DataItem
-        doc: a trained model
-        default: ''
-      - name: dataset
-        type: DataItem
-        doc: features and ground truths, regression targets
-        default: ''
-      - name: labels
-        type: str
-        default: ''
-      - name: figsz
-        doc: matplotlib figure size
-        default:
-        - 10
-        - 5
-      - name: plots_dest
-        type: str
-        doc: path within artifact store
-        default: plots
-      - name: fitype
-        type: str
-        default: permute
-      outputs:
-      - default: ''
-      lineno: 93
-  description: estimate feature importances using permutations
-  build:
-    functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IG51bXB5IGFzIG5wCmltcG9ydCBwYW5kYXMgYXMgcGQKaW1wb3J0IG51bWJlcnMKCmltcG9ydCBza2xlYXJuCmZyb20gc2tsZWFybi5iYXNlIGltcG9ydCBjbG9uZQpmcm9tIHNrbGVhcm4udXRpbHMgaW1wb3J0IGNoZWNrX3JhbmRvbV9zdGF0ZQoKaW1wb3J0IG1hdHBsb3RsaWIucHlwbG90IGFzIHBsdAppbXBvcnQgc2VhYm9ybiBhcyBzbnMKCmZyb20gY2xvdWRwaWNrbGUgaW1wb3J0IGxvYWQKCmZyb20gbWxydW4uZXhlY3V0aW9uIGltcG9ydCBNTENsaWVudEN0eApmcm9tIG1scnVuLmRhdGFzdG9yZSBpbXBvcnQgRGF0YUl0ZW0KZnJvbSBtbHJ1bi5hcnRpZmFjdHMgaW1wb3J0IGdldF9tb2RlbCwgUGxvdEFydGlmYWN0CmZyb20gdHlwaW5nIGltcG9ydCBVbmlvbiwgQ2FsbGFibGUsIExpc3QKCgpkZWYgX2dldF9uX3NhbXBsZXNfYm9vdHN0cmFwKG5fc2FtcGxlcywgbWF4X3NhbXBsZXMpIC0+IGludDoKICAgICIiImdldCB0aGUgbnVtYmVyIG9mIHNhbXBsZXMgaW4gYSBib290c3RyYXAgc2FtcGxlCgogICAgcmV0dXJucyB0aGUgdG90YWwgbnVtYmVyIG9mIHNhbXBsZXMgdG8gZHJhdyBmb3IgdGhlIGJvb3RzdHJhcCBzYW1wbGUKCiAgICBwcml2YXRlIGFwaSBpbiBza2xlYXJuID49IHYwLjI0LCB0YWtlbiBmcm9tIHNrbGVhcm4uZW5zZW1ibGUuX2ZvcmVzdC5weQoKICAgIDpwYXJhbSBuX3NhbXBsZXM6ICAgTnVtYmVyIG9mIHNhbXBsZXMgaW4gdGhlIGRhdGFzZXQuCiAgICA6cGFyYW0gbWF4X3NhbXBsZXM6CiAgICAgICAgVGhlIG1heGltdW0gbnVtYmVyIG9mIHNhbXBsZXMgdG8gZHJhdyBmcm9tIHRoZSB0b3RhbCBhdmFpbGFibGU6CiAgICAgICAgICAgIC0gaWYgZmxvYXQsIHRoaXMgaW5kaWNhdGVzIGEgZnJhY3Rpb24gb2YgdGhlIHRvdGFsIGFuZCBzaG91bGQgYmUKICAgICAgICAgICAgICB0aGUgaW50ZXJ2YWwgYCgwLCAxKWA7CiAgICAgICAgICAgIC0gaWYgaW50LCB0aGlzIGluZGljYXRlcyB0aGUgZXhhY3QgbnVtYmVyIG9mIHNhbXBsZXM7CiAgICAgICAgICAgIC0gaWYgTm9uZSwgdGhpcyBpbmRpY2F0ZXMgdGhlIHRvdGFsIG51bWJlciBvZiBzYW1wbGVzLgogICAgIiIiCiAgICBpZiBtYXhfc2FtcGxlcyBpcyBOb25lOgogICAgICAgIHJldHVybiBuX3NhbXBsZXMKCiAgICBpZiBpc2luc3RhbmNlKG1heF9zYW1wbGVzLCBudW1iZXJzLkludGVncmFsKToKICAgICAgICBpZiBub3QgKDEgPD0gbWF4X3NhbXBsZXMgPD0gbl9zYW1wbGVzKToKICAgICAgICAgICAgbXNnID0gImBtYXhfc2FtcGxlc2AgbXVzdCBiZSBpbiByYW5nZSAxIHRvIHt9IGJ1dCBnb3QgdmFsdWUge30iCiAgICAgICAgICAgIHJhaXNlIFZhbHVlRXJyb3IobXNnLmZvcm1hdChuX3NhbXBsZXMsIG1heF9zYW1wbGVzKSkKICAgICAgICByZXR1cm4gbWF4X3NhbXBsZXMKCiAgICBpZiBpc2luc3RhbmNlKG1heF9zYW1wbGVzLCBudW1iZXJzLlJlYWwpOgogICAgICAgIGlmIG5vdCAoMCA8IG1heF9zYW1wbGVzIDwgMSk6CiAgICAgICAgICAgIG1zZyA9ICJgbWF4X3NhbXBsZXNgIG11c3QgYmUgaW4gcmFuZ2UgKDAsIDEpIGJ1dCBnb3QgdmFsdWUge30iCiAgICAgICAgICAgIHJhaXNlIFZhbHVlRXJyb3IobXNnLmZvcm1hdChtYXhfc2FtcGxlcykpCiAgICAgICAgcmV0dXJuIGludChyb3VuZChuX3NhbXBsZXMgKiBtYXhfc2FtcGxlcykpCgogICAgbXNnID0gImBtYXhfc2FtcGxlc2Agc2hvdWxkIGJlIGludCBvciBmbG9hdCwgYnV0IGdvdCB0eXBlICd7fSciCiAgICByYWlzZSBUeXBlRXJyb3IobXNnLmZvcm1hdCh0eXBlKG1heF9zYW1wbGVzKSkpCgoKZGVmIF9nZXRfdW5zYW1wbGVkX2l4KHJhbmRvbV9zdGF0ZSwgbl9zYW1wbGVzOiBpbnQpIC0+IG5wLmFycmF5OgogICAgIiIiCiAgICBmdXR1cmUtcHJvb2YgZ2V0IHVuc2FtcGxlZCBpbmRpY2VzCiAgICAiIiIKICAgIG5fYm9vdHN0cmFwID0gX2dldF9uX3NhbXBsZXNfYm9vdHN0cmFwKG5fc2FtcGxlcywgbl9zYW1wbGVzKQogICAgcmFuZG9tX2luc3RhbmNlID0gY2hlY2tfcmFuZG9tX3N0YXRlKHJhbmRvbV9zdGF0ZSkKICAgIHNhbXBsZV9pbmRpY2VzID0gcmFuZG9tX2luc3RhbmNlLnJhbmRpbnQoMCwgbl9zYW1wbGVzLCBuX2Jvb3RzdHJhcCkKICAgIHNhbXBsZV9jb3VudHMgPSBucC5iaW5jb3VudChzYW1wbGVfaW5kaWNlcywgbWlubGVuZ3RoPW5fc2FtcGxlcykKCiAgICByZXR1cm4gbnAuYXJhbmdlKG5fc2FtcGxlcylbc2FtcGxlX2NvdW50cyA9PSAwXQoKCmRlZiBfb29iX2NsYXNzaWZpZXJfYWNjdXJhY3kocmYsIFhfdHJhaW4sIHlfdHJhaW4pIC0+IGZsb2F0OgogICAgIiIiCiAgICBDb21wdXRlIG91dC1vZi1iYWcgKE9PQikgYWNjdXJhY3kgZm9yIGEgc2Npa2l0LWxlYXJuIGZvcmVzdCBjbGFzc2lmaWVyLgoKICAgIGh0dHBzOi8vZ2l0aHViLmNvbS9zY2lraXQtbGVhcm4vc2Npa2l0LWxlYXJuL2Jsb2IvYTI0YzhiNDYvc2tsZWFybi9lbnNlbWJsZS9mb3Jlc3QucHkjTDQyNQogICAgIiIiCiAgICBYID0gWF90cmFpbi52YWx1ZXMgaWYgaXNpbnN0YW5jZShYX3RyYWluLCBwZC5EYXRhRnJhbWUpIGVsc2UgWF90cmFpbgogICAgeSA9IHlfdHJhaW4udmFsdWVzIGlmIGlzaW5zdGFuY2UoeV90cmFpbiwgcGQuU2VyaWVzKSBlbHNlIHlfdHJhaW4KCiAgICBuX3NhbXBsZXMgPSBsZW4oWCkKICAgIG5fY2xhc3NlcyA9IGxlbihucC51bmlxdWUoeSkpCiAgICBwcmVkaWN0aW9ucyA9IG5wLnplcm9zKChuX3NhbXBsZXMsIG5fY2xhc3NlcykpCiAgICBmb3IgdHJlZSBpbiByZi5lc3RpbWF0b3JzXzoKICAgICAgICB1bnNhbXBsZWRfaW5kaWNlcyA9IF9nZXRfdW5zYW1wbGVkX2l4KHRyZWUucmFuZG9tX3N0YXRlLCBuX3NhbXBsZXMpCiAgICAgICAgdHJlZV9wcmVkcyA9IHRyZWUucHJlZGljdF9wcm9iYShYW3Vuc2FtcGxlZF9pbmRpY2VzLCA6XSkKICAgICAgICBwcmVkaWN0aW9uc1t1bnNhbXBsZWRfaW5kaWNlc10gKz0gdHJlZV9wcmVkcwoKICAgIHByZWRpY3RlZF9jbGFzc19pbmRleGVzID0gbnAuYXJnbWF4KHByZWRpY3Rpb25zLCBheGlzPTEpCiAgICBwcmVkaWN0ZWRfY2xhc3NlcyA9IFtyZi5jbGFzc2VzX1tpXSBmb3IgaSBpbiBwcmVkaWN0ZWRfY2xhc3NfaW5kZXhlc10KCiAgICBvb2Jfc2NvcmUgPSBucC5tZWFuKHkgPT0gcHJlZGljdGVkX2NsYXNzZXMpCgogICAgcmV0dXJuIG9vYl9zY29yZQoKCmRlZiBwZXJtdXRhdGlvbl9pbXBvcnRhbmNlKAogICAgY29udGV4dDogTUxDbGllbnRDdHgsCiAgICBtb2RlbDogRGF0YUl0ZW0sCiAgICBkYXRhc2V0OiBEYXRhSXRlbSwKICAgIGxhYmVsczogc3RyLAogICAgZmlnc3o9KDEwLCA1KSwKICAgIHBsb3RzX2Rlc3Q6IHN0ciA9ICJwbG90cyIsCiAgICBmaXR5cGU6IHN0ciA9ICJwZXJtdXRlIiwKKSAtPiBwZC5EYXRhRnJhbWU6CiAgICAiIiJjYWxjdWxhdGUgY2hhbmdlIGluIG1ldHJpYwoKICAgIHR5cGUgJ3Blcm11dGUnIHVzZXMgYSBwcmUtZXN0aW1hdGVkIG1vZGVsCiAgICB0eXBlICdkcm9wY29sJyB1c2VzIGEgcmUtZXN0aW1hdGVzIG1vZGVsCgogICAgOnBhcmFtIGNvbnRleHQ6ICAgICB0aGUgZnVuY3Rpb24ncyBleGVjdXRpb24gY29udGV4dAogICAgOnBhcmFtIG1vZGVsOiAgICAgICBhIHRyYWluZWQgbW9kZWwKICAgIDpwYXJhbSBkYXRhc2V0OiAgICAgZmVhdHVyZXMgYW5kIGdyb3VuZCB0cnV0aHMsIHJlZ3Jlc3Npb24gdGFyZ2V0cwogICAgOnBhcmFtIGxhYmVscyAgICAgICBuYW1lIG9mIHRoZSBncm91bmQgdHJ1dGhzIGNvbHVtbgogICAgOnBhcmFtIGZpZ3N6OiAgICAgICBtYXRwbG90bGliIGZpZ3VyZSBzaXplCiAgICA6cGFyYW0gcGxvdHNfZGVzdDogIHBhdGggd2l0aGluIGFydGlmYWN0IHN0b3JlCiAgICA6CiAgICAiIiIKICAgIG1vZGVsX2ZpbGUsIG1vZGVsX2RhdGEsIF8gPSBnZXRfbW9kZWwobW9kZWwudXJsLCBzdWZmaXg9Ii5wa2wiKQogICAgbW9kZWwgPSBsb2FkKG9wZW4oc3RyKG1vZGVsX2ZpbGUpLCAicmIiKSkKCiAgICBYID0gZGF0YXNldC5hc19kZigpCiAgICB5ID0gWC5wb3AobGFiZWxzKQogICAgaGVhZGVyID0gWC5jb2x1bW5zCgogICAgbWV0cmljID0gX29vYl9jbGFzc2lmaWVyX2FjY3VyYWN5CgogICAgYmFzZWxpbmUgPSBtZXRyaWMobW9kZWwsIFgsIHkpCgogICAgaW1wID0gW10KICAgIGZvciBjb2wgaW4gWC5jb2x1bW5zOgogICAgICAgIGlmIGZpdHlwZSBpcyAicGVybXV0ZSI6CiAgICAgICAgICAgIHNhdmUgPSBYW2NvbF0uY29weSgpCiAgICAgICAgICAgIFhbY29sXSA9IG5wLnJhbmRvbS5wZXJtdXRhdGlvbihYW2NvbF0pCiAgICAgICAgICAgIG0gPSBtZXRyaWMobW9kZWwsIFgsIHkpCiAgICAgICAgICAgIFhbY29sXSA9IHNhdmUKICAgICAgICAgICAgaW1wLmFwcGVuZChiYXNlbGluZSAtIG0pCiAgICAgICAgZWxpZiBmaXR5cGUgaXMgImRyb3Bjb2wiOgogICAgICAgICAgICBYXyA9IFguZHJvcChjb2wsIGF4aXM9MSkKICAgICAgICAgICAgbW9kZWxfID0gY2xvbmUobW9kZWwpCiAgICAgICAgICAgICNtb2RlbF8ucmFuZG9tX3N0YXRlID0gcmFuZG9tX3N0YXRlCiAgICAgICAgICAgIG1vZGVsXy5maXQoWF8sIHkpCiAgICAgICAgICAgIG8gPSBtb2RlbF8ub29iX3Njb3JlXwogICAgICAgICAgICBpbXAuYXBwZW5kKGJhc2VsaW5lIC0gbykKICAgICAgICBlbHNlOgogICAgICAgICAgICByYWlzZSBWYWx1ZUVycm9yKCJ1bmtub3duIGZpdHlwZSwgb25seSAncGVybXV0ZScgb3IgJ2Ryb3Bjb2wnIHBlcm1pdHRlZCIpCgogICAgemlwcGVkID0gemlwKGltcCwgaGVhZGVyKQogICAgZmVhdHVyZV9pbXAgPSBwZC5EYXRhRnJhbWUoc29ydGVkKHppcHBlZCksIGNvbHVtbnM9WyJpbXBvcnRhbmNlIiwgImZlYXR1cmUiXSkKICAgIGZlYXR1cmVfaW1wLnNvcnRfdmFsdWVzKGJ5PSJpbXBvcnRhbmNlIiwgYXNjZW5kaW5nPUZhbHNlLCBpbnBsYWNlPVRydWUpCgogICAgcGx0LmNsZigpCiAgICBwbHQuZmlndXJlKGZpZ3NpemU9Zmlnc3opCiAgICBzbnMuYmFycGxvdCh4PSJpbXBvcnRhbmNlIiwgeT0iZmVhdHVyZSIsIGRhdGE9ZmVhdHVyZV9pbXApCiAgICBwbHQudGl0bGUoZiJmZWF0dXJlIGltcG9ydGFuY2VzLXtmaXR5cGV9IikKICAgIHBsdC50aWdodF9sYXlvdXQoKQoKICAgIGNvbnRleHQubG9nX2FydGlmYWN0KAogICAgICAgIFBsb3RBcnRpZmFjdChmImZlYXR1cmUgaW1wb3J0YW5jZXMte2ZpdHlwZX0iLCBib2R5PXBsdC5nY2YoKSksCiAgICAgICAgbG9jYWxfcGF0aD1mIntwbG90c19kZXN0fS9mZWF0dXJlLXBlcm11dGF0aW9ucy5odG1sIiwKICAgICkKICAgIGNvbnRleHQubG9nX2RhdGFzZXQoCiAgICAgICAgZiJmZWF0dXJlLWltcG9ydGFuY2VzLXtmaXR5cGV9LXRibCIsIGRmPWZlYXR1cmVfaW1wLCBpbmRleD1GYWxzZQogICAgKQo=
-    commands: []
-    code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/feature_perms/feature_perms.py
-  affinity: null
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/feature_perms/1.1.0/static/item.html b/functions/development/feature_perms/1.1.0/static/item.html deleted file mode 100644 index 2cdb2735..00000000 --- a/functions/development/feature_perms/1.1.0/static/item.html +++ /dev/null @@ -1,47 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- data-analysis
-description: estimate feature importances using permutations
-doc: ''
-example: feature_perms.ipynb
-generationDate: 2022-08-28:17-25
-hidden: false
-icon: ''
-labels:
-  author: yjb
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 1.1.0
-name: feature-perms
-platformVersion: 3.5.0
-spec:
-  filename: feature_perms.py
-  handler: permutation_importance
-  image: mlrun/ml-models
-  kind: job
-  requirements: []
-url: ''
-version: 1.1.0
-test_valid : False
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/feature_perms/1.1.0/static/source.html b/functions/development/feature_perms/1.1.0/static/source.html deleted file mode 100644 index 9d9c9b8c..00000000 --- a/functions/development/feature_perms/1.1.0/static/source.html +++ /dev/null @@ -1,196 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-# Copyright 2019 Iguazio
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# Generated by nuclio.export.NuclioExporter
-
-import numpy as np
-import pandas as pd
-import numbers
-
-import sklearn
-from sklearn.base import clone
-from sklearn.utils import check_random_state
-
-import matplotlib.pyplot as plt
-import seaborn as sns
-
-from cloudpickle import load
-
-from mlrun.execution import MLClientCtx
-from mlrun.datastore import DataItem
-from mlrun.artifacts import get_model, PlotArtifact
-from typing import Union, Callable, List
-
-
-def _get_n_samples_bootstrap(n_samples, max_samples) -> int:
-    """get the number of samples in a bootstrap sample
-
-    returns the total number of samples to draw for the bootstrap sample
-
-    private api in sklearn >= v0.24, taken from sklearn.ensemble._forest.py
-
-    :param n_samples:   Number of samples in the dataset.
-    :param max_samples:
-        The maximum number of samples to draw from the total available:
-            - if float, this indicates a fraction of the total and should be
-              the interval `(0, 1)`;
-            - if int, this indicates the exact number of samples;
-            - if None, this indicates the total number of samples.
-    """
-    if max_samples is None:
-        return n_samples
-
-    if isinstance(max_samples, numbers.Integral):
-        if not (1 <= max_samples <= n_samples):
-            msg = "`max_samples` must be in range 1 to {} but got value {}"
-            raise ValueError(msg.format(n_samples, max_samples))
-        return max_samples
-
-    if isinstance(max_samples, numbers.Real):
-        if not (0 < max_samples < 1):
-            msg = "`max_samples` must be in range (0, 1) but got value {}"
-            raise ValueError(msg.format(max_samples))
-        return int(round(n_samples * max_samples))
-
-    msg = "`max_samples` should be int or float, but got type '{}'"
-    raise TypeError(msg.format(type(max_samples)))
-
-
-def _get_unsampled_ix(random_state, n_samples: int) -> np.array:
-    """
-    future-proof get unsampled indices
-    """
-    n_bootstrap = _get_n_samples_bootstrap(n_samples, n_samples)
-    random_instance = check_random_state(random_state)
-    sample_indices = random_instance.randint(0, n_samples, n_bootstrap)
-    sample_counts = np.bincount(sample_indices, minlength=n_samples)
-
-    return np.arange(n_samples)[sample_counts == 0]
-
-
-def _oob_classifier_accuracy(rf, X_train, y_train) -> float:
-    """
-    Compute out-of-bag (OOB) accuracy for a scikit-learn forest classifier.
-
-    https://github.com/scikit-learn/scikit-learn/blob/a24c8b46/sklearn/ensemble/forest.py#L425
-    """
-    X = X_train.values if isinstance(X_train, pd.DataFrame) else X_train
-    y = y_train.values if isinstance(y_train, pd.Series) else y_train
-
-    n_samples = len(X)
-    n_classes = len(np.unique(y))
-    predictions = np.zeros((n_samples, n_classes))
-    for tree in rf.estimators_:
-        unsampled_indices = _get_unsampled_ix(tree.random_state, n_samples)
-        tree_preds = tree.predict_proba(X[unsampled_indices, :])
-        predictions[unsampled_indices] += tree_preds
-
-    predicted_class_indexes = np.argmax(predictions, axis=1)
-    predicted_classes = [rf.classes_[i] for i in predicted_class_indexes]
-
-    oob_score = np.mean(y == predicted_classes)
-
-    return oob_score
-
-
-def permutation_importance(
-    context: MLClientCtx,
-    model: DataItem,
-    dataset: DataItem,
-    labels: str,
-    figsz=(10, 5),
-    plots_dest: str = "plots",
-    fitype: str = "permute",
-) -> pd.DataFrame:
-    """calculate change in metric
-
-    type 'permute' uses a pre-estimated model
-    type 'dropcol' uses a re-estimates model
-
-    :param context:     the function's execution context
-    :param model:       a trained model
-    :param dataset:     features and ground truths, regression targets
-    :param labels       name of the ground truths column
-    :param figsz:       matplotlib figure size
-    :param plots_dest:  path within artifact store
-    :
-    """
-    model_file, model_data, _ = get_model(model.url, suffix=".pkl")
-    model = load(open(str(model_file), "rb"))
-
-    X = dataset.as_df()
-    y = X.pop(labels)
-    header = X.columns
-
-    metric = _oob_classifier_accuracy
-
-    baseline = metric(model, X, y)
-
-    imp = []
-    for col in X.columns:
-        if fitype is "permute":
-            save = X[col].copy()
-            X[col] = np.random.permutation(X[col])
-            m = metric(model, X, y)
-            X[col] = save
-            imp.append(baseline - m)
-        elif fitype is "dropcol":
-            X_ = X.drop(col, axis=1)
-            model_ = clone(model)
-            #model_.random_state = random_state
-            model_.fit(X_, y)
-            o = model_.oob_score_
-            imp.append(baseline - o)
-        else:
-            raise ValueError("unknown fitype, only 'permute' or 'dropcol' permitted")
-
-    zipped = zip(imp, header)
-    feature_imp = pd.DataFrame(sorted(zipped), columns=["importance", "feature"])
-    feature_imp.sort_values(by="importance", ascending=False, inplace=True)
-
-    plt.clf()
-    plt.figure(figsize=figsz)
-    sns.barplot(x="importance", y="feature", data=feature_imp)
-    plt.title(f"feature importances-{fitype}")
-    plt.tight_layout()
-
-    context.log_artifact(
-        PlotArtifact(f"feature importances-{fitype}", body=plt.gcf()),
-        local_path=f"{plots_dest}/feature-permutations.html",
-    )
-    context.log_dataset(
-        f"feature-importances-{fitype}-tbl", df=feature_imp, index=False
-    )
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/feature_perms/latest/src/README.ipynb b/functions/development/feature_perms/latest/src/README.ipynb deleted file mode 100644 index 0929a6f6..00000000 --- a/functions/development/feature_perms/latest/src/README.ipynb +++ /dev/null @@ -1,788 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# feature importances" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "There are a number of ways to compute feature importances and **the default estimates reported by scikit learn can be shown to be biased** under certain circumstances. In addition, many non-tree algorithms do not provide conveniently calculated feature importance estimates. The following demonstration is based on material that draws heavily from the following sources:" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## references\n", - "\n", - "\n", - "### repos\n", - "\n", - "* **[Feature importances for scikit-learn machine learning models](https://github.com/parrt/random-forest-importances)**, [MIT License](https://github.com/parrt/random-forest-importances/blob/master/LICENSE)\n", - "* **[Scikit-Learn ensemble module - forests](https://github.com/scikit-learn/scikit-learn/blob/0.23.1/sklearn/ensemble/_forest.py)**, [BSD License](https://github.com/scikit-learn/scikit-learn/blob/fd237278e895b42abe8d8d09105cbb82dc2cbba7/sklearn/ensemble/_forest.py#L40)\n", - "* **[ELI5 - Permutation Importance](https://eli5.readthedocs.io/en/latest/blackbox/permutation_importance.html)** \n", - "\n", - "### articles\n", - "\n", - "Strobl, C., Boulesteix, A., Zeileis, A. et al. **[Bias in random forest variable importance measures: Illustrations, sources and a solution](https://link.springer.com/article/10.1186/1471-2105-8-25#citeas)**. BMC Bioinformatics 8, 25 (2007). https://doi.org/10.1186/1471-2105-8-25 \n", - "\n", - "Strobl, C., Boulesteix, A., Kneib, T. et al. **[Conditional variable importance for random forests](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-9-307#citeas)**. BMC Bioinformatics 9, 307 (2008). https://doi.org/10.1186/1471-2105-9-307 " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## what we'll do\n", - "\n", - "* demonstrate an issue with default feature importance estimates \n", - "* provide alternatives and compare to the default \n", - "* create a new function `feature_perms` that implements a computationally simple algorithm \n", - "* create a new function `dropcol_importances` that implements a computationally intensive algorithm that is more accurate\n", - "* test our new functions\n", - "\n", - "It should be noted that although we are developing this notebook using a classification example, an almost identical presentation can be done for regression." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## imports" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "import pandas as pd\n", - "\n", - "import sklearn\n", - "from sklearn.base import clone\n", - "\n", - "from sklearn.ensemble import RandomForestClassifier as SomeModel\n", - "\n", - "import matplotlib.pyplot as plt\n", - "import seaborn as sns\n", - "\n", - "from typing import Union, Callable, List" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## default feature importances\n", - "\n", - "This is a function that plots default feature importances from an estimated model object when available. It is taken from mlrun's current source-code implementation:" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "def feature_importances(\n", - " model: SomeModel,\n", - " header: List[str], \n", - " figsz=(10, 5)\n", - ") -> None:\n", - " \"\"\"Display default model feature importances\n", - "\n", - " Only works for models with attribute 'feature_importances_`\n", - "\n", - " :param model: fitted model with a feature_importances_ attribute\n", - " :param header: feature labels\n", - " :param figsz: matplotlib figure size\n", - " \"\"\"\n", - " if not hasattr(model, \"feature_importances_\"):\n", - " raise Exception(\n", - " \"feature importances are only available for some models\")\n", - "\n", - " # create a feature importance table with desired labels\n", - " zipped = zip(model.feature_importances_, header)\n", - " feature_imp = pd.DataFrame(\n", - " sorted(zipped), columns=[\"freq\", \"feature\"]).sort_values(\n", - " by=\"freq\", ascending=False)\n", - "\n", - " plt.clf()\n", - " plt.figure(figsize=figsz)\n", - " sns.barplot(x=\"freq\", y=\"feature\", data=feature_imp)\n", - " plt.title(\"features\")\n", - " plt.tight_layout();\n", - " \n", - " return feature_imp" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## permuted features\n", - "\n", - "A proposed solution that has general applicability is randomly permuted features**[refs](#references)**: \n", - "* loop through the feature set \n", - "* shuffle one feature \n", - "* run predict\n", - "* compare the (marginal) change in accuracy (or other metric of interest) \n", - "\n", - "This approach is computationally more demanding than relying on the default values, however it can be easily parallelized. To perform the estimation we only need an estimated model and a held-out test set. The following was proposed in **[Beware Default Random Forest Importances](https://explained.ai/rf-importance/index.html)**:\n", - "\n", - "( the following 3 glue functions will no longer be publicly visible in the sklearn package from 0.24 onwards, consider this a temporary hack while we refactor these away)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**the following has been refactored in final version of function:**" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "from distutils.version import LooseVersion\n", - "import numpy as np\n", - "from sklearn.utils import check_random_state\n", - "\n", - "def _generate_sample_indices(random_state: int, n_samples: int, n_samples_bootstrap: int):\n", - " \"\"\"\n", - " Private function used to _parallel_build_trees function.\n", - " taken from:\n", - " https://github.com/scikit-learn/scikit-learn/blob/2253807bb488b6de73796aef2de38a6dcf282d86/sklearn/ensemble/_forest.py#L116\n", - " (public availability to be deprecated by sklearn v0.24)\n", - " \"\"\"\n", - " random_instance = check_random_state(random_state)\n", - " sample_indices = random_instance.randint(0, n_samples, n_samples_bootstrap)\n", - "\n", - " return sample_indices\n", - "\n", - "def _generate_unsampled_indices(random_state: int, n_samples: int, n_samples_bootstrap: int):\n", - " \"\"\"\n", - " Private function used to forest._set_oob_score function.\n", - " taken from: \n", - " https://github.com/scikit-learn/scikit-learn/blob/2253807bb488b6de73796aef2de38a6dcf282d86/sklearn/ensemble/_forest.py#L126\n", - " (public availability to be deprecated by sklearn v0.24)\n", - " \"\"\"\n", - " sample_indices = _generate_sample_indices(random_state, n_samples,\n", - " n_samples_bootstrap)\n", - " sample_counts = np.bincount(sample_indices, minlength=n_samples)\n", - " unsampled_mask = sample_counts == 0\n", - " indices_range = np.arange(n_samples)\n", - " unsampled_indices = indices_range[unsampled_mask]\n", - "\n", - " return unsampled_indices\n", - "\n", - "def _get_unsampled_indices(tree, n_samples: int):\n", - " \"\"\"\n", - " An interface to get unsampled indices regardless of sklearn version.\n", - " \"\"\"\n", - " import warnings\n", - " warnings.simplefilter(action=\"ignore\", category=FutureWarning)\n", - " if LooseVersion(sklearn.__version__) >= LooseVersion(\"0.22\"):\n", - " # Version 0.22 or newer uses 3 arguments.\n", - " from sklearn.ensemble.forest import _get_n_samples_bootstrap\n", - " n_samples_bootstrap = _get_n_samples_bootstrap(n_samples, n_samples)\n", - " return _generate_unsampled_indices(tree.random_state, n_samples,\n", - " n_samples_bootstrap)\n", - " else:\n", - " # Version 0.21 or older uses only two arguments.\n", - " return _generate_unsampled_indices(tree.random_state, n_samples)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The following function estimates classifier accuracy and has been borrowed from **[references](#references)**. See **[breitman on oob](https://www.stat.berkeley.edu/~breiman/OOBestimation.pdf)** for details on out-of-bag estimation:" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "def oob_classifier_accuracy(rf, X_train: np.array, y_train: np.array) -> float:\n", - " \"\"\"\n", - " Compute out-of-bag (OOB) accuracy for a scikit-learn forest classifier.\n", - " \n", - " https://github.com/scikit-learn/scikit-learn/blob/a24c8b46/sklearn/ensemble/forest.py#L425\n", - " \"\"\"\n", - " X = X_train.values\n", - " y = y_train.values\n", - "\n", - " n_samples = len(X)\n", - " n_classes = len(np.unique(y))\n", - " predictions = np.zeros((n_samples, n_classes))\n", - " for tree in rf.estimators_:\n", - " unsampled_indices = _get_unsampled_indices(tree, n_samples)\n", - " tree_preds = tree.predict_proba(X[unsampled_indices, :])\n", - " predictions[unsampled_indices] += tree_preds\n", - "\n", - " predicted_class_indexes = np.argmax(predictions, axis=1)\n", - " predicted_classes = [rf.classes_[i] for i in predicted_class_indexes]\n", - "\n", - " oob_score = np.mean(y == predicted_classes)\n", - " \n", - " return oob_score" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Putting it all together:" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "def permutation_importances(\n", - " model, \n", - " X_train: np.array,\n", - " y_train: np.array, \n", - " header: List[str],\n", - " metric: Callable = oob_classifier_accuracy,\n", - " figsz=(10, 5)\n", - ") -> np.array:\n", - " \"\"\"calculate change in metric from permuting feature columns\n", - " \n", - " modified from https://explained.ai/rf-importance/index.html\n", - " \n", - " uses a pre-estimated model\n", - "\n", - " :param X_train: training set features\n", - " :param y_train: training set ground truths, regression targets\n", - " :param header: column labels for X_train\n", - " :param figsz: matplotlib figure size\n", - " \n", - " \"\"\"\n", - " baseline = metric(model, X_train, y_train)\n", - " imp = []\n", - " for col in X_train.columns:\n", - " save = X_train[col].copy()\n", - " X_train[col] = np.random.permutation(X_train[col])\n", - " m = metric(model, X_train, y_train)\n", - " X_train[col] = save\n", - " imp.append(baseline - m)\n", - " \n", - " # create a feature importance table with desired labels\n", - " zipped = zip(imp, header)\n", - " feature_imp = pd.DataFrame(sorted(zipped), columns=[\"importance\", \"feature\"])\n", - " feature_imp.sort_values(by=\"importance\", ascending=False, inplace=True)\n", - "\n", - " plt.clf()\n", - " plt.figure(figsize=figsz)\n", - " sns.barplot(x=\"importance\", y=\"feature\", data=feature_imp)\n", - " plt.title(\"feature permutation importances\")\n", - " plt.tight_layout()\n", - "\n", - " return np.array(feature_imp)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## drop-column importances\n", - "\n", - "According to our **[references](#references)** a more accurate measure of feature importance would have us re-estimate the model after dropping a column. This is considered as being close to \"ideal\". Unfortunately, the entire model needs to be re-estimated for each column and without some approximating shortcut this is likely to be infeasible for large datasets.\n", - "\n", - "Here is the suggested implementation and **don't run this on big models!**:" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "def dropcol_importances(\n", - " model, \n", - " X_train: np.array,\n", - " y_train: np.array,\n", - " header: List[str] = [],\n", - " random_state: int = 1994,\n", - " figsz=(10, 5)\n", - ") -> pd.DataFrame:\n", - " \"\"\"drop columns and re-estimate model\n", - " \n", - " modified from https://explained.ai/rf-importance/index.html\n", - " \n", - " :param rf: model to fit\n", - " :param X_train: training set features\n", - " :param y_train: training set ground truth labels\n", - "\n", - " Returns:\n", - " pd.DataFrame: table of diffs vs baseline metric\n", - " \"\"\"\n", - " # cloning makes copy of model pre-fit\n", - " # calculate a baseline with all features\n", - " model_ = clone(model)\n", - " model_.random_state = random_state\n", - " model_.fit(X_train, y_train)\n", - " baseline = model_.oob_score_\n", - " \n", - " # now drop each colum, refit model and calc metric\n", - " imp = []\n", - " for col in X_train.columns:\n", - " X = X_train.drop(col, axis=1)\n", - " model_ = clone(model)\n", - " model_.random_state = random_state\n", - " model_.fit(X, y_train)\n", - " o = model_.oob_score_\n", - " imp.append(baseline - o)\n", - " \n", - " # put it all in a table\n", - " imp = np.array(imp)\n", - " feature_imps = pd.DataFrame(\n", - " data={'feature': X_train.columns,\n", - " 'importance': imp})\n", - " #feature_imps.set_index('feature', inplace=True)\n", - " feature_imps.sort_values('importance', ascending=True, inplace=True)\n", - " \n", - " plt.clf()\n", - " plt.figure(figsize=figsz)\n", - " sns.barplot(x=\"importance\", y=\"feature\", data=feature_imps)\n", - " plt.title(\"drop column feature importances\")\n", - " plt.tight_layout()\n", - " \n", - " return feature_imps" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## demonstration\n", - "\n", - "In this demonstratuon we are going to take a fraction of a fraction of **[Kaggle's RentHop rental listing interest competition](https://www.kaggle.com/c/two-sigma-connect-rental-listing-inquiries)**--the complete dataset is presently >80GB, we'll be looking at 5K rows. \n", - "\n", - "The competition's **[goal](https://www.kaggle.com/c/two-sigma-connect-rental-listing-inquiries)** was\n", - "> to predict the number of inquiries a new listing receives based on the listing’s creation date and other features. \n", - "\n", - "Doing so would help **[RentHop](https://www.renthop.com/)**\n", - "> better handle fraud control, identify potential listing quality issues, and allow owners and agents to better understand renters’ needs and preferences." - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [], - "source": [ - "data = \"/User/artifacts/two-sigma-connect-rental-listing-inquiries/\"\n", - "NFRAC = 0.1" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "sample dimensions (4935, 6)\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
bathroomsbedroomspricelongitudelatitudeinterest_level
182351.001800-73.996040.71972
421401.023000-73.987940.76533
46771.011350-73.899640.85492
\n", - "
" - ], - "text/plain": [ - " bathrooms bedrooms price longitude latitude interest_level\n", - "18235 1.0 0 1800 -73.9960 40.7197 2\n", - "42140 1.0 2 3000 -73.9879 40.7653 3\n", - "4677 1.0 1 1350 -73.8996 40.8549 2" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df = pd.read_csv(data + 'rent.csv').sample(frac=NFRAC)\n", - "print(\"sample dimensions\", df.shape)\n", - "df.head(3)" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [], - "source": [ - "features = ['bathrooms', 'bedrooms', 'longitude', 'latitude', 'price']\n", - "dfr = df[features]\n", - "\n", - "# drop price column\n", - "X_train, y_train = dfr.drop('price', axis=1), dfr['price']\n", - "\n", - "# insert column with random values\n", - "X_train['random'] = np.random.random(size=len(X_train))\n", - "features = ['bathrooms', 'bedrooms', 'longitude', 'latitude', 'random']" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "RandomForestClassifier(n_jobs=-1, oob_score=True)" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# define model\n", - "model_params = {\n", - " \"n_estimators\" : 100, \n", - " \"min_samples_leaf\" : 1,\n", - " \"n_jobs\" : -1,\n", - " \"oob_score\" : True\n", - "}\n", - "\n", - "model = SomeModel(**model_params)\n", - "\n", - "# estimate\n", - "model.fit(X_train, y_train)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### to run this the model needs a default attribute" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "default feature_importances [0.01683784 0.03215169 0.29983429 0.30418813 0.34698806]\n" - ] - }, - { - "data": { - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "if hasattr(model, \"feature_importances_\"):\n", - " print(\"default feature_importances\", model.feature_importances_)\n", - " feature_importances(model, features)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## permutation importances\n", - "\n", - "No need to check for default attributes or functions, this can be run on any kind of model:" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([[0.06545086119554205, 'longitude'],\n", - " [0.06160081053698076, 'latitude'],\n", - " [0.053495440729483285, 'bedrooms'],\n", - " [0.021681864235055734, 'bathrooms'],\n", - " [0.0004052684903748799, 'random']], dtype=object)" - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "pi = permutation_importances(model, X_train, y_train, features)\n", - "pi" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## drop-column importances" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
featureimportance
4random-0.042756
0bathrooms0.001824
1bedrooms0.023100
2longitude0.049240
3latitude0.051874
\n", - "
" - ], - "text/plain": [ - " feature importance\n", - "4 random -0.042756\n", - "0 bathrooms 0.001824\n", - "1 bedrooms 0.023100\n", - "2 longitude 0.049240\n", - "3 latitude 0.051874" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "dc = dropcol_importances(model, X_train, y_train)\n", - "dc" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## conclusions" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "So I would say location is a prime factor, then the number of bedrooms. Bathrooms often is gte bedrooms, and is likely correlated so one of them should likely be dropped." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.8" - }, - "toc-autonumbering": false, - "toc-showcode": false, - "toc-showmarkdowntxt": false - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/functions/development/feature_perms/latest/src/feature_perms.ipynb b/functions/development/feature_perms/latest/src/feature_perms.ipynb deleted file mode 100644 index 77da7b55..00000000 --- a/functions/development/feature_perms/latest/src/feature_perms.ipynb +++ /dev/null @@ -1,1106 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# permutation_importances as reusable function" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## function code" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: ignore\n", - "import nuclio" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "import pandas as pd\n", - "import numbers\n", - "\n", - "import sklearn\n", - "from sklearn.base import clone\n", - "from sklearn.utils import check_random_state\n", - "\n", - "import matplotlib.pyplot as plt\n", - "import seaborn as sns\n", - "\n", - "from cloudpickle import load\n", - "\n", - "from mlrun.execution import MLClientCtx\n", - "from mlrun.datastore import DataItem\n", - "from mlrun.artifacts import get_model, PlotArtifact\n", - "from typing import Union, Callable, List\n", - "\n", - "def _get_n_samples_bootstrap(n_samples, max_samples) -> int:\n", - " \"\"\"get the number of samples in a bootstrap sample\n", - " \n", - " returns the total number of samples to draw for the bootstrap sample\n", - " \n", - " private api in sklearn >= v0.24, taken from sklearn.ensemble._forest.py\n", - "\n", - " :param n_samples: Number of samples in the dataset.\n", - " :param max_samples: \n", - " The maximum number of samples to draw from the total available:\n", - " - if float, this indicates a fraction of the total and should be\n", - " the interval `(0, 1)`;\n", - " - if int, this indicates the exact number of samples;\n", - " - if None, this indicates the total number of samples.\n", - " \"\"\"\n", - " if max_samples is None:\n", - " return n_samples\n", - "\n", - " if isinstance(max_samples, numbers.Integral):\n", - " if not (1 <= max_samples <= n_samples):\n", - " msg = \"`max_samples` must be in range 1 to {} but got value {}\"\n", - " raise ValueError(msg.format(n_samples, max_samples))\n", - " return max_samples\n", - "\n", - " if isinstance(max_samples, numbers.Real):\n", - " if not (0 < max_samples < 1):\n", - " msg = \"`max_samples` must be in range (0, 1) but got value {}\"\n", - " raise ValueError(msg.format(max_samples))\n", - " return int(round(n_samples * max_samples))\n", - "\n", - " msg = \"`max_samples` should be int or float, but got type '{}'\"\n", - " raise TypeError(msg.format(type(max_samples)))\n", - "\n", - "def _get_unsampled_ix(random_state, n_samples: int) -> np.array:\n", - " \"\"\"\n", - " future-proof get unsampled indices\n", - " \"\"\"\n", - " n_bootstrap = _get_n_samples_bootstrap(n_samples, n_samples)\n", - " random_instance = check_random_state(random_state)\n", - " sample_indices = random_instance.randint(0, n_samples, n_bootstrap)\n", - " sample_counts = np.bincount(sample_indices, minlength=n_samples)\n", - "\n", - " return np.arange(n_samples)[sample_counts==0]\n", - "\n", - "def _oob_classifier_accuracy(rf, X_train, y_train) -> float:\n", - " \"\"\"\n", - " Compute out-of-bag (OOB) accuracy for a scikit-learn forest classifier.\n", - " \n", - " https://github.com/scikit-learn/scikit-learn/blob/a24c8b46/sklearn/ensemble/forest.py#L425\n", - " \"\"\"\n", - " X = X_train.values if isinstance(X_train, pd.DataFrame) else X_train\n", - " y = y_train.values if isinstance(y_train, pd.Series) else y_train\n", - "\n", - " n_samples = len(X)\n", - " n_classes = len(np.unique(y))\n", - " predictions = np.zeros((n_samples, n_classes))\n", - " for tree in rf.estimators_:\n", - " unsampled_indices = _get_unsampled_ix(tree.random_state, n_samples)\n", - " tree_preds = tree.predict_proba(X[unsampled_indices, :])\n", - " predictions[unsampled_indices] += tree_preds\n", - "\n", - " predicted_class_indexes = np.argmax(predictions, axis=1)\n", - " predicted_classes = [rf.classes_[i] for i in predicted_class_indexes]\n", - "\n", - " oob_score = np.mean(y == predicted_classes)\n", - " \n", - " return oob_score\n", - "\n", - "def permutation_importances(\n", - " context: MLClientCtx,\n", - " model: DataItem,\n", - " dataset: DataItem,\n", - " labels: str,\n", - " figsz=(10, 5),\n", - " plots_dest: str = \"plots\",\n", - " fitype: str = \"permute\"\n", - ") -> pd.DataFrame:\n", - " \"\"\"calculate change in metric\n", - " \n", - " type 'permute' uses a pre-estimated model\n", - " type 'dropcol' uses a re-estimates model\n", - " \n", - " :param context: the function's execution context\n", - " :param model: a trained model\n", - " :param dataset: features and ground truths, regression targets\n", - " :param labels name of the ground truths column\n", - " :param figsz: matplotlib figure size\n", - " :param plots_dest: path within artifact store\n", - " :\n", - " \"\"\"\n", - " model_file, model_data, _ = get_model(model.url, suffix='.pkl')\n", - " model = load(open(str(model_file), \"rb\"))\n", - " \n", - " X = dataset.as_df()\n", - " y = X.pop(labels)\n", - " header = X.columns\n", - " \n", - " # this will be paramettrized next version, and include regression\n", - " metric = _oob_classifier_accuracy\n", - " \n", - " baseline = metric(model, X, y)\n", - " \n", - " imp = []\n", - " for col in X.columns:\n", - " if fitype is \"permute\":\n", - " save = X[col].copy()\n", - " X[col] = np.random.permutation(X[col])\n", - " m = metric(model, X, y)\n", - " X[col] = save\n", - " imp.append(baseline - m)\n", - " elif fitype is \"dropcol\":\n", - " X_ = X.drop(col, axis=1)\n", - " model_ = clone(model)\n", - " model_.random_state = random_state\n", - " model_.fit(X_, y)\n", - " o = model_.oob_score_\n", - " imp.append(baseline - o)\n", - " else:\n", - " raise ValueError(\"unknown fitype, only 'permute' or 'dropcol' permitted\")\n", - "\n", - " # create a feature importance table with desired labels\n", - " zipped = zip(imp, header)\n", - " feature_imp = pd.DataFrame(sorted(zipped), columns=[\"importance\", \"feature\"])\n", - " feature_imp.sort_values(by=\"importance\", ascending=False, inplace=True)\n", - "\n", - " plt.clf()\n", - " plt.figure(figsize=figsz)\n", - " sns.barplot(x=\"importance\", y=\"feature\", data=feature_imp)\n", - " plt.title(f\"feature importances-{fitype}\")\n", - " plt.tight_layout()\n", - "\n", - " context.log_artifact(PlotArtifact(f\"feature importances-{fitype}\", body=plt.gcf()),\n", - " local_path=f\"{plots_dest}/feature-permutations.html\")\n", - " context.log_dataset(f\"feature-importances-{fitype}-tbl\", df=feature_imp, index=False)" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: end-code" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## save function" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-06-07 19:58:25,298 function spec saved to path: function.yaml\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from mlrun import code_to_function\n", - "from mlrun.platforms.other import auto_mount\n", - "\n", - "gpus = False\n", - "\n", - "# create job function object from notebook code\n", - "fn_params = {\n", - " \"name\" : \"feature-perms\",\n", - " \"handler\" : \"permutation_importances\",\n", - " \"kind\" : \"job\",\n", - " \"image\" : \"mlrun/ml-models\" if not gpus else \"mlrun/ml-models-gpu\",\n", - " \"description\" : \"estimate feature importances using permutations\",\n", - " \"categories\" : [\"analysis\"],\n", - " \"labels\" : {\"author\": \"yjb\"}\n", - "}\n", - "\n", - "perms_fn = code_to_function(**fn_params)\n", - "perms_fn.apply(auto_mount())\n", - "perms_fn.export(\"function.yaml\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## tests" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import import_function\n", - "from mlrun import NewTask, mlconf" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### get some data" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-06-07 19:58:25,352 starting run tasks arc-to-parq uid=e9bc67f2189c418d96bfde754d369956 -> http://mlrun-api:8080\n", - "[mlrun] 2020-06-07 19:58:25,486 Job is running in the background, pod: tasks-arc-to-parq-xqkr7\n", - "[mlrun] 2020-06-07 19:58:29,118 starting local run: main.py # arc_to_parquet\n", - "[mlrun] 2020-06-07 19:58:29,169 downloading https://raw.githubusercontent.com/parrt/random-forest-importances/master/notebooks/data/rent.csv to local tmp\n", - "[mlrun] 2020-06-07 19:58:29,535 destination file does not exist, downloading\n", - "[mlrun] 2020-06-07 19:58:29,898 log artifact rent at /User/artifacts/rent.csv, size: 1492462, db: Y\n", - "\n", - "[mlrun] 2020-06-07 19:58:29,917 run executed, status=completed\n", - "final state: succeeded\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 07 19:58:29completedtasks arc-to-parq
v3io_user=admin
kind=job
owner=admin
host=tasks-arc-to-parq-xqkr7
archive_url
key=rent
stats=True
file_ext=csv
rent
\n", - "
\n", - "
\n", - "
\n", - " Title\n", - " ×\n", - "
\n", - " \n", - "
\n", - "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "to track results use .show() or .logs() or in CLI: \n", - "!mlrun get run e9bc67f2189c418d96bfde754d369956 --project default , !mlrun logs e9bc67f2189c418d96bfde754d369956 --project default\n", - "[mlrun] 2020-06-07 19:58:31,666 run executed, status=completed\n" - ] - } - ], - "source": [ - "data_url = \"https://raw.githubusercontent.com/parrt/random-forest-importances/master/notebooks/data/rent.csv\"\n", - "\n", - "fn = import_function(\"hub://arc_to_parquet\", \"a2p\")\n", - "fn.apply(auto_mount())\n", - "\n", - "params = {\n", - " \"name\" : \"tasks arc-to-parq\",\n", - " \"params\" : {\"key\":\"rent\", \"stats\": True, \"file_ext\":\"csv\"}\n", - "}\n", - "acquire_run = fn.run(NewTask(**params),inputs={\"archive_url\" : data_url},\n", - " artifact_path=mlconf.artifact_path)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### train a model" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-06-07 19:58:31,704 starting run tasks random forest uid=57af834167264641905a5bb5e6b0e263 -> http://mlrun-api:8080\n", - "[mlrun] 2020-06-07 19:58:31,861 Job is running in the background, pod: tasks-random-forest-vkjk5\n", - "[mlrun] 2020-06-07 19:58:35,390 starting local run: main.py # train_model\n", - "[mlrun] 2020-06-07 19:58:36,310 log artifact test_set at /User/artifacts/data/test_set.parquet, size: 24484, db: Y\n", - "[mlrun] 2020-06-07 19:58:37,153 log artifact confusion-matrix at /User/artifacts/model/plots/confusion-matrix.html, size: 27401, db: N\n", - "[mlrun] 2020-06-07 19:58:37,598 log artifact feature-importances at /User/artifacts/model/plots/feature-importances.html, size: 19685, db: N\n", - "[mlrun] 2020-06-07 19:58:37,806 log artifact precision-recall-multiclass at /User/artifacts/model/plots/precision-recall-multiclass.html, size: 74009, db: N\n", - "[mlrun] 2020-06-07 19:58:37,936 log artifact roc-multiclass at /User/artifacts/model/plots/roc-multiclass.html, size: 73053, db: N\n", - "[mlrun] 2020-06-07 19:58:38,079 log artifact model at /User/artifacts/model/, size: 10346780, db: Y\n", - "\n", - "[mlrun] 2020-06-07 19:58:38,106 run executed, status=completed\n", - "final state: succeeded\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 07 19:58:36completedtasks random forest
v3io_user=admin
kind=job
owner=admin
host=tasks-random-forest-vkjk5
class=sklearn.ensemble.RandomForestClassifier
dataset
sample=-5000
model_pkg_class=sklearn.ensemble.RandomForestClassifier
label_column=interest_level
CLASS_n_estimators=100
CLASS_min_samples_leaf=1
CLASS_n_jobs=-1
CLASS_oob_score=True
test-accuracy=0.6902857142857143
test-error=0.3097142857142857
auc-micro=0.8567196734693878
auc-weighted=0.7077200281488216
f1-score=0.44361444815007395
precision_score=0.4969837043184901
recall_score=0.42733978329897576
test_set
confusion-matrix
feature-importances
precision-recall-multiclass
roc-multiclass
model
\n", - "
\n", - "
\n", - "
\n", - " Title\n", - " ×\n", - "
\n", - " \n", - "
\n", - "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "to track results use .show() or .logs() or in CLI: \n", - "!mlrun get run 57af834167264641905a5bb5e6b0e263 --project default , !mlrun logs 57af834167264641905a5bb5e6b0e263 --project default\n", - "[mlrun] 2020-06-07 19:58:41,115 run executed, status=completed\n" - ] - } - ], - "source": [ - "fn = import_function(\"hub://sklearn_classifier\", \"skrf\")\n", - "fn.apply(auto_mount())\n", - "\n", - "# define model\n", - "params = {\n", - " \"name\" : \"tasks random forest\",\n", - " \"params\" : {\n", - " \"sample\" : -5_000, # 5k random rows,\n", - " \"model_pkg_class\" : \"sklearn.ensemble.RandomForestClassifier\",\n", - " \"label_column\" : \"interest_level\",\n", - " \"CLASS_n_estimators\" : 100,\n", - " \"CLASS_min_samples_leaf\" : 1,\n", - " \"CLASS_n_jobs\" : -1,\n", - " \"CLASS_oob_score\" : True}\n", - "}\n", - "\n", - "train_run = fn.run(NewTask(**params), inputs={\"dataset\" : acquire_run.outputs[\"rent\"]},\n", - " artifact_path=mlconf.artifact_path)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "

Feature Importances

\n", - "" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from IPython.display import HTML\n", - "HTML(filename=train_run.outputs['feature-importances'])" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "data = acquire_run.outputs[\"rent\"]\n", - "labels = \"interest_level\"\n", - "model = train_run.outputs[\"model\"]\n" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-06-07 19:58:41,152 starting run features-permutation_importances uid=89235b15ac2a4213aefc906c178a1c5e -> http://mlrun-api:8080\n", - "[mlrun] 2020-06-07 19:58:41,312 Job is running in the background, pod: features-permutation-importances-dwxmt\n", - "[mlrun] 2020-06-07 19:58:44,871 starting local run: main.py # permutation_importances\n", - "[mlrun] 2020-06-07 19:58:48,714 log artifact feature importances-permute at /User/artifacts/plots/feature-permutations.html, size: 25694, db: Y\n", - "[mlrun] 2020-06-07 19:58:48,770 log artifact feature-importances-permute-tbl at /User/artifacts/feature-importances-permute-tbl.csv, size: 167, db: Y\n", - "\n", - "[mlrun] 2020-06-07 19:58:48,785 run executed, status=completed\n", - "final state: succeeded\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 07 19:58:45completedfeatures-permutation_importances
v3io_user=admin
kind=job
owner=admin
host=features-permutation-importances-dwxmt
model
dataset
labels=interest_level
plots_dest=plots
feature importances-permute
feature-importances-permute-tbl
\n", - "
\n", - "
\n", - "
\n", - " Title\n", - " ×\n", - "
\n", - " \n", - "
\n", - "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "to track results use .show() or .logs() or in CLI: \n", - "!mlrun get run 89235b15ac2a4213aefc906c178a1c5e --project default , !mlrun logs 89235b15ac2a4213aefc906c178a1c5e --project default\n", - "[mlrun] 2020-06-07 19:58:50,488 run executed, status=completed\n" - ] - } - ], - "source": [ - "fi_perms = perms_fn.run(\n", - " NewTask(params={\"labels\": labels, \n", - " \"plots_dest\": \"plots\"}),\n", - " inputs={\"model\": model, \"dataset\": data},\n", - " artifact_path=mlconf.artifact_path)" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from IPython.display import HTML\n", - "HTML(filename=fi_perms.outputs['feature importances-permute'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.8" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} \ No newline at end of file diff --git a/functions/development/feature_perms/latest/src/feature_perms.py b/functions/development/feature_perms/latest/src/feature_perms.py deleted file mode 100644 index 13caae32..00000000 --- a/functions/development/feature_perms/latest/src/feature_perms.py +++ /dev/null @@ -1,174 +0,0 @@ -# Copyright 2019 Iguazio -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# Generated by nuclio.export.NuclioExporter - -import numpy as np -import pandas as pd -import numbers - -import sklearn -from sklearn.base import clone -from sklearn.utils import check_random_state - -import matplotlib.pyplot as plt -import seaborn as sns - -from cloudpickle import load - -from mlrun.execution import MLClientCtx -from mlrun.datastore import DataItem -from mlrun.artifacts import get_model, PlotArtifact -from typing import Union, Callable, List - - -def _get_n_samples_bootstrap(n_samples, max_samples) -> int: - """get the number of samples in a bootstrap sample - - returns the total number of samples to draw for the bootstrap sample - - private api in sklearn >= v0.24, taken from sklearn.ensemble._forest.py - - :param n_samples: Number of samples in the dataset. - :param max_samples: - The maximum number of samples to draw from the total available: - - if float, this indicates a fraction of the total and should be - the interval `(0, 1)`; - - if int, this indicates the exact number of samples; - - if None, this indicates the total number of samples. - """ - if max_samples is None: - return n_samples - - if isinstance(max_samples, numbers.Integral): - if not (1 <= max_samples <= n_samples): - msg = "`max_samples` must be in range 1 to {} but got value {}" - raise ValueError(msg.format(n_samples, max_samples)) - return max_samples - - if isinstance(max_samples, numbers.Real): - if not (0 < max_samples < 1): - msg = "`max_samples` must be in range (0, 1) but got value {}" - raise ValueError(msg.format(max_samples)) - return int(round(n_samples * max_samples)) - - msg = "`max_samples` should be int or float, but got type '{}'" - raise TypeError(msg.format(type(max_samples))) - - -def _get_unsampled_ix(random_state, n_samples: int) -> np.array: - """ - future-proof get unsampled indices - """ - n_bootstrap = _get_n_samples_bootstrap(n_samples, n_samples) - random_instance = check_random_state(random_state) - sample_indices = random_instance.randint(0, n_samples, n_bootstrap) - sample_counts = np.bincount(sample_indices, minlength=n_samples) - - return np.arange(n_samples)[sample_counts == 0] - - -def _oob_classifier_accuracy(rf, X_train, y_train) -> float: - """ - Compute out-of-bag (OOB) accuracy for a scikit-learn forest classifier. - - https://github.com/scikit-learn/scikit-learn/blob/a24c8b46/sklearn/ensemble/forest.py#L425 - """ - X = X_train.values if isinstance(X_train, pd.DataFrame) else X_train - y = y_train.values if isinstance(y_train, pd.Series) else y_train - - n_samples = len(X) - n_classes = len(np.unique(y)) - predictions = np.zeros((n_samples, n_classes)) - for tree in rf.estimators_: - unsampled_indices = _get_unsampled_ix(tree.random_state, n_samples) - tree_preds = tree.predict_proba(X[unsampled_indices, :]) - predictions[unsampled_indices] += tree_preds - - predicted_class_indexes = np.argmax(predictions, axis=1) - predicted_classes = [rf.classes_[i] for i in predicted_class_indexes] - - oob_score = np.mean(y == predicted_classes) - - return oob_score - - -def permutation_importance( - context: MLClientCtx, - model: DataItem, - dataset: DataItem, - labels: str, - figsz=(10, 5), - plots_dest: str = "plots", - fitype: str = "permute", -) -> pd.DataFrame: - """calculate change in metric - - type 'permute' uses a pre-estimated model - type 'dropcol' uses a re-estimates model - - :param context: the function's execution context - :param model: a trained model - :param dataset: features and ground truths, regression targets - :param labels name of the ground truths column - :param figsz: matplotlib figure size - :param plots_dest: path within artifact store - : - """ - model_file, model_data, _ = get_model(model.url, suffix=".pkl") - model = load(open(str(model_file), "rb")) - - X = dataset.as_df() - y = X.pop(labels) - header = X.columns - - metric = _oob_classifier_accuracy - - baseline = metric(model, X, y) - - imp = [] - for col in X.columns: - if fitype is "permute": - save = X[col].copy() - X[col] = np.random.permutation(X[col]) - m = metric(model, X, y) - X[col] = save - imp.append(baseline - m) - elif fitype is "dropcol": - X_ = X.drop(col, axis=1) - model_ = clone(model) - #model_.random_state = random_state - model_.fit(X_, y) - o = model_.oob_score_ - imp.append(baseline - o) - else: - raise ValueError("unknown fitype, only 'permute' or 'dropcol' permitted") - - zipped = zip(imp, header) - feature_imp = pd.DataFrame(sorted(zipped), columns=["importance", "feature"]) - feature_imp.sort_values(by="importance", ascending=False, inplace=True) - - plt.clf() - plt.figure(figsize=figsz) - sns.barplot(x="importance", y="feature", data=feature_imp) - plt.title(f"feature importances-{fitype}") - plt.tight_layout() - - context.log_artifact( - PlotArtifact(f"feature importances-{fitype}", body=plt.gcf()), - local_path=f"{plots_dest}/feature-permutations.html", - ) - context.log_dataset( - f"feature-importances-{fitype}-tbl", df=feature_imp, index=False - ) diff --git a/functions/development/feature_perms/latest/src/function.yaml b/functions/development/feature_perms/latest/src/function.yaml deleted file mode 100644 index 713981fd..00000000 --- a/functions/development/feature_perms/latest/src/function.yaml +++ /dev/null @@ -1,63 +0,0 @@ -kind: job -metadata: - name: feature-perms - tag: '' - hash: 2e32234a73e2e48f029cf6c957b150ec2ffd4bc7 - project: '' - labels: - author: yjb - categories: - - data-analysis -spec: - command: '' - args: [] - image: mlrun/ml-models - env: [] - default_handler: permutation_importance - entry_points: - permutation_importance: - name: permutation_importance - doc: 'calculate change in metric - - - type ''permute'' uses a pre-estimated model - - type ''dropcol'' uses a re-estimates model' - parameters: - - name: context - type: MLClientCtx - doc: the function's execution context - default: '' - - name: model - type: DataItem - doc: a trained model - default: '' - - name: dataset - type: DataItem - doc: features and ground truths, regression targets - default: '' - - name: labels - type: str - default: '' - - name: figsz - doc: matplotlib figure size - default: - - 10 - - 5 - - name: plots_dest - type: str - doc: path within artifact store - default: plots - - name: fitype - type: str - default: permute - outputs: - - default: '' - lineno: 93 - description: estimate feature importances using permutations - build: - functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IG51bXB5IGFzIG5wCmltcG9ydCBwYW5kYXMgYXMgcGQKaW1wb3J0IG51bWJlcnMKCmltcG9ydCBza2xlYXJuCmZyb20gc2tsZWFybi5iYXNlIGltcG9ydCBjbG9uZQpmcm9tIHNrbGVhcm4udXRpbHMgaW1wb3J0IGNoZWNrX3JhbmRvbV9zdGF0ZQoKaW1wb3J0IG1hdHBsb3RsaWIucHlwbG90IGFzIHBsdAppbXBvcnQgc2VhYm9ybiBhcyBzbnMKCmZyb20gY2xvdWRwaWNrbGUgaW1wb3J0IGxvYWQKCmZyb20gbWxydW4uZXhlY3V0aW9uIGltcG9ydCBNTENsaWVudEN0eApmcm9tIG1scnVuLmRhdGFzdG9yZSBpbXBvcnQgRGF0YUl0ZW0KZnJvbSBtbHJ1bi5hcnRpZmFjdHMgaW1wb3J0IGdldF9tb2RlbCwgUGxvdEFydGlmYWN0CmZyb20gdHlwaW5nIGltcG9ydCBVbmlvbiwgQ2FsbGFibGUsIExpc3QKCgpkZWYgX2dldF9uX3NhbXBsZXNfYm9vdHN0cmFwKG5fc2FtcGxlcywgbWF4X3NhbXBsZXMpIC0+IGludDoKICAgICIiImdldCB0aGUgbnVtYmVyIG9mIHNhbXBsZXMgaW4gYSBib290c3RyYXAgc2FtcGxlCgogICAgcmV0dXJucyB0aGUgdG90YWwgbnVtYmVyIG9mIHNhbXBsZXMgdG8gZHJhdyBmb3IgdGhlIGJvb3RzdHJhcCBzYW1wbGUKCiAgICBwcml2YXRlIGFwaSBpbiBza2xlYXJuID49IHYwLjI0LCB0YWtlbiBmcm9tIHNrbGVhcm4uZW5zZW1ibGUuX2ZvcmVzdC5weQoKICAgIDpwYXJhbSBuX3NhbXBsZXM6ICAgTnVtYmVyIG9mIHNhbXBsZXMgaW4gdGhlIGRhdGFzZXQuCiAgICA6cGFyYW0gbWF4X3NhbXBsZXM6CiAgICAgICAgVGhlIG1heGltdW0gbnVtYmVyIG9mIHNhbXBsZXMgdG8gZHJhdyBmcm9tIHRoZSB0b3RhbCBhdmFpbGFibGU6CiAgICAgICAgICAgIC0gaWYgZmxvYXQsIHRoaXMgaW5kaWNhdGVzIGEgZnJhY3Rpb24gb2YgdGhlIHRvdGFsIGFuZCBzaG91bGQgYmUKICAgICAgICAgICAgICB0aGUgaW50ZXJ2YWwgYCgwLCAxKWA7CiAgICAgICAgICAgIC0gaWYgaW50LCB0aGlzIGluZGljYXRlcyB0aGUgZXhhY3QgbnVtYmVyIG9mIHNhbXBsZXM7CiAgICAgICAgICAgIC0gaWYgTm9uZSwgdGhpcyBpbmRpY2F0ZXMgdGhlIHRvdGFsIG51bWJlciBvZiBzYW1wbGVzLgogICAgIiIiCiAgICBpZiBtYXhfc2FtcGxlcyBpcyBOb25lOgogICAgICAgIHJldHVybiBuX3NhbXBsZXMKCiAgICBpZiBpc2luc3RhbmNlKG1heF9zYW1wbGVzLCBudW1iZXJzLkludGVncmFsKToKICAgICAgICBpZiBub3QgKDEgPD0gbWF4X3NhbXBsZXMgPD0gbl9zYW1wbGVzKToKICAgICAgICAgICAgbXNnID0gImBtYXhfc2FtcGxlc2AgbXVzdCBiZSBpbiByYW5nZSAxIHRvIHt9IGJ1dCBnb3QgdmFsdWUge30iCiAgICAgICAgICAgIHJhaXNlIFZhbHVlRXJyb3IobXNnLmZvcm1hdChuX3NhbXBsZXMsIG1heF9zYW1wbGVzKSkKICAgICAgICByZXR1cm4gbWF4X3NhbXBsZXMKCiAgICBpZiBpc2luc3RhbmNlKG1heF9zYW1wbGVzLCBudW1iZXJzLlJlYWwpOgogICAgICAgIGlmIG5vdCAoMCA8IG1heF9zYW1wbGVzIDwgMSk6CiAgICAgICAgICAgIG1zZyA9ICJgbWF4X3NhbXBsZXNgIG11c3QgYmUgaW4gcmFuZ2UgKDAsIDEpIGJ1dCBnb3QgdmFsdWUge30iCiAgICAgICAgICAgIHJhaXNlIFZhbHVlRXJyb3IobXNnLmZvcm1hdChtYXhfc2FtcGxlcykpCiAgICAgICAgcmV0dXJuIGludChyb3VuZChuX3NhbXBsZXMgKiBtYXhfc2FtcGxlcykpCgogICAgbXNnID0gImBtYXhfc2FtcGxlc2Agc2hvdWxkIGJlIGludCBvciBmbG9hdCwgYnV0IGdvdCB0eXBlICd7fSciCiAgICByYWlzZSBUeXBlRXJyb3IobXNnLmZvcm1hdCh0eXBlKG1heF9zYW1wbGVzKSkpCgoKZGVmIF9nZXRfdW5zYW1wbGVkX2l4KHJhbmRvbV9zdGF0ZSwgbl9zYW1wbGVzOiBpbnQpIC0+IG5wLmFycmF5OgogICAgIiIiCiAgICBmdXR1cmUtcHJvb2YgZ2V0IHVuc2FtcGxlZCBpbmRpY2VzCiAgICAiIiIKICAgIG5fYm9vdHN0cmFwID0gX2dldF9uX3NhbXBsZXNfYm9vdHN0cmFwKG5fc2FtcGxlcywgbl9zYW1wbGVzKQogICAgcmFuZG9tX2luc3RhbmNlID0gY2hlY2tfcmFuZG9tX3N0YXRlKHJhbmRvbV9zdGF0ZSkKICAgIHNhbXBsZV9pbmRpY2VzID0gcmFuZG9tX2luc3RhbmNlLnJhbmRpbnQoMCwgbl9zYW1wbGVzLCBuX2Jvb3RzdHJhcCkKICAgIHNhbXBsZV9jb3VudHMgPSBucC5iaW5jb3VudChzYW1wbGVfaW5kaWNlcywgbWlubGVuZ3RoPW5fc2FtcGxlcykKCiAgICByZXR1cm4gbnAuYXJhbmdlKG5fc2FtcGxlcylbc2FtcGxlX2NvdW50cyA9PSAwXQoKCmRlZiBfb29iX2NsYXNzaWZpZXJfYWNjdXJhY3kocmYsIFhfdHJhaW4sIHlfdHJhaW4pIC0+IGZsb2F0OgogICAgIiIiCiAgICBDb21wdXRlIG91dC1vZi1iYWcgKE9PQikgYWNjdXJhY3kgZm9yIGEgc2Npa2l0LWxlYXJuIGZvcmVzdCBjbGFzc2lmaWVyLgoKICAgIGh0dHBzOi8vZ2l0aHViLmNvbS9zY2lraXQtbGVhcm4vc2Npa2l0LWxlYXJuL2Jsb2IvYTI0YzhiNDYvc2tsZWFybi9lbnNlbWJsZS9mb3Jlc3QucHkjTDQyNQogICAgIiIiCiAgICBYID0gWF90cmFpbi52YWx1ZXMgaWYgaXNpbnN0YW5jZShYX3RyYWluLCBwZC5EYXRhRnJhbWUpIGVsc2UgWF90cmFpbgogICAgeSA9IHlfdHJhaW4udmFsdWVzIGlmIGlzaW5zdGFuY2UoeV90cmFpbiwgcGQuU2VyaWVzKSBlbHNlIHlfdHJhaW4KCiAgICBuX3NhbXBsZXMgPSBsZW4oWCkKICAgIG5fY2xhc3NlcyA9IGxlbihucC51bmlxdWUoeSkpCiAgICBwcmVkaWN0aW9ucyA9IG5wLnplcm9zKChuX3NhbXBsZXMsIG5fY2xhc3NlcykpCiAgICBmb3IgdHJlZSBpbiByZi5lc3RpbWF0b3JzXzoKICAgICAgICB1bnNhbXBsZWRfaW5kaWNlcyA9IF9nZXRfdW5zYW1wbGVkX2l4KHRyZWUucmFuZG9tX3N0YXRlLCBuX3NhbXBsZXMpCiAgICAgICAgdHJlZV9wcmVkcyA9IHRyZWUucHJlZGljdF9wcm9iYShYW3Vuc2FtcGxlZF9pbmRpY2VzLCA6XSkKICAgICAgICBwcmVkaWN0aW9uc1t1bnNhbXBsZWRfaW5kaWNlc10gKz0gdHJlZV9wcmVkcwoKICAgIHByZWRpY3RlZF9jbGFzc19pbmRleGVzID0gbnAuYXJnbWF4KHByZWRpY3Rpb25zLCBheGlzPTEpCiAgICBwcmVkaWN0ZWRfY2xhc3NlcyA9IFtyZi5jbGFzc2VzX1tpXSBmb3IgaSBpbiBwcmVkaWN0ZWRfY2xhc3NfaW5kZXhlc10KCiAgICBvb2Jfc2NvcmUgPSBucC5tZWFuKHkgPT0gcHJlZGljdGVkX2NsYXNzZXMpCgogICAgcmV0dXJuIG9vYl9zY29yZQoKCmRlZiBwZXJtdXRhdGlvbl9pbXBvcnRhbmNlKAogICAgY29udGV4dDogTUxDbGllbnRDdHgsCiAgICBtb2RlbDogRGF0YUl0ZW0sCiAgICBkYXRhc2V0OiBEYXRhSXRlbSwKICAgIGxhYmVsczogc3RyLAogICAgZmlnc3o9KDEwLCA1KSwKICAgIHBsb3RzX2Rlc3Q6IHN0ciA9ICJwbG90cyIsCiAgICBmaXR5cGU6IHN0ciA9ICJwZXJtdXRlIiwKKSAtPiBwZC5EYXRhRnJhbWU6CiAgICAiIiJjYWxjdWxhdGUgY2hhbmdlIGluIG1ldHJpYwoKICAgIHR5cGUgJ3Blcm11dGUnIHVzZXMgYSBwcmUtZXN0aW1hdGVkIG1vZGVsCiAgICB0eXBlICdkcm9wY29sJyB1c2VzIGEgcmUtZXN0aW1hdGVzIG1vZGVsCgogICAgOnBhcmFtIGNvbnRleHQ6ICAgICB0aGUgZnVuY3Rpb24ncyBleGVjdXRpb24gY29udGV4dAogICAgOnBhcmFtIG1vZGVsOiAgICAgICBhIHRyYWluZWQgbW9kZWwKICAgIDpwYXJhbSBkYXRhc2V0OiAgICAgZmVhdHVyZXMgYW5kIGdyb3VuZCB0cnV0aHMsIHJlZ3Jlc3Npb24gdGFyZ2V0cwogICAgOnBhcmFtIGxhYmVscyAgICAgICBuYW1lIG9mIHRoZSBncm91bmQgdHJ1dGhzIGNvbHVtbgogICAgOnBhcmFtIGZpZ3N6OiAgICAgICBtYXRwbG90bGliIGZpZ3VyZSBzaXplCiAgICA6cGFyYW0gcGxvdHNfZGVzdDogIHBhdGggd2l0aGluIGFydGlmYWN0IHN0b3JlCiAgICA6CiAgICAiIiIKICAgIG1vZGVsX2ZpbGUsIG1vZGVsX2RhdGEsIF8gPSBnZXRfbW9kZWwobW9kZWwudXJsLCBzdWZmaXg9Ii5wa2wiKQogICAgbW9kZWwgPSBsb2FkKG9wZW4oc3RyKG1vZGVsX2ZpbGUpLCAicmIiKSkKCiAgICBYID0gZGF0YXNldC5hc19kZigpCiAgICB5ID0gWC5wb3AobGFiZWxzKQogICAgaGVhZGVyID0gWC5jb2x1bW5zCgogICAgbWV0cmljID0gX29vYl9jbGFzc2lmaWVyX2FjY3VyYWN5CgogICAgYmFzZWxpbmUgPSBtZXRyaWMobW9kZWwsIFgsIHkpCgogICAgaW1wID0gW10KICAgIGZvciBjb2wgaW4gWC5jb2x1bW5zOgogICAgICAgIGlmIGZpdHlwZSBpcyAicGVybXV0ZSI6CiAgICAgICAgICAgIHNhdmUgPSBYW2NvbF0uY29weSgpCiAgICAgICAgICAgIFhbY29sXSA9IG5wLnJhbmRvbS5wZXJtdXRhdGlvbihYW2NvbF0pCiAgICAgICAgICAgIG0gPSBtZXRyaWMobW9kZWwsIFgsIHkpCiAgICAgICAgICAgIFhbY29sXSA9IHNhdmUKICAgICAgICAgICAgaW1wLmFwcGVuZChiYXNlbGluZSAtIG0pCiAgICAgICAgZWxpZiBmaXR5cGUgaXMgImRyb3Bjb2wiOgogICAgICAgICAgICBYXyA9IFguZHJvcChjb2wsIGF4aXM9MSkKICAgICAgICAgICAgbW9kZWxfID0gY2xvbmUobW9kZWwpCiAgICAgICAgICAgICNtb2RlbF8ucmFuZG9tX3N0YXRlID0gcmFuZG9tX3N0YXRlCiAgICAgICAgICAgIG1vZGVsXy5maXQoWF8sIHkpCiAgICAgICAgICAgIG8gPSBtb2RlbF8ub29iX3Njb3JlXwogICAgICAgICAgICBpbXAuYXBwZW5kKGJhc2VsaW5lIC0gbykKICAgICAgICBlbHNlOgogICAgICAgICAgICByYWlzZSBWYWx1ZUVycm9yKCJ1bmtub3duIGZpdHlwZSwgb25seSAncGVybXV0ZScgb3IgJ2Ryb3Bjb2wnIHBlcm1pdHRlZCIpCgogICAgemlwcGVkID0gemlwKGltcCwgaGVhZGVyKQogICAgZmVhdHVyZV9pbXAgPSBwZC5EYXRhRnJhbWUoc29ydGVkKHppcHBlZCksIGNvbHVtbnM9WyJpbXBvcnRhbmNlIiwgImZlYXR1cmUiXSkKICAgIGZlYXR1cmVfaW1wLnNvcnRfdmFsdWVzKGJ5PSJpbXBvcnRhbmNlIiwgYXNjZW5kaW5nPUZhbHNlLCBpbnBsYWNlPVRydWUpCgogICAgcGx0LmNsZigpCiAgICBwbHQuZmlndXJlKGZpZ3NpemU9Zmlnc3opCiAgICBzbnMuYmFycGxvdCh4PSJpbXBvcnRhbmNlIiwgeT0iZmVhdHVyZSIsIGRhdGE9ZmVhdHVyZV9pbXApCiAgICBwbHQudGl0bGUoZiJmZWF0dXJlIGltcG9ydGFuY2VzLXtmaXR5cGV9IikKICAgIHBsdC50aWdodF9sYXlvdXQoKQoKICAgIGNvbnRleHQubG9nX2FydGlmYWN0KAogICAgICAgIFBsb3RBcnRpZmFjdChmImZlYXR1cmUgaW1wb3J0YW5jZXMte2ZpdHlwZX0iLCBib2R5PXBsdC5nY2YoKSksCiAgICAgICAgbG9jYWxfcGF0aD1mIntwbG90c19kZXN0fS9mZWF0dXJlLXBlcm11dGF0aW9ucy5odG1sIiwKICAgICkKICAgIGNvbnRleHQubG9nX2RhdGFzZXQoCiAgICAgICAgZiJmZWF0dXJlLWltcG9ydGFuY2VzLXtmaXR5cGV9LXRibCIsIGRmPWZlYXR1cmVfaW1wLCBpbmRleD1GYWxzZQogICAgKQo= - commands: [] - code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/feature_perms/feature_perms.py - affinity: null -verbose: false diff --git a/functions/development/feature_perms/latest/src/item.yaml b/functions/development/feature_perms/latest/src/item.yaml deleted file mode 100644 index bd909d3e..00000000 --- a/functions/development/feature_perms/latest/src/item.yaml +++ /dev/null @@ -1,25 +0,0 @@ -apiVersion: v1 -categories: -- data-analysis -description: estimate feature importances using permutations -doc: '' -example: feature_perms.ipynb -generationDate: 2022-08-28:17-25 -hidden: false -icon: '' -labels: - author: yjb -maintainers: [] -marketplaceType: '' -mlrunVersion: 1.1.0 -name: feature-perms -platformVersion: 3.5.0 -spec: - filename: feature_perms.py - handler: permutation_importance - image: mlrun/ml-models - kind: job - requirements: [] -url: '' -version: 1.1.0 -test_valid : False diff --git a/functions/development/feature_perms/latest/src/requirements.txt b/functions/development/feature_perms/latest/src/requirements.txt deleted file mode 100644 index 70a079c7..00000000 --- a/functions/development/feature_perms/latest/src/requirements.txt +++ /dev/null @@ -1,5 +0,0 @@ -scikit-learn -matplotlib -seaborn -scikit-plot - diff --git a/functions/development/feature_perms/latest/src/test_feature_perms.py b/functions/development/feature_perms/latest/src/test_feature_perms.py deleted file mode 100644 index a59891ea..00000000 --- a/functions/development/feature_perms/latest/src/test_feature_perms.py +++ /dev/null @@ -1,134 +0,0 @@ -# Copyright 2019 Iguazio -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -from mlrun import code_to_function, import_function -from pathlib import Path -import os - -ARTIFACTS_PATH = 'artifacts' -DATA_URL = "https://raw.githubusercontent.com/parrt/random-forest-importances/master/notebooks/data/rent.csv" -FEATURE_OUTPUT = "feature-importances-permute-tbl" - - -def arc_to_parquet(): - from mlrun import import_function - - archive_func = import_function('hub://arc_to_parquet') - - archive_run = archive_func.run( - handler="arc_to_parquet", - params={"key": "rent", "stats": True, "file_ext": "csv"}, - inputs={"archive_url": DATA_URL}, - artifact_path=os.getcwd() + '/artifacts', - local=True, - ) - - return archive_run.artifact('rent').url - - -def sklearn_classifier(run): - cwd = os.getcwd() - file_path = str(Path(cwd).parent.absolute()) + "/sklearn_classifier/sklearn_classifier.py" - fn = code_to_function( - name='test_sklearn_classifier', - filename=file_path, - handler="train_model", - kind="local", - ) - - fn.spec.command = file_path - fn.run( - params={ - "sample": -5_000, # 5k random rows, - "model_pkg_class": "sklearn.ensemble.RandomForestClassifier", - "label_column": "interest_level", - "CLASS_n_estimators": 100, - "CLASS_min_samples_leaf": 1, - "CLASS_n_jobs": -1, - "CLASS_oob_score": True, - }, - handler="train_model", - inputs={"dataset": run.outputs["rent"]}, - artifact_path='artifacts', - ) - - -def train_model(data): - from mlrun import import_function - - train = import_function('hub://sklearn_classifier') - - train_run = train.run( - inputs={"dataset": data}, - params={ - "sample": -5_000, # 5k random rows, - "model_pkg_class": "sklearn.ensemble.RandomForestClassifier", - "label_column": "interest_level", - "CLASS_n_estimators": 100, - "CLASS_min_samples_leaf": 1, - "CLASS_n_jobs": -1, - "CLASS_oob_score": True, - }, - local=True - ) - - return train_run.artifact('model').url - - -def test_feature_selection_run_local(): - data = arc_to_parquet() - model = train_model(data) - labels = "interest_level" - fn = code_to_function( - name='test_run_local_feature_perms', - filename="feature_perms.py", - handler="permutation_importance", - kind="local", - ) - fn.spec.command = "feature_perms.py" - - run = fn.run( - params={ - "labels": labels, - "plots_dest": "plots", - }, - inputs={ - "model": model, - "dataset": data, - }, - artifact_path='artifacts', - ) - - assert run.artifact(FEATURE_OUTPUT).get() - - -def test_feature_perms_import_function(): - data = arc_to_parquet() - model = train_model(data) - labels = "interest_level" - fn = import_function("function.yaml") - - run = fn.run( - params={ - "labels": labels, - "plots_dest": "plots" - }, - inputs={ - "model": model, - "dataset": data}, - artifact_path=os.getcwd() + '/artifacts', - local=True, - ) - - assert run.artifact(FEATURE_OUTPUT).get() diff --git a/functions/development/feature_perms/latest/static/documentation.html b/functions/development/feature_perms/latest/static/documentation.html deleted file mode 100644 index 66452f5a..00000000 --- a/functions/development/feature_perms/latest/static/documentation.html +++ /dev/null @@ -1,242 +0,0 @@ - - - - - - - -feature_perms package - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - - - -
-
- - -
-
-
- -
-

feature_perms package

- -
- -
-
-
-
-
-

feature_perms package#

-
-

Submodules#

-
-
-

feature_perms.feature_perms module#

-
-
-feature_perms.feature_perms.permutation_importance(context: mlrun.execution.MLClientCtx, model: mlrun.datastore.base.DataItem, dataset: mlrun.datastore.base.DataItem, labels: str, figsz=(10, 5), plots_dest: str = 'plots', fitype: str = 'permute')pandas.core.frame.DataFrame[source]#
-

calculate change in metric

-

type ‘permute’ uses a pre-estimated model -type ‘dropcol’ uses a re-estimates model

-
-
Parameters
-
    -
  • context – the function’s execution context

  • -
  • model – a trained model

  • -
  • dataset – features and ground truths, regression targets

  • -
-
-
-

:param labels name of the ground truths column -:param figsz: matplotlib figure size -:param plots_dest: path within artifact store -:

-
-
-
-

Module contents#

-
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/feature_perms/latest/static/example.html b/functions/development/feature_perms/latest/static/example.html deleted file mode 100644 index a5869008..00000000 --- a/functions/development/feature_perms/latest/static/example.html +++ /dev/null @@ -1,1184 +0,0 @@ - - - - - - - -permutation_importances as reusable function - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - - - -
-
- - -
-
-
- -
-

permutation_importances as reusable function

- -
- -
-
-
-
-
-

permutation_importances as reusable function#

-
-

function code#

-
-
-
# nuclio: ignore
-import nuclio
-
-
-
-
-
-
-
import numpy as np
-import pandas as pd
-import numbers
-
-import sklearn
-from sklearn.base import clone
-from sklearn.utils import check_random_state
-
-import matplotlib.pyplot as plt
-import seaborn as sns
-
-from cloudpickle import load
-
-from mlrun.execution import MLClientCtx
-from mlrun.datastore import DataItem
-from mlrun.artifacts import get_model, PlotArtifact
-from typing import Union, Callable, List
-
-def _get_n_samples_bootstrap(n_samples, max_samples) -> int:
-    """get the number of samples in a bootstrap sample
-    
-    returns the total number of samples to draw for the bootstrap sample
-    
-    private api in sklearn >= v0.24, taken from sklearn.ensemble._forest.py
-
-    :param n_samples:   Number of samples in the dataset.
-    :param max_samples: 
-        The maximum number of samples to draw from the total available:
-            - if float, this indicates a fraction of the total and should be
-              the interval `(0, 1)`;
-            - if int, this indicates the exact number of samples;
-            - if None, this indicates the total number of samples.
-    """
-    if max_samples is None:
-        return n_samples
-
-    if isinstance(max_samples, numbers.Integral):
-        if not (1 <= max_samples <= n_samples):
-            msg = "`max_samples` must be in range 1 to {} but got value {}"
-            raise ValueError(msg.format(n_samples, max_samples))
-        return max_samples
-
-    if isinstance(max_samples, numbers.Real):
-        if not (0 < max_samples < 1):
-            msg = "`max_samples` must be in range (0, 1) but got value {}"
-            raise ValueError(msg.format(max_samples))
-        return int(round(n_samples * max_samples))
-
-    msg = "`max_samples` should be int or float, but got type '{}'"
-    raise TypeError(msg.format(type(max_samples)))
-
-def _get_unsampled_ix(random_state, n_samples: int) -> np.array:
-    """
-    future-proof get unsampled indices
-    """
-    n_bootstrap = _get_n_samples_bootstrap(n_samples, n_samples)
-    random_instance = check_random_state(random_state)
-    sample_indices = random_instance.randint(0, n_samples, n_bootstrap)
-    sample_counts = np.bincount(sample_indices, minlength=n_samples)
-
-    return np.arange(n_samples)[sample_counts==0]
-
-def _oob_classifier_accuracy(rf, X_train, y_train) -> float:
-    """
-    Compute out-of-bag (OOB) accuracy for a scikit-learn forest classifier.
-    
-    https://github.com/scikit-learn/scikit-learn/blob/a24c8b46/sklearn/ensemble/forest.py#L425
-    """
-    X = X_train.values if isinstance(X_train, pd.DataFrame) else X_train
-    y = y_train.values if isinstance(y_train, pd.Series) else y_train
-
-    n_samples = len(X)
-    n_classes = len(np.unique(y))
-    predictions = np.zeros((n_samples, n_classes))
-    for tree in rf.estimators_:
-        unsampled_indices = _get_unsampled_ix(tree.random_state, n_samples)
-        tree_preds = tree.predict_proba(X[unsampled_indices, :])
-        predictions[unsampled_indices] += tree_preds
-
-    predicted_class_indexes = np.argmax(predictions, axis=1)
-    predicted_classes = [rf.classes_[i] for i in predicted_class_indexes]
-
-    oob_score = np.mean(y == predicted_classes)
-    
-    return oob_score
-
-def permutation_importances(
-    context: MLClientCtx,
-    model: DataItem,
-    dataset: DataItem,
-    labels: str,
-    figsz=(10, 5),
-    plots_dest: str = "plots",
-    fitype: str = "permute"
-) -> pd.DataFrame:
-    """calculate change in metric
-    
-    type 'permute' uses a pre-estimated model
-    type 'dropcol' uses a re-estimates model
-    
-    :param context:     the function's execution context
-    :param model:       a trained model
-    :param dataset:     features and ground truths, regression targets
-    :param labels       name of the ground truths column
-    :param figsz:       matplotlib figure size
-    :param plots_dest:  path within artifact store
-    :
-    """
-    model_file, model_data, _ = get_model(model.url, suffix='.pkl')
-    model = load(open(str(model_file), "rb"))
-    
-    X = dataset.as_df()
-    y = X.pop(labels)
-    header = X.columns
-    
-    # this will be paramettrized next version, and include regression
-    metric = _oob_classifier_accuracy
-    
-    baseline = metric(model, X, y)
-    
-    imp = []
-    for col in X.columns:
-        if fitype is "permute":
-            save = X[col].copy()
-            X[col] = np.random.permutation(X[col])
-            m = metric(model, X, y)
-            X[col] = save
-            imp.append(baseline - m)
-        elif fitype is "dropcol":
-            X_ = X.drop(col, axis=1)
-            model_ = clone(model)
-            model_.random_state = random_state
-            model_.fit(X_, y)
-            o = model_.oob_score_
-            imp.append(baseline - o)
-        else:
-            raise ValueError("unknown fitype, only 'permute' or 'dropcol' permitted")
-
-    # create a feature importance table with desired labels
-    zipped = zip(imp, header)
-    feature_imp = pd.DataFrame(sorted(zipped), columns=["importance", "feature"])
-    feature_imp.sort_values(by="importance", ascending=False, inplace=True)
-
-    plt.clf()
-    plt.figure(figsize=figsz)
-    sns.barplot(x="importance", y="feature", data=feature_imp)
-    plt.title(f"feature importances-{fitype}")
-    plt.tight_layout()
-
-    context.log_artifact(PlotArtifact(f"feature importances-{fitype}", body=plt.gcf()),
-                         local_path=f"{plots_dest}/feature-permutations.html")
-    context.log_dataset(f"feature-importances-{fitype}-tbl", df=feature_imp, index=False)
-
-
-
-
-
-
-
# nuclio: end-code
-
-
-
-
-
-
-

save function#

-
-
-
from mlrun import code_to_function
-from mlrun.platforms.other import auto_mount
-
-gpus = False
-
-# create job function object from notebook code
-fn_params = {
-    "name"        : "feature-perms",
-    "handler"     : "permutation_importances",
-    "kind"        : "job",
-    "image"       : "mlrun/ml-models" if not gpus else "mlrun/ml-models-gpu",
-    "description" : "estimate feature importances using permutations",
-    "categories"  : ["analysis"],
-    "labels"      : {"author": "yjb"}
-}
-
-perms_fn = code_to_function(**fn_params)
-perms_fn.apply(auto_mount())
-perms_fn.export("function.yaml")
-
-
-
-
-
[mlrun] 2020-06-07 19:58:25,298 function spec saved to path: function.yaml
-
-
-
<mlrun.runtimes.kubejob.KubejobRuntime at 0x7feb0104efd0>
-
-
-
-
-
-
-

tests#

-
-
-
from mlrun import import_function
-from mlrun import NewTask, mlconf
-
-
-
-
-
-

get some data#

-
-
-
data_url = "https://raw.githubusercontent.com/parrt/random-forest-importances/master/notebooks/data/rent.csv"
-
-fn = import_function("hub://arc_to_parquet", "a2p")
-fn.apply(auto_mount())
-
-params = {
-    "name" : "tasks arc-to-parq",
-    "params" : {"key":"rent", "stats": True, "file_ext":"csv"}
-}
-acquire_run = fn.run(NewTask(**params),inputs={"archive_url" : data_url},
-                     artifact_path=mlconf.artifact_path)
-
-
-
-
-
[mlrun] 2020-06-07 19:58:25,352 starting run tasks arc-to-parq uid=e9bc67f2189c418d96bfde754d369956  -> http://mlrun-api:8080
-[mlrun] 2020-06-07 19:58:25,486 Job is running in the background, pod: tasks-arc-to-parq-xqkr7
-[mlrun] 2020-06-07 19:58:29,118 starting local run: main.py # arc_to_parquet
-[mlrun] 2020-06-07 19:58:29,169 downloading https://raw.githubusercontent.com/parrt/random-forest-importances/master/notebooks/data/rent.csv to local tmp
-[mlrun] 2020-06-07 19:58:29,535 destination file does not exist, downloading
-[mlrun] 2020-06-07 19:58:29,898 log artifact rent at /User/artifacts/rent.csv, size: 1492462, db: Y
-
-[mlrun] 2020-06-07 19:58:29,917 run executed, status=completed
-final state: succeeded
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 07 19:58:29completedtasks arc-to-parq
v3io_user=admin
kind=job
owner=admin
host=tasks-arc-to-parq-xqkr7
archive_url
key=rent
stats=True
file_ext=csv
rent
-
- -
-
to track results use .show() or .logs() or in CLI: 
-!mlrun get run e9bc67f2189c418d96bfde754d369956 --project default , !mlrun logs e9bc67f2189c418d96bfde754d369956 --project default
-[mlrun] 2020-06-07 19:58:31,666 run executed, status=completed
-
-
-
-
-
-
-

train a model#

-
-
-
fn = import_function("hub://sklearn_classifier", "skrf")
-fn.apply(auto_mount())
-
-# define model
-params = {
-    "name" : "tasks random forest",
-    "params" : {
-        "sample"                 : -5_000, # 5k random rows,
-        "model_pkg_class"        : "sklearn.ensemble.RandomForestClassifier",
-        "label_column"           : "interest_level",
-        "CLASS_n_estimators"     : 100,
-        "CLASS_min_samples_leaf" : 1,
-        "CLASS_n_jobs"           : -1,
-        "CLASS_oob_score"        : True}
-}
-
-train_run = fn.run(NewTask(**params), inputs={"dataset" : acquire_run.outputs["rent"]},
-                   artifact_path=mlconf.artifact_path)
-
-
-
-
-
[mlrun] 2020-06-07 19:58:31,704 starting run tasks random forest uid=57af834167264641905a5bb5e6b0e263  -> http://mlrun-api:8080
-[mlrun] 2020-06-07 19:58:31,861 Job is running in the background, pod: tasks-random-forest-vkjk5
-[mlrun] 2020-06-07 19:58:35,390 starting local run: main.py # train_model
-[mlrun] 2020-06-07 19:58:36,310 log artifact test_set at /User/artifacts/data/test_set.parquet, size: 24484, db: Y
-[mlrun] 2020-06-07 19:58:37,153 log artifact confusion-matrix at /User/artifacts/model/plots/confusion-matrix.html, size: 27401, db: N
-[mlrun] 2020-06-07 19:58:37,598 log artifact feature-importances at /User/artifacts/model/plots/feature-importances.html, size: 19685, db: N
-[mlrun] 2020-06-07 19:58:37,806 log artifact precision-recall-multiclass at /User/artifacts/model/plots/precision-recall-multiclass.html, size: 74009, db: N
-[mlrun] 2020-06-07 19:58:37,936 log artifact roc-multiclass at /User/artifacts/model/plots/roc-multiclass.html, size: 73053, db: N
-[mlrun] 2020-06-07 19:58:38,079 log artifact model at /User/artifacts/model/, size: 10346780, db: Y
-
-[mlrun] 2020-06-07 19:58:38,106 run executed, status=completed
-final state: succeeded
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 07 19:58:36completedtasks random forest
v3io_user=admin
kind=job
owner=admin
host=tasks-random-forest-vkjk5
class=sklearn.ensemble.RandomForestClassifier
dataset
sample=-5000
model_pkg_class=sklearn.ensemble.RandomForestClassifier
label_column=interest_level
CLASS_n_estimators=100
CLASS_min_samples_leaf=1
CLASS_n_jobs=-1
CLASS_oob_score=True
test-accuracy=0.6902857142857143
test-error=0.3097142857142857
auc-micro=0.8567196734693878
auc-weighted=0.7077200281488216
f1-score=0.44361444815007395
precision_score=0.4969837043184901
recall_score=0.42733978329897576
test_set
confusion-matrix
feature-importances
precision-recall-multiclass
roc-multiclass
model
-
- -
-
to track results use .show() or .logs() or in CLI: 
-!mlrun get run 57af834167264641905a5bb5e6b0e263 --project default , !mlrun logs 57af834167264641905a5bb5e6b0e263 --project default
-[mlrun] 2020-06-07 19:58:41,115 run executed, status=completed
-
-
-
-
-
-
-
from IPython.display import HTML
-HTML(filename=train_run.outputs['feature-importances'])
-
-
-
-
-

Feature Importances

-
-
-
-
-
data   = acquire_run.outputs["rent"]
-labels = "interest_level"
-model  = train_run.outputs["model"]
-
-
-
-
-
-
-
fi_perms = perms_fn.run(
-    NewTask(params={"labels": labels, 
-                    "plots_dest": "plots"}),
-    inputs={"model": model, "dataset": data},
-    artifact_path=mlconf.artifact_path)
-
-
-
-
-
[mlrun] 2020-06-07 19:58:41,152 starting run features-permutation_importances uid=89235b15ac2a4213aefc906c178a1c5e  -> http://mlrun-api:8080
-[mlrun] 2020-06-07 19:58:41,312 Job is running in the background, pod: features-permutation-importances-dwxmt
-[mlrun] 2020-06-07 19:58:44,871 starting local run: main.py # permutation_importances
-[mlrun] 2020-06-07 19:58:48,714 log artifact feature importances-permute at /User/artifacts/plots/feature-permutations.html, size: 25694, db: Y
-[mlrun] 2020-06-07 19:58:48,770 log artifact feature-importances-permute-tbl at /User/artifacts/feature-importances-permute-tbl.csv, size: 167, db: Y
-
-[mlrun] 2020-06-07 19:58:48,785 run executed, status=completed
-final state: succeeded
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 07 19:58:45completedfeatures-permutation_importances
v3io_user=admin
kind=job
owner=admin
host=features-permutation-importances-dwxmt
model
dataset
labels=interest_level
plots_dest=plots
feature importances-permute
feature-importances-permute-tbl
-
- -
-
to track results use .show() or .logs() or in CLI: 
-!mlrun get run 89235b15ac2a4213aefc906c178a1c5e --project default , !mlrun logs 89235b15ac2a4213aefc906c178a1c5e --project default
-[mlrun] 2020-06-07 19:58:50,488 run executed, status=completed
-
-
-
-
-
-
-
from IPython.display import HTML
-HTML(filename=fi_perms.outputs['feature importances-permute'])
-
-
-
-
-
-
-
-
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/feature_perms/latest/static/feature_perms.html b/functions/development/feature_perms/latest/static/feature_perms.html deleted file mode 100644 index 5121c7f7..00000000 --- a/functions/development/feature_perms/latest/static/feature_perms.html +++ /dev/null @@ -1,314 +0,0 @@ - - - - - - - -feature_perms.feature_perms - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - -
-
- -
-
-
-
-
- -
-

- -
-
-
-
-
-
-
-

Source code for feature_perms.feature_perms

-# Copyright 2019 Iguazio
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# Generated by nuclio.export.NuclioExporter
-
-import numpy as np
-import pandas as pd
-import numbers
-
-import sklearn
-from sklearn.base import clone
-from sklearn.utils import check_random_state
-
-import matplotlib.pyplot as plt
-import seaborn as sns
-
-from cloudpickle import load
-
-from mlrun.execution import MLClientCtx
-from mlrun.datastore import DataItem
-from mlrun.artifacts import get_model, PlotArtifact
-from typing import Union, Callable, List
-
-
-def _get_n_samples_bootstrap(n_samples, max_samples) -> int:
-    """get the number of samples in a bootstrap sample
-
-    returns the total number of samples to draw for the bootstrap sample
-
-    private api in sklearn >= v0.24, taken from sklearn.ensemble._forest.py
-
-    :param n_samples:   Number of samples in the dataset.
-    :param max_samples:
-        The maximum number of samples to draw from the total available:
-            - if float, this indicates a fraction of the total and should be
-              the interval `(0, 1)`;
-            - if int, this indicates the exact number of samples;
-            - if None, this indicates the total number of samples.
-    """
-    if max_samples is None:
-        return n_samples
-
-    if isinstance(max_samples, numbers.Integral):
-        if not (1 <= max_samples <= n_samples):
-            msg = "`max_samples` must be in range 1 to {} but got value {}"
-            raise ValueError(msg.format(n_samples, max_samples))
-        return max_samples
-
-    if isinstance(max_samples, numbers.Real):
-        if not (0 < max_samples < 1):
-            msg = "`max_samples` must be in range (0, 1) but got value {}"
-            raise ValueError(msg.format(max_samples))
-        return int(round(n_samples * max_samples))
-
-    msg = "`max_samples` should be int or float, but got type '{}'"
-    raise TypeError(msg.format(type(max_samples)))
-
-
-def _get_unsampled_ix(random_state, n_samples: int) -> np.array:
-    """
-    future-proof get unsampled indices
-    """
-    n_bootstrap = _get_n_samples_bootstrap(n_samples, n_samples)
-    random_instance = check_random_state(random_state)
-    sample_indices = random_instance.randint(0, n_samples, n_bootstrap)
-    sample_counts = np.bincount(sample_indices, minlength=n_samples)
-
-    return np.arange(n_samples)[sample_counts == 0]
-
-
-def _oob_classifier_accuracy(rf, X_train, y_train) -> float:
-    """
-    Compute out-of-bag (OOB) accuracy for a scikit-learn forest classifier.
-
-    https://github.com/scikit-learn/scikit-learn/blob/a24c8b46/sklearn/ensemble/forest.py#L425
-    """
-    X = X_train.values if isinstance(X_train, pd.DataFrame) else X_train
-    y = y_train.values if isinstance(y_train, pd.Series) else y_train
-
-    n_samples = len(X)
-    n_classes = len(np.unique(y))
-    predictions = np.zeros((n_samples, n_classes))
-    for tree in rf.estimators_:
-        unsampled_indices = _get_unsampled_ix(tree.random_state, n_samples)
-        tree_preds = tree.predict_proba(X[unsampled_indices, :])
-        predictions[unsampled_indices] += tree_preds
-
-    predicted_class_indexes = np.argmax(predictions, axis=1)
-    predicted_classes = [rf.classes_[i] for i in predicted_class_indexes]
-
-    oob_score = np.mean(y == predicted_classes)
-
-    return oob_score
-
-
-
[docs]def permutation_importance( - context: MLClientCtx, - model: DataItem, - dataset: DataItem, - labels: str, - figsz=(10, 5), - plots_dest: str = "plots", - fitype: str = "permute", -) -> pd.DataFrame: - """calculate change in metric - - type 'permute' uses a pre-estimated model - type 'dropcol' uses a re-estimates model - - :param context: the function's execution context - :param model: a trained model - :param dataset: features and ground truths, regression targets - :param labels name of the ground truths column - :param figsz: matplotlib figure size - :param plots_dest: path within artifact store - : - """ - model_file, model_data, _ = get_model(model.url, suffix=".pkl") - model = load(open(str(model_file), "rb")) - - X = dataset.as_df() - y = X.pop(labels) - header = X.columns - - metric = _oob_classifier_accuracy - - baseline = metric(model, X, y) - - imp = [] - for col in X.columns: - if fitype is "permute": - save = X[col].copy() - X[col] = np.random.permutation(X[col]) - m = metric(model, X, y) - X[col] = save - imp.append(baseline - m) - elif fitype is "dropcol": - X_ = X.drop(col, axis=1) - model_ = clone(model) - #model_.random_state = random_state - model_.fit(X_, y) - o = model_.oob_score_ - imp.append(baseline - o) - else: - raise ValueError("unknown fitype, only 'permute' or 'dropcol' permitted") - - zipped = zip(imp, header) - feature_imp = pd.DataFrame(sorted(zipped), columns=["importance", "feature"]) - feature_imp.sort_values(by="importance", ascending=False, inplace=True) - - plt.clf() - plt.figure(figsize=figsz) - sns.barplot(x="importance", y="feature", data=feature_imp) - plt.title(f"feature importances-{fitype}") - plt.tight_layout() - - context.log_artifact( - PlotArtifact(f"feature importances-{fitype}", body=plt.gcf()), - local_path=f"{plots_dest}/feature-permutations.html", - ) - context.log_dataset( - f"feature-importances-{fitype}-tbl", df=feature_imp, index=False - )
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/feature_perms/latest/static/function.html b/functions/development/feature_perms/latest/static/function.html deleted file mode 100644 index f9520633..00000000 --- a/functions/development/feature_perms/latest/static/function.html +++ /dev/null @@ -1,85 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: job
-metadata:
-  name: feature-perms
-  tag: ''
-  hash: 2e32234a73e2e48f029cf6c957b150ec2ffd4bc7
-  project: ''
-  labels:
-    author: yjb
-  categories:
-  - data-analysis
-spec:
-  command: ''
-  args: []
-  image: mlrun/ml-models
-  env: []
-  default_handler: permutation_importance
-  entry_points:
-    permutation_importance:
-      name: permutation_importance
-      doc: 'calculate change in metric
-
-
-        type ''permute'' uses a pre-estimated model
-
-        type ''dropcol'' uses a re-estimates model'
-      parameters:
-      - name: context
-        type: MLClientCtx
-        doc: the function's execution context
-        default: ''
-      - name: model
-        type: DataItem
-        doc: a trained model
-        default: ''
-      - name: dataset
-        type: DataItem
-        doc: features and ground truths, regression targets
-        default: ''
-      - name: labels
-        type: str
-        default: ''
-      - name: figsz
-        doc: matplotlib figure size
-        default:
-        - 10
-        - 5
-      - name: plots_dest
-        type: str
-        doc: path within artifact store
-        default: plots
-      - name: fitype
-        type: str
-        default: permute
-      outputs:
-      - default: ''
-      lineno: 93
-  description: estimate feature importances using permutations
-  build:
-    functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IG51bXB5IGFzIG5wCmltcG9ydCBwYW5kYXMgYXMgcGQKaW1wb3J0IG51bWJlcnMKCmltcG9ydCBza2xlYXJuCmZyb20gc2tsZWFybi5iYXNlIGltcG9ydCBjbG9uZQpmcm9tIHNrbGVhcm4udXRpbHMgaW1wb3J0IGNoZWNrX3JhbmRvbV9zdGF0ZQoKaW1wb3J0IG1hdHBsb3RsaWIucHlwbG90IGFzIHBsdAppbXBvcnQgc2VhYm9ybiBhcyBzbnMKCmZyb20gY2xvdWRwaWNrbGUgaW1wb3J0IGxvYWQKCmZyb20gbWxydW4uZXhlY3V0aW9uIGltcG9ydCBNTENsaWVudEN0eApmcm9tIG1scnVuLmRhdGFzdG9yZSBpbXBvcnQgRGF0YUl0ZW0KZnJvbSBtbHJ1bi5hcnRpZmFjdHMgaW1wb3J0IGdldF9tb2RlbCwgUGxvdEFydGlmYWN0CmZyb20gdHlwaW5nIGltcG9ydCBVbmlvbiwgQ2FsbGFibGUsIExpc3QKCgpkZWYgX2dldF9uX3NhbXBsZXNfYm9vdHN0cmFwKG5fc2FtcGxlcywgbWF4X3NhbXBsZXMpIC0+IGludDoKICAgICIiImdldCB0aGUgbnVtYmVyIG9mIHNhbXBsZXMgaW4gYSBib290c3RyYXAgc2FtcGxlCgogICAgcmV0dXJucyB0aGUgdG90YWwgbnVtYmVyIG9mIHNhbXBsZXMgdG8gZHJhdyBmb3IgdGhlIGJvb3RzdHJhcCBzYW1wbGUKCiAgICBwcml2YXRlIGFwaSBpbiBza2xlYXJuID49IHYwLjI0LCB0YWtlbiBmcm9tIHNrbGVhcm4uZW5zZW1ibGUuX2ZvcmVzdC5weQoKICAgIDpwYXJhbSBuX3NhbXBsZXM6ICAgTnVtYmVyIG9mIHNhbXBsZXMgaW4gdGhlIGRhdGFzZXQuCiAgICA6cGFyYW0gbWF4X3NhbXBsZXM6CiAgICAgICAgVGhlIG1heGltdW0gbnVtYmVyIG9mIHNhbXBsZXMgdG8gZHJhdyBmcm9tIHRoZSB0b3RhbCBhdmFpbGFibGU6CiAgICAgICAgICAgIC0gaWYgZmxvYXQsIHRoaXMgaW5kaWNhdGVzIGEgZnJhY3Rpb24gb2YgdGhlIHRvdGFsIGFuZCBzaG91bGQgYmUKICAgICAgICAgICAgICB0aGUgaW50ZXJ2YWwgYCgwLCAxKWA7CiAgICAgICAgICAgIC0gaWYgaW50LCB0aGlzIGluZGljYXRlcyB0aGUgZXhhY3QgbnVtYmVyIG9mIHNhbXBsZXM7CiAgICAgICAgICAgIC0gaWYgTm9uZSwgdGhpcyBpbmRpY2F0ZXMgdGhlIHRvdGFsIG51bWJlciBvZiBzYW1wbGVzLgogICAgIiIiCiAgICBpZiBtYXhfc2FtcGxlcyBpcyBOb25lOgogICAgICAgIHJldHVybiBuX3NhbXBsZXMKCiAgICBpZiBpc2luc3RhbmNlKG1heF9zYW1wbGVzLCBudW1iZXJzLkludGVncmFsKToKICAgICAgICBpZiBub3QgKDEgPD0gbWF4X3NhbXBsZXMgPD0gbl9zYW1wbGVzKToKICAgICAgICAgICAgbXNnID0gImBtYXhfc2FtcGxlc2AgbXVzdCBiZSBpbiByYW5nZSAxIHRvIHt9IGJ1dCBnb3QgdmFsdWUge30iCiAgICAgICAgICAgIHJhaXNlIFZhbHVlRXJyb3IobXNnLmZvcm1hdChuX3NhbXBsZXMsIG1heF9zYW1wbGVzKSkKICAgICAgICByZXR1cm4gbWF4X3NhbXBsZXMKCiAgICBpZiBpc2luc3RhbmNlKG1heF9zYW1wbGVzLCBudW1iZXJzLlJlYWwpOgogICAgICAgIGlmIG5vdCAoMCA8IG1heF9zYW1wbGVzIDwgMSk6CiAgICAgICAgICAgIG1zZyA9ICJgbWF4X3NhbXBsZXNgIG11c3QgYmUgaW4gcmFuZ2UgKDAsIDEpIGJ1dCBnb3QgdmFsdWUge30iCiAgICAgICAgICAgIHJhaXNlIFZhbHVlRXJyb3IobXNnLmZvcm1hdChtYXhfc2FtcGxlcykpCiAgICAgICAgcmV0dXJuIGludChyb3VuZChuX3NhbXBsZXMgKiBtYXhfc2FtcGxlcykpCgogICAgbXNnID0gImBtYXhfc2FtcGxlc2Agc2hvdWxkIGJlIGludCBvciBmbG9hdCwgYnV0IGdvdCB0eXBlICd7fSciCiAgICByYWlzZSBUeXBlRXJyb3IobXNnLmZvcm1hdCh0eXBlKG1heF9zYW1wbGVzKSkpCgoKZGVmIF9nZXRfdW5zYW1wbGVkX2l4KHJhbmRvbV9zdGF0ZSwgbl9zYW1wbGVzOiBpbnQpIC0+IG5wLmFycmF5OgogICAgIiIiCiAgICBmdXR1cmUtcHJvb2YgZ2V0IHVuc2FtcGxlZCBpbmRpY2VzCiAgICAiIiIKICAgIG5fYm9vdHN0cmFwID0gX2dldF9uX3NhbXBsZXNfYm9vdHN0cmFwKG5fc2FtcGxlcywgbl9zYW1wbGVzKQogICAgcmFuZG9tX2luc3RhbmNlID0gY2hlY2tfcmFuZG9tX3N0YXRlKHJhbmRvbV9zdGF0ZSkKICAgIHNhbXBsZV9pbmRpY2VzID0gcmFuZG9tX2luc3RhbmNlLnJhbmRpbnQoMCwgbl9zYW1wbGVzLCBuX2Jvb3RzdHJhcCkKICAgIHNhbXBsZV9jb3VudHMgPSBucC5iaW5jb3VudChzYW1wbGVfaW5kaWNlcywgbWlubGVuZ3RoPW5fc2FtcGxlcykKCiAgICByZXR1cm4gbnAuYXJhbmdlKG5fc2FtcGxlcylbc2FtcGxlX2NvdW50cyA9PSAwXQoKCmRlZiBfb29iX2NsYXNzaWZpZXJfYWNjdXJhY3kocmYsIFhfdHJhaW4sIHlfdHJhaW4pIC0+IGZsb2F0OgogICAgIiIiCiAgICBDb21wdXRlIG91dC1vZi1iYWcgKE9PQikgYWNjdXJhY3kgZm9yIGEgc2Npa2l0LWxlYXJuIGZvcmVzdCBjbGFzc2lmaWVyLgoKICAgIGh0dHBzOi8vZ2l0aHViLmNvbS9zY2lraXQtbGVhcm4vc2Npa2l0LWxlYXJuL2Jsb2IvYTI0YzhiNDYvc2tsZWFybi9lbnNlbWJsZS9mb3Jlc3QucHkjTDQyNQogICAgIiIiCiAgICBYID0gWF90cmFpbi52YWx1ZXMgaWYgaXNpbnN0YW5jZShYX3RyYWluLCBwZC5EYXRhRnJhbWUpIGVsc2UgWF90cmFpbgogICAgeSA9IHlfdHJhaW4udmFsdWVzIGlmIGlzaW5zdGFuY2UoeV90cmFpbiwgcGQuU2VyaWVzKSBlbHNlIHlfdHJhaW4KCiAgICBuX3NhbXBsZXMgPSBsZW4oWCkKICAgIG5fY2xhc3NlcyA9IGxlbihucC51bmlxdWUoeSkpCiAgICBwcmVkaWN0aW9ucyA9IG5wLnplcm9zKChuX3NhbXBsZXMsIG5fY2xhc3NlcykpCiAgICBmb3IgdHJlZSBpbiByZi5lc3RpbWF0b3JzXzoKICAgICAgICB1bnNhbXBsZWRfaW5kaWNlcyA9IF9nZXRfdW5zYW1wbGVkX2l4KHRyZWUucmFuZG9tX3N0YXRlLCBuX3NhbXBsZXMpCiAgICAgICAgdHJlZV9wcmVkcyA9IHRyZWUucHJlZGljdF9wcm9iYShYW3Vuc2FtcGxlZF9pbmRpY2VzLCA6XSkKICAgICAgICBwcmVkaWN0aW9uc1t1bnNhbXBsZWRfaW5kaWNlc10gKz0gdHJlZV9wcmVkcwoKICAgIHByZWRpY3RlZF9jbGFzc19pbmRleGVzID0gbnAuYXJnbWF4KHByZWRpY3Rpb25zLCBheGlzPTEpCiAgICBwcmVkaWN0ZWRfY2xhc3NlcyA9IFtyZi5jbGFzc2VzX1tpXSBmb3IgaSBpbiBwcmVkaWN0ZWRfY2xhc3NfaW5kZXhlc10KCiAgICBvb2Jfc2NvcmUgPSBucC5tZWFuKHkgPT0gcHJlZGljdGVkX2NsYXNzZXMpCgogICAgcmV0dXJuIG9vYl9zY29yZQoKCmRlZiBwZXJtdXRhdGlvbl9pbXBvcnRhbmNlKAogICAgY29udGV4dDogTUxDbGllbnRDdHgsCiAgICBtb2RlbDogRGF0YUl0ZW0sCiAgICBkYXRhc2V0OiBEYXRhSXRlbSwKICAgIGxhYmVsczogc3RyLAogICAgZmlnc3o9KDEwLCA1KSwKICAgIHBsb3RzX2Rlc3Q6IHN0ciA9ICJwbG90cyIsCiAgICBmaXR5cGU6IHN0ciA9ICJwZXJtdXRlIiwKKSAtPiBwZC5EYXRhRnJhbWU6CiAgICAiIiJjYWxjdWxhdGUgY2hhbmdlIGluIG1ldHJpYwoKICAgIHR5cGUgJ3Blcm11dGUnIHVzZXMgYSBwcmUtZXN0aW1hdGVkIG1vZGVsCiAgICB0eXBlICdkcm9wY29sJyB1c2VzIGEgcmUtZXN0aW1hdGVzIG1vZGVsCgogICAgOnBhcmFtIGNvbnRleHQ6ICAgICB0aGUgZnVuY3Rpb24ncyBleGVjdXRpb24gY29udGV4dAogICAgOnBhcmFtIG1vZGVsOiAgICAgICBhIHRyYWluZWQgbW9kZWwKICAgIDpwYXJhbSBkYXRhc2V0OiAgICAgZmVhdHVyZXMgYW5kIGdyb3VuZCB0cnV0aHMsIHJlZ3Jlc3Npb24gdGFyZ2V0cwogICAgOnBhcmFtIGxhYmVscyAgICAgICBuYW1lIG9mIHRoZSBncm91bmQgdHJ1dGhzIGNvbHVtbgogICAgOnBhcmFtIGZpZ3N6OiAgICAgICBtYXRwbG90bGliIGZpZ3VyZSBzaXplCiAgICA6cGFyYW0gcGxvdHNfZGVzdDogIHBhdGggd2l0aGluIGFydGlmYWN0IHN0b3JlCiAgICA6CiAgICAiIiIKICAgIG1vZGVsX2ZpbGUsIG1vZGVsX2RhdGEsIF8gPSBnZXRfbW9kZWwobW9kZWwudXJsLCBzdWZmaXg9Ii5wa2wiKQogICAgbW9kZWwgPSBsb2FkKG9wZW4oc3RyKG1vZGVsX2ZpbGUpLCAicmIiKSkKCiAgICBYID0gZGF0YXNldC5hc19kZigpCiAgICB5ID0gWC5wb3AobGFiZWxzKQogICAgaGVhZGVyID0gWC5jb2x1bW5zCgogICAgbWV0cmljID0gX29vYl9jbGFzc2lmaWVyX2FjY3VyYWN5CgogICAgYmFzZWxpbmUgPSBtZXRyaWMobW9kZWwsIFgsIHkpCgogICAgaW1wID0gW10KICAgIGZvciBjb2wgaW4gWC5jb2x1bW5zOgogICAgICAgIGlmIGZpdHlwZSBpcyAicGVybXV0ZSI6CiAgICAgICAgICAgIHNhdmUgPSBYW2NvbF0uY29weSgpCiAgICAgICAgICAgIFhbY29sXSA9IG5wLnJhbmRvbS5wZXJtdXRhdGlvbihYW2NvbF0pCiAgICAgICAgICAgIG0gPSBtZXRyaWMobW9kZWwsIFgsIHkpCiAgICAgICAgICAgIFhbY29sXSA9IHNhdmUKICAgICAgICAgICAgaW1wLmFwcGVuZChiYXNlbGluZSAtIG0pCiAgICAgICAgZWxpZiBmaXR5cGUgaXMgImRyb3Bjb2wiOgogICAgICAgICAgICBYXyA9IFguZHJvcChjb2wsIGF4aXM9MSkKICAgICAgICAgICAgbW9kZWxfID0gY2xvbmUobW9kZWwpCiAgICAgICAgICAgICNtb2RlbF8ucmFuZG9tX3N0YXRlID0gcmFuZG9tX3N0YXRlCiAgICAgICAgICAgIG1vZGVsXy5maXQoWF8sIHkpCiAgICAgICAgICAgIG8gPSBtb2RlbF8ub29iX3Njb3JlXwogICAgICAgICAgICBpbXAuYXBwZW5kKGJhc2VsaW5lIC0gbykKICAgICAgICBlbHNlOgogICAgICAgICAgICByYWlzZSBWYWx1ZUVycm9yKCJ1bmtub3duIGZpdHlwZSwgb25seSAncGVybXV0ZScgb3IgJ2Ryb3Bjb2wnIHBlcm1pdHRlZCIpCgogICAgemlwcGVkID0gemlwKGltcCwgaGVhZGVyKQogICAgZmVhdHVyZV9pbXAgPSBwZC5EYXRhRnJhbWUoc29ydGVkKHppcHBlZCksIGNvbHVtbnM9WyJpbXBvcnRhbmNlIiwgImZlYXR1cmUiXSkKICAgIGZlYXR1cmVfaW1wLnNvcnRfdmFsdWVzKGJ5PSJpbXBvcnRhbmNlIiwgYXNjZW5kaW5nPUZhbHNlLCBpbnBsYWNlPVRydWUpCgogICAgcGx0LmNsZigpCiAgICBwbHQuZmlndXJlKGZpZ3NpemU9Zmlnc3opCiAgICBzbnMuYmFycGxvdCh4PSJpbXBvcnRhbmNlIiwgeT0iZmVhdHVyZSIsIGRhdGE9ZmVhdHVyZV9pbXApCiAgICBwbHQudGl0bGUoZiJmZWF0dXJlIGltcG9ydGFuY2VzLXtmaXR5cGV9IikKICAgIHBsdC50aWdodF9sYXlvdXQoKQoKICAgIGNvbnRleHQubG9nX2FydGlmYWN0KAogICAgICAgIFBsb3RBcnRpZmFjdChmImZlYXR1cmUgaW1wb3J0YW5jZXMte2ZpdHlwZX0iLCBib2R5PXBsdC5nY2YoKSksCiAgICAgICAgbG9jYWxfcGF0aD1mIntwbG90c19kZXN0fS9mZWF0dXJlLXBlcm11dGF0aW9ucy5odG1sIiwKICAgICkKICAgIGNvbnRleHQubG9nX2RhdGFzZXQoCiAgICAgICAgZiJmZWF0dXJlLWltcG9ydGFuY2VzLXtmaXR5cGV9LXRibCIsIGRmPWZlYXR1cmVfaW1wLCBpbmRleD1GYWxzZQogICAgKQo=
-    commands: []
-    code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/feature_perms/feature_perms.py
-  affinity: null
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/feature_perms/latest/static/item.html b/functions/development/feature_perms/latest/static/item.html deleted file mode 100644 index 2cdb2735..00000000 --- a/functions/development/feature_perms/latest/static/item.html +++ /dev/null @@ -1,47 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- data-analysis
-description: estimate feature importances using permutations
-doc: ''
-example: feature_perms.ipynb
-generationDate: 2022-08-28:17-25
-hidden: false
-icon: ''
-labels:
-  author: yjb
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 1.1.0
-name: feature-perms
-platformVersion: 3.5.0
-spec:
-  filename: feature_perms.py
-  handler: permutation_importance
-  image: mlrun/ml-models
-  kind: job
-  requirements: []
-url: ''
-version: 1.1.0
-test_valid : False
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/feature_perms/latest/static/source.html b/functions/development/feature_perms/latest/static/source.html deleted file mode 100644 index 9d9c9b8c..00000000 --- a/functions/development/feature_perms/latest/static/source.html +++ /dev/null @@ -1,196 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-# Copyright 2019 Iguazio
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# Generated by nuclio.export.NuclioExporter
-
-import numpy as np
-import pandas as pd
-import numbers
-
-import sklearn
-from sklearn.base import clone
-from sklearn.utils import check_random_state
-
-import matplotlib.pyplot as plt
-import seaborn as sns
-
-from cloudpickle import load
-
-from mlrun.execution import MLClientCtx
-from mlrun.datastore import DataItem
-from mlrun.artifacts import get_model, PlotArtifact
-from typing import Union, Callable, List
-
-
-def _get_n_samples_bootstrap(n_samples, max_samples) -> int:
-    """get the number of samples in a bootstrap sample
-
-    returns the total number of samples to draw for the bootstrap sample
-
-    private api in sklearn >= v0.24, taken from sklearn.ensemble._forest.py
-
-    :param n_samples:   Number of samples in the dataset.
-    :param max_samples:
-        The maximum number of samples to draw from the total available:
-            - if float, this indicates a fraction of the total and should be
-              the interval `(0, 1)`;
-            - if int, this indicates the exact number of samples;
-            - if None, this indicates the total number of samples.
-    """
-    if max_samples is None:
-        return n_samples
-
-    if isinstance(max_samples, numbers.Integral):
-        if not (1 <= max_samples <= n_samples):
-            msg = "`max_samples` must be in range 1 to {} but got value {}"
-            raise ValueError(msg.format(n_samples, max_samples))
-        return max_samples
-
-    if isinstance(max_samples, numbers.Real):
-        if not (0 < max_samples < 1):
-            msg = "`max_samples` must be in range (0, 1) but got value {}"
-            raise ValueError(msg.format(max_samples))
-        return int(round(n_samples * max_samples))
-
-    msg = "`max_samples` should be int or float, but got type '{}'"
-    raise TypeError(msg.format(type(max_samples)))
-
-
-def _get_unsampled_ix(random_state, n_samples: int) -> np.array:
-    """
-    future-proof get unsampled indices
-    """
-    n_bootstrap = _get_n_samples_bootstrap(n_samples, n_samples)
-    random_instance = check_random_state(random_state)
-    sample_indices = random_instance.randint(0, n_samples, n_bootstrap)
-    sample_counts = np.bincount(sample_indices, minlength=n_samples)
-
-    return np.arange(n_samples)[sample_counts == 0]
-
-
-def _oob_classifier_accuracy(rf, X_train, y_train) -> float:
-    """
-    Compute out-of-bag (OOB) accuracy for a scikit-learn forest classifier.
-
-    https://github.com/scikit-learn/scikit-learn/blob/a24c8b46/sklearn/ensemble/forest.py#L425
-    """
-    X = X_train.values if isinstance(X_train, pd.DataFrame) else X_train
-    y = y_train.values if isinstance(y_train, pd.Series) else y_train
-
-    n_samples = len(X)
-    n_classes = len(np.unique(y))
-    predictions = np.zeros((n_samples, n_classes))
-    for tree in rf.estimators_:
-        unsampled_indices = _get_unsampled_ix(tree.random_state, n_samples)
-        tree_preds = tree.predict_proba(X[unsampled_indices, :])
-        predictions[unsampled_indices] += tree_preds
-
-    predicted_class_indexes = np.argmax(predictions, axis=1)
-    predicted_classes = [rf.classes_[i] for i in predicted_class_indexes]
-
-    oob_score = np.mean(y == predicted_classes)
-
-    return oob_score
-
-
-def permutation_importance(
-    context: MLClientCtx,
-    model: DataItem,
-    dataset: DataItem,
-    labels: str,
-    figsz=(10, 5),
-    plots_dest: str = "plots",
-    fitype: str = "permute",
-) -> pd.DataFrame:
-    """calculate change in metric
-
-    type 'permute' uses a pre-estimated model
-    type 'dropcol' uses a re-estimates model
-
-    :param context:     the function's execution context
-    :param model:       a trained model
-    :param dataset:     features and ground truths, regression targets
-    :param labels       name of the ground truths column
-    :param figsz:       matplotlib figure size
-    :param plots_dest:  path within artifact store
-    :
-    """
-    model_file, model_data, _ = get_model(model.url, suffix=".pkl")
-    model = load(open(str(model_file), "rb"))
-
-    X = dataset.as_df()
-    y = X.pop(labels)
-    header = X.columns
-
-    metric = _oob_classifier_accuracy
-
-    baseline = metric(model, X, y)
-
-    imp = []
-    for col in X.columns:
-        if fitype is "permute":
-            save = X[col].copy()
-            X[col] = np.random.permutation(X[col])
-            m = metric(model, X, y)
-            X[col] = save
-            imp.append(baseline - m)
-        elif fitype is "dropcol":
-            X_ = X.drop(col, axis=1)
-            model_ = clone(model)
-            #model_.random_state = random_state
-            model_.fit(X_, y)
-            o = model_.oob_score_
-            imp.append(baseline - o)
-        else:
-            raise ValueError("unknown fitype, only 'permute' or 'dropcol' permitted")
-
-    zipped = zip(imp, header)
-    feature_imp = pd.DataFrame(sorted(zipped), columns=["importance", "feature"])
-    feature_imp.sort_values(by="importance", ascending=False, inplace=True)
-
-    plt.clf()
-    plt.figure(figsize=figsz)
-    sns.barplot(x="importance", y="feature", data=feature_imp)
-    plt.title(f"feature importances-{fitype}")
-    plt.tight_layout()
-
-    context.log_artifact(
-        PlotArtifact(f"feature importances-{fitype}", body=plt.gcf()),
-        local_path=f"{plots_dest}/feature-permutations.html",
-    )
-    context.log_dataset(
-        f"feature-importances-{fitype}-tbl", df=feature_imp, index=False
-    )
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/get_offline_features/0.0.1/src/function.yaml b/functions/development/get_offline_features/0.0.1/src/function.yaml deleted file mode 100644 index 3a298a70..00000000 --- a/functions/development/get_offline_features/0.0.1/src/function.yaml +++ /dev/null @@ -1,107 +0,0 @@ -kind: job -metadata: - name: get-offline-features - tag: '' - hash: da837de231cf416f2f866c3fe2be9f4d27c35b7d - project: '' - labels: - author: yonish - categories: - - data-preparation - - data-analysis - - feature-store -spec: - command: '' - args: [] - image: mlrun/mlrun - env: [] - default_handler: get_offline_features - entry_points: - get_offline_features: - name: get_offline_features - doc: 'retrieve offline feature vector results - - - specify a feature vector object/uri and retrieve the desired features, their - metadata - - and statistics. returns :py:class:`~mlrun.feature_store.OfflineVectorResponse`, - - results can be returned as a dataframe or written to a target - - - The start_time and end_time attributes allow filtering the data to a given - time range, they accept - - string values or pandas `Timestamp` objects, string values can also be relative, - for example: - - "now", "now - 1d2h", "now+5m", where a valid pandas Timedelta string follows - the verb "now", - - for time alignment you can use the verb "floor" e.g. "now -1d floor 1H" will - align the time to the last hour - - (the floor string is passed to pandas.Timestamp.floor(), can use D, H, T, - S for day, hour, min, sec alignment).' - parameters: - - name: context - type: MLClientCtx - doc: MLRun context. - default: '' - - name: feature_vector - type: str - doc: feature vector uri. - default: '' - - name: entity_rows - type: DataItem - doc: URI of the data entity rows to join with. - default: null - - name: entity_timestamp_column - type: str - doc: timestamp column name in the entity rows dataframe - default: null - - name: target - type: Union[str, Dict] - doc: where to write the results to. - default: null - - name: run_config - type: Union[str, Dict] - doc: function and/or run configuration see :py:class:`~mlrun.feature_store.RunConfig` - default: null - - name: drop_columns - type: List[str] - doc: list of columns to drop from the final result - default: null - - name: start_time - type: str - doc: datetime, low limit of time needed to be filtered. Optional. entity_timestamp_column - must be passed when using time filtering. - default: null - - name: end_time - type: str - doc: datetime, high limit of time needed to be filtered. Optional. entity_timestamp_column - must be passed when using time filtering. - default: null - - name: with_indexes - type: bool - doc: return vector with index columns (default False) - default: false - - name: update_stats - type: bool - doc: update features statistics from the requested feature sets on the vector. - Default is False. - default: false - outputs: - - default: '' - lineno: 9 - description: retrieve offline feature vector results - build: - functionSourceCode: ZnJvbSB0eXBpbmcgaW1wb3J0IFVuaW9uLCBMaXN0LCBEaWN0CgppbXBvcnQgbWxydW4uZmVhdHVyZV9zdG9yZSBhcyBmcwpmcm9tIG1scnVuLmRhdGFzdG9yZS50YXJnZXRzIGltcG9ydCBnZXRfdGFyZ2V0X2RyaXZlciwga2luZF90b19kcml2ZXIKZnJvbSBtbHJ1bi5kYXRhc3RvcmUuYmFzZSBpbXBvcnQgRGF0YUl0ZW0KZnJvbSBtbHJ1bi5leGVjdXRpb24gaW1wb3J0IE1MQ2xpZW50Q3R4CgoKZGVmIGdldF9vZmZsaW5lX2ZlYXR1cmVzKAogICAgY29udGV4dDogTUxDbGllbnRDdHgsCiAgICBmZWF0dXJlX3ZlY3Rvcjogc3RyLAogICAgZW50aXR5X3Jvd3M6IERhdGFJdGVtID0gTm9uZSwKICAgIGVudGl0eV90aW1lc3RhbXBfY29sdW1uOiBzdHIgPSBOb25lLAogICAgdGFyZ2V0OiBVbmlvbltzdHIsIERpY3RdID0gTm9uZSwKICAgIHJ1bl9jb25maWc6IFVuaW9uW3N0ciwgRGljdF0gPSBOb25lLAogICAgZHJvcF9jb2x1bW5zOiBMaXN0W3N0cl0gPSBOb25lLAogICAgc3RhcnRfdGltZTogc3RyID0gTm9uZSwKICAgIGVuZF90aW1lOiBzdHIgPSBOb25lLAogICAgd2l0aF9pbmRleGVzOiBib29sID0gRmFsc2UsCiAgICB1cGRhdGVfc3RhdHM6IGJvb2wgPSBGYWxzZSwKKToKICAgICIiInJldHJpZXZlIG9mZmxpbmUgZmVhdHVyZSB2ZWN0b3IgcmVzdWx0cwoKICAgIHNwZWNpZnkgYSBmZWF0dXJlIHZlY3RvciBvYmplY3QvdXJpIGFuZCByZXRyaWV2ZSB0aGUgZGVzaXJlZCBmZWF0dXJlcywgdGhlaXIgbWV0YWRhdGEKICAgIGFuZCBzdGF0aXN0aWNzLiByZXR1cm5zIDpweTpjbGFzczpgfm1scnVuLmZlYXR1cmVfc3RvcmUuT2ZmbGluZVZlY3RvclJlc3BvbnNlYCwKICAgIHJlc3VsdHMgY2FuIGJlIHJldHVybmVkIGFzIGEgZGF0YWZyYW1lIG9yIHdyaXR0ZW4gdG8gYSB0YXJnZXQKCiAgICBUaGUgc3RhcnRfdGltZSBhbmQgZW5kX3RpbWUgYXR0cmlidXRlcyBhbGxvdyBmaWx0ZXJpbmcgdGhlIGRhdGEgdG8gYSBnaXZlbiB0aW1lIHJhbmdlLCB0aGV5IGFjY2VwdAogICAgc3RyaW5nIHZhbHVlcyBvciBwYW5kYXMgYFRpbWVzdGFtcGAgb2JqZWN0cywgc3RyaW5nIHZhbHVlcyBjYW4gYWxzbyBiZSByZWxhdGl2ZSwgZm9yIGV4YW1wbGU6CiAgICAibm93IiwgIm5vdyAtIDFkMmgiLCAibm93KzVtIiwgd2hlcmUgYSB2YWxpZCBwYW5kYXMgVGltZWRlbHRhIHN0cmluZyBmb2xsb3dzIHRoZSB2ZXJiICJub3ciLAogICAgZm9yIHRpbWUgYWxpZ25tZW50IHlvdSBjYW4gdXNlIHRoZSB2ZXJiICJmbG9vciIgZS5nLiAibm93IC0xZCBmbG9vciAxSCIgd2lsbCBhbGlnbiB0aGUgdGltZSB0byB0aGUgbGFzdCBob3VyCiAgICAodGhlIGZsb29yIHN0cmluZyBpcyBwYXNzZWQgdG8gcGFuZGFzLlRpbWVzdGFtcC5mbG9vcigpLCBjYW4gdXNlIEQsIEgsIFQsIFMgZm9yIGRheSwgaG91ciwgbWluLCBzZWMgYWxpZ25tZW50KS4KCgogICAgOnBhcmFtIGNvbnRleHQ6ICAgICAgICBNTFJ1biBjb250ZXh0LgogICAgOnBhcmFtIGZlYXR1cmVfdmVjdG9yOiBmZWF0dXJlIHZlY3RvciB1cmkuCiAgICA6cGFyYW0gZW50aXR5X3Jvd3M6ICAgIFVSSSBvZiB0aGUgZGF0YSBlbnRpdHkgcm93cyB0byBqb2luIHdpdGguCiAgICA6cGFyYW0gdGFyZ2V0OiAgICAgICAgIHdoZXJlIHRvIHdyaXRlIHRoZSByZXN1bHRzIHRvLgogICAgOnBhcmFtIGRyb3BfY29sdW1uczogICBsaXN0IG9mIGNvbHVtbnMgdG8gZHJvcCBmcm9tIHRoZSBmaW5hbCByZXN1bHQKICAgIDpwYXJhbSBlbnRpdHlfdGltZXN0YW1wX2NvbHVtbjogdGltZXN0YW1wIGNvbHVtbiBuYW1lIGluIHRoZSBlbnRpdHkgcm93cyBkYXRhZnJhbWUKICAgIDpwYXJhbSBydW5fY29uZmlnOiAgICAgZnVuY3Rpb24gYW5kL29yIHJ1biBjb25maWd1cmF0aW9uCiAgICAgICAgICAgICAgICAgICAgICAgICAgIHNlZSA6cHk6Y2xhc3M6YH5tbHJ1bi5mZWF0dXJlX3N0b3JlLlJ1bkNvbmZpZ2AKICAgIDpwYXJhbSBzdGFydF90aW1lOiAgICAgIGRhdGV0aW1lLCBsb3cgbGltaXQgb2YgdGltZSBuZWVkZWQgdG8gYmUgZmlsdGVyZWQuIE9wdGlvbmFsLgogICAgICAgIGVudGl0eV90aW1lc3RhbXBfY29sdW1uIG11c3QgYmUgcGFzc2VkIHdoZW4gdXNpbmcgdGltZSBmaWx0ZXJpbmcuCiAgICA6cGFyYW0gZW5kX3RpbWU6ICAgICAgICBkYXRldGltZSwgaGlnaCBsaW1pdCBvZiB0aW1lIG5lZWRlZCB0byBiZSBmaWx0ZXJlZC4gT3B0aW9uYWwuCiAgICAgICAgZW50aXR5X3RpbWVzdGFtcF9jb2x1bW4gbXVzdCBiZSBwYXNzZWQgd2hlbiB1c2luZyB0aW1lIGZpbHRlcmluZy4KICAgIDpwYXJhbSB3aXRoX2luZGV4ZXM6ICAgIHJldHVybiB2ZWN0b3Igd2l0aCBpbmRleCBjb2x1bW5zIChkZWZhdWx0IEZhbHNlKQogICAgOnBhcmFtIHVwZGF0ZV9zdGF0czogICAgdXBkYXRlIGZlYXR1cmVzIHN0YXRpc3RpY3MgZnJvbSB0aGUgcmVxdWVzdGVkIGZlYXR1cmUgc2V0cyBvbiB0aGUgdmVjdG9yLiBEZWZhdWx0IGlzIEZhbHNlLgoKICAgIDpyZXR1cm5zIGZlYXR1cmVfdmVjdG9yIGlucHV0CiAgICAiIiIKCiAgICAjIFByZXBhcmluZyBlbnRpdHlfcm93czoKICAgIGlmIGVudGl0eV9yb3dzIGlzIG5vdCBOb25lOgogICAgICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZiJDcmVhdGluZyBEYXRhRnJhbWUgZnJvbSBlbnRpdHlfcm93cyA9IHtlbnRpdHlfcm93c30iKQogICAgICAgIGVudGl0eV9yb3dzID0gZW50aXR5X3Jvd3MuYXNfZGYoKQoKICAgICMgUHJlcGFyaW5nIHRhcmdldDoKICAgIGlmIHRhcmdldDoKICAgICAgICBpZiBpc2luc3RhbmNlKHRhcmdldCwgc3RyKToKICAgICAgICAgICAgdGFyZ2V0ID0ga2luZF90b19kcml2ZXJbdGFyZ2V0XSgpCgogICAgICAgIG5hbWUgPSB0YXJnZXQubmFtZSBpZiBoYXNhdHRyKHRhcmdldCwgIm5hbWUiKSBlbHNlIHRhcmdldFsibmFtZSJdCiAgICAgICAgY29udGV4dC5sb2dnZXIuaW5mbyhmIlByZXBhcmluZyAne25hbWV9JyB0YXJnZXQiKQogICAgICAgIHRhcmdldCA9IGdldF90YXJnZXRfZHJpdmVyKHRhcmdldCkKICAgIGlmIHRhcmdldC5wYXRoOgogICAgICAgIGNvbnRleHQubG9nX3Jlc3VsdCgidGFyZ2V0IiwgdGFyZ2V0LnBhdGgpCgogICAgIyBQcmVwYXJpbmcgcnVuX2NvbmZpZzoKICAgIGlmIHJ1bl9jb25maWcgYW5kIGlzaW5zdGFuY2UocnVuX2NvbmZpZywgZGljdCk6CiAgICAgICAgY29udGV4dC5sb2dnZXIuaW5mbygiUHJlcGFyaW5nIHJ1biBjb25maWd1cmF0aW9uIikKICAgICAgICBydW5fY29uZmlnID0gZnMuUnVuQ29uZmlnKCoqcnVuX2NvbmZpZykKCiAgICAjIENhbGxpbmcgZ2V0X29mZmxpbmVfZmVhdHVyZXM6CiAgICBjb250ZXh0LmxvZ2dlci5pbmZvKAogICAgICAgIGYiZ2V0dGluZyBvZmZsaW5lIGZlYXR1cmVzIGZyb20gdGhlIEZlYXR1cmVWZWN0b3Ige2ZlYXR1cmVfdmVjdG9yfSIKICAgICkKICAgIGZzLmdldF9vZmZsaW5lX2ZlYXR1cmVzKAogICAgICAgIGZlYXR1cmVfdmVjdG9yPWZlYXR1cmVfdmVjdG9yLAogICAgICAgIGVudGl0eV9yb3dzPWVudGl0eV9yb3dzLAogICAgICAgIGVudGl0eV90aW1lc3RhbXBfY29sdW1uPWVudGl0eV90aW1lc3RhbXBfY29sdW1uLAogICAgICAgIHRhcmdldD10YXJnZXQsCiAgICAgICAgcnVuX2NvbmZpZz1ydW5fY29uZmlnLAogICAgICAgIGRyb3BfY29sdW1ucz1kcm9wX2NvbHVtbnMsCiAgICAgICAgc3RhcnRfdGltZT1zdGFydF90aW1lLAogICAgICAgIGVuZF90aW1lPWVuZF90aW1lLAogICAgICAgIHdpdGhfaW5kZXhlcz13aXRoX2luZGV4ZXMsCiAgICAgICAgdXBkYXRlX3N0YXRzPXVwZGF0ZV9zdGF0cywKICAgICkKCiAgICBjb250ZXh0LmxvZ19yZXN1bHQoImZlYXR1cmVfdmVjdG9yIiwgZmVhdHVyZV92ZWN0b3IpCg== - commands: [] - code_origin: https://github.com/mlrun/functions.git#886a88217c2a2570c81a14877f9c1dfb1ac8a244:C:\Users\yonatans\projects\functions\get_offline_features\get_offline_features.py - origin_filename: C:\Users\yonatans\projects\functions\get_offline_features\get_offline_features.py - disable_auto_mount: false - priority_class_name: '' - affinity: null -verbose: false diff --git a/functions/development/get_offline_features/0.0.1/src/get_offline_features.ipynb b/functions/development/get_offline_features/0.0.1/src/get_offline_features.ipynb deleted file mode 100644 index d97402a2..00000000 --- a/functions/development/get_offline_features/0.0.1/src/get_offline_features.ipynb +++ /dev/null @@ -1,1536 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# `get_offline_features()` from MLRun FeatureStore\n", - "\n", - "This MLRun Function has the following `params`:\n", - "\n", - "- `feature_vector: str`, feature vector uri.\n", - "\n", - "- `entity_rows: DataItem` = None, URI of the data entity rows to join with.\n", - "\n", - "- `entity_timestamp_column: str = None`, timestamp column name in the entity rows dataframe.\n", - "\n", - "- `target: Union[str, Dict] = None`, where to write the results to.\n", - "\n", - "- `run_config: Union[str, Dict] = None`, function and/or run configuration see :py:class:`~mlrun.feature_store.RunConfig`.\n", - "\n", - "- `drop_columns: List[str] = None`, list of columns to drop from the final result. \n", - "\n", - "- `start_time: str = None`, datetime, low limit of time needed to be filtered. Optional. `entity_timestamp_column` must be passed when using time filtering.\n", - "\n", - "- `end_time: str = None`, datetime, high limit of time needed to be filtered. Optional. `entity_timestamp_column` must be passed when using time filtering.\n", - "\n", - "- `with_indexes: bool = False`, return vector with index columns (default False).\n", - "\n", - "- `update_stats: bool = False`, update features statistics from the requested feature sets on the vector. Default is False." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import mlrun\n", - "import mlrun.feature_store as fstore\n", - "from mlrun.datastore.targets import CSVTarget\n", - "from mlrun.datastore.sources import CSVSource\n", - "from mlrun.run import get_dataitem\n", - "from mlrun.feature_store.steps import *\n", - "from mlrun.features import MinMaxValidator\n", - "import pandas as pd\n", - "import datetime\n", - "import os" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2022-01-31 14:41:48,288 [info] loaded project get-offline-features from MLRun DB\n" - ] - } - ], - "source": [ - "ABS_PATH = 'v3io://users/{}/get_offline_features/'.format(os.environ['V3IO_USERNAME'])\n", - "# Initialize the MLRun project object\n", - "project = mlrun.get_or_create_project('get-offline-features', context=\"./\", user_project=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Generating the Same FeatureSets and FeatureVecotrs Based on the Stocks Example" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Create Sample Data For Demo" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "quotes = pd.DataFrame(\n", - " {\n", - " \"time\": [\n", - " pd.Timestamp(\"2016-05-25 13:30:00.023\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.023\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.030\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.041\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.048\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.049\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.072\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.075\")\n", - " ],\n", - " \"ticker\": [\n", - " \"GOOG\",\n", - " \"MSFT\",\n", - " \"MSFT\",\n", - " \"MSFT\",\n", - " \"GOOG\",\n", - " \"AAPL\",\n", - " \"GOOG\",\n", - " \"MSFT\"\n", - " ],\n", - " \"bid\": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01],\n", - " \"ask\": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03]\n", - " }\n", - ")\n", - "\n", - "trades = pd.DataFrame(\n", - " {\n", - " \"time\": [\n", - " pd.Timestamp(\"2016-05-25 13:30:00.023\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.038\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.048\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.048\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.048\")\n", - " ],\n", - " \"ticker\": [\"MSFT\", \"MSFT\", \"GOOG\", \"GOOG\", \"AAPL\"],\n", - " \"price\": [51.95, 51.95, 720.77, 720.92, 98.0],\n", - " \"quantity\": [75, 155, 100, 100, 100]\n", - " }\n", - ")\n", - "\n", - "stocks = pd.DataFrame(\n", - " {\n", - " \"ticker\": [\"MSFT\", \"GOOG\", \"AAPL\"],\n", - " \"name\": [\"Microsoft Corporation\", \"Alphabet Inc\", \"Apple Inc\"],\n", - " \"exchange\": [\"NASDAQ\", \"NASDAQ\", \"NASDAQ\"]\n", - " }\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "def move_date(df, col):\n", - " max_date = df[col].max()\n", - " now_date = datetime.datetime.now()\n", - " delta = now_date - max_date \n", - " df[col] = df[col] + delta \n", - " return df\n", - "\n", - "quotes = move_date(quotes, \"time\")\n", - "trades = move_date(trades, \"time\")\n", - "trades.to_csv('trades.csv', index=False)\n", - "data_uri = os.path.join(ABS_PATH, 'trades.csv')" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
timetickerbidask
02022-01-31 14:41:48.260566GOOG720.50720.93
12022-01-31 14:41:48.260566MSFT51.9551.96
22022-01-31 14:41:48.267566MSFT51.9751.98
32022-01-31 14:41:48.278566MSFT51.9952.00
42022-01-31 14:41:48.285566GOOG720.50720.93
52022-01-31 14:41:48.286566AAPL97.9998.01
62022-01-31 14:41:48.309566GOOG720.50720.88
72022-01-31 14:41:48.312566MSFT52.0152.03
\n", - "
" - ], - "text/plain": [ - " time ticker bid ask\n", - "0 2022-01-31 14:41:48.260566 GOOG 720.50 720.93\n", - "1 2022-01-31 14:41:48.260566 MSFT 51.95 51.96\n", - "2 2022-01-31 14:41:48.267566 MSFT 51.97 51.98\n", - "3 2022-01-31 14:41:48.278566 MSFT 51.99 52.00\n", - "4 2022-01-31 14:41:48.285566 GOOG 720.50 720.93\n", - "5 2022-01-31 14:41:48.286566 AAPL 97.99 98.01\n", - "6 2022-01-31 14:41:48.309566 GOOG 720.50 720.88\n", - "7 2022-01-31 14:41:48.312566 MSFT 52.01 52.03" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "quotes" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
timetickerpricequantity
02022-01-31 14:41:48.288476MSFT51.9575
12022-01-31 14:41:48.303476MSFT51.95155
22022-01-31 14:41:48.313476GOOG720.77100
32022-01-31 14:41:48.313476GOOG720.92100
42022-01-31 14:41:48.313476AAPL98.00100
\n", - "
" - ], - "text/plain": [ - " time ticker price quantity\n", - "0 2022-01-31 14:41:48.288476 MSFT 51.95 75\n", - "1 2022-01-31 14:41:48.303476 MSFT 51.95 155\n", - "2 2022-01-31 14:41:48.313476 GOOG 720.77 100\n", - "3 2022-01-31 14:41:48.313476 GOOG 720.92 100\n", - "4 2022-01-31 14:41:48.313476 AAPL 98.00 100" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "trades" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
tickernameexchange
0MSFTMicrosoft CorporationNASDAQ
1GOOGAlphabet IncNASDAQ
2AAPLApple IncNASDAQ
\n", - "
" - ], - "text/plain": [ - " ticker name exchange\n", - "0 MSFT Microsoft Corporation NASDAQ\n", - "1 GOOG Alphabet Inc NASDAQ\n", - "2 AAPL Apple Inc NASDAQ" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "stocks" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Build & Ingest Simple Feature Set (stocks)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
nameexchange
ticker
MSFTMicrosoft CorporationNASDAQ
GOOGAlphabet IncNASDAQ
AAPLApple IncNASDAQ
\n", - "
" - ], - "text/plain": [ - " name exchange\n", - "ticker \n", - "MSFT Microsoft Corporation NASDAQ\n", - "GOOG Alphabet Inc NASDAQ\n", - "AAPL Apple Inc NASDAQ" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# add feature set without time column (stock ticker metadata) \n", - "stocks_set = fstore.FeatureSet(\"stocks\", entities=[fstore.Entity(\"ticker\")])\n", - "fstore.ingest(stocks_set, stocks, infer_options=fstore.InferOptions.default())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Build Advanced feature set - with feature engineering pipeline" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "quotes_set = fstore.FeatureSet(\"stock-quotes\", entities=[fstore.Entity(\"ticker\")])" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "class MyMap(MapClass):\n", - " def __init__(self, multiplier=1, **kwargs):\n", - " super().__init__(**kwargs)\n", - " self._multiplier = multiplier\n", - "\n", - " def do(self, event):\n", - " event[\"multi\"] = event[\"bid\"] * self._multiplier\n", - " return event" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "image/svg+xml": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "mlrun-flow\n", - "\n", - "\n", - "\n", - "_start\n", - "\n", - "start\n", - "\n", - "\n", - "\n", - "MyMap\n", - "\n", - "MyMap\n", - "\n", - "\n", - "\n", - "_start->MyMap\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "storey.Extend\n", - "\n", - "storey.Extend\n", - "\n", - "\n", - "\n", - "MyMap->storey.Extend\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "filter\n", - "\n", - "filter\n", - "\n", - "\n", - "\n", - "storey.Extend->filter\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "FeaturesetValidator\n", - "\n", - "FeaturesetValidator\n", - "\n", - "\n", - "\n", - "filter->FeaturesetValidator\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "Aggregates\n", - "\n", - "Aggregates\n", - "\n", - "\n", - "\n", - "FeaturesetValidator->Aggregates\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "parquet\n", - "\n", - "\n", - "parquet\n", - "\n", - "\n", - "\n", - "Aggregates->parquet\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "nosql\n", - "\n", - "\n", - "nosql\n", - "\n", - "\n", - "\n", - "Aggregates->nosql\n", - "\n", - "\n", - "\n", - "\n", - "\n" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "quotes_set.graph.to(\"MyMap\", multiplier=3)\\\n", - " .to(\"storey.Extend\", _fn=\"({'extra': event['bid'] * 77})\")\\\n", - " .to(\"storey.Filter\", \"filter\", _fn=\"(event['bid'] > 51.92)\")\\\n", - " .to(FeaturesetValidator())\n", - "\n", - "quotes_set.add_aggregation(\"ask\", [\"sum\", \"max\"], \"1h\", \"10m\", name=\"asks1\")\n", - "quotes_set.add_aggregation(\"ask\", [\"sum\", \"max\"], \"5h\", \"10m\", name=\"asks5\")\n", - "quotes_set.add_aggregation(\"bid\", [\"min\", \"max\"], \"1h\", \"10m\", name=\"bids\")\n", - "\n", - "# add feature validation policy\n", - "quotes_set[\"bid\"] = fstore.Feature(validator=MinMaxValidator(min=52, severity=\"info\"))\n", - "\n", - "# add default target definitions and plot\n", - "quotes_set.set_targets()\n", - "quotes_set.plot(rankdir=\"LR\", with_targets=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Ingest Data Into Offline And Online Stores" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.377248+00:00 args={'min': 52, 'value': 51.95}\n", - "info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.377927+00:00 args={'min': 52, 'value': 51.97}\n", - "info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.378103+00:00 args={'min': 52, 'value': 51.99}\n", - "info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.578640+00:00 args={'min': 52, 'value': 51.95}\n", - "info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.581692+00:00 args={'min': 52, 'value': 51.97}\n", - "info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.584351+00:00 args={'min': 52, 'value': 51.99}\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
asks1_max_1hasks1_sum_1hasks5_max_5hasks5_sum_5hbids_max_1hbids_min_1htimebidaskmultiextra
ticker
GOOG720.93720.93720.93720.93720.50720.502022-01-31 14:41:48.260566720.50720.932161.5055478.50
MSFT51.9651.9651.9651.9651.9551.952022-01-31 14:41:48.26056651.9551.96155.854000.15
MSFT51.98103.9451.98103.9451.9751.952022-01-31 14:41:48.26756651.9751.98155.914001.69
MSFT52.00155.9452.00155.9451.9951.952022-01-31 14:41:48.27856651.9952.00155.974003.23
GOOG720.931441.86720.931441.86720.50720.502022-01-31 14:41:48.285566720.50720.932161.5055478.50
AAPL98.0198.0198.0198.0197.9997.992022-01-31 14:41:48.28656697.9998.01293.977545.23
GOOG720.932162.74720.932162.74720.50720.502022-01-31 14:41:48.309566720.50720.882161.5055478.50
MSFT52.03207.9752.03207.9752.0151.952022-01-31 14:41:48.31256652.0152.03156.034004.77
\n", - "
" - ], - "text/plain": [ - " asks1_max_1h asks1_sum_1h asks5_max_5h asks5_sum_5h bids_max_1h \\\n", - "ticker \n", - "GOOG 720.93 720.93 720.93 720.93 720.50 \n", - "MSFT 51.96 51.96 51.96 51.96 51.95 \n", - "MSFT 51.98 103.94 51.98 103.94 51.97 \n", - "MSFT 52.00 155.94 52.00 155.94 51.99 \n", - "GOOG 720.93 1441.86 720.93 1441.86 720.50 \n", - "AAPL 98.01 98.01 98.01 98.01 97.99 \n", - "GOOG 720.93 2162.74 720.93 2162.74 720.50 \n", - "MSFT 52.03 207.97 52.03 207.97 52.01 \n", - "\n", - " bids_min_1h time bid ask multi \\\n", - "ticker \n", - "GOOG 720.50 2022-01-31 14:41:48.260566 720.50 720.93 2161.50 \n", - "MSFT 51.95 2022-01-31 14:41:48.260566 51.95 51.96 155.85 \n", - "MSFT 51.95 2022-01-31 14:41:48.267566 51.97 51.98 155.91 \n", - "MSFT 51.95 2022-01-31 14:41:48.278566 51.99 52.00 155.97 \n", - "GOOG 720.50 2022-01-31 14:41:48.285566 720.50 720.93 2161.50 \n", - "AAPL 97.99 2022-01-31 14:41:48.286566 97.99 98.01 293.97 \n", - "GOOG 720.50 2022-01-31 14:41:48.309566 720.50 720.88 2161.50 \n", - "MSFT 51.95 2022-01-31 14:41:48.312566 52.01 52.03 156.03 \n", - "\n", - " extra \n", - "ticker \n", - "GOOG 55478.50 \n", - "MSFT 4000.15 \n", - "MSFT 4001.69 \n", - "MSFT 4003.23 \n", - "GOOG 55478.50 \n", - "AAPL 7545.23 \n", - "GOOG 55478.50 \n", - "MSFT 4004.77 " - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# save ingest data and print the FeatureSet spec\n", - "fstore.ingest(quotes_set, quotes)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Get an Offline Feature Vector" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "features = [\n", - " \"stock-quotes.multi\",\n", - " \"stock-quotes.asks5_sum_5h as total_ask\",\n", - " \"stock-quotes.bids_min_1h\",\n", - " \"stock-quotes.bids_max_1h\",\n", - " \"stocks.*\",\n", - "]\n", - "\n", - "vector = fstore.FeatureVector(\"stocks-vec\", features)\n", - "vector.save()" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "target_dict = CSVTarget('mycsv',path=os.path.join(ABS_PATH, 'my_csv.csv')).to_dict()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Using `get_offline_features()` " - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [], - "source": [ - "get_offline_features_fn = mlrun.import_function('hub://get_offline_features:development')" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2022-01-31 14:41:52,066 [info] starting run get-offline-features-get_offline_features uid=956663b9a9ba448c9ea65e8e9245718e DB=http://mlrun-api:8080\n", - "> 2022-01-31 14:41:52,214 [info] Creating DataFrame from entity_rows = v3io://users/yonatan/get_offline_features/trades.csv\n", - "> 2022-01-31 14:41:52,292 [info] Preparing 'mycsv' target\n", - "> 2022-01-31 14:41:52,294 [info] getting offline features from the FeatureVector store://feature-vectors/get-offline-features-yonatan/stocks-vec\n", - "> 2022-01-31 14:41:52,708 [info] wrote target: {'name': 'mycsv', 'kind': 'csv', 'path': 'v3io://users/yonatan/get_offline_features/my_csv.csv', 'status': 'ready', 'updated': '2022-01-31T14:41:52.708534+00:00'}\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
get-offline-features-yonatan0Jan 31 14:41:52completedget-offline-features-get_offline_features
v3io_user=yonatan
kind=
owner=yonatan
host=jupyter-yoni-647b99c95d-w4jlc
entity_rows
feature_vector=store://feature-vectors/get-offline-features-yonatan/stocks-vec
target={'name': 'mycsv', 'kind': 'csv', 'path': 'v3io://users/yonatan/get_offline_features/my_csv.csv', 'partitioned': False}
entity_timestamp_column=time
target=v3io://users/yonatan/get_offline_features/my_csv.csv
feature_vector=store://feature-vectors/get-offline-features-yonatan/stocks-vec
\n", - "
\n", - "
\n", - "
\n", - " Title\n", - " ×\n", - "
\n", - " \n", - "
\n", - "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - }, - { - "data": { - "text/html": [ - " > to track results use the .show() or .logs() methods or click here to open in UI" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2022-01-31 14:41:52,896 [info] run executed, status=completed\n" - ] - } - ], - "source": [ - "gof_run = get_offline_features_fn.run(\n", - " handler='get_offline_features',\n", - " inputs= {'entity_rows': data_uri},\n", - " params={'feature_vector': vector.uri,\n", - " 'target': target_dict,\n", - " 'entity_timestamp_column': \"time\",\n", - " },\n", - " local=True\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'store://feature-vectors/get-offline-features-yonatan/stocks-vec'" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "gof_run.outputs['feature_vector']" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Unnamed: 0pricequantitymultitotal_askbids_min_1hbids_max_1hnameexchange
0051.9575155.8551.9651.9551.95Microsoft CorporationNASDAQ
1151.9575155.91103.9451.9551.97Microsoft CorporationNASDAQ
2251.9575155.97155.9451.9551.99Microsoft CorporationNASDAQ
3351.9575156.03207.9751.9552.01Microsoft CorporationNASDAQ
4451.95155155.8551.9651.9551.95Microsoft CorporationNASDAQ
5551.95155155.91103.9451.9551.97Microsoft CorporationNASDAQ
6651.95155155.97155.9451.9551.99Microsoft CorporationNASDAQ
7751.95155156.03207.9751.9552.01Microsoft CorporationNASDAQ
88720.771002161.50720.93720.50720.50Alphabet IncNASDAQ
99720.771002161.501441.86720.50720.50Alphabet IncNASDAQ
1010720.771002161.502162.74720.50720.50Alphabet IncNASDAQ
1111720.921002161.50720.93720.50720.50Alphabet IncNASDAQ
1212720.921002161.501441.86720.50720.50Alphabet IncNASDAQ
1313720.921002161.502162.74720.50720.50Alphabet IncNASDAQ
141498.00100293.9798.0197.9997.99Apple IncNASDAQ
\n", - "
" - ], - "text/plain": [ - " Unnamed: 0 price quantity multi total_ask bids_min_1h \\\n", - "0 0 51.95 75 155.85 51.96 51.95 \n", - "1 1 51.95 75 155.91 103.94 51.95 \n", - "2 2 51.95 75 155.97 155.94 51.95 \n", - "3 3 51.95 75 156.03 207.97 51.95 \n", - "4 4 51.95 155 155.85 51.96 51.95 \n", - "5 5 51.95 155 155.91 103.94 51.95 \n", - "6 6 51.95 155 155.97 155.94 51.95 \n", - "7 7 51.95 155 156.03 207.97 51.95 \n", - "8 8 720.77 100 2161.50 720.93 720.50 \n", - "9 9 720.77 100 2161.50 1441.86 720.50 \n", - "10 10 720.77 100 2161.50 2162.74 720.50 \n", - "11 11 720.92 100 2161.50 720.93 720.50 \n", - "12 12 720.92 100 2161.50 1441.86 720.50 \n", - "13 13 720.92 100 2161.50 2162.74 720.50 \n", - "14 14 98.00 100 293.97 98.01 97.99 \n", - "\n", - " bids_max_1h name exchange \n", - "0 51.95 Microsoft Corporation NASDAQ \n", - "1 51.97 Microsoft Corporation NASDAQ \n", - "2 51.99 Microsoft Corporation NASDAQ \n", - "3 52.01 Microsoft Corporation NASDAQ \n", - "4 51.95 Microsoft Corporation NASDAQ \n", - "5 51.97 Microsoft Corporation NASDAQ \n", - "6 51.99 Microsoft Corporation NASDAQ \n", - "7 52.01 Microsoft Corporation NASDAQ \n", - "8 720.50 Alphabet Inc NASDAQ \n", - "9 720.50 Alphabet Inc NASDAQ \n", - "10 720.50 Alphabet Inc NASDAQ \n", - "11 720.50 Alphabet Inc NASDAQ \n", - "12 720.50 Alphabet Inc NASDAQ \n", - "13 720.50 Alphabet Inc NASDAQ \n", - "14 97.99 Apple Inc NASDAQ " - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "mlrun.get_dataitem(gof_run.outputs['feature_vector']).as_df()" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/functions/development/get_offline_features/0.0.1/src/get_offline_features.py b/functions/development/get_offline_features/0.0.1/src/get_offline_features.py deleted file mode 100644 index 56fe0583..00000000 --- a/functions/development/get_offline_features/0.0.1/src/get_offline_features.py +++ /dev/null @@ -1,120 +0,0 @@ -from typing import Union, List, Dict - -import mlrun -import mlrun.feature_store as fs -from mlrun.datastore.store_resources import is_store_uri, parse_store_uri -from mlrun.datastore.targets import get_target_driver, kind_to_driver -from mlrun.datastore.base import DataItem -from mlrun.execution import MLClientCtx -from mlrun.utils import StorePrefix, parse_versioned_object_uri -from mlrun.errors import MLRunInvalidArgumentError - - -def get_offline_features( - context: MLClientCtx, - feature_vector: str, - features: List[str] = None, - label_feature: str = None, - description: str = None, - entity_rows: DataItem = None, - entity_timestamp_column: str = None, - target: Union[str, Dict] = None, - run_config: Union[str, Dict] = None, - drop_columns: List[str] = None, - start_time: str = None, - end_time: str = None, - with_indexes: bool = False, - update_stats: bool = False, -): - """retrieve offline feature vector results - - specify a feature vector object/uri and retrieve the desired features, their metadata - and statistics. returns :py:class:`~mlrun.feature_store.OfflineVectorResponse`, - results can be returned as a dataframe or written to a target. - If feature vector does not exist, a new one will be created and saved with the given features. - - The start_time and end_time attributes allow filtering the data to a given time range, they accept - string values or pandas `Timestamp` objects, string values can also be relative, for example: - "now", "now - 1d2h", "now+5m", where a valid pandas Timedelta string follows the verb "now", - for time alignment you can use the verb "floor" e.g. "now -1d floor 1H" will align the time to the last hour - (the floor string is passed to pandas.Timestamp.floor(), can use D, H, T, S for day, hour, min, sec alignment) - - - :param context: MLRun context - :param feature_vector: feature vector uri - :param features: Relevant only if feature_vector not exist. list of feature to collect to this vector - format [/]. [as ] - :param label_feature: feature name to be used as label data - :param description: text description of the vector - :param entity_rows: URI of the data entity rows to join with - :param target: where to write the results to - :param drop_columns: list of columns to drop from the final result - :param entity_timestamp_column: timestamp column name in the entity rows dataframe - :param run_config: function and/or run configuration - see :py:class:`~mlrun.feature_store.RunConfig` - :param start_time: datetime, low limit of time needed to be filtered. Optional - entity_timestamp_column must be passed when using time filtering - :param end_time: datetime, high limit of time needed to be filtered. Optional - entity_timestamp_column must be passed when using time filtering - :param with_indexes: return vector with index columns (default False) - :param update_stats: update features statistics from the requested feature sets on the vector. Default is False. - - :returns feature_vector input - """ - - if features is not None: - # Creating a new FeatureVector and saving: - if is_store_uri(feature_vector): - prefix, new_uri = parse_store_uri(feature_vector) - if prefix != StorePrefix.FeatureVector: - raise MLRunInvalidArgumentError( - f"provided store uri ({feature_vector}) does not represent a feature vector (prefix={prefix})" - ) - feature_vector = new_uri - - project, name, tag, _ = parse_versioned_object_uri(feature_vector, mlrun.mlconf.default_project) - vector = fs.FeatureVector(name, features, label_feature=label_feature, description=description) - vector.metadata.project = project - vector.metadata.tag = tag - vector.save() - feature_vector = vector.uri - - # Preparing entity_rows: - if entity_rows is not None: - context.logger.info(f"Creating DataFrame from entity_rows = {entity_rows}") - entity_rows = entity_rows.as_df() - - # Preparing target: - if target: - if isinstance(target, str): - target = kind_to_driver[target]() - - name = target.name if hasattr(target, "name") else target["name"] - context.logger.info(f"Preparing '{name}' target") - target = get_target_driver(target) - if target.path: - context.log_result("target", target.path) - - # Preparing run_config: - if run_config and isinstance(run_config, dict): - context.logger.info("Preparing run configuration") - run_config = fs.RunConfig(**run_config) - - # Calling get_offline_features: - context.logger.info( - f"getting offline features from the FeatureVector {feature_vector}" - ) - fs.get_offline_features( - feature_vector=feature_vector, - entity_rows=entity_rows, - entity_timestamp_column=entity_timestamp_column, - target=target, - run_config=run_config, - drop_columns=drop_columns, - start_time=start_time, - end_time=end_time, - with_indexes=with_indexes, - update_stats=update_stats, - ) - - context.log_result("feature_vector", feature_vector) diff --git a/functions/development/get_offline_features/0.0.1/src/item.yaml b/functions/development/get_offline_features/0.0.1/src/item.yaml deleted file mode 100644 index 3fec3455..00000000 --- a/functions/development/get_offline_features/0.0.1/src/item.yaml +++ /dev/null @@ -1,25 +0,0 @@ -apiVersion: v1 -categories: -- data-preparation -- data-analysis -- feature-store -description: retrieve offline feature vector results -doc: '' -example: get_offline_features.ipynb -generationDate: 2022-01-17:17-56 -icon: '' -labels: - author: yonish -maintainers: [] -marketplaceType: '' -mlrunVersion: 0.9.1 -name: get_offline_features -platformVersion: '' -spec: - filename: get_offline_features.py - handler: get_offline_features - image: mlrun/mlrun - kind: job - requirements: [] -url: '' -version: 0.0.1 diff --git a/functions/development/get_offline_features/0.0.1/src/test_get_offline_features.py b/functions/development/get_offline_features/0.0.1/src/test_get_offline_features.py deleted file mode 100644 index 1aa92f0b..00000000 --- a/functions/development/get_offline_features/0.0.1/src/test_get_offline_features.py +++ /dev/null @@ -1,224 +0,0 @@ -import os -import tempfile -import shutil -import datetime - -import pytest -import mlrun -import mlrun.feature_store as fstore -from mlrun.datastore.targets import CSVTarget -from mlrun.feature_store.steps import * -from mlrun.features import MinMaxValidator -from mlrun.run import get_dataitem - - -REQUIRED_ENV_VARS = [ - "MLRUN_DBPATH", - "MLRUN_ARTIFACT_PATH", - "V3IO_USERNAME", - "V3IO_API", - "V3IO_ACCESS_KEY", -] - - -def _validate_environment_variables() -> bool: - """ - Checks that all required Environment variables are set. - """ - environment_keys = os.environ.keys() - return all(key in environment_keys for key in REQUIRED_ENV_VARS) - - -def _set_environment(): - """ - Creating project and temp dir for the project. - """ - artifact_path = tempfile.TemporaryDirectory().name - os.makedirs(artifact_path) - project = mlrun.get_or_create_project( - "get-offline-features-test", context="./", user_project=True - ) - return artifact_path, project - - -def _cleanup_environment(artifact_path: str): - """ - Cleanup the test environment, deleting files and artifacts created during the test. - - :param artifact_path: The artifact path to delete. - """ - # Clean the local directory: - for test_output in [ - *os.listdir(artifact_path), - "schedules", - "runs", - "artifacts", - "functions", - ]: - test_output_path = os.path.abspath(f"./{test_output}") - if os.path.exists(test_output_path): - if os.path.isdir(test_output_path): - shutil.rmtree(test_output_path) - else: - os.remove(test_output_path) - - # Clean the artifacts directory: - shutil.rmtree(artifact_path) - - -def create_dataframes() -> (pd.DataFrame, pd.DataFrame, pd.DataFrame): - """ - Creates all the necessary DataFrames to the test. - """ - - def move_date(df, col): - max_date = df[col].max() - now_date = datetime.datetime.now() - delta = now_date - max_date - df[col] = df[col] + delta - return df - - stocks = pd.DataFrame( - { - "ticker": ["MSFT", "GOOG", "AAPL"], - "name": ["Microsoft Corporation", "Alphabet Inc", "Apple Inc"], - "exchange": ["NASDAQ", "NASDAQ", "NASDAQ"], - } - ) - - quotes = pd.DataFrame( - { - "time": [ - pd.Timestamp("2016-05-25 13:30:00.023"), - pd.Timestamp("2016-05-25 13:30:00.023"), - pd.Timestamp("2016-05-25 13:30:00.030"), - pd.Timestamp("2016-05-25 13:30:00.041"), - pd.Timestamp("2016-05-25 13:30:00.048"), - pd.Timestamp("2016-05-25 13:30:00.049"), - pd.Timestamp("2016-05-25 13:30:00.072"), - pd.Timestamp("2016-05-25 13:30:00.075"), - ], - "ticker": ["GOOG", "MSFT", "MSFT", "MSFT", "GOOG", "AAPL", "GOOG", "MSFT"], - "bid": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01], - "ask": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03], - } - ) - - trades = pd.DataFrame( - { - "time": [ - pd.Timestamp("2016-05-25 13:30:00.023"), - pd.Timestamp("2016-05-25 13:30:00.038"), - pd.Timestamp("2016-05-25 13:30:00.048"), - pd.Timestamp("2016-05-25 13:30:00.048"), - pd.Timestamp("2016-05-25 13:30:00.048"), - ], - "ticker": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"], - "price": [51.95, 51.95, 720.77, 720.92, 98.0], - "quantity": [75, 155, 100, 100, 100], - } - ) - quotes = move_date(quotes, "time") - trades = move_date(trades, "time") - return quotes, trades, stocks - - -class MyMap(MapClass): - def __init__(self, multiplier=1, **kwargs): - super().__init__(**kwargs) - self._multiplier = multiplier - - def do(self, event): - event["multi"] = event["bid"] * self._multiplier - return event - - -def _create_feature_set(): - """ - Creating all the necessary FeatureSets for the test. - """ - stocks_set = fstore.FeatureSet("stocks", entities=[fstore.Entity("ticker")]) - - quotes_set = fstore.FeatureSet("stock-quotes", entities=[fstore.Entity("ticker")]) - - quotes_set.graph.to("MyMap", multiplier=3).to( - "storey.Extend", _fn="({'extra': event['bid'] * 77})" - ).to("storey.Filter", "filter", _fn="(event['bid'] > 51.92)").to( - FeaturesetValidator() - ) - - quotes_set.add_aggregation("ask", ["sum", "max"], "1h", "10m", name="asks1") - quotes_set.add_aggregation("ask", ["sum", "max"], "5h", "10m", name="asks5") - quotes_set.add_aggregation("bid", ["min", "max"], "1h", "10m", name="bids") - - # add feature validation policy - quotes_set["bid"] = fstore.Feature( - validator=MinMaxValidator(min=52, severity="info") - ) - - # add default target definitions and plot - quotes_set.set_targets() - return quotes_set, stocks_set - - -@pytest.mark.skipif( - condition=not _validate_environment_variables(), - reason="Project's environment variables are not set", -) -def test_get_offline_vector(): - # Creating project: - artifact_path, project = _set_environment() - - # Importing the marketplace function: - gof_fn = mlrun.import_function("function.yaml") - - # Creating the dataframes: - quotes, trades, stocks = create_dataframes() - - # Defining features for the FeatureVector: - features = [ - "stock-quotes.multi", - "stock-quotes.asks5_sum_5h as total_ask", - "stock-quotes.bids_min_1h", - "stock-quotes.bids_max_1h", - "stocks.*", - ] - - # Creating the FeatureSets and ingesting them: - quotes_set, stocks_set = _create_feature_set() - fstore.ingest(stocks_set, stocks) - fstore.ingest(quotes_set, quotes) - - # Saving the trades dataframe as a csv to use as entity_rows: - trades_uri = os.path.join(artifact_path, "trades.csv") - trades.to_csv(trades_uri, index=False) - - # Creating target for the FeatureVector: - target_dict = CSVTarget( - "mycsv", path=os.path.join(artifact_path, "my_csv.csv") - ).to_dict() - - # Running the getting_offline_features function: - gof_run = None - try: - gof_run = gof_fn.run( - handler="get_offline_features", - inputs={"entity_rows": trades_uri}, - params={ - "feature_vector": "stocks-vec", - "features": features, - "target": target_dict, - "entity_timestamp_column": "time", - }, - local=True, - ) - - except Exception as e: - print(f"- The test failed - raised the following error:\n- {e}") - - target_df = get_dataitem(gof_run.outputs["target"]).as_df() - vector_df = get_dataitem(gof_run.outputs["feature_vector"]).as_df() - - # Asserting that the target and FeatureVector dataframes are the same: - assert vector_df.equals(target_df), "Target and feature vector are not the same" - _cleanup_environment(artifact_path) diff --git a/functions/development/get_offline_features/0.0.1/static/documentation.html b/functions/development/get_offline_features/0.0.1/static/documentation.html deleted file mode 100644 index 3f3bfebc..00000000 --- a/functions/development/get_offline_features/0.0.1/static/documentation.html +++ /dev/null @@ -1,125 +0,0 @@ - - - - - - - -get_offline_features package - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- - -
-
-
-
-
-
-

get_offline_features package

-
-

Submodules

-
-
-

get_offline_features.get_offline_features module

-
-
-

Module contents

-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/get_offline_features/0.0.1/static/example.html b/functions/development/get_offline_features/0.0.1/static/example.html deleted file mode 100644 index a2811133..00000000 --- a/functions/development/get_offline_features/0.0.1/static/example.html +++ /dev/null @@ -1,1256 +0,0 @@ - - - - - - - -get_offline_features() from MLRun FeatureStore - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
- -
-
-
-
-

get_offline_features() from MLRun FeatureStore

-

This MLRun Function has the following params:

-
    -
  • feature_vector: str, feature vector uri.

  • -
  • entity_rows: DataItem = None, URI of the data entity rows to join with.

  • -
  • entity_timestamp_column: str = None, timestamp column name in the entity rows dataframe.

  • -
  • target: Union[str, Dict] = None, where to write the results to.

  • -
  • run_config: Union[str, Dict] = None, function and/or run configuration see :py:class:~mlrun.feature_store.RunConfig.

  • -
  • drop_columns: List[str] = None, list of columns to drop from the final result.

  • -
  • start_time: str = None, datetime, low limit of time needed to be filtered. Optional. entity_timestamp_column must be passed when using time filtering.

  • -
  • end_time: str = None, datetime, high limit of time needed to be filtered. Optional. entity_timestamp_column must be passed when using time filtering.

  • -
  • with_indexes: bool = False, return vector with index columns (default False).

  • -
  • update_stats: bool = False, update features statistics from the requested feature sets on the vector. Default is False.

  • -
-
-
-
import mlrun
-import mlrun.feature_store as fstore
-from mlrun.datastore.targets import CSVTarget
-from mlrun.datastore.sources import CSVSource
-from mlrun.run import get_dataitem
-from mlrun.feature_store.steps import *
-from mlrun.features import MinMaxValidator
-import pandas as pd
-import datetime
-import os
-
-
-
-
-
-
-
ABS_PATH = 'v3io://users/{}/get_offline_features/'.format(os.environ['V3IO_USERNAME'])
-# Initialize the MLRun project object
-project = mlrun.get_or_create_project('get-offline-features', context="./", user_project=True)
-
-
-
-
-
> 2022-01-31 14:41:48,288 [info] loaded project get-offline-features from MLRun DB
-
-
-
-
-
-

Generating the Same FeatureSets and FeatureVecotrs Based on the Stocks Example

-
-

Create Sample Data For Demo

-
-
-
quotes = pd.DataFrame(
-    {
-        "time": [
-            pd.Timestamp("2016-05-25 13:30:00.023"),
-            pd.Timestamp("2016-05-25 13:30:00.023"),
-            pd.Timestamp("2016-05-25 13:30:00.030"),
-            pd.Timestamp("2016-05-25 13:30:00.041"),
-            pd.Timestamp("2016-05-25 13:30:00.048"),
-            pd.Timestamp("2016-05-25 13:30:00.049"),
-            pd.Timestamp("2016-05-25 13:30:00.072"),
-            pd.Timestamp("2016-05-25 13:30:00.075")
-        ],
-        "ticker": [
-               "GOOG",
-               "MSFT",
-               "MSFT",
-               "MSFT",
-               "GOOG",
-               "AAPL",
-               "GOOG",
-               "MSFT"
-           ],
-           "bid": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01],
-           "ask": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03]
-    }
-)
-
-trades = pd.DataFrame(
-       {
-           "time": [
-               pd.Timestamp("2016-05-25 13:30:00.023"),
-               pd.Timestamp("2016-05-25 13:30:00.038"),
-               pd.Timestamp("2016-05-25 13:30:00.048"),
-               pd.Timestamp("2016-05-25 13:30:00.048"),
-               pd.Timestamp("2016-05-25 13:30:00.048")
-           ],
-           "ticker": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"],
-           "price": [51.95, 51.95, 720.77, 720.92, 98.0],
-           "quantity": [75, 155, 100, 100, 100]
-       }
-)
-
-stocks = pd.DataFrame(
-       {
-           "ticker": ["MSFT", "GOOG", "AAPL"],
-           "name": ["Microsoft Corporation", "Alphabet Inc", "Apple Inc"],
-           "exchange": ["NASDAQ", "NASDAQ", "NASDAQ"]
-       }
-)
-
-
-
-
-
-
-
def move_date(df, col):
-    max_date = df[col].max()
-    now_date = datetime.datetime.now()
-    delta = now_date - max_date 
-    df[col] = df[col] + delta 
-    return df
-
-quotes = move_date(quotes, "time")
-trades = move_date(trades, "time")
-trades.to_csv('trades.csv', index=False)
-data_uri = os.path.join(ABS_PATH, 'trades.csv')
-
-
-
-
-
-
-
quotes
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
timetickerbidask
02022-01-31 14:41:48.260566GOOG720.50720.93
12022-01-31 14:41:48.260566MSFT51.9551.96
22022-01-31 14:41:48.267566MSFT51.9751.98
32022-01-31 14:41:48.278566MSFT51.9952.00
42022-01-31 14:41:48.285566GOOG720.50720.93
52022-01-31 14:41:48.286566AAPL97.9998.01
62022-01-31 14:41:48.309566GOOG720.50720.88
72022-01-31 14:41:48.312566MSFT52.0152.03
-
-
-
-
-
trades
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
timetickerpricequantity
02022-01-31 14:41:48.288476MSFT51.9575
12022-01-31 14:41:48.303476MSFT51.95155
22022-01-31 14:41:48.313476GOOG720.77100
32022-01-31 14:41:48.313476GOOG720.92100
42022-01-31 14:41:48.313476AAPL98.00100
-
-
-
-
-
stocks
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
tickernameexchange
0MSFTMicrosoft CorporationNASDAQ
1GOOGAlphabet IncNASDAQ
2AAPLApple IncNASDAQ
-
-
-
-
-

Build & Ingest Simple Feature Set (stocks)

-
-
-
# add feature set without time column (stock ticker metadata) 
-stocks_set = fstore.FeatureSet("stocks", entities=[fstore.Entity("ticker")])
-fstore.ingest(stocks_set, stocks, infer_options=fstore.InferOptions.default())
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
nameexchange
ticker
MSFTMicrosoft CorporationNASDAQ
GOOGAlphabet IncNASDAQ
AAPLApple IncNASDAQ
-
-
-
-
-

Build Advanced feature set - with feature engineering pipeline

-
-
-
quotes_set = fstore.FeatureSet("stock-quotes", entities=[fstore.Entity("ticker")])
-
-
-
-
-
-
-
class MyMap(MapClass):
-    def __init__(self, multiplier=1, **kwargs):
-        super().__init__(**kwargs)
-        self._multiplier = multiplier
-
-    def do(self, event):
-        event["multi"] = event["bid"] * self._multiplier
-        return event
-
-
-
-
-
-
-
quotes_set.graph.to("MyMap", multiplier=3)\
-                .to("storey.Extend", _fn="({'extra': event['bid'] * 77})")\
-                .to("storey.Filter", "filter", _fn="(event['bid'] > 51.92)")\
-                .to(FeaturesetValidator())
-
-quotes_set.add_aggregation("ask", ["sum", "max"], "1h", "10m", name="asks1")
-quotes_set.add_aggregation("ask", ["sum", "max"], "5h", "10m", name="asks5")
-quotes_set.add_aggregation("bid", ["min", "max"], "1h", "10m", name="bids")
-
-# add feature validation policy
-quotes_set["bid"] = fstore.Feature(validator=MinMaxValidator(min=52, severity="info"))
-
-# add default target definitions and plot
-quotes_set.set_targets()
-quotes_set.plot(rankdir="LR", with_targets=True)
-
-
-
-
-_images/get_offline_features_example_15_0.svg
-
-
-
-

Ingest Data Into Offline And Online Stores

-
-
-
# save ingest data and print the FeatureSet spec
-fstore.ingest(quotes_set, quotes)
-
-
-
-
-
info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.377248+00:00 args={'min': 52, 'value': 51.95}
-info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.377927+00:00 args={'min': 52, 'value': 51.97}
-info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.378103+00:00 args={'min': 52, 'value': 51.99}
-info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.578640+00:00 args={'min': 52, 'value': 51.95}
-info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.581692+00:00 args={'min': 52, 'value': 51.97}
-info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.584351+00:00 args={'min': 52, 'value': 51.99}
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
asks1_max_1hasks1_sum_1hasks5_max_5hasks5_sum_5hbids_max_1hbids_min_1htimebidaskmultiextra
ticker
GOOG720.93720.93720.93720.93720.50720.502022-01-31 14:41:48.260566720.50720.932161.5055478.50
MSFT51.9651.9651.9651.9651.9551.952022-01-31 14:41:48.26056651.9551.96155.854000.15
MSFT51.98103.9451.98103.9451.9751.952022-01-31 14:41:48.26756651.9751.98155.914001.69
MSFT52.00155.9452.00155.9451.9951.952022-01-31 14:41:48.27856651.9952.00155.974003.23
GOOG720.931441.86720.931441.86720.50720.502022-01-31 14:41:48.285566720.50720.932161.5055478.50
AAPL98.0198.0198.0198.0197.9997.992022-01-31 14:41:48.28656697.9998.01293.977545.23
GOOG720.932162.74720.932162.74720.50720.502022-01-31 14:41:48.309566720.50720.882161.5055478.50
MSFT52.03207.9752.03207.9752.0151.952022-01-31 14:41:48.31256652.0152.03156.034004.77
-
-
-
-
-

Get an Offline Feature Vector

-
-
-
features = [
-    "stock-quotes.multi",
-    "stock-quotes.asks5_sum_5h as total_ask",
-    "stock-quotes.bids_min_1h",
-    "stock-quotes.bids_max_1h",
-    "stocks.*",
-]
-
-vector = fstore.FeatureVector("stocks-vec", features)
-vector.save()
-
-
-
-
-
-
-
target_dict = CSVTarget('mycsv',path=os.path.join(ABS_PATH, 'my_csv.csv')).to_dict()
-
-
-
-
-
-
-

Using get_offline_features()

-
-
-
get_offline_features_fn = mlrun.import_function('hub://get_offline_features:development')
-
-
-
-
-
-
-
gof_run = get_offline_features_fn.run(
-    handler='get_offline_features',
-    inputs= {'entity_rows': data_uri},
-    params={'feature_vector': vector.uri,
-           'target': target_dict,
-            'entity_timestamp_column': "time",
-           },
-    local=True
-)
-
-
-
-
-
> 2022-01-31 14:41:52,066 [info] starting run get-offline-features-get_offline_features uid=956663b9a9ba448c9ea65e8e9245718e DB=http://mlrun-api:8080
-> 2022-01-31 14:41:52,214 [info] Creating DataFrame from entity_rows = v3io://users/yonatan/get_offline_features/trades.csv
-> 2022-01-31 14:41:52,292 [info] Preparing 'mycsv' target
-> 2022-01-31 14:41:52,294 [info] getting offline features from the FeatureVector store://feature-vectors/get-offline-features-yonatan/stocks-vec
-> 2022-01-31 14:41:52,708 [info] wrote target: {'name': 'mycsv', 'kind': 'csv', 'path': 'v3io://users/yonatan/get_offline_features/my_csv.csv', 'status': 'ready', 'updated': '2022-01-31T14:41:52.708534+00:00'}
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
get-offline-features-yonatan0Jan 31 14:41:52completedget-offline-features-get_offline_features
v3io_user=yonatan
kind=
owner=yonatan
host=jupyter-yoni-647b99c95d-w4jlc
entity_rows
feature_vector=store://feature-vectors/get-offline-features-yonatan/stocks-vec
target={'name': 'mycsv', 'kind': 'csv', 'path': 'v3io://users/yonatan/get_offline_features/my_csv.csv', 'partitioned': False}
entity_timestamp_column=time
target=v3io://users/yonatan/get_offline_features/my_csv.csv
feature_vector=store://feature-vectors/get-offline-features-yonatan/stocks-vec
-
- -
-

-
-
-
> to track results use the .show() or .logs() methods or click here to open in UI
> 2022-01-31 14:41:52,896 [info] run executed, status=completed
-
-
-
-
-
-
-
gof_run.outputs['feature_vector']
-
-
-
-
-
'store://feature-vectors/get-offline-features-yonatan/stocks-vec'
-
-
-
-
-
-
-
mlrun.get_dataitem(gof_run.outputs['feature_vector']).as_df()
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Unnamed: 0pricequantitymultitotal_askbids_min_1hbids_max_1hnameexchange
0051.9575155.8551.9651.9551.95Microsoft CorporationNASDAQ
1151.9575155.91103.9451.9551.97Microsoft CorporationNASDAQ
2251.9575155.97155.9451.9551.99Microsoft CorporationNASDAQ
3351.9575156.03207.9751.9552.01Microsoft CorporationNASDAQ
4451.95155155.8551.9651.9551.95Microsoft CorporationNASDAQ
5551.95155155.91103.9451.9551.97Microsoft CorporationNASDAQ
6651.95155155.97155.9451.9551.99Microsoft CorporationNASDAQ
7751.95155156.03207.9751.9552.01Microsoft CorporationNASDAQ
88720.771002161.50720.93720.50720.50Alphabet IncNASDAQ
99720.771002161.501441.86720.50720.50Alphabet IncNASDAQ
1010720.771002161.502162.74720.50720.50Alphabet IncNASDAQ
1111720.921002161.50720.93720.50720.50Alphabet IncNASDAQ
1212720.921002161.501441.86720.50720.50Alphabet IncNASDAQ
1313720.921002161.502162.74720.50720.50Alphabet IncNASDAQ
141498.00100293.9798.0197.9997.99Apple IncNASDAQ
-
-
-
-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/get_offline_features/0.0.1/static/function.html b/functions/development/get_offline_features/0.0.1/static/function.html deleted file mode 100644 index 4d3e8de2..00000000 --- a/functions/development/get_offline_features/0.0.1/static/function.html +++ /dev/null @@ -1,129 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: job
-metadata:
-  name: get-offline-features
-  tag: ''
-  hash: da837de231cf416f2f866c3fe2be9f4d27c35b7d
-  project: ''
-  labels:
-    author: yonish
-  categories:
-  - data-preparation
-  - data-analysis
-  - feature-store
-spec:
-  command: ''
-  args: []
-  image: mlrun/mlrun
-  env: []
-  default_handler: get_offline_features
-  entry_points:
-    get_offline_features:
-      name: get_offline_features
-      doc: 'retrieve offline feature vector results
-
-
-        specify a feature vector object/uri and retrieve the desired features, their
-        metadata
-
-        and statistics. returns :py:class:`~mlrun.feature_store.OfflineVectorResponse`,
-
-        results can be returned as a dataframe or written to a target
-
-
-        The start_time and end_time attributes allow filtering the data to a given
-        time range, they accept
-
-        string values or pandas `Timestamp` objects, string values can also be relative,
-        for example:
-
-        "now", "now - 1d2h", "now+5m", where a valid pandas Timedelta string follows
-        the verb "now",
-
-        for time alignment you can use the verb "floor" e.g. "now -1d floor 1H" will
-        align the time to the last hour
-
-        (the floor string is passed to pandas.Timestamp.floor(), can use D, H, T,
-        S for day, hour, min, sec alignment).'
-      parameters:
-      - name: context
-        type: MLClientCtx
-        doc: MLRun context.
-        default: ''
-      - name: feature_vector
-        type: str
-        doc: feature vector uri.
-        default: ''
-      - name: entity_rows
-        type: DataItem
-        doc: URI of the data entity rows to join with.
-        default: null
-      - name: entity_timestamp_column
-        type: str
-        doc: timestamp column name in the entity rows dataframe
-        default: null
-      - name: target
-        type: Union[str, Dict]
-        doc: where to write the results to.
-        default: null
-      - name: run_config
-        type: Union[str, Dict]
-        doc: function and/or run configuration see :py:class:`~mlrun.feature_store.RunConfig`
-        default: null
-      - name: drop_columns
-        type: List[str]
-        doc: list of columns to drop from the final result
-        default: null
-      - name: start_time
-        type: str
-        doc: datetime, low limit of time needed to be filtered. Optional. entity_timestamp_column
-          must be passed when using time filtering.
-        default: null
-      - name: end_time
-        type: str
-        doc: datetime, high limit of time needed to be filtered. Optional. entity_timestamp_column
-          must be passed when using time filtering.
-        default: null
-      - name: with_indexes
-        type: bool
-        doc: return vector with index columns (default False)
-        default: false
-      - name: update_stats
-        type: bool
-        doc: update features statistics from the requested feature sets on the vector.
-          Default is False.
-        default: false
-      outputs:
-      - default: ''
-      lineno: 9
-  description: retrieve offline feature vector results
-  build:
-    functionSourceCode: ZnJvbSB0eXBpbmcgaW1wb3J0IFVuaW9uLCBMaXN0LCBEaWN0CgppbXBvcnQgbWxydW4uZmVhdHVyZV9zdG9yZSBhcyBmcwpmcm9tIG1scnVuLmRhdGFzdG9yZS50YXJnZXRzIGltcG9ydCBnZXRfdGFyZ2V0X2RyaXZlciwga2luZF90b19kcml2ZXIKZnJvbSBtbHJ1bi5kYXRhc3RvcmUuYmFzZSBpbXBvcnQgRGF0YUl0ZW0KZnJvbSBtbHJ1bi5leGVjdXRpb24gaW1wb3J0IE1MQ2xpZW50Q3R4CgoKZGVmIGdldF9vZmZsaW5lX2ZlYXR1cmVzKAogICAgY29udGV4dDogTUxDbGllbnRDdHgsCiAgICBmZWF0dXJlX3ZlY3Rvcjogc3RyLAogICAgZW50aXR5X3Jvd3M6IERhdGFJdGVtID0gTm9uZSwKICAgIGVudGl0eV90aW1lc3RhbXBfY29sdW1uOiBzdHIgPSBOb25lLAogICAgdGFyZ2V0OiBVbmlvbltzdHIsIERpY3RdID0gTm9uZSwKICAgIHJ1bl9jb25maWc6IFVuaW9uW3N0ciwgRGljdF0gPSBOb25lLAogICAgZHJvcF9jb2x1bW5zOiBMaXN0W3N0cl0gPSBOb25lLAogICAgc3RhcnRfdGltZTogc3RyID0gTm9uZSwKICAgIGVuZF90aW1lOiBzdHIgPSBOb25lLAogICAgd2l0aF9pbmRleGVzOiBib29sID0gRmFsc2UsCiAgICB1cGRhdGVfc3RhdHM6IGJvb2wgPSBGYWxzZSwKKToKICAgICIiInJldHJpZXZlIG9mZmxpbmUgZmVhdHVyZSB2ZWN0b3IgcmVzdWx0cwoKICAgIHNwZWNpZnkgYSBmZWF0dXJlIHZlY3RvciBvYmplY3QvdXJpIGFuZCByZXRyaWV2ZSB0aGUgZGVzaXJlZCBmZWF0dXJlcywgdGhlaXIgbWV0YWRhdGEKICAgIGFuZCBzdGF0aXN0aWNzLiByZXR1cm5zIDpweTpjbGFzczpgfm1scnVuLmZlYXR1cmVfc3RvcmUuT2ZmbGluZVZlY3RvclJlc3BvbnNlYCwKICAgIHJlc3VsdHMgY2FuIGJlIHJldHVybmVkIGFzIGEgZGF0YWZyYW1lIG9yIHdyaXR0ZW4gdG8gYSB0YXJnZXQKCiAgICBUaGUgc3RhcnRfdGltZSBhbmQgZW5kX3RpbWUgYXR0cmlidXRlcyBhbGxvdyBmaWx0ZXJpbmcgdGhlIGRhdGEgdG8gYSBnaXZlbiB0aW1lIHJhbmdlLCB0aGV5IGFjY2VwdAogICAgc3RyaW5nIHZhbHVlcyBvciBwYW5kYXMgYFRpbWVzdGFtcGAgb2JqZWN0cywgc3RyaW5nIHZhbHVlcyBjYW4gYWxzbyBiZSByZWxhdGl2ZSwgZm9yIGV4YW1wbGU6CiAgICAibm93IiwgIm5vdyAtIDFkMmgiLCAibm93KzVtIiwgd2hlcmUgYSB2YWxpZCBwYW5kYXMgVGltZWRlbHRhIHN0cmluZyBmb2xsb3dzIHRoZSB2ZXJiICJub3ciLAogICAgZm9yIHRpbWUgYWxpZ25tZW50IHlvdSBjYW4gdXNlIHRoZSB2ZXJiICJmbG9vciIgZS5nLiAibm93IC0xZCBmbG9vciAxSCIgd2lsbCBhbGlnbiB0aGUgdGltZSB0byB0aGUgbGFzdCBob3VyCiAgICAodGhlIGZsb29yIHN0cmluZyBpcyBwYXNzZWQgdG8gcGFuZGFzLlRpbWVzdGFtcC5mbG9vcigpLCBjYW4gdXNlIEQsIEgsIFQsIFMgZm9yIGRheSwgaG91ciwgbWluLCBzZWMgYWxpZ25tZW50KS4KCgogICAgOnBhcmFtIGNvbnRleHQ6ICAgICAgICBNTFJ1biBjb250ZXh0LgogICAgOnBhcmFtIGZlYXR1cmVfdmVjdG9yOiBmZWF0dXJlIHZlY3RvciB1cmkuCiAgICA6cGFyYW0gZW50aXR5X3Jvd3M6ICAgIFVSSSBvZiB0aGUgZGF0YSBlbnRpdHkgcm93cyB0byBqb2luIHdpdGguCiAgICA6cGFyYW0gdGFyZ2V0OiAgICAgICAgIHdoZXJlIHRvIHdyaXRlIHRoZSByZXN1bHRzIHRvLgogICAgOnBhcmFtIGRyb3BfY29sdW1uczogICBsaXN0IG9mIGNvbHVtbnMgdG8gZHJvcCBmcm9tIHRoZSBmaW5hbCByZXN1bHQKICAgIDpwYXJhbSBlbnRpdHlfdGltZXN0YW1wX2NvbHVtbjogdGltZXN0YW1wIGNvbHVtbiBuYW1lIGluIHRoZSBlbnRpdHkgcm93cyBkYXRhZnJhbWUKICAgIDpwYXJhbSBydW5fY29uZmlnOiAgICAgZnVuY3Rpb24gYW5kL29yIHJ1biBjb25maWd1cmF0aW9uCiAgICAgICAgICAgICAgICAgICAgICAgICAgIHNlZSA6cHk6Y2xhc3M6YH5tbHJ1bi5mZWF0dXJlX3N0b3JlLlJ1bkNvbmZpZ2AKICAgIDpwYXJhbSBzdGFydF90aW1lOiAgICAgIGRhdGV0aW1lLCBsb3cgbGltaXQgb2YgdGltZSBuZWVkZWQgdG8gYmUgZmlsdGVyZWQuIE9wdGlvbmFsLgogICAgICAgIGVudGl0eV90aW1lc3RhbXBfY29sdW1uIG11c3QgYmUgcGFzc2VkIHdoZW4gdXNpbmcgdGltZSBmaWx0ZXJpbmcuCiAgICA6cGFyYW0gZW5kX3RpbWU6ICAgICAgICBkYXRldGltZSwgaGlnaCBsaW1pdCBvZiB0aW1lIG5lZWRlZCB0byBiZSBmaWx0ZXJlZC4gT3B0aW9uYWwuCiAgICAgICAgZW50aXR5X3RpbWVzdGFtcF9jb2x1bW4gbXVzdCBiZSBwYXNzZWQgd2hlbiB1c2luZyB0aW1lIGZpbHRlcmluZy4KICAgIDpwYXJhbSB3aXRoX2luZGV4ZXM6ICAgIHJldHVybiB2ZWN0b3Igd2l0aCBpbmRleCBjb2x1bW5zIChkZWZhdWx0IEZhbHNlKQogICAgOnBhcmFtIHVwZGF0ZV9zdGF0czogICAgdXBkYXRlIGZlYXR1cmVzIHN0YXRpc3RpY3MgZnJvbSB0aGUgcmVxdWVzdGVkIGZlYXR1cmUgc2V0cyBvbiB0aGUgdmVjdG9yLiBEZWZhdWx0IGlzIEZhbHNlLgoKICAgIDpyZXR1cm5zIGZlYXR1cmVfdmVjdG9yIGlucHV0CiAgICAiIiIKCiAgICAjIFByZXBhcmluZyBlbnRpdHlfcm93czoKICAgIGlmIGVudGl0eV9yb3dzIGlzIG5vdCBOb25lOgogICAgICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZiJDcmVhdGluZyBEYXRhRnJhbWUgZnJvbSBlbnRpdHlfcm93cyA9IHtlbnRpdHlfcm93c30iKQogICAgICAgIGVudGl0eV9yb3dzID0gZW50aXR5X3Jvd3MuYXNfZGYoKQoKICAgICMgUHJlcGFyaW5nIHRhcmdldDoKICAgIGlmIHRhcmdldDoKICAgICAgICBpZiBpc2luc3RhbmNlKHRhcmdldCwgc3RyKToKICAgICAgICAgICAgdGFyZ2V0ID0ga2luZF90b19kcml2ZXJbdGFyZ2V0XSgpCgogICAgICAgIG5hbWUgPSB0YXJnZXQubmFtZSBpZiBoYXNhdHRyKHRhcmdldCwgIm5hbWUiKSBlbHNlIHRhcmdldFsibmFtZSJdCiAgICAgICAgY29udGV4dC5sb2dnZXIuaW5mbyhmIlByZXBhcmluZyAne25hbWV9JyB0YXJnZXQiKQogICAgICAgIHRhcmdldCA9IGdldF90YXJnZXRfZHJpdmVyKHRhcmdldCkKICAgIGlmIHRhcmdldC5wYXRoOgogICAgICAgIGNvbnRleHQubG9nX3Jlc3VsdCgidGFyZ2V0IiwgdGFyZ2V0LnBhdGgpCgogICAgIyBQcmVwYXJpbmcgcnVuX2NvbmZpZzoKICAgIGlmIHJ1bl9jb25maWcgYW5kIGlzaW5zdGFuY2UocnVuX2NvbmZpZywgZGljdCk6CiAgICAgICAgY29udGV4dC5sb2dnZXIuaW5mbygiUHJlcGFyaW5nIHJ1biBjb25maWd1cmF0aW9uIikKICAgICAgICBydW5fY29uZmlnID0gZnMuUnVuQ29uZmlnKCoqcnVuX2NvbmZpZykKCiAgICAjIENhbGxpbmcgZ2V0X29mZmxpbmVfZmVhdHVyZXM6CiAgICBjb250ZXh0LmxvZ2dlci5pbmZvKAogICAgICAgIGYiZ2V0dGluZyBvZmZsaW5lIGZlYXR1cmVzIGZyb20gdGhlIEZlYXR1cmVWZWN0b3Ige2ZlYXR1cmVfdmVjdG9yfSIKICAgICkKICAgIGZzLmdldF9vZmZsaW5lX2ZlYXR1cmVzKAogICAgICAgIGZlYXR1cmVfdmVjdG9yPWZlYXR1cmVfdmVjdG9yLAogICAgICAgIGVudGl0eV9yb3dzPWVudGl0eV9yb3dzLAogICAgICAgIGVudGl0eV90aW1lc3RhbXBfY29sdW1uPWVudGl0eV90aW1lc3RhbXBfY29sdW1uLAogICAgICAgIHRhcmdldD10YXJnZXQsCiAgICAgICAgcnVuX2NvbmZpZz1ydW5fY29uZmlnLAogICAgICAgIGRyb3BfY29sdW1ucz1kcm9wX2NvbHVtbnMsCiAgICAgICAgc3RhcnRfdGltZT1zdGFydF90aW1lLAogICAgICAgIGVuZF90aW1lPWVuZF90aW1lLAogICAgICAgIHdpdGhfaW5kZXhlcz13aXRoX2luZGV4ZXMsCiAgICAgICAgdXBkYXRlX3N0YXRzPXVwZGF0ZV9zdGF0cywKICAgICkKCiAgICBjb250ZXh0LmxvZ19yZXN1bHQoImZlYXR1cmVfdmVjdG9yIiwgZmVhdHVyZV92ZWN0b3IpCg==
-    commands: []
-    code_origin: https://github.com/mlrun/functions.git#886a88217c2a2570c81a14877f9c1dfb1ac8a244:C:\Users\yonatans\projects\functions\get_offline_features\get_offline_features.py
-    origin_filename: C:\Users\yonatans\projects\functions\get_offline_features\get_offline_features.py
-  disable_auto_mount: false
-  priority_class_name: ''
-  affinity: null
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/get_offline_features/0.0.1/static/item.html b/functions/development/get_offline_features/0.0.1/static/item.html deleted file mode 100644 index 121d026b..00000000 --- a/functions/development/get_offline_features/0.0.1/static/item.html +++ /dev/null @@ -1,47 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- data-preparation
-- data-analysis
-- feature-store
-description: retrieve offline feature vector results
-doc: ''
-example: get_offline_features.ipynb
-generationDate: 2022-01-17:17-56
-icon: ''
-labels:
-  author: yonish
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 0.9.1
-name: get_offline_features
-platformVersion: ''
-spec:
-  filename: get_offline_features.py
-  handler: get_offline_features
-  image: mlrun/mlrun
-  kind: job
-  requirements: []
-url: ''
-version: 0.0.1
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/get_offline_features/0.0.1/static/source.html b/functions/development/get_offline_features/0.0.1/static/source.html deleted file mode 100644 index 2b75bea7..00000000 --- a/functions/development/get_offline_features/0.0.1/static/source.html +++ /dev/null @@ -1,142 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-from typing import Union, List, Dict
-
-import mlrun
-import mlrun.feature_store as fs
-from mlrun.datastore.store_resources import is_store_uri, parse_store_uri
-from mlrun.datastore.targets import get_target_driver, kind_to_driver
-from mlrun.datastore.base import DataItem
-from mlrun.execution import MLClientCtx
-from mlrun.utils import StorePrefix, parse_versioned_object_uri
-from mlrun.errors import MLRunInvalidArgumentError
-
-
-def get_offline_features(
-    context: MLClientCtx,
-    feature_vector: str,
-    features: List[str] = None,
-    label_feature: str = None,
-    description: str = None,
-    entity_rows: DataItem = None,
-    entity_timestamp_column: str = None,
-    target: Union[str, Dict] = None,
-    run_config: Union[str, Dict] = None,
-    drop_columns: List[str] = None,
-    start_time: str = None,
-    end_time: str = None,
-    with_indexes: bool = False,
-    update_stats: bool = False,
-):
-    """retrieve offline feature vector results
-
-    specify a feature vector object/uri and retrieve the desired features, their metadata
-    and statistics. returns :py:class:`~mlrun.feature_store.OfflineVectorResponse`,
-    results can be returned as a dataframe or written to a target.
-    If feature vector does not exist, a new one will be created and saved with the given features.
-
-    The start_time and end_time attributes allow filtering the data to a given time range, they accept
-    string values or pandas `Timestamp` objects, string values can also be relative, for example:
-    "now", "now - 1d2h", "now+5m", where a valid pandas Timedelta string follows the verb "now",
-    for time alignment you can use the verb "floor" e.g. "now -1d floor 1H" will align the time to the last hour
-    (the floor string is passed to pandas.Timestamp.floor(), can use D, H, T, S for day, hour, min, sec alignment)
-
-
-    :param context:        MLRun context
-    :param feature_vector: feature vector uri
-    :param features:       Relevant only if feature_vector not exist. list of feature to collect to this vector
-                           format [/]. [as ]
-    :param label_feature:  feature name to be used as label data
-    :param description:    text description of the vector
-    :param entity_rows:    URI of the data entity rows to join with
-    :param target:         where to write the results to
-    :param drop_columns:   list of columns to drop from the final result
-    :param entity_timestamp_column: timestamp column name in the entity rows dataframe
-    :param run_config:     function and/or run configuration
-                           see :py:class:`~mlrun.feature_store.RunConfig`
-    :param start_time:      datetime, low limit of time needed to be filtered. Optional
-        entity_timestamp_column must be passed when using time filtering
-    :param end_time:        datetime, high limit of time needed to be filtered. Optional
-        entity_timestamp_column must be passed when using time filtering
-    :param with_indexes:    return vector with index columns (default False)
-    :param update_stats:    update features statistics from the requested feature sets on the vector. Default is False.
-
-    :returns feature_vector input
-    """
-
-    if features is not None:
-        # Creating a new FeatureVector and saving:
-        if is_store_uri(feature_vector):
-            prefix, new_uri = parse_store_uri(feature_vector)
-            if prefix != StorePrefix.FeatureVector:
-                raise MLRunInvalidArgumentError(
-                    f"provided store uri ({feature_vector}) does not represent a feature vector (prefix={prefix})"
-                )
-            feature_vector = new_uri
-
-        project, name, tag, _ = parse_versioned_object_uri(feature_vector, mlrun.mlconf.default_project)
-        vector = fs.FeatureVector(name, features, label_feature=label_feature, description=description)
-        vector.metadata.project = project
-        vector.metadata.tag = tag
-        vector.save()
-        feature_vector = vector.uri
-
-    # Preparing entity_rows:
-    if entity_rows is not None:
-        context.logger.info(f"Creating DataFrame from entity_rows = {entity_rows}")
-        entity_rows = entity_rows.as_df()
-
-    # Preparing target:
-    if target:
-        if isinstance(target, str):
-            target = kind_to_driver[target]()
-
-        name = target.name if hasattr(target, "name") else target["name"]
-        context.logger.info(f"Preparing '{name}' target")
-        target = get_target_driver(target)
-    if target.path:
-        context.log_result("target", target.path)
-
-    # Preparing run_config:
-    if run_config and isinstance(run_config, dict):
-        context.logger.info("Preparing run configuration")
-        run_config = fs.RunConfig(**run_config)
-
-    # Calling get_offline_features:
-    context.logger.info(
-        f"getting offline features from the FeatureVector {feature_vector}"
-    )
-    fs.get_offline_features(
-        feature_vector=feature_vector,
-        entity_rows=entity_rows,
-        entity_timestamp_column=entity_timestamp_column,
-        target=target,
-        run_config=run_config,
-        drop_columns=drop_columns,
-        start_time=start_time,
-        end_time=end_time,
-        with_indexes=with_indexes,
-        update_stats=update_stats,
-    )
-
-    context.log_result("feature_vector", feature_vector)
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/get_offline_features/0.9.0/src/function.yaml b/functions/development/get_offline_features/0.9.0/src/function.yaml deleted file mode 100644 index 3a298a70..00000000 --- a/functions/development/get_offline_features/0.9.0/src/function.yaml +++ /dev/null @@ -1,107 +0,0 @@ -kind: job -metadata: - name: get-offline-features - tag: '' - hash: da837de231cf416f2f866c3fe2be9f4d27c35b7d - project: '' - labels: - author: yonish - categories: - - data-preparation - - data-analysis - - feature-store -spec: - command: '' - args: [] - image: mlrun/mlrun - env: [] - default_handler: get_offline_features - entry_points: - get_offline_features: - name: get_offline_features - doc: 'retrieve offline feature vector results - - - specify a feature vector object/uri and retrieve the desired features, their - metadata - - and statistics. returns :py:class:`~mlrun.feature_store.OfflineVectorResponse`, - - results can be returned as a dataframe or written to a target - - - The start_time and end_time attributes allow filtering the data to a given - time range, they accept - - string values or pandas `Timestamp` objects, string values can also be relative, - for example: - - "now", "now - 1d2h", "now+5m", where a valid pandas Timedelta string follows - the verb "now", - - for time alignment you can use the verb "floor" e.g. "now -1d floor 1H" will - align the time to the last hour - - (the floor string is passed to pandas.Timestamp.floor(), can use D, H, T, - S for day, hour, min, sec alignment).' - parameters: - - name: context - type: MLClientCtx - doc: MLRun context. - default: '' - - name: feature_vector - type: str - doc: feature vector uri. - default: '' - - name: entity_rows - type: DataItem - doc: URI of the data entity rows to join with. - default: null - - name: entity_timestamp_column - type: str - doc: timestamp column name in the entity rows dataframe - default: null - - name: target - type: Union[str, Dict] - doc: where to write the results to. - default: null - - name: run_config - type: Union[str, Dict] - doc: function and/or run configuration see :py:class:`~mlrun.feature_store.RunConfig` - default: null - - name: drop_columns - type: List[str] - doc: list of columns to drop from the final result - default: null - - name: start_time - type: str - doc: datetime, low limit of time needed to be filtered. Optional. entity_timestamp_column - must be passed when using time filtering. - default: null - - name: end_time - type: str - doc: datetime, high limit of time needed to be filtered. Optional. entity_timestamp_column - must be passed when using time filtering. - default: null - - name: with_indexes - type: bool - doc: return vector with index columns (default False) - default: false - - name: update_stats - type: bool - doc: update features statistics from the requested feature sets on the vector. - Default is False. - default: false - outputs: - - default: '' - lineno: 9 - description: retrieve offline feature vector results - build: - functionSourceCode: ZnJvbSB0eXBpbmcgaW1wb3J0IFVuaW9uLCBMaXN0LCBEaWN0CgppbXBvcnQgbWxydW4uZmVhdHVyZV9zdG9yZSBhcyBmcwpmcm9tIG1scnVuLmRhdGFzdG9yZS50YXJnZXRzIGltcG9ydCBnZXRfdGFyZ2V0X2RyaXZlciwga2luZF90b19kcml2ZXIKZnJvbSBtbHJ1bi5kYXRhc3RvcmUuYmFzZSBpbXBvcnQgRGF0YUl0ZW0KZnJvbSBtbHJ1bi5leGVjdXRpb24gaW1wb3J0IE1MQ2xpZW50Q3R4CgoKZGVmIGdldF9vZmZsaW5lX2ZlYXR1cmVzKAogICAgY29udGV4dDogTUxDbGllbnRDdHgsCiAgICBmZWF0dXJlX3ZlY3Rvcjogc3RyLAogICAgZW50aXR5X3Jvd3M6IERhdGFJdGVtID0gTm9uZSwKICAgIGVudGl0eV90aW1lc3RhbXBfY29sdW1uOiBzdHIgPSBOb25lLAogICAgdGFyZ2V0OiBVbmlvbltzdHIsIERpY3RdID0gTm9uZSwKICAgIHJ1bl9jb25maWc6IFVuaW9uW3N0ciwgRGljdF0gPSBOb25lLAogICAgZHJvcF9jb2x1bW5zOiBMaXN0W3N0cl0gPSBOb25lLAogICAgc3RhcnRfdGltZTogc3RyID0gTm9uZSwKICAgIGVuZF90aW1lOiBzdHIgPSBOb25lLAogICAgd2l0aF9pbmRleGVzOiBib29sID0gRmFsc2UsCiAgICB1cGRhdGVfc3RhdHM6IGJvb2wgPSBGYWxzZSwKKToKICAgICIiInJldHJpZXZlIG9mZmxpbmUgZmVhdHVyZSB2ZWN0b3IgcmVzdWx0cwoKICAgIHNwZWNpZnkgYSBmZWF0dXJlIHZlY3RvciBvYmplY3QvdXJpIGFuZCByZXRyaWV2ZSB0aGUgZGVzaXJlZCBmZWF0dXJlcywgdGhlaXIgbWV0YWRhdGEKICAgIGFuZCBzdGF0aXN0aWNzLiByZXR1cm5zIDpweTpjbGFzczpgfm1scnVuLmZlYXR1cmVfc3RvcmUuT2ZmbGluZVZlY3RvclJlc3BvbnNlYCwKICAgIHJlc3VsdHMgY2FuIGJlIHJldHVybmVkIGFzIGEgZGF0YWZyYW1lIG9yIHdyaXR0ZW4gdG8gYSB0YXJnZXQKCiAgICBUaGUgc3RhcnRfdGltZSBhbmQgZW5kX3RpbWUgYXR0cmlidXRlcyBhbGxvdyBmaWx0ZXJpbmcgdGhlIGRhdGEgdG8gYSBnaXZlbiB0aW1lIHJhbmdlLCB0aGV5IGFjY2VwdAogICAgc3RyaW5nIHZhbHVlcyBvciBwYW5kYXMgYFRpbWVzdGFtcGAgb2JqZWN0cywgc3RyaW5nIHZhbHVlcyBjYW4gYWxzbyBiZSByZWxhdGl2ZSwgZm9yIGV4YW1wbGU6CiAgICAibm93IiwgIm5vdyAtIDFkMmgiLCAibm93KzVtIiwgd2hlcmUgYSB2YWxpZCBwYW5kYXMgVGltZWRlbHRhIHN0cmluZyBmb2xsb3dzIHRoZSB2ZXJiICJub3ciLAogICAgZm9yIHRpbWUgYWxpZ25tZW50IHlvdSBjYW4gdXNlIHRoZSB2ZXJiICJmbG9vciIgZS5nLiAibm93IC0xZCBmbG9vciAxSCIgd2lsbCBhbGlnbiB0aGUgdGltZSB0byB0aGUgbGFzdCBob3VyCiAgICAodGhlIGZsb29yIHN0cmluZyBpcyBwYXNzZWQgdG8gcGFuZGFzLlRpbWVzdGFtcC5mbG9vcigpLCBjYW4gdXNlIEQsIEgsIFQsIFMgZm9yIGRheSwgaG91ciwgbWluLCBzZWMgYWxpZ25tZW50KS4KCgogICAgOnBhcmFtIGNvbnRleHQ6ICAgICAgICBNTFJ1biBjb250ZXh0LgogICAgOnBhcmFtIGZlYXR1cmVfdmVjdG9yOiBmZWF0dXJlIHZlY3RvciB1cmkuCiAgICA6cGFyYW0gZW50aXR5X3Jvd3M6ICAgIFVSSSBvZiB0aGUgZGF0YSBlbnRpdHkgcm93cyB0byBqb2luIHdpdGguCiAgICA6cGFyYW0gdGFyZ2V0OiAgICAgICAgIHdoZXJlIHRvIHdyaXRlIHRoZSByZXN1bHRzIHRvLgogICAgOnBhcmFtIGRyb3BfY29sdW1uczogICBsaXN0IG9mIGNvbHVtbnMgdG8gZHJvcCBmcm9tIHRoZSBmaW5hbCByZXN1bHQKICAgIDpwYXJhbSBlbnRpdHlfdGltZXN0YW1wX2NvbHVtbjogdGltZXN0YW1wIGNvbHVtbiBuYW1lIGluIHRoZSBlbnRpdHkgcm93cyBkYXRhZnJhbWUKICAgIDpwYXJhbSBydW5fY29uZmlnOiAgICAgZnVuY3Rpb24gYW5kL29yIHJ1biBjb25maWd1cmF0aW9uCiAgICAgICAgICAgICAgICAgICAgICAgICAgIHNlZSA6cHk6Y2xhc3M6YH5tbHJ1bi5mZWF0dXJlX3N0b3JlLlJ1bkNvbmZpZ2AKICAgIDpwYXJhbSBzdGFydF90aW1lOiAgICAgIGRhdGV0aW1lLCBsb3cgbGltaXQgb2YgdGltZSBuZWVkZWQgdG8gYmUgZmlsdGVyZWQuIE9wdGlvbmFsLgogICAgICAgIGVudGl0eV90aW1lc3RhbXBfY29sdW1uIG11c3QgYmUgcGFzc2VkIHdoZW4gdXNpbmcgdGltZSBmaWx0ZXJpbmcuCiAgICA6cGFyYW0gZW5kX3RpbWU6ICAgICAgICBkYXRldGltZSwgaGlnaCBsaW1pdCBvZiB0aW1lIG5lZWRlZCB0byBiZSBmaWx0ZXJlZC4gT3B0aW9uYWwuCiAgICAgICAgZW50aXR5X3RpbWVzdGFtcF9jb2x1bW4gbXVzdCBiZSBwYXNzZWQgd2hlbiB1c2luZyB0aW1lIGZpbHRlcmluZy4KICAgIDpwYXJhbSB3aXRoX2luZGV4ZXM6ICAgIHJldHVybiB2ZWN0b3Igd2l0aCBpbmRleCBjb2x1bW5zIChkZWZhdWx0IEZhbHNlKQogICAgOnBhcmFtIHVwZGF0ZV9zdGF0czogICAgdXBkYXRlIGZlYXR1cmVzIHN0YXRpc3RpY3MgZnJvbSB0aGUgcmVxdWVzdGVkIGZlYXR1cmUgc2V0cyBvbiB0aGUgdmVjdG9yLiBEZWZhdWx0IGlzIEZhbHNlLgoKICAgIDpyZXR1cm5zIGZlYXR1cmVfdmVjdG9yIGlucHV0CiAgICAiIiIKCiAgICAjIFByZXBhcmluZyBlbnRpdHlfcm93czoKICAgIGlmIGVudGl0eV9yb3dzIGlzIG5vdCBOb25lOgogICAgICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZiJDcmVhdGluZyBEYXRhRnJhbWUgZnJvbSBlbnRpdHlfcm93cyA9IHtlbnRpdHlfcm93c30iKQogICAgICAgIGVudGl0eV9yb3dzID0gZW50aXR5X3Jvd3MuYXNfZGYoKQoKICAgICMgUHJlcGFyaW5nIHRhcmdldDoKICAgIGlmIHRhcmdldDoKICAgICAgICBpZiBpc2luc3RhbmNlKHRhcmdldCwgc3RyKToKICAgICAgICAgICAgdGFyZ2V0ID0ga2luZF90b19kcml2ZXJbdGFyZ2V0XSgpCgogICAgICAgIG5hbWUgPSB0YXJnZXQubmFtZSBpZiBoYXNhdHRyKHRhcmdldCwgIm5hbWUiKSBlbHNlIHRhcmdldFsibmFtZSJdCiAgICAgICAgY29udGV4dC5sb2dnZXIuaW5mbyhmIlByZXBhcmluZyAne25hbWV9JyB0YXJnZXQiKQogICAgICAgIHRhcmdldCA9IGdldF90YXJnZXRfZHJpdmVyKHRhcmdldCkKICAgIGlmIHRhcmdldC5wYXRoOgogICAgICAgIGNvbnRleHQubG9nX3Jlc3VsdCgidGFyZ2V0IiwgdGFyZ2V0LnBhdGgpCgogICAgIyBQcmVwYXJpbmcgcnVuX2NvbmZpZzoKICAgIGlmIHJ1bl9jb25maWcgYW5kIGlzaW5zdGFuY2UocnVuX2NvbmZpZywgZGljdCk6CiAgICAgICAgY29udGV4dC5sb2dnZXIuaW5mbygiUHJlcGFyaW5nIHJ1biBjb25maWd1cmF0aW9uIikKICAgICAgICBydW5fY29uZmlnID0gZnMuUnVuQ29uZmlnKCoqcnVuX2NvbmZpZykKCiAgICAjIENhbGxpbmcgZ2V0X29mZmxpbmVfZmVhdHVyZXM6CiAgICBjb250ZXh0LmxvZ2dlci5pbmZvKAogICAgICAgIGYiZ2V0dGluZyBvZmZsaW5lIGZlYXR1cmVzIGZyb20gdGhlIEZlYXR1cmVWZWN0b3Ige2ZlYXR1cmVfdmVjdG9yfSIKICAgICkKICAgIGZzLmdldF9vZmZsaW5lX2ZlYXR1cmVzKAogICAgICAgIGZlYXR1cmVfdmVjdG9yPWZlYXR1cmVfdmVjdG9yLAogICAgICAgIGVudGl0eV9yb3dzPWVudGl0eV9yb3dzLAogICAgICAgIGVudGl0eV90aW1lc3RhbXBfY29sdW1uPWVudGl0eV90aW1lc3RhbXBfY29sdW1uLAogICAgICAgIHRhcmdldD10YXJnZXQsCiAgICAgICAgcnVuX2NvbmZpZz1ydW5fY29uZmlnLAogICAgICAgIGRyb3BfY29sdW1ucz1kcm9wX2NvbHVtbnMsCiAgICAgICAgc3RhcnRfdGltZT1zdGFydF90aW1lLAogICAgICAgIGVuZF90aW1lPWVuZF90aW1lLAogICAgICAgIHdpdGhfaW5kZXhlcz13aXRoX2luZGV4ZXMsCiAgICAgICAgdXBkYXRlX3N0YXRzPXVwZGF0ZV9zdGF0cywKICAgICkKCiAgICBjb250ZXh0LmxvZ19yZXN1bHQoImZlYXR1cmVfdmVjdG9yIiwgZmVhdHVyZV92ZWN0b3IpCg== - commands: [] - code_origin: https://github.com/mlrun/functions.git#886a88217c2a2570c81a14877f9c1dfb1ac8a244:C:\Users\yonatans\projects\functions\get_offline_features\get_offline_features.py - origin_filename: C:\Users\yonatans\projects\functions\get_offline_features\get_offline_features.py - disable_auto_mount: false - priority_class_name: '' - affinity: null -verbose: false diff --git a/functions/development/get_offline_features/0.9.0/src/get_offline_features.ipynb b/functions/development/get_offline_features/0.9.0/src/get_offline_features.ipynb deleted file mode 100644 index d97402a2..00000000 --- a/functions/development/get_offline_features/0.9.0/src/get_offline_features.ipynb +++ /dev/null @@ -1,1536 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# `get_offline_features()` from MLRun FeatureStore\n", - "\n", - "This MLRun Function has the following `params`:\n", - "\n", - "- `feature_vector: str`, feature vector uri.\n", - "\n", - "- `entity_rows: DataItem` = None, URI of the data entity rows to join with.\n", - "\n", - "- `entity_timestamp_column: str = None`, timestamp column name in the entity rows dataframe.\n", - "\n", - "- `target: Union[str, Dict] = None`, where to write the results to.\n", - "\n", - "- `run_config: Union[str, Dict] = None`, function and/or run configuration see :py:class:`~mlrun.feature_store.RunConfig`.\n", - "\n", - "- `drop_columns: List[str] = None`, list of columns to drop from the final result. \n", - "\n", - "- `start_time: str = None`, datetime, low limit of time needed to be filtered. Optional. `entity_timestamp_column` must be passed when using time filtering.\n", - "\n", - "- `end_time: str = None`, datetime, high limit of time needed to be filtered. Optional. `entity_timestamp_column` must be passed when using time filtering.\n", - "\n", - "- `with_indexes: bool = False`, return vector with index columns (default False).\n", - "\n", - "- `update_stats: bool = False`, update features statistics from the requested feature sets on the vector. Default is False." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import mlrun\n", - "import mlrun.feature_store as fstore\n", - "from mlrun.datastore.targets import CSVTarget\n", - "from mlrun.datastore.sources import CSVSource\n", - "from mlrun.run import get_dataitem\n", - "from mlrun.feature_store.steps import *\n", - "from mlrun.features import MinMaxValidator\n", - "import pandas as pd\n", - "import datetime\n", - "import os" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2022-01-31 14:41:48,288 [info] loaded project get-offline-features from MLRun DB\n" - ] - } - ], - "source": [ - "ABS_PATH = 'v3io://users/{}/get_offline_features/'.format(os.environ['V3IO_USERNAME'])\n", - "# Initialize the MLRun project object\n", - "project = mlrun.get_or_create_project('get-offline-features', context=\"./\", user_project=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Generating the Same FeatureSets and FeatureVecotrs Based on the Stocks Example" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Create Sample Data For Demo" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "quotes = pd.DataFrame(\n", - " {\n", - " \"time\": [\n", - " pd.Timestamp(\"2016-05-25 13:30:00.023\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.023\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.030\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.041\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.048\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.049\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.072\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.075\")\n", - " ],\n", - " \"ticker\": [\n", - " \"GOOG\",\n", - " \"MSFT\",\n", - " \"MSFT\",\n", - " \"MSFT\",\n", - " \"GOOG\",\n", - " \"AAPL\",\n", - " \"GOOG\",\n", - " \"MSFT\"\n", - " ],\n", - " \"bid\": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01],\n", - " \"ask\": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03]\n", - " }\n", - ")\n", - "\n", - "trades = pd.DataFrame(\n", - " {\n", - " \"time\": [\n", - " pd.Timestamp(\"2016-05-25 13:30:00.023\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.038\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.048\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.048\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.048\")\n", - " ],\n", - " \"ticker\": [\"MSFT\", \"MSFT\", \"GOOG\", \"GOOG\", \"AAPL\"],\n", - " \"price\": [51.95, 51.95, 720.77, 720.92, 98.0],\n", - " \"quantity\": [75, 155, 100, 100, 100]\n", - " }\n", - ")\n", - "\n", - "stocks = pd.DataFrame(\n", - " {\n", - " \"ticker\": [\"MSFT\", \"GOOG\", \"AAPL\"],\n", - " \"name\": [\"Microsoft Corporation\", \"Alphabet Inc\", \"Apple Inc\"],\n", - " \"exchange\": [\"NASDAQ\", \"NASDAQ\", \"NASDAQ\"]\n", - " }\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "def move_date(df, col):\n", - " max_date = df[col].max()\n", - " now_date = datetime.datetime.now()\n", - " delta = now_date - max_date \n", - " df[col] = df[col] + delta \n", - " return df\n", - "\n", - "quotes = move_date(quotes, \"time\")\n", - "trades = move_date(trades, \"time\")\n", - "trades.to_csv('trades.csv', index=False)\n", - "data_uri = os.path.join(ABS_PATH, 'trades.csv')" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
timetickerbidask
02022-01-31 14:41:48.260566GOOG720.50720.93
12022-01-31 14:41:48.260566MSFT51.9551.96
22022-01-31 14:41:48.267566MSFT51.9751.98
32022-01-31 14:41:48.278566MSFT51.9952.00
42022-01-31 14:41:48.285566GOOG720.50720.93
52022-01-31 14:41:48.286566AAPL97.9998.01
62022-01-31 14:41:48.309566GOOG720.50720.88
72022-01-31 14:41:48.312566MSFT52.0152.03
\n", - "
" - ], - "text/plain": [ - " time ticker bid ask\n", - "0 2022-01-31 14:41:48.260566 GOOG 720.50 720.93\n", - "1 2022-01-31 14:41:48.260566 MSFT 51.95 51.96\n", - "2 2022-01-31 14:41:48.267566 MSFT 51.97 51.98\n", - "3 2022-01-31 14:41:48.278566 MSFT 51.99 52.00\n", - "4 2022-01-31 14:41:48.285566 GOOG 720.50 720.93\n", - "5 2022-01-31 14:41:48.286566 AAPL 97.99 98.01\n", - "6 2022-01-31 14:41:48.309566 GOOG 720.50 720.88\n", - "7 2022-01-31 14:41:48.312566 MSFT 52.01 52.03" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "quotes" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
timetickerpricequantity
02022-01-31 14:41:48.288476MSFT51.9575
12022-01-31 14:41:48.303476MSFT51.95155
22022-01-31 14:41:48.313476GOOG720.77100
32022-01-31 14:41:48.313476GOOG720.92100
42022-01-31 14:41:48.313476AAPL98.00100
\n", - "
" - ], - "text/plain": [ - " time ticker price quantity\n", - "0 2022-01-31 14:41:48.288476 MSFT 51.95 75\n", - "1 2022-01-31 14:41:48.303476 MSFT 51.95 155\n", - "2 2022-01-31 14:41:48.313476 GOOG 720.77 100\n", - "3 2022-01-31 14:41:48.313476 GOOG 720.92 100\n", - "4 2022-01-31 14:41:48.313476 AAPL 98.00 100" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "trades" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
tickernameexchange
0MSFTMicrosoft CorporationNASDAQ
1GOOGAlphabet IncNASDAQ
2AAPLApple IncNASDAQ
\n", - "
" - ], - "text/plain": [ - " ticker name exchange\n", - "0 MSFT Microsoft Corporation NASDAQ\n", - "1 GOOG Alphabet Inc NASDAQ\n", - "2 AAPL Apple Inc NASDAQ" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "stocks" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Build & Ingest Simple Feature Set (stocks)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
nameexchange
ticker
MSFTMicrosoft CorporationNASDAQ
GOOGAlphabet IncNASDAQ
AAPLApple IncNASDAQ
\n", - "
" - ], - "text/plain": [ - " name exchange\n", - "ticker \n", - "MSFT Microsoft Corporation NASDAQ\n", - "GOOG Alphabet Inc NASDAQ\n", - "AAPL Apple Inc NASDAQ" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# add feature set without time column (stock ticker metadata) \n", - "stocks_set = fstore.FeatureSet(\"stocks\", entities=[fstore.Entity(\"ticker\")])\n", - "fstore.ingest(stocks_set, stocks, infer_options=fstore.InferOptions.default())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Build Advanced feature set - with feature engineering pipeline" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "quotes_set = fstore.FeatureSet(\"stock-quotes\", entities=[fstore.Entity(\"ticker\")])" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "class MyMap(MapClass):\n", - " def __init__(self, multiplier=1, **kwargs):\n", - " super().__init__(**kwargs)\n", - " self._multiplier = multiplier\n", - "\n", - " def do(self, event):\n", - " event[\"multi\"] = event[\"bid\"] * self._multiplier\n", - " return event" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "image/svg+xml": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "mlrun-flow\n", - "\n", - "\n", - "\n", - "_start\n", - "\n", - "start\n", - "\n", - "\n", - "\n", - "MyMap\n", - "\n", - "MyMap\n", - "\n", - "\n", - "\n", - "_start->MyMap\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "storey.Extend\n", - "\n", - "storey.Extend\n", - "\n", - "\n", - "\n", - "MyMap->storey.Extend\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "filter\n", - "\n", - "filter\n", - "\n", - "\n", - "\n", - "storey.Extend->filter\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "FeaturesetValidator\n", - "\n", - "FeaturesetValidator\n", - "\n", - "\n", - "\n", - "filter->FeaturesetValidator\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "Aggregates\n", - "\n", - "Aggregates\n", - "\n", - "\n", - "\n", - "FeaturesetValidator->Aggregates\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "parquet\n", - "\n", - "\n", - "parquet\n", - "\n", - "\n", - "\n", - "Aggregates->parquet\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "nosql\n", - "\n", - "\n", - "nosql\n", - "\n", - "\n", - "\n", - "Aggregates->nosql\n", - "\n", - "\n", - "\n", - "\n", - "\n" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "quotes_set.graph.to(\"MyMap\", multiplier=3)\\\n", - " .to(\"storey.Extend\", _fn=\"({'extra': event['bid'] * 77})\")\\\n", - " .to(\"storey.Filter\", \"filter\", _fn=\"(event['bid'] > 51.92)\")\\\n", - " .to(FeaturesetValidator())\n", - "\n", - "quotes_set.add_aggregation(\"ask\", [\"sum\", \"max\"], \"1h\", \"10m\", name=\"asks1\")\n", - "quotes_set.add_aggregation(\"ask\", [\"sum\", \"max\"], \"5h\", \"10m\", name=\"asks5\")\n", - "quotes_set.add_aggregation(\"bid\", [\"min\", \"max\"], \"1h\", \"10m\", name=\"bids\")\n", - "\n", - "# add feature validation policy\n", - "quotes_set[\"bid\"] = fstore.Feature(validator=MinMaxValidator(min=52, severity=\"info\"))\n", - "\n", - "# add default target definitions and plot\n", - "quotes_set.set_targets()\n", - "quotes_set.plot(rankdir=\"LR\", with_targets=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Ingest Data Into Offline And Online Stores" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.377248+00:00 args={'min': 52, 'value': 51.95}\n", - "info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.377927+00:00 args={'min': 52, 'value': 51.97}\n", - "info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.378103+00:00 args={'min': 52, 'value': 51.99}\n", - "info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.578640+00:00 args={'min': 52, 'value': 51.95}\n", - "info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.581692+00:00 args={'min': 52, 'value': 51.97}\n", - "info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.584351+00:00 args={'min': 52, 'value': 51.99}\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
asks1_max_1hasks1_sum_1hasks5_max_5hasks5_sum_5hbids_max_1hbids_min_1htimebidaskmultiextra
ticker
GOOG720.93720.93720.93720.93720.50720.502022-01-31 14:41:48.260566720.50720.932161.5055478.50
MSFT51.9651.9651.9651.9651.9551.952022-01-31 14:41:48.26056651.9551.96155.854000.15
MSFT51.98103.9451.98103.9451.9751.952022-01-31 14:41:48.26756651.9751.98155.914001.69
MSFT52.00155.9452.00155.9451.9951.952022-01-31 14:41:48.27856651.9952.00155.974003.23
GOOG720.931441.86720.931441.86720.50720.502022-01-31 14:41:48.285566720.50720.932161.5055478.50
AAPL98.0198.0198.0198.0197.9997.992022-01-31 14:41:48.28656697.9998.01293.977545.23
GOOG720.932162.74720.932162.74720.50720.502022-01-31 14:41:48.309566720.50720.882161.5055478.50
MSFT52.03207.9752.03207.9752.0151.952022-01-31 14:41:48.31256652.0152.03156.034004.77
\n", - "
" - ], - "text/plain": [ - " asks1_max_1h asks1_sum_1h asks5_max_5h asks5_sum_5h bids_max_1h \\\n", - "ticker \n", - "GOOG 720.93 720.93 720.93 720.93 720.50 \n", - "MSFT 51.96 51.96 51.96 51.96 51.95 \n", - "MSFT 51.98 103.94 51.98 103.94 51.97 \n", - "MSFT 52.00 155.94 52.00 155.94 51.99 \n", - "GOOG 720.93 1441.86 720.93 1441.86 720.50 \n", - "AAPL 98.01 98.01 98.01 98.01 97.99 \n", - "GOOG 720.93 2162.74 720.93 2162.74 720.50 \n", - "MSFT 52.03 207.97 52.03 207.97 52.01 \n", - "\n", - " bids_min_1h time bid ask multi \\\n", - "ticker \n", - "GOOG 720.50 2022-01-31 14:41:48.260566 720.50 720.93 2161.50 \n", - "MSFT 51.95 2022-01-31 14:41:48.260566 51.95 51.96 155.85 \n", - "MSFT 51.95 2022-01-31 14:41:48.267566 51.97 51.98 155.91 \n", - "MSFT 51.95 2022-01-31 14:41:48.278566 51.99 52.00 155.97 \n", - "GOOG 720.50 2022-01-31 14:41:48.285566 720.50 720.93 2161.50 \n", - "AAPL 97.99 2022-01-31 14:41:48.286566 97.99 98.01 293.97 \n", - "GOOG 720.50 2022-01-31 14:41:48.309566 720.50 720.88 2161.50 \n", - "MSFT 51.95 2022-01-31 14:41:48.312566 52.01 52.03 156.03 \n", - "\n", - " extra \n", - "ticker \n", - "GOOG 55478.50 \n", - "MSFT 4000.15 \n", - "MSFT 4001.69 \n", - "MSFT 4003.23 \n", - "GOOG 55478.50 \n", - "AAPL 7545.23 \n", - "GOOG 55478.50 \n", - "MSFT 4004.77 " - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# save ingest data and print the FeatureSet spec\n", - "fstore.ingest(quotes_set, quotes)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Get an Offline Feature Vector" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "features = [\n", - " \"stock-quotes.multi\",\n", - " \"stock-quotes.asks5_sum_5h as total_ask\",\n", - " \"stock-quotes.bids_min_1h\",\n", - " \"stock-quotes.bids_max_1h\",\n", - " \"stocks.*\",\n", - "]\n", - "\n", - "vector = fstore.FeatureVector(\"stocks-vec\", features)\n", - "vector.save()" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "target_dict = CSVTarget('mycsv',path=os.path.join(ABS_PATH, 'my_csv.csv')).to_dict()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Using `get_offline_features()` " - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [], - "source": [ - "get_offline_features_fn = mlrun.import_function('hub://get_offline_features:development')" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2022-01-31 14:41:52,066 [info] starting run get-offline-features-get_offline_features uid=956663b9a9ba448c9ea65e8e9245718e DB=http://mlrun-api:8080\n", - "> 2022-01-31 14:41:52,214 [info] Creating DataFrame from entity_rows = v3io://users/yonatan/get_offline_features/trades.csv\n", - "> 2022-01-31 14:41:52,292 [info] Preparing 'mycsv' target\n", - "> 2022-01-31 14:41:52,294 [info] getting offline features from the FeatureVector store://feature-vectors/get-offline-features-yonatan/stocks-vec\n", - "> 2022-01-31 14:41:52,708 [info] wrote target: {'name': 'mycsv', 'kind': 'csv', 'path': 'v3io://users/yonatan/get_offline_features/my_csv.csv', 'status': 'ready', 'updated': '2022-01-31T14:41:52.708534+00:00'}\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
get-offline-features-yonatan0Jan 31 14:41:52completedget-offline-features-get_offline_features
v3io_user=yonatan
kind=
owner=yonatan
host=jupyter-yoni-647b99c95d-w4jlc
entity_rows
feature_vector=store://feature-vectors/get-offline-features-yonatan/stocks-vec
target={'name': 'mycsv', 'kind': 'csv', 'path': 'v3io://users/yonatan/get_offline_features/my_csv.csv', 'partitioned': False}
entity_timestamp_column=time
target=v3io://users/yonatan/get_offline_features/my_csv.csv
feature_vector=store://feature-vectors/get-offline-features-yonatan/stocks-vec
\n", - "
\n", - "
\n", - "
\n", - " Title\n", - " ×\n", - "
\n", - " \n", - "
\n", - "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - }, - { - "data": { - "text/html": [ - " > to track results use the .show() or .logs() methods or click here to open in UI" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2022-01-31 14:41:52,896 [info] run executed, status=completed\n" - ] - } - ], - "source": [ - "gof_run = get_offline_features_fn.run(\n", - " handler='get_offline_features',\n", - " inputs= {'entity_rows': data_uri},\n", - " params={'feature_vector': vector.uri,\n", - " 'target': target_dict,\n", - " 'entity_timestamp_column': \"time\",\n", - " },\n", - " local=True\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'store://feature-vectors/get-offline-features-yonatan/stocks-vec'" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "gof_run.outputs['feature_vector']" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Unnamed: 0pricequantitymultitotal_askbids_min_1hbids_max_1hnameexchange
0051.9575155.8551.9651.9551.95Microsoft CorporationNASDAQ
1151.9575155.91103.9451.9551.97Microsoft CorporationNASDAQ
2251.9575155.97155.9451.9551.99Microsoft CorporationNASDAQ
3351.9575156.03207.9751.9552.01Microsoft CorporationNASDAQ
4451.95155155.8551.9651.9551.95Microsoft CorporationNASDAQ
5551.95155155.91103.9451.9551.97Microsoft CorporationNASDAQ
6651.95155155.97155.9451.9551.99Microsoft CorporationNASDAQ
7751.95155156.03207.9751.9552.01Microsoft CorporationNASDAQ
88720.771002161.50720.93720.50720.50Alphabet IncNASDAQ
99720.771002161.501441.86720.50720.50Alphabet IncNASDAQ
1010720.771002161.502162.74720.50720.50Alphabet IncNASDAQ
1111720.921002161.50720.93720.50720.50Alphabet IncNASDAQ
1212720.921002161.501441.86720.50720.50Alphabet IncNASDAQ
1313720.921002161.502162.74720.50720.50Alphabet IncNASDAQ
141498.00100293.9798.0197.9997.99Apple IncNASDAQ
\n", - "
" - ], - "text/plain": [ - " Unnamed: 0 price quantity multi total_ask bids_min_1h \\\n", - "0 0 51.95 75 155.85 51.96 51.95 \n", - "1 1 51.95 75 155.91 103.94 51.95 \n", - "2 2 51.95 75 155.97 155.94 51.95 \n", - "3 3 51.95 75 156.03 207.97 51.95 \n", - "4 4 51.95 155 155.85 51.96 51.95 \n", - "5 5 51.95 155 155.91 103.94 51.95 \n", - "6 6 51.95 155 155.97 155.94 51.95 \n", - "7 7 51.95 155 156.03 207.97 51.95 \n", - "8 8 720.77 100 2161.50 720.93 720.50 \n", - "9 9 720.77 100 2161.50 1441.86 720.50 \n", - "10 10 720.77 100 2161.50 2162.74 720.50 \n", - "11 11 720.92 100 2161.50 720.93 720.50 \n", - "12 12 720.92 100 2161.50 1441.86 720.50 \n", - "13 13 720.92 100 2161.50 2162.74 720.50 \n", - "14 14 98.00 100 293.97 98.01 97.99 \n", - "\n", - " bids_max_1h name exchange \n", - "0 51.95 Microsoft Corporation NASDAQ \n", - "1 51.97 Microsoft Corporation NASDAQ \n", - "2 51.99 Microsoft Corporation NASDAQ \n", - "3 52.01 Microsoft Corporation NASDAQ \n", - "4 51.95 Microsoft Corporation NASDAQ \n", - "5 51.97 Microsoft Corporation NASDAQ \n", - "6 51.99 Microsoft Corporation NASDAQ \n", - "7 52.01 Microsoft Corporation NASDAQ \n", - "8 720.50 Alphabet Inc NASDAQ \n", - "9 720.50 Alphabet Inc NASDAQ \n", - "10 720.50 Alphabet Inc NASDAQ \n", - "11 720.50 Alphabet Inc NASDAQ \n", - "12 720.50 Alphabet Inc NASDAQ \n", - "13 720.50 Alphabet Inc NASDAQ \n", - "14 97.99 Apple Inc NASDAQ " - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "mlrun.get_dataitem(gof_run.outputs['feature_vector']).as_df()" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/functions/development/get_offline_features/0.9.0/src/get_offline_features.py b/functions/development/get_offline_features/0.9.0/src/get_offline_features.py deleted file mode 100644 index 56fe0583..00000000 --- a/functions/development/get_offline_features/0.9.0/src/get_offline_features.py +++ /dev/null @@ -1,120 +0,0 @@ -from typing import Union, List, Dict - -import mlrun -import mlrun.feature_store as fs -from mlrun.datastore.store_resources import is_store_uri, parse_store_uri -from mlrun.datastore.targets import get_target_driver, kind_to_driver -from mlrun.datastore.base import DataItem -from mlrun.execution import MLClientCtx -from mlrun.utils import StorePrefix, parse_versioned_object_uri -from mlrun.errors import MLRunInvalidArgumentError - - -def get_offline_features( - context: MLClientCtx, - feature_vector: str, - features: List[str] = None, - label_feature: str = None, - description: str = None, - entity_rows: DataItem = None, - entity_timestamp_column: str = None, - target: Union[str, Dict] = None, - run_config: Union[str, Dict] = None, - drop_columns: List[str] = None, - start_time: str = None, - end_time: str = None, - with_indexes: bool = False, - update_stats: bool = False, -): - """retrieve offline feature vector results - - specify a feature vector object/uri and retrieve the desired features, their metadata - and statistics. returns :py:class:`~mlrun.feature_store.OfflineVectorResponse`, - results can be returned as a dataframe or written to a target. - If feature vector does not exist, a new one will be created and saved with the given features. - - The start_time and end_time attributes allow filtering the data to a given time range, they accept - string values or pandas `Timestamp` objects, string values can also be relative, for example: - "now", "now - 1d2h", "now+5m", where a valid pandas Timedelta string follows the verb "now", - for time alignment you can use the verb "floor" e.g. "now -1d floor 1H" will align the time to the last hour - (the floor string is passed to pandas.Timestamp.floor(), can use D, H, T, S for day, hour, min, sec alignment) - - - :param context: MLRun context - :param feature_vector: feature vector uri - :param features: Relevant only if feature_vector not exist. list of feature to collect to this vector - format [/]. [as ] - :param label_feature: feature name to be used as label data - :param description: text description of the vector - :param entity_rows: URI of the data entity rows to join with - :param target: where to write the results to - :param drop_columns: list of columns to drop from the final result - :param entity_timestamp_column: timestamp column name in the entity rows dataframe - :param run_config: function and/or run configuration - see :py:class:`~mlrun.feature_store.RunConfig` - :param start_time: datetime, low limit of time needed to be filtered. Optional - entity_timestamp_column must be passed when using time filtering - :param end_time: datetime, high limit of time needed to be filtered. Optional - entity_timestamp_column must be passed when using time filtering - :param with_indexes: return vector with index columns (default False) - :param update_stats: update features statistics from the requested feature sets on the vector. Default is False. - - :returns feature_vector input - """ - - if features is not None: - # Creating a new FeatureVector and saving: - if is_store_uri(feature_vector): - prefix, new_uri = parse_store_uri(feature_vector) - if prefix != StorePrefix.FeatureVector: - raise MLRunInvalidArgumentError( - f"provided store uri ({feature_vector}) does not represent a feature vector (prefix={prefix})" - ) - feature_vector = new_uri - - project, name, tag, _ = parse_versioned_object_uri(feature_vector, mlrun.mlconf.default_project) - vector = fs.FeatureVector(name, features, label_feature=label_feature, description=description) - vector.metadata.project = project - vector.metadata.tag = tag - vector.save() - feature_vector = vector.uri - - # Preparing entity_rows: - if entity_rows is not None: - context.logger.info(f"Creating DataFrame from entity_rows = {entity_rows}") - entity_rows = entity_rows.as_df() - - # Preparing target: - if target: - if isinstance(target, str): - target = kind_to_driver[target]() - - name = target.name if hasattr(target, "name") else target["name"] - context.logger.info(f"Preparing '{name}' target") - target = get_target_driver(target) - if target.path: - context.log_result("target", target.path) - - # Preparing run_config: - if run_config and isinstance(run_config, dict): - context.logger.info("Preparing run configuration") - run_config = fs.RunConfig(**run_config) - - # Calling get_offline_features: - context.logger.info( - f"getting offline features from the FeatureVector {feature_vector}" - ) - fs.get_offline_features( - feature_vector=feature_vector, - entity_rows=entity_rows, - entity_timestamp_column=entity_timestamp_column, - target=target, - run_config=run_config, - drop_columns=drop_columns, - start_time=start_time, - end_time=end_time, - with_indexes=with_indexes, - update_stats=update_stats, - ) - - context.log_result("feature_vector", feature_vector) diff --git a/functions/development/get_offline_features/0.9.0/src/item.yaml b/functions/development/get_offline_features/0.9.0/src/item.yaml deleted file mode 100644 index 509c7d60..00000000 --- a/functions/development/get_offline_features/0.9.0/src/item.yaml +++ /dev/null @@ -1,25 +0,0 @@ -apiVersion: v1 -categories: -- data-preparation -- data-analysis -- feature-store -description: retrieve offline feature vector results -doc: '' -example: get_offline_features.ipynb -generationDate: 2022-01-17:17-56 -icon: '' -labels: - author: yonish -maintainers: [] -marketplaceType: '' -mlrunVersion: 0.9.1 -name: get_offline_features -platformVersion: '' -spec: - filename: get_offline_features.py - handler: get_offline_features - image: mlrun/mlrun - kind: job - requirements: [] -url: '' -version: 0.9.0 diff --git a/functions/development/get_offline_features/0.9.0/src/test_get_offline_features.py b/functions/development/get_offline_features/0.9.0/src/test_get_offline_features.py deleted file mode 100644 index 1aa92f0b..00000000 --- a/functions/development/get_offline_features/0.9.0/src/test_get_offline_features.py +++ /dev/null @@ -1,224 +0,0 @@ -import os -import tempfile -import shutil -import datetime - -import pytest -import mlrun -import mlrun.feature_store as fstore -from mlrun.datastore.targets import CSVTarget -from mlrun.feature_store.steps import * -from mlrun.features import MinMaxValidator -from mlrun.run import get_dataitem - - -REQUIRED_ENV_VARS = [ - "MLRUN_DBPATH", - "MLRUN_ARTIFACT_PATH", - "V3IO_USERNAME", - "V3IO_API", - "V3IO_ACCESS_KEY", -] - - -def _validate_environment_variables() -> bool: - """ - Checks that all required Environment variables are set. - """ - environment_keys = os.environ.keys() - return all(key in environment_keys for key in REQUIRED_ENV_VARS) - - -def _set_environment(): - """ - Creating project and temp dir for the project. - """ - artifact_path = tempfile.TemporaryDirectory().name - os.makedirs(artifact_path) - project = mlrun.get_or_create_project( - "get-offline-features-test", context="./", user_project=True - ) - return artifact_path, project - - -def _cleanup_environment(artifact_path: str): - """ - Cleanup the test environment, deleting files and artifacts created during the test. - - :param artifact_path: The artifact path to delete. - """ - # Clean the local directory: - for test_output in [ - *os.listdir(artifact_path), - "schedules", - "runs", - "artifacts", - "functions", - ]: - test_output_path = os.path.abspath(f"./{test_output}") - if os.path.exists(test_output_path): - if os.path.isdir(test_output_path): - shutil.rmtree(test_output_path) - else: - os.remove(test_output_path) - - # Clean the artifacts directory: - shutil.rmtree(artifact_path) - - -def create_dataframes() -> (pd.DataFrame, pd.DataFrame, pd.DataFrame): - """ - Creates all the necessary DataFrames to the test. - """ - - def move_date(df, col): - max_date = df[col].max() - now_date = datetime.datetime.now() - delta = now_date - max_date - df[col] = df[col] + delta - return df - - stocks = pd.DataFrame( - { - "ticker": ["MSFT", "GOOG", "AAPL"], - "name": ["Microsoft Corporation", "Alphabet Inc", "Apple Inc"], - "exchange": ["NASDAQ", "NASDAQ", "NASDAQ"], - } - ) - - quotes = pd.DataFrame( - { - "time": [ - pd.Timestamp("2016-05-25 13:30:00.023"), - pd.Timestamp("2016-05-25 13:30:00.023"), - pd.Timestamp("2016-05-25 13:30:00.030"), - pd.Timestamp("2016-05-25 13:30:00.041"), - pd.Timestamp("2016-05-25 13:30:00.048"), - pd.Timestamp("2016-05-25 13:30:00.049"), - pd.Timestamp("2016-05-25 13:30:00.072"), - pd.Timestamp("2016-05-25 13:30:00.075"), - ], - "ticker": ["GOOG", "MSFT", "MSFT", "MSFT", "GOOG", "AAPL", "GOOG", "MSFT"], - "bid": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01], - "ask": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03], - } - ) - - trades = pd.DataFrame( - { - "time": [ - pd.Timestamp("2016-05-25 13:30:00.023"), - pd.Timestamp("2016-05-25 13:30:00.038"), - pd.Timestamp("2016-05-25 13:30:00.048"), - pd.Timestamp("2016-05-25 13:30:00.048"), - pd.Timestamp("2016-05-25 13:30:00.048"), - ], - "ticker": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"], - "price": [51.95, 51.95, 720.77, 720.92, 98.0], - "quantity": [75, 155, 100, 100, 100], - } - ) - quotes = move_date(quotes, "time") - trades = move_date(trades, "time") - return quotes, trades, stocks - - -class MyMap(MapClass): - def __init__(self, multiplier=1, **kwargs): - super().__init__(**kwargs) - self._multiplier = multiplier - - def do(self, event): - event["multi"] = event["bid"] * self._multiplier - return event - - -def _create_feature_set(): - """ - Creating all the necessary FeatureSets for the test. - """ - stocks_set = fstore.FeatureSet("stocks", entities=[fstore.Entity("ticker")]) - - quotes_set = fstore.FeatureSet("stock-quotes", entities=[fstore.Entity("ticker")]) - - quotes_set.graph.to("MyMap", multiplier=3).to( - "storey.Extend", _fn="({'extra': event['bid'] * 77})" - ).to("storey.Filter", "filter", _fn="(event['bid'] > 51.92)").to( - FeaturesetValidator() - ) - - quotes_set.add_aggregation("ask", ["sum", "max"], "1h", "10m", name="asks1") - quotes_set.add_aggregation("ask", ["sum", "max"], "5h", "10m", name="asks5") - quotes_set.add_aggregation("bid", ["min", "max"], "1h", "10m", name="bids") - - # add feature validation policy - quotes_set["bid"] = fstore.Feature( - validator=MinMaxValidator(min=52, severity="info") - ) - - # add default target definitions and plot - quotes_set.set_targets() - return quotes_set, stocks_set - - -@pytest.mark.skipif( - condition=not _validate_environment_variables(), - reason="Project's environment variables are not set", -) -def test_get_offline_vector(): - # Creating project: - artifact_path, project = _set_environment() - - # Importing the marketplace function: - gof_fn = mlrun.import_function("function.yaml") - - # Creating the dataframes: - quotes, trades, stocks = create_dataframes() - - # Defining features for the FeatureVector: - features = [ - "stock-quotes.multi", - "stock-quotes.asks5_sum_5h as total_ask", - "stock-quotes.bids_min_1h", - "stock-quotes.bids_max_1h", - "stocks.*", - ] - - # Creating the FeatureSets and ingesting them: - quotes_set, stocks_set = _create_feature_set() - fstore.ingest(stocks_set, stocks) - fstore.ingest(quotes_set, quotes) - - # Saving the trades dataframe as a csv to use as entity_rows: - trades_uri = os.path.join(artifact_path, "trades.csv") - trades.to_csv(trades_uri, index=False) - - # Creating target for the FeatureVector: - target_dict = CSVTarget( - "mycsv", path=os.path.join(artifact_path, "my_csv.csv") - ).to_dict() - - # Running the getting_offline_features function: - gof_run = None - try: - gof_run = gof_fn.run( - handler="get_offline_features", - inputs={"entity_rows": trades_uri}, - params={ - "feature_vector": "stocks-vec", - "features": features, - "target": target_dict, - "entity_timestamp_column": "time", - }, - local=True, - ) - - except Exception as e: - print(f"- The test failed - raised the following error:\n- {e}") - - target_df = get_dataitem(gof_run.outputs["target"]).as_df() - vector_df = get_dataitem(gof_run.outputs["feature_vector"]).as_df() - - # Asserting that the target and FeatureVector dataframes are the same: - assert vector_df.equals(target_df), "Target and feature vector are not the same" - _cleanup_environment(artifact_path) diff --git a/functions/development/get_offline_features/0.9.0/static/documentation.html b/functions/development/get_offline_features/0.9.0/static/documentation.html deleted file mode 100644 index 3f3bfebc..00000000 --- a/functions/development/get_offline_features/0.9.0/static/documentation.html +++ /dev/null @@ -1,125 +0,0 @@ - - - - - - - -get_offline_features package - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- - -
-
-
-
-
-
-

get_offline_features package

-
-

Submodules

-
-
-

get_offline_features.get_offline_features module

-
-
-

Module contents

-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/get_offline_features/0.9.0/static/example.html b/functions/development/get_offline_features/0.9.0/static/example.html deleted file mode 100644 index a2811133..00000000 --- a/functions/development/get_offline_features/0.9.0/static/example.html +++ /dev/null @@ -1,1256 +0,0 @@ - - - - - - - -get_offline_features() from MLRun FeatureStore - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
- -
-
-
-
-

get_offline_features() from MLRun FeatureStore

-

This MLRun Function has the following params:

-
    -
  • feature_vector: str, feature vector uri.

  • -
  • entity_rows: DataItem = None, URI of the data entity rows to join with.

  • -
  • entity_timestamp_column: str = None, timestamp column name in the entity rows dataframe.

  • -
  • target: Union[str, Dict] = None, where to write the results to.

  • -
  • run_config: Union[str, Dict] = None, function and/or run configuration see :py:class:~mlrun.feature_store.RunConfig.

  • -
  • drop_columns: List[str] = None, list of columns to drop from the final result.

  • -
  • start_time: str = None, datetime, low limit of time needed to be filtered. Optional. entity_timestamp_column must be passed when using time filtering.

  • -
  • end_time: str = None, datetime, high limit of time needed to be filtered. Optional. entity_timestamp_column must be passed when using time filtering.

  • -
  • with_indexes: bool = False, return vector with index columns (default False).

  • -
  • update_stats: bool = False, update features statistics from the requested feature sets on the vector. Default is False.

  • -
-
-
-
import mlrun
-import mlrun.feature_store as fstore
-from mlrun.datastore.targets import CSVTarget
-from mlrun.datastore.sources import CSVSource
-from mlrun.run import get_dataitem
-from mlrun.feature_store.steps import *
-from mlrun.features import MinMaxValidator
-import pandas as pd
-import datetime
-import os
-
-
-
-
-
-
-
ABS_PATH = 'v3io://users/{}/get_offline_features/'.format(os.environ['V3IO_USERNAME'])
-# Initialize the MLRun project object
-project = mlrun.get_or_create_project('get-offline-features', context="./", user_project=True)
-
-
-
-
-
> 2022-01-31 14:41:48,288 [info] loaded project get-offline-features from MLRun DB
-
-
-
-
-
-

Generating the Same FeatureSets and FeatureVecotrs Based on the Stocks Example

-
-

Create Sample Data For Demo

-
-
-
quotes = pd.DataFrame(
-    {
-        "time": [
-            pd.Timestamp("2016-05-25 13:30:00.023"),
-            pd.Timestamp("2016-05-25 13:30:00.023"),
-            pd.Timestamp("2016-05-25 13:30:00.030"),
-            pd.Timestamp("2016-05-25 13:30:00.041"),
-            pd.Timestamp("2016-05-25 13:30:00.048"),
-            pd.Timestamp("2016-05-25 13:30:00.049"),
-            pd.Timestamp("2016-05-25 13:30:00.072"),
-            pd.Timestamp("2016-05-25 13:30:00.075")
-        ],
-        "ticker": [
-               "GOOG",
-               "MSFT",
-               "MSFT",
-               "MSFT",
-               "GOOG",
-               "AAPL",
-               "GOOG",
-               "MSFT"
-           ],
-           "bid": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01],
-           "ask": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03]
-    }
-)
-
-trades = pd.DataFrame(
-       {
-           "time": [
-               pd.Timestamp("2016-05-25 13:30:00.023"),
-               pd.Timestamp("2016-05-25 13:30:00.038"),
-               pd.Timestamp("2016-05-25 13:30:00.048"),
-               pd.Timestamp("2016-05-25 13:30:00.048"),
-               pd.Timestamp("2016-05-25 13:30:00.048")
-           ],
-           "ticker": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"],
-           "price": [51.95, 51.95, 720.77, 720.92, 98.0],
-           "quantity": [75, 155, 100, 100, 100]
-       }
-)
-
-stocks = pd.DataFrame(
-       {
-           "ticker": ["MSFT", "GOOG", "AAPL"],
-           "name": ["Microsoft Corporation", "Alphabet Inc", "Apple Inc"],
-           "exchange": ["NASDAQ", "NASDAQ", "NASDAQ"]
-       }
-)
-
-
-
-
-
-
-
def move_date(df, col):
-    max_date = df[col].max()
-    now_date = datetime.datetime.now()
-    delta = now_date - max_date 
-    df[col] = df[col] + delta 
-    return df
-
-quotes = move_date(quotes, "time")
-trades = move_date(trades, "time")
-trades.to_csv('trades.csv', index=False)
-data_uri = os.path.join(ABS_PATH, 'trades.csv')
-
-
-
-
-
-
-
quotes
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
timetickerbidask
02022-01-31 14:41:48.260566GOOG720.50720.93
12022-01-31 14:41:48.260566MSFT51.9551.96
22022-01-31 14:41:48.267566MSFT51.9751.98
32022-01-31 14:41:48.278566MSFT51.9952.00
42022-01-31 14:41:48.285566GOOG720.50720.93
52022-01-31 14:41:48.286566AAPL97.9998.01
62022-01-31 14:41:48.309566GOOG720.50720.88
72022-01-31 14:41:48.312566MSFT52.0152.03
-
-
-
-
-
trades
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
timetickerpricequantity
02022-01-31 14:41:48.288476MSFT51.9575
12022-01-31 14:41:48.303476MSFT51.95155
22022-01-31 14:41:48.313476GOOG720.77100
32022-01-31 14:41:48.313476GOOG720.92100
42022-01-31 14:41:48.313476AAPL98.00100
-
-
-
-
-
stocks
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
tickernameexchange
0MSFTMicrosoft CorporationNASDAQ
1GOOGAlphabet IncNASDAQ
2AAPLApple IncNASDAQ
-
-
-
-
-

Build & Ingest Simple Feature Set (stocks)

-
-
-
# add feature set without time column (stock ticker metadata) 
-stocks_set = fstore.FeatureSet("stocks", entities=[fstore.Entity("ticker")])
-fstore.ingest(stocks_set, stocks, infer_options=fstore.InferOptions.default())
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
nameexchange
ticker
MSFTMicrosoft CorporationNASDAQ
GOOGAlphabet IncNASDAQ
AAPLApple IncNASDAQ
-
-
-
-
-

Build Advanced feature set - with feature engineering pipeline

-
-
-
quotes_set = fstore.FeatureSet("stock-quotes", entities=[fstore.Entity("ticker")])
-
-
-
-
-
-
-
class MyMap(MapClass):
-    def __init__(self, multiplier=1, **kwargs):
-        super().__init__(**kwargs)
-        self._multiplier = multiplier
-
-    def do(self, event):
-        event["multi"] = event["bid"] * self._multiplier
-        return event
-
-
-
-
-
-
-
quotes_set.graph.to("MyMap", multiplier=3)\
-                .to("storey.Extend", _fn="({'extra': event['bid'] * 77})")\
-                .to("storey.Filter", "filter", _fn="(event['bid'] > 51.92)")\
-                .to(FeaturesetValidator())
-
-quotes_set.add_aggregation("ask", ["sum", "max"], "1h", "10m", name="asks1")
-quotes_set.add_aggregation("ask", ["sum", "max"], "5h", "10m", name="asks5")
-quotes_set.add_aggregation("bid", ["min", "max"], "1h", "10m", name="bids")
-
-# add feature validation policy
-quotes_set["bid"] = fstore.Feature(validator=MinMaxValidator(min=52, severity="info"))
-
-# add default target definitions and plot
-quotes_set.set_targets()
-quotes_set.plot(rankdir="LR", with_targets=True)
-
-
-
-
-_images/get_offline_features_example_15_0.svg
-
-
-
-

Ingest Data Into Offline And Online Stores

-
-
-
# save ingest data and print the FeatureSet spec
-fstore.ingest(quotes_set, quotes)
-
-
-
-
-
info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.377248+00:00 args={'min': 52, 'value': 51.95}
-info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.377927+00:00 args={'min': 52, 'value': 51.97}
-info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.378103+00:00 args={'min': 52, 'value': 51.99}
-info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.578640+00:00 args={'min': 52, 'value': 51.95}
-info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.581692+00:00 args={'min': 52, 'value': 51.97}
-info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.584351+00:00 args={'min': 52, 'value': 51.99}
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
asks1_max_1hasks1_sum_1hasks5_max_5hasks5_sum_5hbids_max_1hbids_min_1htimebidaskmultiextra
ticker
GOOG720.93720.93720.93720.93720.50720.502022-01-31 14:41:48.260566720.50720.932161.5055478.50
MSFT51.9651.9651.9651.9651.9551.952022-01-31 14:41:48.26056651.9551.96155.854000.15
MSFT51.98103.9451.98103.9451.9751.952022-01-31 14:41:48.26756651.9751.98155.914001.69
MSFT52.00155.9452.00155.9451.9951.952022-01-31 14:41:48.27856651.9952.00155.974003.23
GOOG720.931441.86720.931441.86720.50720.502022-01-31 14:41:48.285566720.50720.932161.5055478.50
AAPL98.0198.0198.0198.0197.9997.992022-01-31 14:41:48.28656697.9998.01293.977545.23
GOOG720.932162.74720.932162.74720.50720.502022-01-31 14:41:48.309566720.50720.882161.5055478.50
MSFT52.03207.9752.03207.9752.0151.952022-01-31 14:41:48.31256652.0152.03156.034004.77
-
-
-
-
-

Get an Offline Feature Vector

-
-
-
features = [
-    "stock-quotes.multi",
-    "stock-quotes.asks5_sum_5h as total_ask",
-    "stock-quotes.bids_min_1h",
-    "stock-quotes.bids_max_1h",
-    "stocks.*",
-]
-
-vector = fstore.FeatureVector("stocks-vec", features)
-vector.save()
-
-
-
-
-
-
-
target_dict = CSVTarget('mycsv',path=os.path.join(ABS_PATH, 'my_csv.csv')).to_dict()
-
-
-
-
-
-
-

Using get_offline_features()

-
-
-
get_offline_features_fn = mlrun.import_function('hub://get_offline_features:development')
-
-
-
-
-
-
-
gof_run = get_offline_features_fn.run(
-    handler='get_offline_features',
-    inputs= {'entity_rows': data_uri},
-    params={'feature_vector': vector.uri,
-           'target': target_dict,
-            'entity_timestamp_column': "time",
-           },
-    local=True
-)
-
-
-
-
-
> 2022-01-31 14:41:52,066 [info] starting run get-offline-features-get_offline_features uid=956663b9a9ba448c9ea65e8e9245718e DB=http://mlrun-api:8080
-> 2022-01-31 14:41:52,214 [info] Creating DataFrame from entity_rows = v3io://users/yonatan/get_offline_features/trades.csv
-> 2022-01-31 14:41:52,292 [info] Preparing 'mycsv' target
-> 2022-01-31 14:41:52,294 [info] getting offline features from the FeatureVector store://feature-vectors/get-offline-features-yonatan/stocks-vec
-> 2022-01-31 14:41:52,708 [info] wrote target: {'name': 'mycsv', 'kind': 'csv', 'path': 'v3io://users/yonatan/get_offline_features/my_csv.csv', 'status': 'ready', 'updated': '2022-01-31T14:41:52.708534+00:00'}
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
get-offline-features-yonatan0Jan 31 14:41:52completedget-offline-features-get_offline_features
v3io_user=yonatan
kind=
owner=yonatan
host=jupyter-yoni-647b99c95d-w4jlc
entity_rows
feature_vector=store://feature-vectors/get-offline-features-yonatan/stocks-vec
target={'name': 'mycsv', 'kind': 'csv', 'path': 'v3io://users/yonatan/get_offline_features/my_csv.csv', 'partitioned': False}
entity_timestamp_column=time
target=v3io://users/yonatan/get_offline_features/my_csv.csv
feature_vector=store://feature-vectors/get-offline-features-yonatan/stocks-vec
-
- -
-

-
-
-
> to track results use the .show() or .logs() methods or click here to open in UI
> 2022-01-31 14:41:52,896 [info] run executed, status=completed
-
-
-
-
-
-
-
gof_run.outputs['feature_vector']
-
-
-
-
-
'store://feature-vectors/get-offline-features-yonatan/stocks-vec'
-
-
-
-
-
-
-
mlrun.get_dataitem(gof_run.outputs['feature_vector']).as_df()
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Unnamed: 0pricequantitymultitotal_askbids_min_1hbids_max_1hnameexchange
0051.9575155.8551.9651.9551.95Microsoft CorporationNASDAQ
1151.9575155.91103.9451.9551.97Microsoft CorporationNASDAQ
2251.9575155.97155.9451.9551.99Microsoft CorporationNASDAQ
3351.9575156.03207.9751.9552.01Microsoft CorporationNASDAQ
4451.95155155.8551.9651.9551.95Microsoft CorporationNASDAQ
5551.95155155.91103.9451.9551.97Microsoft CorporationNASDAQ
6651.95155155.97155.9451.9551.99Microsoft CorporationNASDAQ
7751.95155156.03207.9751.9552.01Microsoft CorporationNASDAQ
88720.771002161.50720.93720.50720.50Alphabet IncNASDAQ
99720.771002161.501441.86720.50720.50Alphabet IncNASDAQ
1010720.771002161.502162.74720.50720.50Alphabet IncNASDAQ
1111720.921002161.50720.93720.50720.50Alphabet IncNASDAQ
1212720.921002161.501441.86720.50720.50Alphabet IncNASDAQ
1313720.921002161.502162.74720.50720.50Alphabet IncNASDAQ
141498.00100293.9798.0197.9997.99Apple IncNASDAQ
-
-
-
-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/get_offline_features/0.9.0/static/function.html b/functions/development/get_offline_features/0.9.0/static/function.html deleted file mode 100644 index 4d3e8de2..00000000 --- a/functions/development/get_offline_features/0.9.0/static/function.html +++ /dev/null @@ -1,129 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: job
-metadata:
-  name: get-offline-features
-  tag: ''
-  hash: da837de231cf416f2f866c3fe2be9f4d27c35b7d
-  project: ''
-  labels:
-    author: yonish
-  categories:
-  - data-preparation
-  - data-analysis
-  - feature-store
-spec:
-  command: ''
-  args: []
-  image: mlrun/mlrun
-  env: []
-  default_handler: get_offline_features
-  entry_points:
-    get_offline_features:
-      name: get_offline_features
-      doc: 'retrieve offline feature vector results
-
-
-        specify a feature vector object/uri and retrieve the desired features, their
-        metadata
-
-        and statistics. returns :py:class:`~mlrun.feature_store.OfflineVectorResponse`,
-
-        results can be returned as a dataframe or written to a target
-
-
-        The start_time and end_time attributes allow filtering the data to a given
-        time range, they accept
-
-        string values or pandas `Timestamp` objects, string values can also be relative,
-        for example:
-
-        "now", "now - 1d2h", "now+5m", where a valid pandas Timedelta string follows
-        the verb "now",
-
-        for time alignment you can use the verb "floor" e.g. "now -1d floor 1H" will
-        align the time to the last hour
-
-        (the floor string is passed to pandas.Timestamp.floor(), can use D, H, T,
-        S for day, hour, min, sec alignment).'
-      parameters:
-      - name: context
-        type: MLClientCtx
-        doc: MLRun context.
-        default: ''
-      - name: feature_vector
-        type: str
-        doc: feature vector uri.
-        default: ''
-      - name: entity_rows
-        type: DataItem
-        doc: URI of the data entity rows to join with.
-        default: null
-      - name: entity_timestamp_column
-        type: str
-        doc: timestamp column name in the entity rows dataframe
-        default: null
-      - name: target
-        type: Union[str, Dict]
-        doc: where to write the results to.
-        default: null
-      - name: run_config
-        type: Union[str, Dict]
-        doc: function and/or run configuration see :py:class:`~mlrun.feature_store.RunConfig`
-        default: null
-      - name: drop_columns
-        type: List[str]
-        doc: list of columns to drop from the final result
-        default: null
-      - name: start_time
-        type: str
-        doc: datetime, low limit of time needed to be filtered. Optional. entity_timestamp_column
-          must be passed when using time filtering.
-        default: null
-      - name: end_time
-        type: str
-        doc: datetime, high limit of time needed to be filtered. Optional. entity_timestamp_column
-          must be passed when using time filtering.
-        default: null
-      - name: with_indexes
-        type: bool
-        doc: return vector with index columns (default False)
-        default: false
-      - name: update_stats
-        type: bool
-        doc: update features statistics from the requested feature sets on the vector.
-          Default is False.
-        default: false
-      outputs:
-      - default: ''
-      lineno: 9
-  description: retrieve offline feature vector results
-  build:
-    functionSourceCode: ZnJvbSB0eXBpbmcgaW1wb3J0IFVuaW9uLCBMaXN0LCBEaWN0CgppbXBvcnQgbWxydW4uZmVhdHVyZV9zdG9yZSBhcyBmcwpmcm9tIG1scnVuLmRhdGFzdG9yZS50YXJnZXRzIGltcG9ydCBnZXRfdGFyZ2V0X2RyaXZlciwga2luZF90b19kcml2ZXIKZnJvbSBtbHJ1bi5kYXRhc3RvcmUuYmFzZSBpbXBvcnQgRGF0YUl0ZW0KZnJvbSBtbHJ1bi5leGVjdXRpb24gaW1wb3J0IE1MQ2xpZW50Q3R4CgoKZGVmIGdldF9vZmZsaW5lX2ZlYXR1cmVzKAogICAgY29udGV4dDogTUxDbGllbnRDdHgsCiAgICBmZWF0dXJlX3ZlY3Rvcjogc3RyLAogICAgZW50aXR5X3Jvd3M6IERhdGFJdGVtID0gTm9uZSwKICAgIGVudGl0eV90aW1lc3RhbXBfY29sdW1uOiBzdHIgPSBOb25lLAogICAgdGFyZ2V0OiBVbmlvbltzdHIsIERpY3RdID0gTm9uZSwKICAgIHJ1bl9jb25maWc6IFVuaW9uW3N0ciwgRGljdF0gPSBOb25lLAogICAgZHJvcF9jb2x1bW5zOiBMaXN0W3N0cl0gPSBOb25lLAogICAgc3RhcnRfdGltZTogc3RyID0gTm9uZSwKICAgIGVuZF90aW1lOiBzdHIgPSBOb25lLAogICAgd2l0aF9pbmRleGVzOiBib29sID0gRmFsc2UsCiAgICB1cGRhdGVfc3RhdHM6IGJvb2wgPSBGYWxzZSwKKToKICAgICIiInJldHJpZXZlIG9mZmxpbmUgZmVhdHVyZSB2ZWN0b3IgcmVzdWx0cwoKICAgIHNwZWNpZnkgYSBmZWF0dXJlIHZlY3RvciBvYmplY3QvdXJpIGFuZCByZXRyaWV2ZSB0aGUgZGVzaXJlZCBmZWF0dXJlcywgdGhlaXIgbWV0YWRhdGEKICAgIGFuZCBzdGF0aXN0aWNzLiByZXR1cm5zIDpweTpjbGFzczpgfm1scnVuLmZlYXR1cmVfc3RvcmUuT2ZmbGluZVZlY3RvclJlc3BvbnNlYCwKICAgIHJlc3VsdHMgY2FuIGJlIHJldHVybmVkIGFzIGEgZGF0YWZyYW1lIG9yIHdyaXR0ZW4gdG8gYSB0YXJnZXQKCiAgICBUaGUgc3RhcnRfdGltZSBhbmQgZW5kX3RpbWUgYXR0cmlidXRlcyBhbGxvdyBmaWx0ZXJpbmcgdGhlIGRhdGEgdG8gYSBnaXZlbiB0aW1lIHJhbmdlLCB0aGV5IGFjY2VwdAogICAgc3RyaW5nIHZhbHVlcyBvciBwYW5kYXMgYFRpbWVzdGFtcGAgb2JqZWN0cywgc3RyaW5nIHZhbHVlcyBjYW4gYWxzbyBiZSByZWxhdGl2ZSwgZm9yIGV4YW1wbGU6CiAgICAibm93IiwgIm5vdyAtIDFkMmgiLCAibm93KzVtIiwgd2hlcmUgYSB2YWxpZCBwYW5kYXMgVGltZWRlbHRhIHN0cmluZyBmb2xsb3dzIHRoZSB2ZXJiICJub3ciLAogICAgZm9yIHRpbWUgYWxpZ25tZW50IHlvdSBjYW4gdXNlIHRoZSB2ZXJiICJmbG9vciIgZS5nLiAibm93IC0xZCBmbG9vciAxSCIgd2lsbCBhbGlnbiB0aGUgdGltZSB0byB0aGUgbGFzdCBob3VyCiAgICAodGhlIGZsb29yIHN0cmluZyBpcyBwYXNzZWQgdG8gcGFuZGFzLlRpbWVzdGFtcC5mbG9vcigpLCBjYW4gdXNlIEQsIEgsIFQsIFMgZm9yIGRheSwgaG91ciwgbWluLCBzZWMgYWxpZ25tZW50KS4KCgogICAgOnBhcmFtIGNvbnRleHQ6ICAgICAgICBNTFJ1biBjb250ZXh0LgogICAgOnBhcmFtIGZlYXR1cmVfdmVjdG9yOiBmZWF0dXJlIHZlY3RvciB1cmkuCiAgICA6cGFyYW0gZW50aXR5X3Jvd3M6ICAgIFVSSSBvZiB0aGUgZGF0YSBlbnRpdHkgcm93cyB0byBqb2luIHdpdGguCiAgICA6cGFyYW0gdGFyZ2V0OiAgICAgICAgIHdoZXJlIHRvIHdyaXRlIHRoZSByZXN1bHRzIHRvLgogICAgOnBhcmFtIGRyb3BfY29sdW1uczogICBsaXN0IG9mIGNvbHVtbnMgdG8gZHJvcCBmcm9tIHRoZSBmaW5hbCByZXN1bHQKICAgIDpwYXJhbSBlbnRpdHlfdGltZXN0YW1wX2NvbHVtbjogdGltZXN0YW1wIGNvbHVtbiBuYW1lIGluIHRoZSBlbnRpdHkgcm93cyBkYXRhZnJhbWUKICAgIDpwYXJhbSBydW5fY29uZmlnOiAgICAgZnVuY3Rpb24gYW5kL29yIHJ1biBjb25maWd1cmF0aW9uCiAgICAgICAgICAgICAgICAgICAgICAgICAgIHNlZSA6cHk6Y2xhc3M6YH5tbHJ1bi5mZWF0dXJlX3N0b3JlLlJ1bkNvbmZpZ2AKICAgIDpwYXJhbSBzdGFydF90aW1lOiAgICAgIGRhdGV0aW1lLCBsb3cgbGltaXQgb2YgdGltZSBuZWVkZWQgdG8gYmUgZmlsdGVyZWQuIE9wdGlvbmFsLgogICAgICAgIGVudGl0eV90aW1lc3RhbXBfY29sdW1uIG11c3QgYmUgcGFzc2VkIHdoZW4gdXNpbmcgdGltZSBmaWx0ZXJpbmcuCiAgICA6cGFyYW0gZW5kX3RpbWU6ICAgICAgICBkYXRldGltZSwgaGlnaCBsaW1pdCBvZiB0aW1lIG5lZWRlZCB0byBiZSBmaWx0ZXJlZC4gT3B0aW9uYWwuCiAgICAgICAgZW50aXR5X3RpbWVzdGFtcF9jb2x1bW4gbXVzdCBiZSBwYXNzZWQgd2hlbiB1c2luZyB0aW1lIGZpbHRlcmluZy4KICAgIDpwYXJhbSB3aXRoX2luZGV4ZXM6ICAgIHJldHVybiB2ZWN0b3Igd2l0aCBpbmRleCBjb2x1bW5zIChkZWZhdWx0IEZhbHNlKQogICAgOnBhcmFtIHVwZGF0ZV9zdGF0czogICAgdXBkYXRlIGZlYXR1cmVzIHN0YXRpc3RpY3MgZnJvbSB0aGUgcmVxdWVzdGVkIGZlYXR1cmUgc2V0cyBvbiB0aGUgdmVjdG9yLiBEZWZhdWx0IGlzIEZhbHNlLgoKICAgIDpyZXR1cm5zIGZlYXR1cmVfdmVjdG9yIGlucHV0CiAgICAiIiIKCiAgICAjIFByZXBhcmluZyBlbnRpdHlfcm93czoKICAgIGlmIGVudGl0eV9yb3dzIGlzIG5vdCBOb25lOgogICAgICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZiJDcmVhdGluZyBEYXRhRnJhbWUgZnJvbSBlbnRpdHlfcm93cyA9IHtlbnRpdHlfcm93c30iKQogICAgICAgIGVudGl0eV9yb3dzID0gZW50aXR5X3Jvd3MuYXNfZGYoKQoKICAgICMgUHJlcGFyaW5nIHRhcmdldDoKICAgIGlmIHRhcmdldDoKICAgICAgICBpZiBpc2luc3RhbmNlKHRhcmdldCwgc3RyKToKICAgICAgICAgICAgdGFyZ2V0ID0ga2luZF90b19kcml2ZXJbdGFyZ2V0XSgpCgogICAgICAgIG5hbWUgPSB0YXJnZXQubmFtZSBpZiBoYXNhdHRyKHRhcmdldCwgIm5hbWUiKSBlbHNlIHRhcmdldFsibmFtZSJdCiAgICAgICAgY29udGV4dC5sb2dnZXIuaW5mbyhmIlByZXBhcmluZyAne25hbWV9JyB0YXJnZXQiKQogICAgICAgIHRhcmdldCA9IGdldF90YXJnZXRfZHJpdmVyKHRhcmdldCkKICAgIGlmIHRhcmdldC5wYXRoOgogICAgICAgIGNvbnRleHQubG9nX3Jlc3VsdCgidGFyZ2V0IiwgdGFyZ2V0LnBhdGgpCgogICAgIyBQcmVwYXJpbmcgcnVuX2NvbmZpZzoKICAgIGlmIHJ1bl9jb25maWcgYW5kIGlzaW5zdGFuY2UocnVuX2NvbmZpZywgZGljdCk6CiAgICAgICAgY29udGV4dC5sb2dnZXIuaW5mbygiUHJlcGFyaW5nIHJ1biBjb25maWd1cmF0aW9uIikKICAgICAgICBydW5fY29uZmlnID0gZnMuUnVuQ29uZmlnKCoqcnVuX2NvbmZpZykKCiAgICAjIENhbGxpbmcgZ2V0X29mZmxpbmVfZmVhdHVyZXM6CiAgICBjb250ZXh0LmxvZ2dlci5pbmZvKAogICAgICAgIGYiZ2V0dGluZyBvZmZsaW5lIGZlYXR1cmVzIGZyb20gdGhlIEZlYXR1cmVWZWN0b3Ige2ZlYXR1cmVfdmVjdG9yfSIKICAgICkKICAgIGZzLmdldF9vZmZsaW5lX2ZlYXR1cmVzKAogICAgICAgIGZlYXR1cmVfdmVjdG9yPWZlYXR1cmVfdmVjdG9yLAogICAgICAgIGVudGl0eV9yb3dzPWVudGl0eV9yb3dzLAogICAgICAgIGVudGl0eV90aW1lc3RhbXBfY29sdW1uPWVudGl0eV90aW1lc3RhbXBfY29sdW1uLAogICAgICAgIHRhcmdldD10YXJnZXQsCiAgICAgICAgcnVuX2NvbmZpZz1ydW5fY29uZmlnLAogICAgICAgIGRyb3BfY29sdW1ucz1kcm9wX2NvbHVtbnMsCiAgICAgICAgc3RhcnRfdGltZT1zdGFydF90aW1lLAogICAgICAgIGVuZF90aW1lPWVuZF90aW1lLAogICAgICAgIHdpdGhfaW5kZXhlcz13aXRoX2luZGV4ZXMsCiAgICAgICAgdXBkYXRlX3N0YXRzPXVwZGF0ZV9zdGF0cywKICAgICkKCiAgICBjb250ZXh0LmxvZ19yZXN1bHQoImZlYXR1cmVfdmVjdG9yIiwgZmVhdHVyZV92ZWN0b3IpCg==
-    commands: []
-    code_origin: https://github.com/mlrun/functions.git#886a88217c2a2570c81a14877f9c1dfb1ac8a244:C:\Users\yonatans\projects\functions\get_offline_features\get_offline_features.py
-    origin_filename: C:\Users\yonatans\projects\functions\get_offline_features\get_offline_features.py
-  disable_auto_mount: false
-  priority_class_name: ''
-  affinity: null
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/get_offline_features/0.9.0/static/item.html b/functions/development/get_offline_features/0.9.0/static/item.html deleted file mode 100644 index 9ec78602..00000000 --- a/functions/development/get_offline_features/0.9.0/static/item.html +++ /dev/null @@ -1,47 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- data-preparation
-- data-analysis
-- feature-store
-description: retrieve offline feature vector results
-doc: ''
-example: get_offline_features.ipynb
-generationDate: 2022-01-17:17-56
-icon: ''
-labels:
-  author: yonish
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 0.9.1
-name: get_offline_features
-platformVersion: ''
-spec:
-  filename: get_offline_features.py
-  handler: get_offline_features
-  image: mlrun/mlrun
-  kind: job
-  requirements: []
-url: ''
-version: 0.9.0
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/get_offline_features/0.9.0/static/source.html b/functions/development/get_offline_features/0.9.0/static/source.html deleted file mode 100644 index 2b75bea7..00000000 --- a/functions/development/get_offline_features/0.9.0/static/source.html +++ /dev/null @@ -1,142 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-from typing import Union, List, Dict
-
-import mlrun
-import mlrun.feature_store as fs
-from mlrun.datastore.store_resources import is_store_uri, parse_store_uri
-from mlrun.datastore.targets import get_target_driver, kind_to_driver
-from mlrun.datastore.base import DataItem
-from mlrun.execution import MLClientCtx
-from mlrun.utils import StorePrefix, parse_versioned_object_uri
-from mlrun.errors import MLRunInvalidArgumentError
-
-
-def get_offline_features(
-    context: MLClientCtx,
-    feature_vector: str,
-    features: List[str] = None,
-    label_feature: str = None,
-    description: str = None,
-    entity_rows: DataItem = None,
-    entity_timestamp_column: str = None,
-    target: Union[str, Dict] = None,
-    run_config: Union[str, Dict] = None,
-    drop_columns: List[str] = None,
-    start_time: str = None,
-    end_time: str = None,
-    with_indexes: bool = False,
-    update_stats: bool = False,
-):
-    """retrieve offline feature vector results
-
-    specify a feature vector object/uri and retrieve the desired features, their metadata
-    and statistics. returns :py:class:`~mlrun.feature_store.OfflineVectorResponse`,
-    results can be returned as a dataframe or written to a target.
-    If feature vector does not exist, a new one will be created and saved with the given features.
-
-    The start_time and end_time attributes allow filtering the data to a given time range, they accept
-    string values or pandas `Timestamp` objects, string values can also be relative, for example:
-    "now", "now - 1d2h", "now+5m", where a valid pandas Timedelta string follows the verb "now",
-    for time alignment you can use the verb "floor" e.g. "now -1d floor 1H" will align the time to the last hour
-    (the floor string is passed to pandas.Timestamp.floor(), can use D, H, T, S for day, hour, min, sec alignment)
-
-
-    :param context:        MLRun context
-    :param feature_vector: feature vector uri
-    :param features:       Relevant only if feature_vector not exist. list of feature to collect to this vector
-                           format [/]. [as ]
-    :param label_feature:  feature name to be used as label data
-    :param description:    text description of the vector
-    :param entity_rows:    URI of the data entity rows to join with
-    :param target:         where to write the results to
-    :param drop_columns:   list of columns to drop from the final result
-    :param entity_timestamp_column: timestamp column name in the entity rows dataframe
-    :param run_config:     function and/or run configuration
-                           see :py:class:`~mlrun.feature_store.RunConfig`
-    :param start_time:      datetime, low limit of time needed to be filtered. Optional
-        entity_timestamp_column must be passed when using time filtering
-    :param end_time:        datetime, high limit of time needed to be filtered. Optional
-        entity_timestamp_column must be passed when using time filtering
-    :param with_indexes:    return vector with index columns (default False)
-    :param update_stats:    update features statistics from the requested feature sets on the vector. Default is False.
-
-    :returns feature_vector input
-    """
-
-    if features is not None:
-        # Creating a new FeatureVector and saving:
-        if is_store_uri(feature_vector):
-            prefix, new_uri = parse_store_uri(feature_vector)
-            if prefix != StorePrefix.FeatureVector:
-                raise MLRunInvalidArgumentError(
-                    f"provided store uri ({feature_vector}) does not represent a feature vector (prefix={prefix})"
-                )
-            feature_vector = new_uri
-
-        project, name, tag, _ = parse_versioned_object_uri(feature_vector, mlrun.mlconf.default_project)
-        vector = fs.FeatureVector(name, features, label_feature=label_feature, description=description)
-        vector.metadata.project = project
-        vector.metadata.tag = tag
-        vector.save()
-        feature_vector = vector.uri
-
-    # Preparing entity_rows:
-    if entity_rows is not None:
-        context.logger.info(f"Creating DataFrame from entity_rows = {entity_rows}")
-        entity_rows = entity_rows.as_df()
-
-    # Preparing target:
-    if target:
-        if isinstance(target, str):
-            target = kind_to_driver[target]()
-
-        name = target.name if hasattr(target, "name") else target["name"]
-        context.logger.info(f"Preparing '{name}' target")
-        target = get_target_driver(target)
-    if target.path:
-        context.log_result("target", target.path)
-
-    # Preparing run_config:
-    if run_config and isinstance(run_config, dict):
-        context.logger.info("Preparing run configuration")
-        run_config = fs.RunConfig(**run_config)
-
-    # Calling get_offline_features:
-    context.logger.info(
-        f"getting offline features from the FeatureVector {feature_vector}"
-    )
-    fs.get_offline_features(
-        feature_vector=feature_vector,
-        entity_rows=entity_rows,
-        entity_timestamp_column=entity_timestamp_column,
-        target=target,
-        run_config=run_config,
-        drop_columns=drop_columns,
-        start_time=start_time,
-        end_time=end_time,
-        with_indexes=with_indexes,
-        update_stats=update_stats,
-    )
-
-    context.log_result("feature_vector", feature_vector)
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/get_offline_features/0.9.1/src/function.yaml b/functions/development/get_offline_features/0.9.1/src/function.yaml deleted file mode 100644 index c3c929ad..00000000 --- a/functions/development/get_offline_features/0.9.1/src/function.yaml +++ /dev/null @@ -1,126 +0,0 @@ -kind: job -metadata: - name: get-offline-features - tag: '' - hash: e7bd4fc1f7377374a910d4aa21e31dbba31f59a3 - project: '' - labels: - author: yonish - categories: - - data-preparation - - data-analysis - - feature-store -spec: - command: '' - args: [] - image: mlrun/mlrun - build: - functionSourceCode: ZnJvbSB0eXBpbmcgaW1wb3J0IFVuaW9uLCBMaXN0LCBEaWN0CgppbXBvcnQgbWxydW4KaW1wb3J0IG1scnVuLmZlYXR1cmVfc3RvcmUgYXMgZnMKZnJvbSBtbHJ1bi5kYXRhc3RvcmUuc3RvcmVfcmVzb3VyY2VzIGltcG9ydCBpc19zdG9yZV91cmksIHBhcnNlX3N0b3JlX3VyaQpmcm9tIG1scnVuLmRhdGFzdG9yZS50YXJnZXRzIGltcG9ydCBnZXRfdGFyZ2V0X2RyaXZlciwga2luZF90b19kcml2ZXIKZnJvbSBtbHJ1bi5kYXRhc3RvcmUuYmFzZSBpbXBvcnQgRGF0YUl0ZW0KZnJvbSBtbHJ1bi5leGVjdXRpb24gaW1wb3J0IE1MQ2xpZW50Q3R4CmZyb20gbWxydW4udXRpbHMgaW1wb3J0IFN0b3JlUHJlZml4LCBwYXJzZV92ZXJzaW9uZWRfb2JqZWN0X3VyaQpmcm9tIG1scnVuLmVycm9ycyBpbXBvcnQgTUxSdW5JbnZhbGlkQXJndW1lbnRFcnJvcgoKCmRlZiBnZXRfb2ZmbGluZV9mZWF0dXJlcygKICAgIGNvbnRleHQ6IE1MQ2xpZW50Q3R4LAogICAgZmVhdHVyZV92ZWN0b3I6IHN0ciwKICAgIGZlYXR1cmVzOiBMaXN0W3N0cl0gPSBOb25lLAogICAgbGFiZWxfZmVhdHVyZTogc3RyID0gTm9uZSwKICAgIGRlc2NyaXB0aW9uOiBzdHIgPSBOb25lLAogICAgZW50aXR5X3Jvd3M6IERhdGFJdGVtID0gTm9uZSwKICAgIGVudGl0eV90aW1lc3RhbXBfY29sdW1uOiBzdHIgPSBOb25lLAogICAgdGFyZ2V0OiBVbmlvbltzdHIsIERpY3RdID0gTm9uZSwKICAgIHJ1bl9jb25maWc6IFVuaW9uW3N0ciwgRGljdF0gPSBOb25lLAogICAgZHJvcF9jb2x1bW5zOiBMaXN0W3N0cl0gPSBOb25lLAogICAgc3RhcnRfdGltZTogc3RyID0gTm9uZSwKICAgIGVuZF90aW1lOiBzdHIgPSBOb25lLAogICAgd2l0aF9pbmRleGVzOiBib29sID0gRmFsc2UsCiAgICB1cGRhdGVfc3RhdHM6IGJvb2wgPSBGYWxzZSwKKToKICAgICIiInJldHJpZXZlIG9mZmxpbmUgZmVhdHVyZSB2ZWN0b3IgcmVzdWx0cwoKICAgIHNwZWNpZnkgYSBmZWF0dXJlIHZlY3RvciBvYmplY3QvdXJpIGFuZCByZXRyaWV2ZSB0aGUgZGVzaXJlZCBmZWF0dXJlcywgdGhlaXIgbWV0YWRhdGEKICAgIGFuZCBzdGF0aXN0aWNzLiByZXR1cm5zIDpweTpjbGFzczpgfm1scnVuLmZlYXR1cmVfc3RvcmUuT2ZmbGluZVZlY3RvclJlc3BvbnNlYCwKICAgIHJlc3VsdHMgY2FuIGJlIHJldHVybmVkIGFzIGEgZGF0YWZyYW1lIG9yIHdyaXR0ZW4gdG8gYSB0YXJnZXQuCiAgICBJZiBmZWF0dXJlIHZlY3RvciBkb2VzIG5vdCBleGlzdCwgYSBuZXcgb25lIHdpbGwgYmUgY3JlYXRlZCBhbmQgc2F2ZWQgd2l0aCB0aGUgZ2l2ZW4gZmVhdHVyZXMuCgogICAgVGhlIHN0YXJ0X3RpbWUgYW5kIGVuZF90aW1lIGF0dHJpYnV0ZXMgYWxsb3cgZmlsdGVyaW5nIHRoZSBkYXRhIHRvIGEgZ2l2ZW4gdGltZSByYW5nZSwgdGhleSBhY2NlcHQKICAgIHN0cmluZyB2YWx1ZXMgb3IgcGFuZGFzIGBUaW1lc3RhbXBgIG9iamVjdHMsIHN0cmluZyB2YWx1ZXMgY2FuIGFsc28gYmUgcmVsYXRpdmUsIGZvciBleGFtcGxlOgogICAgIm5vdyIsICJub3cgLSAxZDJoIiwgIm5vdys1bSIsIHdoZXJlIGEgdmFsaWQgcGFuZGFzIFRpbWVkZWx0YSBzdHJpbmcgZm9sbG93cyB0aGUgdmVyYiAibm93IiwKICAgIGZvciB0aW1lIGFsaWdubWVudCB5b3UgY2FuIHVzZSB0aGUgdmVyYiAiZmxvb3IiIGUuZy4gIm5vdyAtMWQgZmxvb3IgMUgiIHdpbGwgYWxpZ24gdGhlIHRpbWUgdG8gdGhlIGxhc3QgaG91cgogICAgKHRoZSBmbG9vciBzdHJpbmcgaXMgcGFzc2VkIHRvIHBhbmRhcy5UaW1lc3RhbXAuZmxvb3IoKSwgY2FuIHVzZSBELCBILCBULCBTIGZvciBkYXksIGhvdXIsIG1pbiwgc2VjIGFsaWdubWVudCkKCgogICAgOnBhcmFtIGNvbnRleHQ6ICAgICAgICBNTFJ1biBjb250ZXh0CiAgICA6cGFyYW0gZmVhdHVyZV92ZWN0b3I6IGZlYXR1cmUgdmVjdG9yIHVyaQogICAgOnBhcmFtIGZlYXR1cmVzOiAgICAgICBSZWxldmFudCBvbmx5IGlmIGZlYXR1cmVfdmVjdG9yIG5vdCBleGlzdC4gbGlzdCBvZiBmZWF0dXJlIHRvIGNvbGxlY3QgdG8gdGhpcyB2ZWN0b3IKICAgICAgICAgICAgICAgICAgICAgICAgICAgZm9ybWF0IFs8cHJvamVjdD4vXTxmZWF0dXJlX3NldD4uPGZlYXR1cmVfbmFtZSBvciAqPiBbYXMgPGFsaWFzPl0KICAgIDpwYXJhbSBsYWJlbF9mZWF0dXJlOiAgZmVhdHVyZSBuYW1lIHRvIGJlIHVzZWQgYXMgbGFiZWwgZGF0YQogICAgOnBhcmFtIGRlc2NyaXB0aW9uOiAgICB0ZXh0IGRlc2NyaXB0aW9uIG9mIHRoZSB2ZWN0b3IKICAgIDpwYXJhbSBlbnRpdHlfcm93czogICAgVVJJIG9mIHRoZSBkYXRhIGVudGl0eSByb3dzIHRvIGpvaW4gd2l0aAogICAgOnBhcmFtIHRhcmdldDogICAgICAgICB3aGVyZSB0byB3cml0ZSB0aGUgcmVzdWx0cyB0bwogICAgOnBhcmFtIGRyb3BfY29sdW1uczogICBsaXN0IG9mIGNvbHVtbnMgdG8gZHJvcCBmcm9tIHRoZSBmaW5hbCByZXN1bHQKICAgIDpwYXJhbSBlbnRpdHlfdGltZXN0YW1wX2NvbHVtbjogdGltZXN0YW1wIGNvbHVtbiBuYW1lIGluIHRoZSBlbnRpdHkgcm93cyBkYXRhZnJhbWUKICAgIDpwYXJhbSBydW5fY29uZmlnOiAgICAgZnVuY3Rpb24gYW5kL29yIHJ1biBjb25maWd1cmF0aW9uCiAgICAgICAgICAgICAgICAgICAgICAgICAgIHNlZSA6cHk6Y2xhc3M6YH5tbHJ1bi5mZWF0dXJlX3N0b3JlLlJ1bkNvbmZpZ2AKICAgIDpwYXJhbSBzdGFydF90aW1lOiAgICAgIGRhdGV0aW1lLCBsb3cgbGltaXQgb2YgdGltZSBuZWVkZWQgdG8gYmUgZmlsdGVyZWQuIE9wdGlvbmFsCiAgICAgICAgZW50aXR5X3RpbWVzdGFtcF9jb2x1bW4gbXVzdCBiZSBwYXNzZWQgd2hlbiB1c2luZyB0aW1lIGZpbHRlcmluZwogICAgOnBhcmFtIGVuZF90aW1lOiAgICAgICAgZGF0ZXRpbWUsIGhpZ2ggbGltaXQgb2YgdGltZSBuZWVkZWQgdG8gYmUgZmlsdGVyZWQuIE9wdGlvbmFsCiAgICAgICAgZW50aXR5X3RpbWVzdGFtcF9jb2x1bW4gbXVzdCBiZSBwYXNzZWQgd2hlbiB1c2luZyB0aW1lIGZpbHRlcmluZwogICAgOnBhcmFtIHdpdGhfaW5kZXhlczogICAgcmV0dXJuIHZlY3RvciB3aXRoIGluZGV4IGNvbHVtbnMgKGRlZmF1bHQgRmFsc2UpCiAgICA6cGFyYW0gdXBkYXRlX3N0YXRzOiAgICB1cGRhdGUgZmVhdHVyZXMgc3RhdGlzdGljcyBmcm9tIHRoZSByZXF1ZXN0ZWQgZmVhdHVyZSBzZXRzIG9uIHRoZSB2ZWN0b3IuIERlZmF1bHQgaXMgRmFsc2UuCgogICAgOnJldHVybnMgZmVhdHVyZV92ZWN0b3IgaW5wdXQKICAgICIiIgoKICAgIGlmIGZlYXR1cmVzIGlzIG5vdCBOb25lOgogICAgICAgICMgQ3JlYXRpbmcgYSBuZXcgRmVhdHVyZVZlY3RvciBhbmQgc2F2aW5nOgogICAgICAgIGlmIGlzX3N0b3JlX3VyaShmZWF0dXJlX3ZlY3Rvcik6CiAgICAgICAgICAgIHByZWZpeCwgbmV3X3VyaSA9IHBhcnNlX3N0b3JlX3VyaShmZWF0dXJlX3ZlY3RvcikKICAgICAgICAgICAgaWYgcHJlZml4ICE9IFN0b3JlUHJlZml4LkZlYXR1cmVWZWN0b3I6CiAgICAgICAgICAgICAgICByYWlzZSBNTFJ1bkludmFsaWRBcmd1bWVudEVycm9yKAogICAgICAgICAgICAgICAgICAgIGYicHJvdmlkZWQgc3RvcmUgdXJpICh7ZmVhdHVyZV92ZWN0b3J9KSBkb2VzIG5vdCByZXByZXNlbnQgYSBmZWF0dXJlIHZlY3RvciAocHJlZml4PXtwcmVmaXh9KSIKICAgICAgICAgICAgICAgICkKICAgICAgICAgICAgZmVhdHVyZV92ZWN0b3IgPSBuZXdfdXJpCgogICAgICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZiJDcmVhdGluZyBGZWF0dXJlVmVjdG9yIHtmZWF0dXJlX3ZlY3Rvcn0iKQogICAgICAgIHByb2plY3QsIG5hbWUsIHRhZywgXyA9IHBhcnNlX3ZlcnNpb25lZF9vYmplY3RfdXJpKGZlYXR1cmVfdmVjdG9yLCBtbHJ1bi5tbGNvbmYuZGVmYXVsdF9wcm9qZWN0KQogICAgICAgIHZlY3RvciA9IGZzLkZlYXR1cmVWZWN0b3IobmFtZSwgZmVhdHVyZXMsIGxhYmVsX2ZlYXR1cmU9bGFiZWxfZmVhdHVyZSwgZGVzY3JpcHRpb249ZGVzY3JpcHRpb24pCiAgICAgICAgdmVjdG9yLm1ldGFkYXRhLnByb2plY3QgPSBwcm9qZWN0CiAgICAgICAgdmVjdG9yLm1ldGFkYXRhLnRhZyA9IHRhZwogICAgICAgIHZlY3Rvci5zYXZlKCkKICAgICAgICBmZWF0dXJlX3ZlY3RvciA9IHZlY3Rvci51cmkKCiAgICAjIFByZXBhcmluZyBlbnRpdHlfcm93czoKICAgIGlmIGVudGl0eV9yb3dzIGlzIG5vdCBOb25lOgogICAgICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZiJDcmVhdGluZyBEYXRhRnJhbWUgZnJvbSBlbnRpdHlfcm93cyA9IHtlbnRpdHlfcm93c30iKQogICAgICAgIGVudGl0eV9yb3dzID0gZW50aXR5X3Jvd3MuYXNfZGYoKQoKICAgICMgUHJlcGFyaW5nIHRhcmdldDoKICAgIGlmIHRhcmdldDoKICAgICAgICBpZiBpc2luc3RhbmNlKHRhcmdldCwgc3RyKToKICAgICAgICAgICAgdGFyZ2V0ID0ga2luZF90b19kcml2ZXJbdGFyZ2V0XSgpCgogICAgICAgIG5hbWUgPSB0YXJnZXQubmFtZSBpZiBoYXNhdHRyKHRhcmdldCwgIm5hbWUiKSBlbHNlIHRhcmdldFsibmFtZSJdCiAgICAgICAgY29udGV4dC5sb2dnZXIuaW5mbyhmIlByZXBhcmluZyAne25hbWV9JyB0YXJnZXQiKQogICAgICAgIHRhcmdldCA9IGdldF90YXJnZXRfZHJpdmVyKHRhcmdldCkKICAgIGlmIHRhcmdldC5wYXRoOgogICAgICAgIGNvbnRleHQubG9nX3Jlc3VsdCgidGFyZ2V0IiwgdGFyZ2V0LnBhdGgpCgogICAgIyBQcmVwYXJpbmcgcnVuX2NvbmZpZzoKICAgIGlmIHJ1bl9jb25maWcgYW5kIGlzaW5zdGFuY2UocnVuX2NvbmZpZywgZGljdCk6CiAgICAgICAgY29udGV4dC5sb2dnZXIuaW5mbygiUHJlcGFyaW5nIHJ1biBjb25maWd1cmF0aW9uIikKICAgICAgICBydW5fY29uZmlnID0gZnMuUnVuQ29uZmlnKCoqcnVuX2NvbmZpZykKCiAgICAjIENhbGxpbmcgZ2V0X29mZmxpbmVfZmVhdHVyZXM6CiAgICBjb250ZXh0LmxvZ2dlci5pbmZvKAogICAgICAgIGYiZ2V0dGluZyBvZmZsaW5lIGZlYXR1cmVzIGZyb20gdGhlIEZlYXR1cmVWZWN0b3Ige2ZlYXR1cmVfdmVjdG9yfSIKICAgICkKICAgIGZzLmdldF9vZmZsaW5lX2ZlYXR1cmVzKAogICAgICAgIGZlYXR1cmVfdmVjdG9yPWZlYXR1cmVfdmVjdG9yLAogICAgICAgIGVudGl0eV9yb3dzPWVudGl0eV9yb3dzLAogICAgICAgIGVudGl0eV90aW1lc3RhbXBfY29sdW1uPWVudGl0eV90aW1lc3RhbXBfY29sdW1uLAogICAgICAgIHRhcmdldD10YXJnZXQsCiAgICAgICAgcnVuX2NvbmZpZz1ydW5fY29uZmlnLAogICAgICAgIGRyb3BfY29sdW1ucz1kcm9wX2NvbHVtbnMsCiAgICAgICAgc3RhcnRfdGltZT1zdGFydF90aW1lLAogICAgICAgIGVuZF90aW1lPWVuZF90aW1lLAogICAgICAgIHdpdGhfaW5kZXhlcz13aXRoX2luZGV4ZXMsCiAgICAgICAgdXBkYXRlX3N0YXRzPXVwZGF0ZV9zdGF0cywKICAgICkKCiAgICBjb250ZXh0LmxvZ19yZXN1bHQoImZlYXR1cmVfdmVjdG9yIiwgZmVhdHVyZV92ZWN0b3IpCg== - commands: [] - code_origin: https://github.com/mlrun/functions.git#552572c0a503a86a12830c7ab8eb515b2f1526fa:/Users/yonatanshelach/yoni/projects/functions/get_offline_features/get_offline_features.py - origin_filename: /Users/yonatanshelach/yoni/projects/functions/get_offline_features/get_offline_features.py - entry_points: - get_offline_features: - name: get_offline_features - doc: 'retrieve offline feature vector results - - - specify a feature vector object/uri and retrieve the desired features, their - metadata - - and statistics. returns :py:class:`~mlrun.feature_store.OfflineVectorResponse`, - - results can be returned as a dataframe or written to a target. - - If feature vector does not exist, a new one will be created and saved with - the given features. - - - The start_time and end_time attributes allow filtering the data to a given - time range, they accept - - string values or pandas `Timestamp` objects, string values can also be relative, - for example: - - "now", "now - 1d2h", "now+5m", where a valid pandas Timedelta string follows - the verb "now", - - for time alignment you can use the verb "floor" e.g. "now -1d floor 1H" will - align the time to the last hour - - (the floor string is passed to pandas.Timestamp.floor(), can use D, H, T, - S for day, hour, min, sec alignment)' - parameters: - - name: context - type: MLClientCtx - doc: MLRun context - default: '' - - name: feature_vector - type: str - doc: feature vector uri - default: '' - - name: features - type: List[str] - doc: Relevant only if feature_vector not exist. list of feature to collect - to this vector format [/]. [as - ] - default: null - - name: label_feature - type: str - doc: feature name to be used as label data - default: null - - name: description - type: str - doc: text description of the vector - default: null - - name: entity_rows - type: DataItem - doc: URI of the data entity rows to join with - default: null - - name: entity_timestamp_column - type: str - doc: timestamp column name in the entity rows dataframe - default: null - - name: target - type: Union[str, Dict] - doc: where to write the results to - default: null - - name: run_config - type: Union[str, Dict] - doc: function and/or run configuration see :py:class:`~mlrun.feature_store.RunConfig` - default: null - - name: drop_columns - type: List[str] - doc: list of columns to drop from the final result - default: null - - name: start_time - type: str - doc: datetime, low limit of time needed to be filtered. Optional entity_timestamp_column - must be passed when using time filtering - default: null - - name: end_time - type: str - doc: datetime, high limit of time needed to be filtered. Optional entity_timestamp_column - must be passed when using time filtering - default: null - - name: with_indexes - type: bool - doc: return vector with index columns (default False) - default: false - - name: update_stats - type: bool - doc: update features statistics from the requested feature sets on the vector. - Default is False. - default: false - outputs: - - default: '' - lineno: 13 - description: retrieve offline feature vector results - default_handler: get_offline_features - disable_auto_mount: false - env: [] - priority_class_name: '' - preemption_mode: prevent - affinity: null - tolerations: null -verbose: false diff --git a/functions/development/get_offline_features/0.9.1/src/get_offline_features.ipynb b/functions/development/get_offline_features/0.9.1/src/get_offline_features.ipynb deleted file mode 100644 index d97402a2..00000000 --- a/functions/development/get_offline_features/0.9.1/src/get_offline_features.ipynb +++ /dev/null @@ -1,1536 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# `get_offline_features()` from MLRun FeatureStore\n", - "\n", - "This MLRun Function has the following `params`:\n", - "\n", - "- `feature_vector: str`, feature vector uri.\n", - "\n", - "- `entity_rows: DataItem` = None, URI of the data entity rows to join with.\n", - "\n", - "- `entity_timestamp_column: str = None`, timestamp column name in the entity rows dataframe.\n", - "\n", - "- `target: Union[str, Dict] = None`, where to write the results to.\n", - "\n", - "- `run_config: Union[str, Dict] = None`, function and/or run configuration see :py:class:`~mlrun.feature_store.RunConfig`.\n", - "\n", - "- `drop_columns: List[str] = None`, list of columns to drop from the final result. \n", - "\n", - "- `start_time: str = None`, datetime, low limit of time needed to be filtered. Optional. `entity_timestamp_column` must be passed when using time filtering.\n", - "\n", - "- `end_time: str = None`, datetime, high limit of time needed to be filtered. Optional. `entity_timestamp_column` must be passed when using time filtering.\n", - "\n", - "- `with_indexes: bool = False`, return vector with index columns (default False).\n", - "\n", - "- `update_stats: bool = False`, update features statistics from the requested feature sets on the vector. Default is False." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import mlrun\n", - "import mlrun.feature_store as fstore\n", - "from mlrun.datastore.targets import CSVTarget\n", - "from mlrun.datastore.sources import CSVSource\n", - "from mlrun.run import get_dataitem\n", - "from mlrun.feature_store.steps import *\n", - "from mlrun.features import MinMaxValidator\n", - "import pandas as pd\n", - "import datetime\n", - "import os" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2022-01-31 14:41:48,288 [info] loaded project get-offline-features from MLRun DB\n" - ] - } - ], - "source": [ - "ABS_PATH = 'v3io://users/{}/get_offline_features/'.format(os.environ['V3IO_USERNAME'])\n", - "# Initialize the MLRun project object\n", - "project = mlrun.get_or_create_project('get-offline-features', context=\"./\", user_project=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Generating the Same FeatureSets and FeatureVecotrs Based on the Stocks Example" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Create Sample Data For Demo" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "quotes = pd.DataFrame(\n", - " {\n", - " \"time\": [\n", - " pd.Timestamp(\"2016-05-25 13:30:00.023\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.023\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.030\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.041\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.048\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.049\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.072\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.075\")\n", - " ],\n", - " \"ticker\": [\n", - " \"GOOG\",\n", - " \"MSFT\",\n", - " \"MSFT\",\n", - " \"MSFT\",\n", - " \"GOOG\",\n", - " \"AAPL\",\n", - " \"GOOG\",\n", - " \"MSFT\"\n", - " ],\n", - " \"bid\": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01],\n", - " \"ask\": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03]\n", - " }\n", - ")\n", - "\n", - "trades = pd.DataFrame(\n", - " {\n", - " \"time\": [\n", - " pd.Timestamp(\"2016-05-25 13:30:00.023\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.038\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.048\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.048\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.048\")\n", - " ],\n", - " \"ticker\": [\"MSFT\", \"MSFT\", \"GOOG\", \"GOOG\", \"AAPL\"],\n", - " \"price\": [51.95, 51.95, 720.77, 720.92, 98.0],\n", - " \"quantity\": [75, 155, 100, 100, 100]\n", - " }\n", - ")\n", - "\n", - "stocks = pd.DataFrame(\n", - " {\n", - " \"ticker\": [\"MSFT\", \"GOOG\", \"AAPL\"],\n", - " \"name\": [\"Microsoft Corporation\", \"Alphabet Inc\", \"Apple Inc\"],\n", - " \"exchange\": [\"NASDAQ\", \"NASDAQ\", \"NASDAQ\"]\n", - " }\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "def move_date(df, col):\n", - " max_date = df[col].max()\n", - " now_date = datetime.datetime.now()\n", - " delta = now_date - max_date \n", - " df[col] = df[col] + delta \n", - " return df\n", - "\n", - "quotes = move_date(quotes, \"time\")\n", - "trades = move_date(trades, \"time\")\n", - "trades.to_csv('trades.csv', index=False)\n", - "data_uri = os.path.join(ABS_PATH, 'trades.csv')" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
timetickerbidask
02022-01-31 14:41:48.260566GOOG720.50720.93
12022-01-31 14:41:48.260566MSFT51.9551.96
22022-01-31 14:41:48.267566MSFT51.9751.98
32022-01-31 14:41:48.278566MSFT51.9952.00
42022-01-31 14:41:48.285566GOOG720.50720.93
52022-01-31 14:41:48.286566AAPL97.9998.01
62022-01-31 14:41:48.309566GOOG720.50720.88
72022-01-31 14:41:48.312566MSFT52.0152.03
\n", - "
" - ], - "text/plain": [ - " time ticker bid ask\n", - "0 2022-01-31 14:41:48.260566 GOOG 720.50 720.93\n", - "1 2022-01-31 14:41:48.260566 MSFT 51.95 51.96\n", - "2 2022-01-31 14:41:48.267566 MSFT 51.97 51.98\n", - "3 2022-01-31 14:41:48.278566 MSFT 51.99 52.00\n", - "4 2022-01-31 14:41:48.285566 GOOG 720.50 720.93\n", - "5 2022-01-31 14:41:48.286566 AAPL 97.99 98.01\n", - "6 2022-01-31 14:41:48.309566 GOOG 720.50 720.88\n", - "7 2022-01-31 14:41:48.312566 MSFT 52.01 52.03" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "quotes" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
timetickerpricequantity
02022-01-31 14:41:48.288476MSFT51.9575
12022-01-31 14:41:48.303476MSFT51.95155
22022-01-31 14:41:48.313476GOOG720.77100
32022-01-31 14:41:48.313476GOOG720.92100
42022-01-31 14:41:48.313476AAPL98.00100
\n", - "
" - ], - "text/plain": [ - " time ticker price quantity\n", - "0 2022-01-31 14:41:48.288476 MSFT 51.95 75\n", - "1 2022-01-31 14:41:48.303476 MSFT 51.95 155\n", - "2 2022-01-31 14:41:48.313476 GOOG 720.77 100\n", - "3 2022-01-31 14:41:48.313476 GOOG 720.92 100\n", - "4 2022-01-31 14:41:48.313476 AAPL 98.00 100" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "trades" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
tickernameexchange
0MSFTMicrosoft CorporationNASDAQ
1GOOGAlphabet IncNASDAQ
2AAPLApple IncNASDAQ
\n", - "
" - ], - "text/plain": [ - " ticker name exchange\n", - "0 MSFT Microsoft Corporation NASDAQ\n", - "1 GOOG Alphabet Inc NASDAQ\n", - "2 AAPL Apple Inc NASDAQ" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "stocks" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Build & Ingest Simple Feature Set (stocks)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
nameexchange
ticker
MSFTMicrosoft CorporationNASDAQ
GOOGAlphabet IncNASDAQ
AAPLApple IncNASDAQ
\n", - "
" - ], - "text/plain": [ - " name exchange\n", - "ticker \n", - "MSFT Microsoft Corporation NASDAQ\n", - "GOOG Alphabet Inc NASDAQ\n", - "AAPL Apple Inc NASDAQ" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# add feature set without time column (stock ticker metadata) \n", - "stocks_set = fstore.FeatureSet(\"stocks\", entities=[fstore.Entity(\"ticker\")])\n", - "fstore.ingest(stocks_set, stocks, infer_options=fstore.InferOptions.default())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Build Advanced feature set - with feature engineering pipeline" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "quotes_set = fstore.FeatureSet(\"stock-quotes\", entities=[fstore.Entity(\"ticker\")])" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "class MyMap(MapClass):\n", - " def __init__(self, multiplier=1, **kwargs):\n", - " super().__init__(**kwargs)\n", - " self._multiplier = multiplier\n", - "\n", - " def do(self, event):\n", - " event[\"multi\"] = event[\"bid\"] * self._multiplier\n", - " return event" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "image/svg+xml": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "mlrun-flow\n", - "\n", - "\n", - "\n", - "_start\n", - "\n", - "start\n", - "\n", - "\n", - "\n", - "MyMap\n", - "\n", - "MyMap\n", - "\n", - "\n", - "\n", - "_start->MyMap\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "storey.Extend\n", - "\n", - "storey.Extend\n", - "\n", - "\n", - "\n", - "MyMap->storey.Extend\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "filter\n", - "\n", - "filter\n", - "\n", - "\n", - "\n", - "storey.Extend->filter\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "FeaturesetValidator\n", - "\n", - "FeaturesetValidator\n", - "\n", - "\n", - "\n", - "filter->FeaturesetValidator\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "Aggregates\n", - "\n", - "Aggregates\n", - "\n", - "\n", - "\n", - "FeaturesetValidator->Aggregates\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "parquet\n", - "\n", - "\n", - "parquet\n", - "\n", - "\n", - "\n", - "Aggregates->parquet\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "nosql\n", - "\n", - "\n", - "nosql\n", - "\n", - "\n", - "\n", - "Aggregates->nosql\n", - "\n", - "\n", - "\n", - "\n", - "\n" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "quotes_set.graph.to(\"MyMap\", multiplier=3)\\\n", - " .to(\"storey.Extend\", _fn=\"({'extra': event['bid'] * 77})\")\\\n", - " .to(\"storey.Filter\", \"filter\", _fn=\"(event['bid'] > 51.92)\")\\\n", - " .to(FeaturesetValidator())\n", - "\n", - "quotes_set.add_aggregation(\"ask\", [\"sum\", \"max\"], \"1h\", \"10m\", name=\"asks1\")\n", - "quotes_set.add_aggregation(\"ask\", [\"sum\", \"max\"], \"5h\", \"10m\", name=\"asks5\")\n", - "quotes_set.add_aggregation(\"bid\", [\"min\", \"max\"], \"1h\", \"10m\", name=\"bids\")\n", - "\n", - "# add feature validation policy\n", - "quotes_set[\"bid\"] = fstore.Feature(validator=MinMaxValidator(min=52, severity=\"info\"))\n", - "\n", - "# add default target definitions and plot\n", - "quotes_set.set_targets()\n", - "quotes_set.plot(rankdir=\"LR\", with_targets=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Ingest Data Into Offline And Online Stores" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.377248+00:00 args={'min': 52, 'value': 51.95}\n", - "info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.377927+00:00 args={'min': 52, 'value': 51.97}\n", - "info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.378103+00:00 args={'min': 52, 'value': 51.99}\n", - "info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.578640+00:00 args={'min': 52, 'value': 51.95}\n", - "info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.581692+00:00 args={'min': 52, 'value': 51.97}\n", - "info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.584351+00:00 args={'min': 52, 'value': 51.99}\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
asks1_max_1hasks1_sum_1hasks5_max_5hasks5_sum_5hbids_max_1hbids_min_1htimebidaskmultiextra
ticker
GOOG720.93720.93720.93720.93720.50720.502022-01-31 14:41:48.260566720.50720.932161.5055478.50
MSFT51.9651.9651.9651.9651.9551.952022-01-31 14:41:48.26056651.9551.96155.854000.15
MSFT51.98103.9451.98103.9451.9751.952022-01-31 14:41:48.26756651.9751.98155.914001.69
MSFT52.00155.9452.00155.9451.9951.952022-01-31 14:41:48.27856651.9952.00155.974003.23
GOOG720.931441.86720.931441.86720.50720.502022-01-31 14:41:48.285566720.50720.932161.5055478.50
AAPL98.0198.0198.0198.0197.9997.992022-01-31 14:41:48.28656697.9998.01293.977545.23
GOOG720.932162.74720.932162.74720.50720.502022-01-31 14:41:48.309566720.50720.882161.5055478.50
MSFT52.03207.9752.03207.9752.0151.952022-01-31 14:41:48.31256652.0152.03156.034004.77
\n", - "
" - ], - "text/plain": [ - " asks1_max_1h asks1_sum_1h asks5_max_5h asks5_sum_5h bids_max_1h \\\n", - "ticker \n", - "GOOG 720.93 720.93 720.93 720.93 720.50 \n", - "MSFT 51.96 51.96 51.96 51.96 51.95 \n", - "MSFT 51.98 103.94 51.98 103.94 51.97 \n", - "MSFT 52.00 155.94 52.00 155.94 51.99 \n", - "GOOG 720.93 1441.86 720.93 1441.86 720.50 \n", - "AAPL 98.01 98.01 98.01 98.01 97.99 \n", - "GOOG 720.93 2162.74 720.93 2162.74 720.50 \n", - "MSFT 52.03 207.97 52.03 207.97 52.01 \n", - "\n", - " bids_min_1h time bid ask multi \\\n", - "ticker \n", - "GOOG 720.50 2022-01-31 14:41:48.260566 720.50 720.93 2161.50 \n", - "MSFT 51.95 2022-01-31 14:41:48.260566 51.95 51.96 155.85 \n", - "MSFT 51.95 2022-01-31 14:41:48.267566 51.97 51.98 155.91 \n", - "MSFT 51.95 2022-01-31 14:41:48.278566 51.99 52.00 155.97 \n", - "GOOG 720.50 2022-01-31 14:41:48.285566 720.50 720.93 2161.50 \n", - "AAPL 97.99 2022-01-31 14:41:48.286566 97.99 98.01 293.97 \n", - "GOOG 720.50 2022-01-31 14:41:48.309566 720.50 720.88 2161.50 \n", - "MSFT 51.95 2022-01-31 14:41:48.312566 52.01 52.03 156.03 \n", - "\n", - " extra \n", - "ticker \n", - "GOOG 55478.50 \n", - "MSFT 4000.15 \n", - "MSFT 4001.69 \n", - "MSFT 4003.23 \n", - "GOOG 55478.50 \n", - "AAPL 7545.23 \n", - "GOOG 55478.50 \n", - "MSFT 4004.77 " - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# save ingest data and print the FeatureSet spec\n", - "fstore.ingest(quotes_set, quotes)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Get an Offline Feature Vector" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "features = [\n", - " \"stock-quotes.multi\",\n", - " \"stock-quotes.asks5_sum_5h as total_ask\",\n", - " \"stock-quotes.bids_min_1h\",\n", - " \"stock-quotes.bids_max_1h\",\n", - " \"stocks.*\",\n", - "]\n", - "\n", - "vector = fstore.FeatureVector(\"stocks-vec\", features)\n", - "vector.save()" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "target_dict = CSVTarget('mycsv',path=os.path.join(ABS_PATH, 'my_csv.csv')).to_dict()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Using `get_offline_features()` " - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [], - "source": [ - "get_offline_features_fn = mlrun.import_function('hub://get_offline_features:development')" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2022-01-31 14:41:52,066 [info] starting run get-offline-features-get_offline_features uid=956663b9a9ba448c9ea65e8e9245718e DB=http://mlrun-api:8080\n", - "> 2022-01-31 14:41:52,214 [info] Creating DataFrame from entity_rows = v3io://users/yonatan/get_offline_features/trades.csv\n", - "> 2022-01-31 14:41:52,292 [info] Preparing 'mycsv' target\n", - "> 2022-01-31 14:41:52,294 [info] getting offline features from the FeatureVector store://feature-vectors/get-offline-features-yonatan/stocks-vec\n", - "> 2022-01-31 14:41:52,708 [info] wrote target: {'name': 'mycsv', 'kind': 'csv', 'path': 'v3io://users/yonatan/get_offline_features/my_csv.csv', 'status': 'ready', 'updated': '2022-01-31T14:41:52.708534+00:00'}\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
get-offline-features-yonatan0Jan 31 14:41:52completedget-offline-features-get_offline_features
v3io_user=yonatan
kind=
owner=yonatan
host=jupyter-yoni-647b99c95d-w4jlc
entity_rows
feature_vector=store://feature-vectors/get-offline-features-yonatan/stocks-vec
target={'name': 'mycsv', 'kind': 'csv', 'path': 'v3io://users/yonatan/get_offline_features/my_csv.csv', 'partitioned': False}
entity_timestamp_column=time
target=v3io://users/yonatan/get_offline_features/my_csv.csv
feature_vector=store://feature-vectors/get-offline-features-yonatan/stocks-vec
\n", - "
\n", - "
\n", - "
\n", - " Title\n", - " ×\n", - "
\n", - " \n", - "
\n", - "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - }, - { - "data": { - "text/html": [ - " > to track results use the .show() or .logs() methods or click here to open in UI" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2022-01-31 14:41:52,896 [info] run executed, status=completed\n" - ] - } - ], - "source": [ - "gof_run = get_offline_features_fn.run(\n", - " handler='get_offline_features',\n", - " inputs= {'entity_rows': data_uri},\n", - " params={'feature_vector': vector.uri,\n", - " 'target': target_dict,\n", - " 'entity_timestamp_column': \"time\",\n", - " },\n", - " local=True\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'store://feature-vectors/get-offline-features-yonatan/stocks-vec'" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "gof_run.outputs['feature_vector']" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Unnamed: 0pricequantitymultitotal_askbids_min_1hbids_max_1hnameexchange
0051.9575155.8551.9651.9551.95Microsoft CorporationNASDAQ
1151.9575155.91103.9451.9551.97Microsoft CorporationNASDAQ
2251.9575155.97155.9451.9551.99Microsoft CorporationNASDAQ
3351.9575156.03207.9751.9552.01Microsoft CorporationNASDAQ
4451.95155155.8551.9651.9551.95Microsoft CorporationNASDAQ
5551.95155155.91103.9451.9551.97Microsoft CorporationNASDAQ
6651.95155155.97155.9451.9551.99Microsoft CorporationNASDAQ
7751.95155156.03207.9751.9552.01Microsoft CorporationNASDAQ
88720.771002161.50720.93720.50720.50Alphabet IncNASDAQ
99720.771002161.501441.86720.50720.50Alphabet IncNASDAQ
1010720.771002161.502162.74720.50720.50Alphabet IncNASDAQ
1111720.921002161.50720.93720.50720.50Alphabet IncNASDAQ
1212720.921002161.501441.86720.50720.50Alphabet IncNASDAQ
1313720.921002161.502162.74720.50720.50Alphabet IncNASDAQ
141498.00100293.9798.0197.9997.99Apple IncNASDAQ
\n", - "
" - ], - "text/plain": [ - " Unnamed: 0 price quantity multi total_ask bids_min_1h \\\n", - "0 0 51.95 75 155.85 51.96 51.95 \n", - "1 1 51.95 75 155.91 103.94 51.95 \n", - "2 2 51.95 75 155.97 155.94 51.95 \n", - "3 3 51.95 75 156.03 207.97 51.95 \n", - "4 4 51.95 155 155.85 51.96 51.95 \n", - "5 5 51.95 155 155.91 103.94 51.95 \n", - "6 6 51.95 155 155.97 155.94 51.95 \n", - "7 7 51.95 155 156.03 207.97 51.95 \n", - "8 8 720.77 100 2161.50 720.93 720.50 \n", - "9 9 720.77 100 2161.50 1441.86 720.50 \n", - "10 10 720.77 100 2161.50 2162.74 720.50 \n", - "11 11 720.92 100 2161.50 720.93 720.50 \n", - "12 12 720.92 100 2161.50 1441.86 720.50 \n", - "13 13 720.92 100 2161.50 2162.74 720.50 \n", - "14 14 98.00 100 293.97 98.01 97.99 \n", - "\n", - " bids_max_1h name exchange \n", - "0 51.95 Microsoft Corporation NASDAQ \n", - "1 51.97 Microsoft Corporation NASDAQ \n", - "2 51.99 Microsoft Corporation NASDAQ \n", - "3 52.01 Microsoft Corporation NASDAQ \n", - "4 51.95 Microsoft Corporation NASDAQ \n", - "5 51.97 Microsoft Corporation NASDAQ \n", - "6 51.99 Microsoft Corporation NASDAQ \n", - "7 52.01 Microsoft Corporation NASDAQ \n", - "8 720.50 Alphabet Inc NASDAQ \n", - "9 720.50 Alphabet Inc NASDAQ \n", - "10 720.50 Alphabet Inc NASDAQ \n", - "11 720.50 Alphabet Inc NASDAQ \n", - "12 720.50 Alphabet Inc NASDAQ \n", - "13 720.50 Alphabet Inc NASDAQ \n", - "14 97.99 Apple Inc NASDAQ " - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "mlrun.get_dataitem(gof_run.outputs['feature_vector']).as_df()" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/functions/development/get_offline_features/0.9.1/src/get_offline_features.py b/functions/development/get_offline_features/0.9.1/src/get_offline_features.py deleted file mode 100644 index 59cd114d..00000000 --- a/functions/development/get_offline_features/0.9.1/src/get_offline_features.py +++ /dev/null @@ -1,121 +0,0 @@ -from typing import Union, List, Dict - -import mlrun -import mlrun.feature_store as fs -from mlrun.datastore.store_resources import is_store_uri, parse_store_uri -from mlrun.datastore.targets import get_target_driver, kind_to_driver -from mlrun.datastore.base import DataItem -from mlrun.execution import MLClientCtx -from mlrun.utils import StorePrefix, parse_versioned_object_uri -from mlrun.errors import MLRunInvalidArgumentError - - -def get_offline_features( - context: MLClientCtx, - feature_vector: str, - features: List[str] = None, - label_feature: str = None, - description: str = None, - entity_rows: DataItem = None, - entity_timestamp_column: str = None, - target: Union[str, Dict] = None, - run_config: Union[str, Dict] = None, - drop_columns: List[str] = None, - start_time: str = None, - end_time: str = None, - with_indexes: bool = False, - update_stats: bool = False, -): - """retrieve offline feature vector results - - specify a feature vector object/uri and retrieve the desired features, their metadata - and statistics. returns :py:class:`~mlrun.feature_store.OfflineVectorResponse`, - results can be returned as a dataframe or written to a target. - If feature vector does not exist, a new one will be created and saved with the given features. - - The start_time and end_time attributes allow filtering the data to a given time range, they accept - string values or pandas `Timestamp` objects, string values can also be relative, for example: - "now", "now - 1d2h", "now+5m", where a valid pandas Timedelta string follows the verb "now", - for time alignment you can use the verb "floor" e.g. "now -1d floor 1H" will align the time to the last hour - (the floor string is passed to pandas.Timestamp.floor(), can use D, H, T, S for day, hour, min, sec alignment) - - - :param context: MLRun context - :param feature_vector: feature vector uri - :param features: Relevant only if feature_vector not exist. list of feature to collect to this vector - format [/]. [as ] - :param label_feature: feature name to be used as label data - :param description: text description of the vector - :param entity_rows: URI of the data entity rows to join with - :param target: where to write the results to - :param drop_columns: list of columns to drop from the final result - :param entity_timestamp_column: timestamp column name in the entity rows dataframe - :param run_config: function and/or run configuration - see :py:class:`~mlrun.feature_store.RunConfig` - :param start_time: datetime, low limit of time needed to be filtered. Optional - entity_timestamp_column must be passed when using time filtering - :param end_time: datetime, high limit of time needed to be filtered. Optional - entity_timestamp_column must be passed when using time filtering - :param with_indexes: return vector with index columns (default False) - :param update_stats: update features statistics from the requested feature sets on the vector. Default is False. - - :returns feature_vector input - """ - - if features is not None: - # Creating a new FeatureVector and saving: - if is_store_uri(feature_vector): - prefix, new_uri = parse_store_uri(feature_vector) - if prefix != StorePrefix.FeatureVector: - raise MLRunInvalidArgumentError( - f"provided store uri ({feature_vector}) does not represent a feature vector (prefix={prefix})" - ) - feature_vector = new_uri - - context.logger.info(f"Creating FeatureVector {feature_vector}") - project, name, tag, _ = parse_versioned_object_uri(feature_vector, mlrun.mlconf.default_project) - vector = fs.FeatureVector(name, features, label_feature=label_feature, description=description) - vector.metadata.project = project - vector.metadata.tag = tag - vector.save() - feature_vector = vector.uri - - # Preparing entity_rows: - if entity_rows is not None: - context.logger.info(f"Creating DataFrame from entity_rows = {entity_rows}") - entity_rows = entity_rows.as_df() - - # Preparing target: - if target: - if isinstance(target, str): - target = kind_to_driver[target]() - - name = target.name if hasattr(target, "name") else target["name"] - context.logger.info(f"Preparing '{name}' target") - target = get_target_driver(target) - if target.path: - context.log_result("target", target.path) - - # Preparing run_config: - if run_config and isinstance(run_config, dict): - context.logger.info("Preparing run configuration") - run_config = fs.RunConfig(**run_config) - - # Calling get_offline_features: - context.logger.info( - f"getting offline features from the FeatureVector {feature_vector}" - ) - fs.get_offline_features( - feature_vector=feature_vector, - entity_rows=entity_rows, - entity_timestamp_column=entity_timestamp_column, - target=target, - run_config=run_config, - drop_columns=drop_columns, - start_time=start_time, - end_time=end_time, - with_indexes=with_indexes, - update_stats=update_stats, - ) - - context.log_result("feature_vector", feature_vector) diff --git a/functions/development/get_offline_features/0.9.1/src/item.yaml b/functions/development/get_offline_features/0.9.1/src/item.yaml deleted file mode 100644 index e54b01ce..00000000 --- a/functions/development/get_offline_features/0.9.1/src/item.yaml +++ /dev/null @@ -1,25 +0,0 @@ -apiVersion: v1 -categories: -- data-preparation -- data-analysis -- feature-store -description: retrieve offline feature vector results -doc: '' -example: get_offline_features.ipynb -generationDate: 2022-05-25:10-58 -icon: '' -labels: - author: yonish -maintainers: [] -marketplaceType: '' -mlrunVersion: 1.0.1 -name: get_offline_features -platformVersion: '' -spec: - filename: get_offline_features.py - handler: get_offline_features - image: mlrun/mlrun - kind: job - requirements: [] -url: '' -version: 0.9.1 diff --git a/functions/development/get_offline_features/0.9.1/src/test_get_offline_features.py b/functions/development/get_offline_features/0.9.1/src/test_get_offline_features.py deleted file mode 100644 index 1aa92f0b..00000000 --- a/functions/development/get_offline_features/0.9.1/src/test_get_offline_features.py +++ /dev/null @@ -1,224 +0,0 @@ -import os -import tempfile -import shutil -import datetime - -import pytest -import mlrun -import mlrun.feature_store as fstore -from mlrun.datastore.targets import CSVTarget -from mlrun.feature_store.steps import * -from mlrun.features import MinMaxValidator -from mlrun.run import get_dataitem - - -REQUIRED_ENV_VARS = [ - "MLRUN_DBPATH", - "MLRUN_ARTIFACT_PATH", - "V3IO_USERNAME", - "V3IO_API", - "V3IO_ACCESS_KEY", -] - - -def _validate_environment_variables() -> bool: - """ - Checks that all required Environment variables are set. - """ - environment_keys = os.environ.keys() - return all(key in environment_keys for key in REQUIRED_ENV_VARS) - - -def _set_environment(): - """ - Creating project and temp dir for the project. - """ - artifact_path = tempfile.TemporaryDirectory().name - os.makedirs(artifact_path) - project = mlrun.get_or_create_project( - "get-offline-features-test", context="./", user_project=True - ) - return artifact_path, project - - -def _cleanup_environment(artifact_path: str): - """ - Cleanup the test environment, deleting files and artifacts created during the test. - - :param artifact_path: The artifact path to delete. - """ - # Clean the local directory: - for test_output in [ - *os.listdir(artifact_path), - "schedules", - "runs", - "artifacts", - "functions", - ]: - test_output_path = os.path.abspath(f"./{test_output}") - if os.path.exists(test_output_path): - if os.path.isdir(test_output_path): - shutil.rmtree(test_output_path) - else: - os.remove(test_output_path) - - # Clean the artifacts directory: - shutil.rmtree(artifact_path) - - -def create_dataframes() -> (pd.DataFrame, pd.DataFrame, pd.DataFrame): - """ - Creates all the necessary DataFrames to the test. - """ - - def move_date(df, col): - max_date = df[col].max() - now_date = datetime.datetime.now() - delta = now_date - max_date - df[col] = df[col] + delta - return df - - stocks = pd.DataFrame( - { - "ticker": ["MSFT", "GOOG", "AAPL"], - "name": ["Microsoft Corporation", "Alphabet Inc", "Apple Inc"], - "exchange": ["NASDAQ", "NASDAQ", "NASDAQ"], - } - ) - - quotes = pd.DataFrame( - { - "time": [ - pd.Timestamp("2016-05-25 13:30:00.023"), - pd.Timestamp("2016-05-25 13:30:00.023"), - pd.Timestamp("2016-05-25 13:30:00.030"), - pd.Timestamp("2016-05-25 13:30:00.041"), - pd.Timestamp("2016-05-25 13:30:00.048"), - pd.Timestamp("2016-05-25 13:30:00.049"), - pd.Timestamp("2016-05-25 13:30:00.072"), - pd.Timestamp("2016-05-25 13:30:00.075"), - ], - "ticker": ["GOOG", "MSFT", "MSFT", "MSFT", "GOOG", "AAPL", "GOOG", "MSFT"], - "bid": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01], - "ask": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03], - } - ) - - trades = pd.DataFrame( - { - "time": [ - pd.Timestamp("2016-05-25 13:30:00.023"), - pd.Timestamp("2016-05-25 13:30:00.038"), - pd.Timestamp("2016-05-25 13:30:00.048"), - pd.Timestamp("2016-05-25 13:30:00.048"), - pd.Timestamp("2016-05-25 13:30:00.048"), - ], - "ticker": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"], - "price": [51.95, 51.95, 720.77, 720.92, 98.0], - "quantity": [75, 155, 100, 100, 100], - } - ) - quotes = move_date(quotes, "time") - trades = move_date(trades, "time") - return quotes, trades, stocks - - -class MyMap(MapClass): - def __init__(self, multiplier=1, **kwargs): - super().__init__(**kwargs) - self._multiplier = multiplier - - def do(self, event): - event["multi"] = event["bid"] * self._multiplier - return event - - -def _create_feature_set(): - """ - Creating all the necessary FeatureSets for the test. - """ - stocks_set = fstore.FeatureSet("stocks", entities=[fstore.Entity("ticker")]) - - quotes_set = fstore.FeatureSet("stock-quotes", entities=[fstore.Entity("ticker")]) - - quotes_set.graph.to("MyMap", multiplier=3).to( - "storey.Extend", _fn="({'extra': event['bid'] * 77})" - ).to("storey.Filter", "filter", _fn="(event['bid'] > 51.92)").to( - FeaturesetValidator() - ) - - quotes_set.add_aggregation("ask", ["sum", "max"], "1h", "10m", name="asks1") - quotes_set.add_aggregation("ask", ["sum", "max"], "5h", "10m", name="asks5") - quotes_set.add_aggregation("bid", ["min", "max"], "1h", "10m", name="bids") - - # add feature validation policy - quotes_set["bid"] = fstore.Feature( - validator=MinMaxValidator(min=52, severity="info") - ) - - # add default target definitions and plot - quotes_set.set_targets() - return quotes_set, stocks_set - - -@pytest.mark.skipif( - condition=not _validate_environment_variables(), - reason="Project's environment variables are not set", -) -def test_get_offline_vector(): - # Creating project: - artifact_path, project = _set_environment() - - # Importing the marketplace function: - gof_fn = mlrun.import_function("function.yaml") - - # Creating the dataframes: - quotes, trades, stocks = create_dataframes() - - # Defining features for the FeatureVector: - features = [ - "stock-quotes.multi", - "stock-quotes.asks5_sum_5h as total_ask", - "stock-quotes.bids_min_1h", - "stock-quotes.bids_max_1h", - "stocks.*", - ] - - # Creating the FeatureSets and ingesting them: - quotes_set, stocks_set = _create_feature_set() - fstore.ingest(stocks_set, stocks) - fstore.ingest(quotes_set, quotes) - - # Saving the trades dataframe as a csv to use as entity_rows: - trades_uri = os.path.join(artifact_path, "trades.csv") - trades.to_csv(trades_uri, index=False) - - # Creating target for the FeatureVector: - target_dict = CSVTarget( - "mycsv", path=os.path.join(artifact_path, "my_csv.csv") - ).to_dict() - - # Running the getting_offline_features function: - gof_run = None - try: - gof_run = gof_fn.run( - handler="get_offline_features", - inputs={"entity_rows": trades_uri}, - params={ - "feature_vector": "stocks-vec", - "features": features, - "target": target_dict, - "entity_timestamp_column": "time", - }, - local=True, - ) - - except Exception as e: - print(f"- The test failed - raised the following error:\n- {e}") - - target_df = get_dataitem(gof_run.outputs["target"]).as_df() - vector_df = get_dataitem(gof_run.outputs["feature_vector"]).as_df() - - # Asserting that the target and FeatureVector dataframes are the same: - assert vector_df.equals(target_df), "Target and feature vector are not the same" - _cleanup_environment(artifact_path) diff --git a/functions/development/get_offline_features/0.9.1/static/documentation.html b/functions/development/get_offline_features/0.9.1/static/documentation.html deleted file mode 100644 index 7e82b21a..00000000 --- a/functions/development/get_offline_features/0.9.1/static/documentation.html +++ /dev/null @@ -1,167 +0,0 @@ - - - - - - - -get_offline_features package - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- - -
-
-
-
-
-
-

get_offline_features package

-
-

Submodules

-
-
-

get_offline_features.get_offline_features module

-
-
-get_offline_features.get_offline_features.get_offline_features(context: mlrun.execution.MLClientCtx, feature_vector: str, features: Optional[List[str]] = None, label_feature: Optional[str] = None, description: Optional[str] = None, entity_rows: Optional[mlrun.datastore.base.DataItem] = None, entity_timestamp_column: Optional[str] = None, target: Optional[Union[str, Dict]] = None, run_config: Optional[Union[str, Dict]] = None, drop_columns: Optional[List[str]] = None, start_time: Optional[str] = None, end_time: Optional[str] = None, with_indexes: bool = False, update_stats: bool = False)[source]
-

retrieve offline feature vector results

-

specify a feature vector object/uri and retrieve the desired features, their metadata -and statistics. returns OfflineVectorResponse, -results can be returned as a dataframe or written to a target. -If feature vector does not exist, a new one will be created and saved with the given features.

-

The start_time and end_time attributes allow filtering the data to a given time range, they accept -string values or pandas Timestamp objects, string values can also be relative, for example: -“now”, “now - 1d2h”, “now+5m”, where a valid pandas Timedelta string follows the verb “now”, -for time alignment you can use the verb “floor” e.g. “now -1d floor 1H” will align the time to the last hour -(the floor string is passed to pandas.Timestamp.floor(), can use D, H, T, S for day, hour, min, sec alignment)

-
-
Parameters
-
    -
  • context – MLRun context

  • -
  • feature_vector – feature vector uri

  • -
  • features – Relevant only if feature_vector not exist. list of feature to collect to this vector -format [<project>/]<feature_set>.<feature_name or *> [as <alias>]

  • -
  • label_feature – feature name to be used as label data

  • -
  • description – text description of the vector

  • -
  • entity_rows – URI of the data entity rows to join with

  • -
  • target – where to write the results to

  • -
  • drop_columns – list of columns to drop from the final result

  • -
  • entity_timestamp_column – timestamp column name in the entity rows dataframe

  • -
  • run_config – function and/or run configuration -see RunConfig

  • -
  • start_time – datetime, low limit of time needed to be filtered. Optional -entity_timestamp_column must be passed when using time filtering

  • -
  • end_time – datetime, high limit of time needed to be filtered. Optional -entity_timestamp_column must be passed when using time filtering

  • -
  • with_indexes – return vector with index columns (default False)

  • -
  • update_stats – update features statistics from the requested feature sets on the vector. Default is False.

  • -
-
-
-

:returns feature_vector input

-
-
-
-

Module contents

-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/get_offline_features/0.9.1/static/example.html b/functions/development/get_offline_features/0.9.1/static/example.html deleted file mode 100644 index 4e837c4f..00000000 --- a/functions/development/get_offline_features/0.9.1/static/example.html +++ /dev/null @@ -1,1259 +0,0 @@ - - - - - - - -get_offline_features() from MLRun FeatureStore - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
- -
-
-
-
-

get_offline_features() from MLRun FeatureStore

-

This MLRun Function has the following params:

-
    -
  • feature_vector: str, feature vector uri.

  • -
  • entity_rows: DataItem = None, URI of the data entity rows to join with.

  • -
  • entity_timestamp_column: str = None, timestamp column name in the entity rows dataframe.

  • -
  • target: Union[str, Dict] = None, where to write the results to.

  • -
  • run_config: Union[str, Dict] = None, function and/or run configuration see :py:class:~mlrun.feature_store.RunConfig.

  • -
  • drop_columns: List[str] = None, list of columns to drop from the final result.

  • -
  • start_time: str = None, datetime, low limit of time needed to be filtered. Optional. entity_timestamp_column must be passed when using time filtering.

  • -
  • end_time: str = None, datetime, high limit of time needed to be filtered. Optional. entity_timestamp_column must be passed when using time filtering.

  • -
  • with_indexes: bool = False, return vector with index columns (default False).

  • -
  • update_stats: bool = False, update features statistics from the requested feature sets on the vector. Default is False.

  • -
-
-
-
import mlrun
-import mlrun.feature_store as fstore
-from mlrun.datastore.targets import CSVTarget
-from mlrun.datastore.sources import CSVSource
-from mlrun.run import get_dataitem
-from mlrun.feature_store.steps import *
-from mlrun.features import MinMaxValidator
-import pandas as pd
-import datetime
-import os
-
-
-
-
-
-
-
ABS_PATH = 'v3io://users/{}/get_offline_features/'.format(os.environ['V3IO_USERNAME'])
-# Initialize the MLRun project object
-project = mlrun.get_or_create_project('get-offline-features', context="./", user_project=True)
-
-
-
-
-
> 2022-01-31 14:41:48,288 [info] loaded project get-offline-features from MLRun DB
-
-
-
-
-
-

Generating the Same FeatureSets and FeatureVecotrs Based on the Stocks Example

-
-

Create Sample Data For Demo

-
-
-
quotes = pd.DataFrame(
-    {
-        "time": [
-            pd.Timestamp("2016-05-25 13:30:00.023"),
-            pd.Timestamp("2016-05-25 13:30:00.023"),
-            pd.Timestamp("2016-05-25 13:30:00.030"),
-            pd.Timestamp("2016-05-25 13:30:00.041"),
-            pd.Timestamp("2016-05-25 13:30:00.048"),
-            pd.Timestamp("2016-05-25 13:30:00.049"),
-            pd.Timestamp("2016-05-25 13:30:00.072"),
-            pd.Timestamp("2016-05-25 13:30:00.075")
-        ],
-        "ticker": [
-               "GOOG",
-               "MSFT",
-               "MSFT",
-               "MSFT",
-               "GOOG",
-               "AAPL",
-               "GOOG",
-               "MSFT"
-           ],
-           "bid": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01],
-           "ask": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03]
-    }
-)
-
-trades = pd.DataFrame(
-       {
-           "time": [
-               pd.Timestamp("2016-05-25 13:30:00.023"),
-               pd.Timestamp("2016-05-25 13:30:00.038"),
-               pd.Timestamp("2016-05-25 13:30:00.048"),
-               pd.Timestamp("2016-05-25 13:30:00.048"),
-               pd.Timestamp("2016-05-25 13:30:00.048")
-           ],
-           "ticker": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"],
-           "price": [51.95, 51.95, 720.77, 720.92, 98.0],
-           "quantity": [75, 155, 100, 100, 100]
-       }
-)
-
-stocks = pd.DataFrame(
-       {
-           "ticker": ["MSFT", "GOOG", "AAPL"],
-           "name": ["Microsoft Corporation", "Alphabet Inc", "Apple Inc"],
-           "exchange": ["NASDAQ", "NASDAQ", "NASDAQ"]
-       }
-)
-
-
-
-
-
-
-
def move_date(df, col):
-    max_date = df[col].max()
-    now_date = datetime.datetime.now()
-    delta = now_date - max_date 
-    df[col] = df[col] + delta 
-    return df
-
-quotes = move_date(quotes, "time")
-trades = move_date(trades, "time")
-trades.to_csv('trades.csv', index=False)
-data_uri = os.path.join(ABS_PATH, 'trades.csv')
-
-
-
-
-
-
-
quotes
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
timetickerbidask
02022-01-31 14:41:48.260566GOOG720.50720.93
12022-01-31 14:41:48.260566MSFT51.9551.96
22022-01-31 14:41:48.267566MSFT51.9751.98
32022-01-31 14:41:48.278566MSFT51.9952.00
42022-01-31 14:41:48.285566GOOG720.50720.93
52022-01-31 14:41:48.286566AAPL97.9998.01
62022-01-31 14:41:48.309566GOOG720.50720.88
72022-01-31 14:41:48.312566MSFT52.0152.03
-
-
-
-
-
trades
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
timetickerpricequantity
02022-01-31 14:41:48.288476MSFT51.9575
12022-01-31 14:41:48.303476MSFT51.95155
22022-01-31 14:41:48.313476GOOG720.77100
32022-01-31 14:41:48.313476GOOG720.92100
42022-01-31 14:41:48.313476AAPL98.00100
-
-
-
-
-
stocks
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
tickernameexchange
0MSFTMicrosoft CorporationNASDAQ
1GOOGAlphabet IncNASDAQ
2AAPLApple IncNASDAQ
-
-
-
-
-

Build & Ingest Simple Feature Set (stocks)

-
-
-
# add feature set without time column (stock ticker metadata) 
-stocks_set = fstore.FeatureSet("stocks", entities=[fstore.Entity("ticker")])
-fstore.ingest(stocks_set, stocks, infer_options=fstore.InferOptions.default())
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
nameexchange
ticker
MSFTMicrosoft CorporationNASDAQ
GOOGAlphabet IncNASDAQ
AAPLApple IncNASDAQ
-
-
-
-
-

Build Advanced feature set - with feature engineering pipeline

-
-
-
quotes_set = fstore.FeatureSet("stock-quotes", entities=[fstore.Entity("ticker")])
-
-
-
-
-
-
-
class MyMap(MapClass):
-    def __init__(self, multiplier=1, **kwargs):
-        super().__init__(**kwargs)
-        self._multiplier = multiplier
-
-    def do(self, event):
-        event["multi"] = event["bid"] * self._multiplier
-        return event
-
-
-
-
-
-
-
quotes_set.graph.to("MyMap", multiplier=3)\
-                .to("storey.Extend", _fn="({'extra': event['bid'] * 77})")\
-                .to("storey.Filter", "filter", _fn="(event['bid'] > 51.92)")\
-                .to(FeaturesetValidator())
-
-quotes_set.add_aggregation("ask", ["sum", "max"], "1h", "10m", name="asks1")
-quotes_set.add_aggregation("ask", ["sum", "max"], "5h", "10m", name="asks5")
-quotes_set.add_aggregation("bid", ["min", "max"], "1h", "10m", name="bids")
-
-# add feature validation policy
-quotes_set["bid"] = fstore.Feature(validator=MinMaxValidator(min=52, severity="info"))
-
-# add default target definitions and plot
-quotes_set.set_targets()
-quotes_set.plot(rankdir="LR", with_targets=True)
-
-
-
-
-_images/01e548158b906df8cc3f4f282097c5f50e603245dd43f2314bc9ff5ef6aa1447.svg
-
-
-
-

Ingest Data Into Offline And Online Stores

-
-
-
# save ingest data and print the FeatureSet spec
-fstore.ingest(quotes_set, quotes)
-
-
-
-
-
info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.377248+00:00 args={'min': 52, 'value': 51.95}
-info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.377927+00:00 args={'min': 52, 'value': 51.97}
-info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.378103+00:00 args={'min': 52, 'value': 51.99}
-info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.578640+00:00 args={'min': 52, 'value': 51.95}
-info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.581692+00:00 args={'min': 52, 'value': 51.97}
-info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.584351+00:00 args={'min': 52, 'value': 51.99}
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
asks1_max_1hasks1_sum_1hasks5_max_5hasks5_sum_5hbids_max_1hbids_min_1htimebidaskmultiextra
ticker
GOOG720.93720.93720.93720.93720.50720.502022-01-31 14:41:48.260566720.50720.932161.5055478.50
MSFT51.9651.9651.9651.9651.9551.952022-01-31 14:41:48.26056651.9551.96155.854000.15
MSFT51.98103.9451.98103.9451.9751.952022-01-31 14:41:48.26756651.9751.98155.914001.69
MSFT52.00155.9452.00155.9451.9951.952022-01-31 14:41:48.27856651.9952.00155.974003.23
GOOG720.931441.86720.931441.86720.50720.502022-01-31 14:41:48.285566720.50720.932161.5055478.50
AAPL98.0198.0198.0198.0197.9997.992022-01-31 14:41:48.28656697.9998.01293.977545.23
GOOG720.932162.74720.932162.74720.50720.502022-01-31 14:41:48.309566720.50720.882161.5055478.50
MSFT52.03207.9752.03207.9752.0151.952022-01-31 14:41:48.31256652.0152.03156.034004.77
-
-
-
-
-

Get an Offline Feature Vector

-
-
-
features = [
-    "stock-quotes.multi",
-    "stock-quotes.asks5_sum_5h as total_ask",
-    "stock-quotes.bids_min_1h",
-    "stock-quotes.bids_max_1h",
-    "stocks.*",
-]
-
-vector = fstore.FeatureVector("stocks-vec", features)
-vector.save()
-
-
-
-
-
-
-
target_dict = CSVTarget('mycsv',path=os.path.join(ABS_PATH, 'my_csv.csv')).to_dict()
-
-
-
-
-
-
-

Using get_offline_features()

-
-
-
get_offline_features_fn = mlrun.import_function('hub://get_offline_features:development')
-
-
-
-
-
-
-
gof_run = get_offline_features_fn.run(
-    handler='get_offline_features',
-    inputs= {'entity_rows': data_uri},
-    params={'feature_vector': vector.uri,
-           'target': target_dict,
-            'entity_timestamp_column': "time",
-           },
-    local=True
-)
-
-
-
-
-
> 2022-01-31 14:41:52,066 [info] starting run get-offline-features-get_offline_features uid=956663b9a9ba448c9ea65e8e9245718e DB=http://mlrun-api:8080
-> 2022-01-31 14:41:52,214 [info] Creating DataFrame from entity_rows = v3io://users/yonatan/get_offline_features/trades.csv
-> 2022-01-31 14:41:52,292 [info] Preparing 'mycsv' target
-> 2022-01-31 14:41:52,294 [info] getting offline features from the FeatureVector store://feature-vectors/get-offline-features-yonatan/stocks-vec
-> 2022-01-31 14:41:52,708 [info] wrote target: {'name': 'mycsv', 'kind': 'csv', 'path': 'v3io://users/yonatan/get_offline_features/my_csv.csv', 'status': 'ready', 'updated': '2022-01-31T14:41:52.708534+00:00'}
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
get-offline-features-yonatan0Jan 31 14:41:52completedget-offline-features-get_offline_features
v3io_user=yonatan
kind=
owner=yonatan
host=jupyter-yoni-647b99c95d-w4jlc
entity_rows
feature_vector=store://feature-vectors/get-offline-features-yonatan/stocks-vec
target={'name': 'mycsv', 'kind': 'csv', 'path': 'v3io://users/yonatan/get_offline_features/my_csv.csv', 'partitioned': False}
entity_timestamp_column=time
target=v3io://users/yonatan/get_offline_features/my_csv.csv
feature_vector=store://feature-vectors/get-offline-features-yonatan/stocks-vec
-
- -
-

-
-
-
> to track results use the .show() or .logs() methods or click here to open in UI
> 2022-01-31 14:41:52,896 [info] run executed, status=completed
-
-
-
-
-
-
-
gof_run.outputs['feature_vector']
-
-
-
-
-
'store://feature-vectors/get-offline-features-yonatan/stocks-vec'
-
-
-
-
-
-
-
mlrun.get_dataitem(gof_run.outputs['feature_vector']).as_df()
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Unnamed: 0pricequantitymultitotal_askbids_min_1hbids_max_1hnameexchange
0051.9575155.8551.9651.9551.95Microsoft CorporationNASDAQ
1151.9575155.91103.9451.9551.97Microsoft CorporationNASDAQ
2251.9575155.97155.9451.9551.99Microsoft CorporationNASDAQ
3351.9575156.03207.9751.9552.01Microsoft CorporationNASDAQ
4451.95155155.8551.9651.9551.95Microsoft CorporationNASDAQ
5551.95155155.91103.9451.9551.97Microsoft CorporationNASDAQ
6651.95155155.97155.9451.9551.99Microsoft CorporationNASDAQ
7751.95155156.03207.9751.9552.01Microsoft CorporationNASDAQ
88720.771002161.50720.93720.50720.50Alphabet IncNASDAQ
99720.771002161.501441.86720.50720.50Alphabet IncNASDAQ
1010720.771002161.502162.74720.50720.50Alphabet IncNASDAQ
1111720.921002161.50720.93720.50720.50Alphabet IncNASDAQ
1212720.921002161.501441.86720.50720.50Alphabet IncNASDAQ
1313720.921002161.502162.74720.50720.50Alphabet IncNASDAQ
141498.00100293.9798.0197.9997.99Apple IncNASDAQ
-
-
-
-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/get_offline_features/0.9.1/static/function.html b/functions/development/get_offline_features/0.9.1/static/function.html deleted file mode 100644 index 00a8866c..00000000 --- a/functions/development/get_offline_features/0.9.1/static/function.html +++ /dev/null @@ -1,148 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: job
-metadata:
-  name: get-offline-features
-  tag: ''
-  hash: e7bd4fc1f7377374a910d4aa21e31dbba31f59a3
-  project: ''
-  labels:
-    author: yonish
-  categories:
-  - data-preparation
-  - data-analysis
-  - feature-store
-spec:
-  command: ''
-  args: []
-  image: mlrun/mlrun
-  build:
-    functionSourceCode: ZnJvbSB0eXBpbmcgaW1wb3J0IFVuaW9uLCBMaXN0LCBEaWN0CgppbXBvcnQgbWxydW4KaW1wb3J0IG1scnVuLmZlYXR1cmVfc3RvcmUgYXMgZnMKZnJvbSBtbHJ1bi5kYXRhc3RvcmUuc3RvcmVfcmVzb3VyY2VzIGltcG9ydCBpc19zdG9yZV91cmksIHBhcnNlX3N0b3JlX3VyaQpmcm9tIG1scnVuLmRhdGFzdG9yZS50YXJnZXRzIGltcG9ydCBnZXRfdGFyZ2V0X2RyaXZlciwga2luZF90b19kcml2ZXIKZnJvbSBtbHJ1bi5kYXRhc3RvcmUuYmFzZSBpbXBvcnQgRGF0YUl0ZW0KZnJvbSBtbHJ1bi5leGVjdXRpb24gaW1wb3J0IE1MQ2xpZW50Q3R4CmZyb20gbWxydW4udXRpbHMgaW1wb3J0IFN0b3JlUHJlZml4LCBwYXJzZV92ZXJzaW9uZWRfb2JqZWN0X3VyaQpmcm9tIG1scnVuLmVycm9ycyBpbXBvcnQgTUxSdW5JbnZhbGlkQXJndW1lbnRFcnJvcgoKCmRlZiBnZXRfb2ZmbGluZV9mZWF0dXJlcygKICAgIGNvbnRleHQ6IE1MQ2xpZW50Q3R4LAogICAgZmVhdHVyZV92ZWN0b3I6IHN0ciwKICAgIGZlYXR1cmVzOiBMaXN0W3N0cl0gPSBOb25lLAogICAgbGFiZWxfZmVhdHVyZTogc3RyID0gTm9uZSwKICAgIGRlc2NyaXB0aW9uOiBzdHIgPSBOb25lLAogICAgZW50aXR5X3Jvd3M6IERhdGFJdGVtID0gTm9uZSwKICAgIGVudGl0eV90aW1lc3RhbXBfY29sdW1uOiBzdHIgPSBOb25lLAogICAgdGFyZ2V0OiBVbmlvbltzdHIsIERpY3RdID0gTm9uZSwKICAgIHJ1bl9jb25maWc6IFVuaW9uW3N0ciwgRGljdF0gPSBOb25lLAogICAgZHJvcF9jb2x1bW5zOiBMaXN0W3N0cl0gPSBOb25lLAogICAgc3RhcnRfdGltZTogc3RyID0gTm9uZSwKICAgIGVuZF90aW1lOiBzdHIgPSBOb25lLAogICAgd2l0aF9pbmRleGVzOiBib29sID0gRmFsc2UsCiAgICB1cGRhdGVfc3RhdHM6IGJvb2wgPSBGYWxzZSwKKToKICAgICIiInJldHJpZXZlIG9mZmxpbmUgZmVhdHVyZSB2ZWN0b3IgcmVzdWx0cwoKICAgIHNwZWNpZnkgYSBmZWF0dXJlIHZlY3RvciBvYmplY3QvdXJpIGFuZCByZXRyaWV2ZSB0aGUgZGVzaXJlZCBmZWF0dXJlcywgdGhlaXIgbWV0YWRhdGEKICAgIGFuZCBzdGF0aXN0aWNzLiByZXR1cm5zIDpweTpjbGFzczpgfm1scnVuLmZlYXR1cmVfc3RvcmUuT2ZmbGluZVZlY3RvclJlc3BvbnNlYCwKICAgIHJlc3VsdHMgY2FuIGJlIHJldHVybmVkIGFzIGEgZGF0YWZyYW1lIG9yIHdyaXR0ZW4gdG8gYSB0YXJnZXQuCiAgICBJZiBmZWF0dXJlIHZlY3RvciBkb2VzIG5vdCBleGlzdCwgYSBuZXcgb25lIHdpbGwgYmUgY3JlYXRlZCBhbmQgc2F2ZWQgd2l0aCB0aGUgZ2l2ZW4gZmVhdHVyZXMuCgogICAgVGhlIHN0YXJ0X3RpbWUgYW5kIGVuZF90aW1lIGF0dHJpYnV0ZXMgYWxsb3cgZmlsdGVyaW5nIHRoZSBkYXRhIHRvIGEgZ2l2ZW4gdGltZSByYW5nZSwgdGhleSBhY2NlcHQKICAgIHN0cmluZyB2YWx1ZXMgb3IgcGFuZGFzIGBUaW1lc3RhbXBgIG9iamVjdHMsIHN0cmluZyB2YWx1ZXMgY2FuIGFsc28gYmUgcmVsYXRpdmUsIGZvciBleGFtcGxlOgogICAgIm5vdyIsICJub3cgLSAxZDJoIiwgIm5vdys1bSIsIHdoZXJlIGEgdmFsaWQgcGFuZGFzIFRpbWVkZWx0YSBzdHJpbmcgZm9sbG93cyB0aGUgdmVyYiAibm93IiwKICAgIGZvciB0aW1lIGFsaWdubWVudCB5b3UgY2FuIHVzZSB0aGUgdmVyYiAiZmxvb3IiIGUuZy4gIm5vdyAtMWQgZmxvb3IgMUgiIHdpbGwgYWxpZ24gdGhlIHRpbWUgdG8gdGhlIGxhc3QgaG91cgogICAgKHRoZSBmbG9vciBzdHJpbmcgaXMgcGFzc2VkIHRvIHBhbmRhcy5UaW1lc3RhbXAuZmxvb3IoKSwgY2FuIHVzZSBELCBILCBULCBTIGZvciBkYXksIGhvdXIsIG1pbiwgc2VjIGFsaWdubWVudCkKCgogICAgOnBhcmFtIGNvbnRleHQ6ICAgICAgICBNTFJ1biBjb250ZXh0CiAgICA6cGFyYW0gZmVhdHVyZV92ZWN0b3I6IGZlYXR1cmUgdmVjdG9yIHVyaQogICAgOnBhcmFtIGZlYXR1cmVzOiAgICAgICBSZWxldmFudCBvbmx5IGlmIGZlYXR1cmVfdmVjdG9yIG5vdCBleGlzdC4gbGlzdCBvZiBmZWF0dXJlIHRvIGNvbGxlY3QgdG8gdGhpcyB2ZWN0b3IKICAgICAgICAgICAgICAgICAgICAgICAgICAgZm9ybWF0IFs8cHJvamVjdD4vXTxmZWF0dXJlX3NldD4uPGZlYXR1cmVfbmFtZSBvciAqPiBbYXMgPGFsaWFzPl0KICAgIDpwYXJhbSBsYWJlbF9mZWF0dXJlOiAgZmVhdHVyZSBuYW1lIHRvIGJlIHVzZWQgYXMgbGFiZWwgZGF0YQogICAgOnBhcmFtIGRlc2NyaXB0aW9uOiAgICB0ZXh0IGRlc2NyaXB0aW9uIG9mIHRoZSB2ZWN0b3IKICAgIDpwYXJhbSBlbnRpdHlfcm93czogICAgVVJJIG9mIHRoZSBkYXRhIGVudGl0eSByb3dzIHRvIGpvaW4gd2l0aAogICAgOnBhcmFtIHRhcmdldDogICAgICAgICB3aGVyZSB0byB3cml0ZSB0aGUgcmVzdWx0cyB0bwogICAgOnBhcmFtIGRyb3BfY29sdW1uczogICBsaXN0IG9mIGNvbHVtbnMgdG8gZHJvcCBmcm9tIHRoZSBmaW5hbCByZXN1bHQKICAgIDpwYXJhbSBlbnRpdHlfdGltZXN0YW1wX2NvbHVtbjogdGltZXN0YW1wIGNvbHVtbiBuYW1lIGluIHRoZSBlbnRpdHkgcm93cyBkYXRhZnJhbWUKICAgIDpwYXJhbSBydW5fY29uZmlnOiAgICAgZnVuY3Rpb24gYW5kL29yIHJ1biBjb25maWd1cmF0aW9uCiAgICAgICAgICAgICAgICAgICAgICAgICAgIHNlZSA6cHk6Y2xhc3M6YH5tbHJ1bi5mZWF0dXJlX3N0b3JlLlJ1bkNvbmZpZ2AKICAgIDpwYXJhbSBzdGFydF90aW1lOiAgICAgIGRhdGV0aW1lLCBsb3cgbGltaXQgb2YgdGltZSBuZWVkZWQgdG8gYmUgZmlsdGVyZWQuIE9wdGlvbmFsCiAgICAgICAgZW50aXR5X3RpbWVzdGFtcF9jb2x1bW4gbXVzdCBiZSBwYXNzZWQgd2hlbiB1c2luZyB0aW1lIGZpbHRlcmluZwogICAgOnBhcmFtIGVuZF90aW1lOiAgICAgICAgZGF0ZXRpbWUsIGhpZ2ggbGltaXQgb2YgdGltZSBuZWVkZWQgdG8gYmUgZmlsdGVyZWQuIE9wdGlvbmFsCiAgICAgICAgZW50aXR5X3RpbWVzdGFtcF9jb2x1bW4gbXVzdCBiZSBwYXNzZWQgd2hlbiB1c2luZyB0aW1lIGZpbHRlcmluZwogICAgOnBhcmFtIHdpdGhfaW5kZXhlczogICAgcmV0dXJuIHZlY3RvciB3aXRoIGluZGV4IGNvbHVtbnMgKGRlZmF1bHQgRmFsc2UpCiAgICA6cGFyYW0gdXBkYXRlX3N0YXRzOiAgICB1cGRhdGUgZmVhdHVyZXMgc3RhdGlzdGljcyBmcm9tIHRoZSByZXF1ZXN0ZWQgZmVhdHVyZSBzZXRzIG9uIHRoZSB2ZWN0b3IuIERlZmF1bHQgaXMgRmFsc2UuCgogICAgOnJldHVybnMgZmVhdHVyZV92ZWN0b3IgaW5wdXQKICAgICIiIgoKICAgIGlmIGZlYXR1cmVzIGlzIG5vdCBOb25lOgogICAgICAgICMgQ3JlYXRpbmcgYSBuZXcgRmVhdHVyZVZlY3RvciBhbmQgc2F2aW5nOgogICAgICAgIGlmIGlzX3N0b3JlX3VyaShmZWF0dXJlX3ZlY3Rvcik6CiAgICAgICAgICAgIHByZWZpeCwgbmV3X3VyaSA9IHBhcnNlX3N0b3JlX3VyaShmZWF0dXJlX3ZlY3RvcikKICAgICAgICAgICAgaWYgcHJlZml4ICE9IFN0b3JlUHJlZml4LkZlYXR1cmVWZWN0b3I6CiAgICAgICAgICAgICAgICByYWlzZSBNTFJ1bkludmFsaWRBcmd1bWVudEVycm9yKAogICAgICAgICAgICAgICAgICAgIGYicHJvdmlkZWQgc3RvcmUgdXJpICh7ZmVhdHVyZV92ZWN0b3J9KSBkb2VzIG5vdCByZXByZXNlbnQgYSBmZWF0dXJlIHZlY3RvciAocHJlZml4PXtwcmVmaXh9KSIKICAgICAgICAgICAgICAgICkKICAgICAgICAgICAgZmVhdHVyZV92ZWN0b3IgPSBuZXdfdXJpCgogICAgICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZiJDcmVhdGluZyBGZWF0dXJlVmVjdG9yIHtmZWF0dXJlX3ZlY3Rvcn0iKQogICAgICAgIHByb2plY3QsIG5hbWUsIHRhZywgXyA9IHBhcnNlX3ZlcnNpb25lZF9vYmplY3RfdXJpKGZlYXR1cmVfdmVjdG9yLCBtbHJ1bi5tbGNvbmYuZGVmYXVsdF9wcm9qZWN0KQogICAgICAgIHZlY3RvciA9IGZzLkZlYXR1cmVWZWN0b3IobmFtZSwgZmVhdHVyZXMsIGxhYmVsX2ZlYXR1cmU9bGFiZWxfZmVhdHVyZSwgZGVzY3JpcHRpb249ZGVzY3JpcHRpb24pCiAgICAgICAgdmVjdG9yLm1ldGFkYXRhLnByb2plY3QgPSBwcm9qZWN0CiAgICAgICAgdmVjdG9yLm1ldGFkYXRhLnRhZyA9IHRhZwogICAgICAgIHZlY3Rvci5zYXZlKCkKICAgICAgICBmZWF0dXJlX3ZlY3RvciA9IHZlY3Rvci51cmkKCiAgICAjIFByZXBhcmluZyBlbnRpdHlfcm93czoKICAgIGlmIGVudGl0eV9yb3dzIGlzIG5vdCBOb25lOgogICAgICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZiJDcmVhdGluZyBEYXRhRnJhbWUgZnJvbSBlbnRpdHlfcm93cyA9IHtlbnRpdHlfcm93c30iKQogICAgICAgIGVudGl0eV9yb3dzID0gZW50aXR5X3Jvd3MuYXNfZGYoKQoKICAgICMgUHJlcGFyaW5nIHRhcmdldDoKICAgIGlmIHRhcmdldDoKICAgICAgICBpZiBpc2luc3RhbmNlKHRhcmdldCwgc3RyKToKICAgICAgICAgICAgdGFyZ2V0ID0ga2luZF90b19kcml2ZXJbdGFyZ2V0XSgpCgogICAgICAgIG5hbWUgPSB0YXJnZXQubmFtZSBpZiBoYXNhdHRyKHRhcmdldCwgIm5hbWUiKSBlbHNlIHRhcmdldFsibmFtZSJdCiAgICAgICAgY29udGV4dC5sb2dnZXIuaW5mbyhmIlByZXBhcmluZyAne25hbWV9JyB0YXJnZXQiKQogICAgICAgIHRhcmdldCA9IGdldF90YXJnZXRfZHJpdmVyKHRhcmdldCkKICAgIGlmIHRhcmdldC5wYXRoOgogICAgICAgIGNvbnRleHQubG9nX3Jlc3VsdCgidGFyZ2V0IiwgdGFyZ2V0LnBhdGgpCgogICAgIyBQcmVwYXJpbmcgcnVuX2NvbmZpZzoKICAgIGlmIHJ1bl9jb25maWcgYW5kIGlzaW5zdGFuY2UocnVuX2NvbmZpZywgZGljdCk6CiAgICAgICAgY29udGV4dC5sb2dnZXIuaW5mbygiUHJlcGFyaW5nIHJ1biBjb25maWd1cmF0aW9uIikKICAgICAgICBydW5fY29uZmlnID0gZnMuUnVuQ29uZmlnKCoqcnVuX2NvbmZpZykKCiAgICAjIENhbGxpbmcgZ2V0X29mZmxpbmVfZmVhdHVyZXM6CiAgICBjb250ZXh0LmxvZ2dlci5pbmZvKAogICAgICAgIGYiZ2V0dGluZyBvZmZsaW5lIGZlYXR1cmVzIGZyb20gdGhlIEZlYXR1cmVWZWN0b3Ige2ZlYXR1cmVfdmVjdG9yfSIKICAgICkKICAgIGZzLmdldF9vZmZsaW5lX2ZlYXR1cmVzKAogICAgICAgIGZlYXR1cmVfdmVjdG9yPWZlYXR1cmVfdmVjdG9yLAogICAgICAgIGVudGl0eV9yb3dzPWVudGl0eV9yb3dzLAogICAgICAgIGVudGl0eV90aW1lc3RhbXBfY29sdW1uPWVudGl0eV90aW1lc3RhbXBfY29sdW1uLAogICAgICAgIHRhcmdldD10YXJnZXQsCiAgICAgICAgcnVuX2NvbmZpZz1ydW5fY29uZmlnLAogICAgICAgIGRyb3BfY29sdW1ucz1kcm9wX2NvbHVtbnMsCiAgICAgICAgc3RhcnRfdGltZT1zdGFydF90aW1lLAogICAgICAgIGVuZF90aW1lPWVuZF90aW1lLAogICAgICAgIHdpdGhfaW5kZXhlcz13aXRoX2luZGV4ZXMsCiAgICAgICAgdXBkYXRlX3N0YXRzPXVwZGF0ZV9zdGF0cywKICAgICkKCiAgICBjb250ZXh0LmxvZ19yZXN1bHQoImZlYXR1cmVfdmVjdG9yIiwgZmVhdHVyZV92ZWN0b3IpCg==
-    commands: []
-    code_origin: https://github.com/mlrun/functions.git#552572c0a503a86a12830c7ab8eb515b2f1526fa:/Users/yonatanshelach/yoni/projects/functions/get_offline_features/get_offline_features.py
-    origin_filename: /Users/yonatanshelach/yoni/projects/functions/get_offline_features/get_offline_features.py
-  entry_points:
-    get_offline_features:
-      name: get_offline_features
-      doc: 'retrieve offline feature vector results
-
-
-        specify a feature vector object/uri and retrieve the desired features, their
-        metadata
-
-        and statistics. returns :py:class:`~mlrun.feature_store.OfflineVectorResponse`,
-
-        results can be returned as a dataframe or written to a target.
-
-        If feature vector does not exist, a new one will be created and saved with
-        the given features.
-
-
-        The start_time and end_time attributes allow filtering the data to a given
-        time range, they accept
-
-        string values or pandas `Timestamp` objects, string values can also be relative,
-        for example:
-
-        "now", "now - 1d2h", "now+5m", where a valid pandas Timedelta string follows
-        the verb "now",
-
-        for time alignment you can use the verb "floor" e.g. "now -1d floor 1H" will
-        align the time to the last hour
-
-        (the floor string is passed to pandas.Timestamp.floor(), can use D, H, T,
-        S for day, hour, min, sec alignment)'
-      parameters:
-      - name: context
-        type: MLClientCtx
-        doc: MLRun context
-        default: ''
-      - name: feature_vector
-        type: str
-        doc: feature vector uri
-        default: ''
-      - name: features
-        type: List[str]
-        doc: Relevant only if feature_vector not exist. list of feature to collect
-          to this vector format [/]. [as
-          ]
-        default: null
-      - name: label_feature
-        type: str
-        doc: feature name to be used as label data
-        default: null
-      - name: description
-        type: str
-        doc: text description of the vector
-        default: null
-      - name: entity_rows
-        type: DataItem
-        doc: URI of the data entity rows to join with
-        default: null
-      - name: entity_timestamp_column
-        type: str
-        doc: timestamp column name in the entity rows dataframe
-        default: null
-      - name: target
-        type: Union[str, Dict]
-        doc: where to write the results to
-        default: null
-      - name: run_config
-        type: Union[str, Dict]
-        doc: function and/or run configuration see :py:class:`~mlrun.feature_store.RunConfig`
-        default: null
-      - name: drop_columns
-        type: List[str]
-        doc: list of columns to drop from the final result
-        default: null
-      - name: start_time
-        type: str
-        doc: datetime, low limit of time needed to be filtered. Optional entity_timestamp_column
-          must be passed when using time filtering
-        default: null
-      - name: end_time
-        type: str
-        doc: datetime, high limit of time needed to be filtered. Optional entity_timestamp_column
-          must be passed when using time filtering
-        default: null
-      - name: with_indexes
-        type: bool
-        doc: return vector with index columns (default False)
-        default: false
-      - name: update_stats
-        type: bool
-        doc: update features statistics from the requested feature sets on the vector.
-          Default is False.
-        default: false
-      outputs:
-      - default: ''
-      lineno: 13
-  description: retrieve offline feature vector results
-  default_handler: get_offline_features
-  disable_auto_mount: false
-  env: []
-  priority_class_name: ''
-  preemption_mode: prevent
-  affinity: null
-  tolerations: null
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/get_offline_features/0.9.1/static/item.html b/functions/development/get_offline_features/0.9.1/static/item.html deleted file mode 100644 index 05d3cf9a..00000000 --- a/functions/development/get_offline_features/0.9.1/static/item.html +++ /dev/null @@ -1,47 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- data-preparation
-- data-analysis
-- feature-store
-description: retrieve offline feature vector results
-doc: ''
-example: get_offline_features.ipynb
-generationDate: 2022-05-25:10-58
-icon: ''
-labels:
-  author: yonish
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 1.0.1
-name: get_offline_features
-platformVersion: ''
-spec:
-  filename: get_offline_features.py
-  handler: get_offline_features
-  image: mlrun/mlrun
-  kind: job
-  requirements: []
-url: ''
-version: 0.9.1
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/get_offline_features/0.9.1/static/source.html b/functions/development/get_offline_features/0.9.1/static/source.html deleted file mode 100644 index a06d4cdf..00000000 --- a/functions/development/get_offline_features/0.9.1/static/source.html +++ /dev/null @@ -1,143 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-from typing import Union, List, Dict
-
-import mlrun
-import mlrun.feature_store as fs
-from mlrun.datastore.store_resources import is_store_uri, parse_store_uri
-from mlrun.datastore.targets import get_target_driver, kind_to_driver
-from mlrun.datastore.base import DataItem
-from mlrun.execution import MLClientCtx
-from mlrun.utils import StorePrefix, parse_versioned_object_uri
-from mlrun.errors import MLRunInvalidArgumentError
-
-
-def get_offline_features(
-    context: MLClientCtx,
-    feature_vector: str,
-    features: List[str] = None,
-    label_feature: str = None,
-    description: str = None,
-    entity_rows: DataItem = None,
-    entity_timestamp_column: str = None,
-    target: Union[str, Dict] = None,
-    run_config: Union[str, Dict] = None,
-    drop_columns: List[str] = None,
-    start_time: str = None,
-    end_time: str = None,
-    with_indexes: bool = False,
-    update_stats: bool = False,
-):
-    """retrieve offline feature vector results
-
-    specify a feature vector object/uri and retrieve the desired features, their metadata
-    and statistics. returns :py:class:`~mlrun.feature_store.OfflineVectorResponse`,
-    results can be returned as a dataframe or written to a target.
-    If feature vector does not exist, a new one will be created and saved with the given features.
-
-    The start_time and end_time attributes allow filtering the data to a given time range, they accept
-    string values or pandas `Timestamp` objects, string values can also be relative, for example:
-    "now", "now - 1d2h", "now+5m", where a valid pandas Timedelta string follows the verb "now",
-    for time alignment you can use the verb "floor" e.g. "now -1d floor 1H" will align the time to the last hour
-    (the floor string is passed to pandas.Timestamp.floor(), can use D, H, T, S for day, hour, min, sec alignment)
-
-
-    :param context:        MLRun context
-    :param feature_vector: feature vector uri
-    :param features:       Relevant only if feature_vector not exist. list of feature to collect to this vector
-                           format [/]. [as ]
-    :param label_feature:  feature name to be used as label data
-    :param description:    text description of the vector
-    :param entity_rows:    URI of the data entity rows to join with
-    :param target:         where to write the results to
-    :param drop_columns:   list of columns to drop from the final result
-    :param entity_timestamp_column: timestamp column name in the entity rows dataframe
-    :param run_config:     function and/or run configuration
-                           see :py:class:`~mlrun.feature_store.RunConfig`
-    :param start_time:      datetime, low limit of time needed to be filtered. Optional
-        entity_timestamp_column must be passed when using time filtering
-    :param end_time:        datetime, high limit of time needed to be filtered. Optional
-        entity_timestamp_column must be passed when using time filtering
-    :param with_indexes:    return vector with index columns (default False)
-    :param update_stats:    update features statistics from the requested feature sets on the vector. Default is False.
-
-    :returns feature_vector input
-    """
-
-    if features is not None:
-        # Creating a new FeatureVector and saving:
-        if is_store_uri(feature_vector):
-            prefix, new_uri = parse_store_uri(feature_vector)
-            if prefix != StorePrefix.FeatureVector:
-                raise MLRunInvalidArgumentError(
-                    f"provided store uri ({feature_vector}) does not represent a feature vector (prefix={prefix})"
-                )
-            feature_vector = new_uri
-
-        context.logger.info(f"Creating FeatureVector {feature_vector}")
-        project, name, tag, _ = parse_versioned_object_uri(feature_vector, mlrun.mlconf.default_project)
-        vector = fs.FeatureVector(name, features, label_feature=label_feature, description=description)
-        vector.metadata.project = project
-        vector.metadata.tag = tag
-        vector.save()
-        feature_vector = vector.uri
-
-    # Preparing entity_rows:
-    if entity_rows is not None:
-        context.logger.info(f"Creating DataFrame from entity_rows = {entity_rows}")
-        entity_rows = entity_rows.as_df()
-
-    # Preparing target:
-    if target:
-        if isinstance(target, str):
-            target = kind_to_driver[target]()
-
-        name = target.name if hasattr(target, "name") else target["name"]
-        context.logger.info(f"Preparing '{name}' target")
-        target = get_target_driver(target)
-    if target.path:
-        context.log_result("target", target.path)
-
-    # Preparing run_config:
-    if run_config and isinstance(run_config, dict):
-        context.logger.info("Preparing run configuration")
-        run_config = fs.RunConfig(**run_config)
-
-    # Calling get_offline_features:
-    context.logger.info(
-        f"getting offline features from the FeatureVector {feature_vector}"
-    )
-    fs.get_offline_features(
-        feature_vector=feature_vector,
-        entity_rows=entity_rows,
-        entity_timestamp_column=entity_timestamp_column,
-        target=target,
-        run_config=run_config,
-        drop_columns=drop_columns,
-        start_time=start_time,
-        end_time=end_time,
-        with_indexes=with_indexes,
-        update_stats=update_stats,
-    )
-
-    context.log_result("feature_vector", feature_vector)
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/get_offline_features/1.0.0/src/function.yaml b/functions/development/get_offline_features/1.0.0/src/function.yaml deleted file mode 100644 index c3c929ad..00000000 --- a/functions/development/get_offline_features/1.0.0/src/function.yaml +++ /dev/null @@ -1,126 +0,0 @@ -kind: job -metadata: - name: get-offline-features - tag: '' - hash: e7bd4fc1f7377374a910d4aa21e31dbba31f59a3 - project: '' - labels: - author: yonish - categories: - - data-preparation - - data-analysis - - feature-store -spec: - command: '' - args: [] - image: mlrun/mlrun - build: - functionSourceCode: ZnJvbSB0eXBpbmcgaW1wb3J0IFVuaW9uLCBMaXN0LCBEaWN0CgppbXBvcnQgbWxydW4KaW1wb3J0IG1scnVuLmZlYXR1cmVfc3RvcmUgYXMgZnMKZnJvbSBtbHJ1bi5kYXRhc3RvcmUuc3RvcmVfcmVzb3VyY2VzIGltcG9ydCBpc19zdG9yZV91cmksIHBhcnNlX3N0b3JlX3VyaQpmcm9tIG1scnVuLmRhdGFzdG9yZS50YXJnZXRzIGltcG9ydCBnZXRfdGFyZ2V0X2RyaXZlciwga2luZF90b19kcml2ZXIKZnJvbSBtbHJ1bi5kYXRhc3RvcmUuYmFzZSBpbXBvcnQgRGF0YUl0ZW0KZnJvbSBtbHJ1bi5leGVjdXRpb24gaW1wb3J0IE1MQ2xpZW50Q3R4CmZyb20gbWxydW4udXRpbHMgaW1wb3J0IFN0b3JlUHJlZml4LCBwYXJzZV92ZXJzaW9uZWRfb2JqZWN0X3VyaQpmcm9tIG1scnVuLmVycm9ycyBpbXBvcnQgTUxSdW5JbnZhbGlkQXJndW1lbnRFcnJvcgoKCmRlZiBnZXRfb2ZmbGluZV9mZWF0dXJlcygKICAgIGNvbnRleHQ6IE1MQ2xpZW50Q3R4LAogICAgZmVhdHVyZV92ZWN0b3I6IHN0ciwKICAgIGZlYXR1cmVzOiBMaXN0W3N0cl0gPSBOb25lLAogICAgbGFiZWxfZmVhdHVyZTogc3RyID0gTm9uZSwKICAgIGRlc2NyaXB0aW9uOiBzdHIgPSBOb25lLAogICAgZW50aXR5X3Jvd3M6IERhdGFJdGVtID0gTm9uZSwKICAgIGVudGl0eV90aW1lc3RhbXBfY29sdW1uOiBzdHIgPSBOb25lLAogICAgdGFyZ2V0OiBVbmlvbltzdHIsIERpY3RdID0gTm9uZSwKICAgIHJ1bl9jb25maWc6IFVuaW9uW3N0ciwgRGljdF0gPSBOb25lLAogICAgZHJvcF9jb2x1bW5zOiBMaXN0W3N0cl0gPSBOb25lLAogICAgc3RhcnRfdGltZTogc3RyID0gTm9uZSwKICAgIGVuZF90aW1lOiBzdHIgPSBOb25lLAogICAgd2l0aF9pbmRleGVzOiBib29sID0gRmFsc2UsCiAgICB1cGRhdGVfc3RhdHM6IGJvb2wgPSBGYWxzZSwKKToKICAgICIiInJldHJpZXZlIG9mZmxpbmUgZmVhdHVyZSB2ZWN0b3IgcmVzdWx0cwoKICAgIHNwZWNpZnkgYSBmZWF0dXJlIHZlY3RvciBvYmplY3QvdXJpIGFuZCByZXRyaWV2ZSB0aGUgZGVzaXJlZCBmZWF0dXJlcywgdGhlaXIgbWV0YWRhdGEKICAgIGFuZCBzdGF0aXN0aWNzLiByZXR1cm5zIDpweTpjbGFzczpgfm1scnVuLmZlYXR1cmVfc3RvcmUuT2ZmbGluZVZlY3RvclJlc3BvbnNlYCwKICAgIHJlc3VsdHMgY2FuIGJlIHJldHVybmVkIGFzIGEgZGF0YWZyYW1lIG9yIHdyaXR0ZW4gdG8gYSB0YXJnZXQuCiAgICBJZiBmZWF0dXJlIHZlY3RvciBkb2VzIG5vdCBleGlzdCwgYSBuZXcgb25lIHdpbGwgYmUgY3JlYXRlZCBhbmQgc2F2ZWQgd2l0aCB0aGUgZ2l2ZW4gZmVhdHVyZXMuCgogICAgVGhlIHN0YXJ0X3RpbWUgYW5kIGVuZF90aW1lIGF0dHJpYnV0ZXMgYWxsb3cgZmlsdGVyaW5nIHRoZSBkYXRhIHRvIGEgZ2l2ZW4gdGltZSByYW5nZSwgdGhleSBhY2NlcHQKICAgIHN0cmluZyB2YWx1ZXMgb3IgcGFuZGFzIGBUaW1lc3RhbXBgIG9iamVjdHMsIHN0cmluZyB2YWx1ZXMgY2FuIGFsc28gYmUgcmVsYXRpdmUsIGZvciBleGFtcGxlOgogICAgIm5vdyIsICJub3cgLSAxZDJoIiwgIm5vdys1bSIsIHdoZXJlIGEgdmFsaWQgcGFuZGFzIFRpbWVkZWx0YSBzdHJpbmcgZm9sbG93cyB0aGUgdmVyYiAibm93IiwKICAgIGZvciB0aW1lIGFsaWdubWVudCB5b3UgY2FuIHVzZSB0aGUgdmVyYiAiZmxvb3IiIGUuZy4gIm5vdyAtMWQgZmxvb3IgMUgiIHdpbGwgYWxpZ24gdGhlIHRpbWUgdG8gdGhlIGxhc3QgaG91cgogICAgKHRoZSBmbG9vciBzdHJpbmcgaXMgcGFzc2VkIHRvIHBhbmRhcy5UaW1lc3RhbXAuZmxvb3IoKSwgY2FuIHVzZSBELCBILCBULCBTIGZvciBkYXksIGhvdXIsIG1pbiwgc2VjIGFsaWdubWVudCkKCgogICAgOnBhcmFtIGNvbnRleHQ6ICAgICAgICBNTFJ1biBjb250ZXh0CiAgICA6cGFyYW0gZmVhdHVyZV92ZWN0b3I6IGZlYXR1cmUgdmVjdG9yIHVyaQogICAgOnBhcmFtIGZlYXR1cmVzOiAgICAgICBSZWxldmFudCBvbmx5IGlmIGZlYXR1cmVfdmVjdG9yIG5vdCBleGlzdC4gbGlzdCBvZiBmZWF0dXJlIHRvIGNvbGxlY3QgdG8gdGhpcyB2ZWN0b3IKICAgICAgICAgICAgICAgICAgICAgICAgICAgZm9ybWF0IFs8cHJvamVjdD4vXTxmZWF0dXJlX3NldD4uPGZlYXR1cmVfbmFtZSBvciAqPiBbYXMgPGFsaWFzPl0KICAgIDpwYXJhbSBsYWJlbF9mZWF0dXJlOiAgZmVhdHVyZSBuYW1lIHRvIGJlIHVzZWQgYXMgbGFiZWwgZGF0YQogICAgOnBhcmFtIGRlc2NyaXB0aW9uOiAgICB0ZXh0IGRlc2NyaXB0aW9uIG9mIHRoZSB2ZWN0b3IKICAgIDpwYXJhbSBlbnRpdHlfcm93czogICAgVVJJIG9mIHRoZSBkYXRhIGVudGl0eSByb3dzIHRvIGpvaW4gd2l0aAogICAgOnBhcmFtIHRhcmdldDogICAgICAgICB3aGVyZSB0byB3cml0ZSB0aGUgcmVzdWx0cyB0bwogICAgOnBhcmFtIGRyb3BfY29sdW1uczogICBsaXN0IG9mIGNvbHVtbnMgdG8gZHJvcCBmcm9tIHRoZSBmaW5hbCByZXN1bHQKICAgIDpwYXJhbSBlbnRpdHlfdGltZXN0YW1wX2NvbHVtbjogdGltZXN0YW1wIGNvbHVtbiBuYW1lIGluIHRoZSBlbnRpdHkgcm93cyBkYXRhZnJhbWUKICAgIDpwYXJhbSBydW5fY29uZmlnOiAgICAgZnVuY3Rpb24gYW5kL29yIHJ1biBjb25maWd1cmF0aW9uCiAgICAgICAgICAgICAgICAgICAgICAgICAgIHNlZSA6cHk6Y2xhc3M6YH5tbHJ1bi5mZWF0dXJlX3N0b3JlLlJ1bkNvbmZpZ2AKICAgIDpwYXJhbSBzdGFydF90aW1lOiAgICAgIGRhdGV0aW1lLCBsb3cgbGltaXQgb2YgdGltZSBuZWVkZWQgdG8gYmUgZmlsdGVyZWQuIE9wdGlvbmFsCiAgICAgICAgZW50aXR5X3RpbWVzdGFtcF9jb2x1bW4gbXVzdCBiZSBwYXNzZWQgd2hlbiB1c2luZyB0aW1lIGZpbHRlcmluZwogICAgOnBhcmFtIGVuZF90aW1lOiAgICAgICAgZGF0ZXRpbWUsIGhpZ2ggbGltaXQgb2YgdGltZSBuZWVkZWQgdG8gYmUgZmlsdGVyZWQuIE9wdGlvbmFsCiAgICAgICAgZW50aXR5X3RpbWVzdGFtcF9jb2x1bW4gbXVzdCBiZSBwYXNzZWQgd2hlbiB1c2luZyB0aW1lIGZpbHRlcmluZwogICAgOnBhcmFtIHdpdGhfaW5kZXhlczogICAgcmV0dXJuIHZlY3RvciB3aXRoIGluZGV4IGNvbHVtbnMgKGRlZmF1bHQgRmFsc2UpCiAgICA6cGFyYW0gdXBkYXRlX3N0YXRzOiAgICB1cGRhdGUgZmVhdHVyZXMgc3RhdGlzdGljcyBmcm9tIHRoZSByZXF1ZXN0ZWQgZmVhdHVyZSBzZXRzIG9uIHRoZSB2ZWN0b3IuIERlZmF1bHQgaXMgRmFsc2UuCgogICAgOnJldHVybnMgZmVhdHVyZV92ZWN0b3IgaW5wdXQKICAgICIiIgoKICAgIGlmIGZlYXR1cmVzIGlzIG5vdCBOb25lOgogICAgICAgICMgQ3JlYXRpbmcgYSBuZXcgRmVhdHVyZVZlY3RvciBhbmQgc2F2aW5nOgogICAgICAgIGlmIGlzX3N0b3JlX3VyaShmZWF0dXJlX3ZlY3Rvcik6CiAgICAgICAgICAgIHByZWZpeCwgbmV3X3VyaSA9IHBhcnNlX3N0b3JlX3VyaShmZWF0dXJlX3ZlY3RvcikKICAgICAgICAgICAgaWYgcHJlZml4ICE9IFN0b3JlUHJlZml4LkZlYXR1cmVWZWN0b3I6CiAgICAgICAgICAgICAgICByYWlzZSBNTFJ1bkludmFsaWRBcmd1bWVudEVycm9yKAogICAgICAgICAgICAgICAgICAgIGYicHJvdmlkZWQgc3RvcmUgdXJpICh7ZmVhdHVyZV92ZWN0b3J9KSBkb2VzIG5vdCByZXByZXNlbnQgYSBmZWF0dXJlIHZlY3RvciAocHJlZml4PXtwcmVmaXh9KSIKICAgICAgICAgICAgICAgICkKICAgICAgICAgICAgZmVhdHVyZV92ZWN0b3IgPSBuZXdfdXJpCgogICAgICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZiJDcmVhdGluZyBGZWF0dXJlVmVjdG9yIHtmZWF0dXJlX3ZlY3Rvcn0iKQogICAgICAgIHByb2plY3QsIG5hbWUsIHRhZywgXyA9IHBhcnNlX3ZlcnNpb25lZF9vYmplY3RfdXJpKGZlYXR1cmVfdmVjdG9yLCBtbHJ1bi5tbGNvbmYuZGVmYXVsdF9wcm9qZWN0KQogICAgICAgIHZlY3RvciA9IGZzLkZlYXR1cmVWZWN0b3IobmFtZSwgZmVhdHVyZXMsIGxhYmVsX2ZlYXR1cmU9bGFiZWxfZmVhdHVyZSwgZGVzY3JpcHRpb249ZGVzY3JpcHRpb24pCiAgICAgICAgdmVjdG9yLm1ldGFkYXRhLnByb2plY3QgPSBwcm9qZWN0CiAgICAgICAgdmVjdG9yLm1ldGFkYXRhLnRhZyA9IHRhZwogICAgICAgIHZlY3Rvci5zYXZlKCkKICAgICAgICBmZWF0dXJlX3ZlY3RvciA9IHZlY3Rvci51cmkKCiAgICAjIFByZXBhcmluZyBlbnRpdHlfcm93czoKICAgIGlmIGVudGl0eV9yb3dzIGlzIG5vdCBOb25lOgogICAgICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZiJDcmVhdGluZyBEYXRhRnJhbWUgZnJvbSBlbnRpdHlfcm93cyA9IHtlbnRpdHlfcm93c30iKQogICAgICAgIGVudGl0eV9yb3dzID0gZW50aXR5X3Jvd3MuYXNfZGYoKQoKICAgICMgUHJlcGFyaW5nIHRhcmdldDoKICAgIGlmIHRhcmdldDoKICAgICAgICBpZiBpc2luc3RhbmNlKHRhcmdldCwgc3RyKToKICAgICAgICAgICAgdGFyZ2V0ID0ga2luZF90b19kcml2ZXJbdGFyZ2V0XSgpCgogICAgICAgIG5hbWUgPSB0YXJnZXQubmFtZSBpZiBoYXNhdHRyKHRhcmdldCwgIm5hbWUiKSBlbHNlIHRhcmdldFsibmFtZSJdCiAgICAgICAgY29udGV4dC5sb2dnZXIuaW5mbyhmIlByZXBhcmluZyAne25hbWV9JyB0YXJnZXQiKQogICAgICAgIHRhcmdldCA9IGdldF90YXJnZXRfZHJpdmVyKHRhcmdldCkKICAgIGlmIHRhcmdldC5wYXRoOgogICAgICAgIGNvbnRleHQubG9nX3Jlc3VsdCgidGFyZ2V0IiwgdGFyZ2V0LnBhdGgpCgogICAgIyBQcmVwYXJpbmcgcnVuX2NvbmZpZzoKICAgIGlmIHJ1bl9jb25maWcgYW5kIGlzaW5zdGFuY2UocnVuX2NvbmZpZywgZGljdCk6CiAgICAgICAgY29udGV4dC5sb2dnZXIuaW5mbygiUHJlcGFyaW5nIHJ1biBjb25maWd1cmF0aW9uIikKICAgICAgICBydW5fY29uZmlnID0gZnMuUnVuQ29uZmlnKCoqcnVuX2NvbmZpZykKCiAgICAjIENhbGxpbmcgZ2V0X29mZmxpbmVfZmVhdHVyZXM6CiAgICBjb250ZXh0LmxvZ2dlci5pbmZvKAogICAgICAgIGYiZ2V0dGluZyBvZmZsaW5lIGZlYXR1cmVzIGZyb20gdGhlIEZlYXR1cmVWZWN0b3Ige2ZlYXR1cmVfdmVjdG9yfSIKICAgICkKICAgIGZzLmdldF9vZmZsaW5lX2ZlYXR1cmVzKAogICAgICAgIGZlYXR1cmVfdmVjdG9yPWZlYXR1cmVfdmVjdG9yLAogICAgICAgIGVudGl0eV9yb3dzPWVudGl0eV9yb3dzLAogICAgICAgIGVudGl0eV90aW1lc3RhbXBfY29sdW1uPWVudGl0eV90aW1lc3RhbXBfY29sdW1uLAogICAgICAgIHRhcmdldD10YXJnZXQsCiAgICAgICAgcnVuX2NvbmZpZz1ydW5fY29uZmlnLAogICAgICAgIGRyb3BfY29sdW1ucz1kcm9wX2NvbHVtbnMsCiAgICAgICAgc3RhcnRfdGltZT1zdGFydF90aW1lLAogICAgICAgIGVuZF90aW1lPWVuZF90aW1lLAogICAgICAgIHdpdGhfaW5kZXhlcz13aXRoX2luZGV4ZXMsCiAgICAgICAgdXBkYXRlX3N0YXRzPXVwZGF0ZV9zdGF0cywKICAgICkKCiAgICBjb250ZXh0LmxvZ19yZXN1bHQoImZlYXR1cmVfdmVjdG9yIiwgZmVhdHVyZV92ZWN0b3IpCg== - commands: [] - code_origin: https://github.com/mlrun/functions.git#552572c0a503a86a12830c7ab8eb515b2f1526fa:/Users/yonatanshelach/yoni/projects/functions/get_offline_features/get_offline_features.py - origin_filename: /Users/yonatanshelach/yoni/projects/functions/get_offline_features/get_offline_features.py - entry_points: - get_offline_features: - name: get_offline_features - doc: 'retrieve offline feature vector results - - - specify a feature vector object/uri and retrieve the desired features, their - metadata - - and statistics. returns :py:class:`~mlrun.feature_store.OfflineVectorResponse`, - - results can be returned as a dataframe or written to a target. - - If feature vector does not exist, a new one will be created and saved with - the given features. - - - The start_time and end_time attributes allow filtering the data to a given - time range, they accept - - string values or pandas `Timestamp` objects, string values can also be relative, - for example: - - "now", "now - 1d2h", "now+5m", where a valid pandas Timedelta string follows - the verb "now", - - for time alignment you can use the verb "floor" e.g. "now -1d floor 1H" will - align the time to the last hour - - (the floor string is passed to pandas.Timestamp.floor(), can use D, H, T, - S for day, hour, min, sec alignment)' - parameters: - - name: context - type: MLClientCtx - doc: MLRun context - default: '' - - name: feature_vector - type: str - doc: feature vector uri - default: '' - - name: features - type: List[str] - doc: Relevant only if feature_vector not exist. list of feature to collect - to this vector format [/]. [as - ] - default: null - - name: label_feature - type: str - doc: feature name to be used as label data - default: null - - name: description - type: str - doc: text description of the vector - default: null - - name: entity_rows - type: DataItem - doc: URI of the data entity rows to join with - default: null - - name: entity_timestamp_column - type: str - doc: timestamp column name in the entity rows dataframe - default: null - - name: target - type: Union[str, Dict] - doc: where to write the results to - default: null - - name: run_config - type: Union[str, Dict] - doc: function and/or run configuration see :py:class:`~mlrun.feature_store.RunConfig` - default: null - - name: drop_columns - type: List[str] - doc: list of columns to drop from the final result - default: null - - name: start_time - type: str - doc: datetime, low limit of time needed to be filtered. Optional entity_timestamp_column - must be passed when using time filtering - default: null - - name: end_time - type: str - doc: datetime, high limit of time needed to be filtered. Optional entity_timestamp_column - must be passed when using time filtering - default: null - - name: with_indexes - type: bool - doc: return vector with index columns (default False) - default: false - - name: update_stats - type: bool - doc: update features statistics from the requested feature sets on the vector. - Default is False. - default: false - outputs: - - default: '' - lineno: 13 - description: retrieve offline feature vector results - default_handler: get_offline_features - disable_auto_mount: false - env: [] - priority_class_name: '' - preemption_mode: prevent - affinity: null - tolerations: null -verbose: false diff --git a/functions/development/get_offline_features/1.0.0/src/get_offline_features.ipynb b/functions/development/get_offline_features/1.0.0/src/get_offline_features.ipynb deleted file mode 100644 index d97402a2..00000000 --- a/functions/development/get_offline_features/1.0.0/src/get_offline_features.ipynb +++ /dev/null @@ -1,1536 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# `get_offline_features()` from MLRun FeatureStore\n", - "\n", - "This MLRun Function has the following `params`:\n", - "\n", - "- `feature_vector: str`, feature vector uri.\n", - "\n", - "- `entity_rows: DataItem` = None, URI of the data entity rows to join with.\n", - "\n", - "- `entity_timestamp_column: str = None`, timestamp column name in the entity rows dataframe.\n", - "\n", - "- `target: Union[str, Dict] = None`, where to write the results to.\n", - "\n", - "- `run_config: Union[str, Dict] = None`, function and/or run configuration see :py:class:`~mlrun.feature_store.RunConfig`.\n", - "\n", - "- `drop_columns: List[str] = None`, list of columns to drop from the final result. \n", - "\n", - "- `start_time: str = None`, datetime, low limit of time needed to be filtered. Optional. `entity_timestamp_column` must be passed when using time filtering.\n", - "\n", - "- `end_time: str = None`, datetime, high limit of time needed to be filtered. Optional. `entity_timestamp_column` must be passed when using time filtering.\n", - "\n", - "- `with_indexes: bool = False`, return vector with index columns (default False).\n", - "\n", - "- `update_stats: bool = False`, update features statistics from the requested feature sets on the vector. Default is False." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import mlrun\n", - "import mlrun.feature_store as fstore\n", - "from mlrun.datastore.targets import CSVTarget\n", - "from mlrun.datastore.sources import CSVSource\n", - "from mlrun.run import get_dataitem\n", - "from mlrun.feature_store.steps import *\n", - "from mlrun.features import MinMaxValidator\n", - "import pandas as pd\n", - "import datetime\n", - "import os" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2022-01-31 14:41:48,288 [info] loaded project get-offline-features from MLRun DB\n" - ] - } - ], - "source": [ - "ABS_PATH = 'v3io://users/{}/get_offline_features/'.format(os.environ['V3IO_USERNAME'])\n", - "# Initialize the MLRun project object\n", - "project = mlrun.get_or_create_project('get-offline-features', context=\"./\", user_project=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Generating the Same FeatureSets and FeatureVecotrs Based on the Stocks Example" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Create Sample Data For Demo" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "quotes = pd.DataFrame(\n", - " {\n", - " \"time\": [\n", - " pd.Timestamp(\"2016-05-25 13:30:00.023\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.023\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.030\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.041\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.048\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.049\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.072\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.075\")\n", - " ],\n", - " \"ticker\": [\n", - " \"GOOG\",\n", - " \"MSFT\",\n", - " \"MSFT\",\n", - " \"MSFT\",\n", - " \"GOOG\",\n", - " \"AAPL\",\n", - " \"GOOG\",\n", - " \"MSFT\"\n", - " ],\n", - " \"bid\": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01],\n", - " \"ask\": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03]\n", - " }\n", - ")\n", - "\n", - "trades = pd.DataFrame(\n", - " {\n", - " \"time\": [\n", - " pd.Timestamp(\"2016-05-25 13:30:00.023\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.038\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.048\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.048\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.048\")\n", - " ],\n", - " \"ticker\": [\"MSFT\", \"MSFT\", \"GOOG\", \"GOOG\", \"AAPL\"],\n", - " \"price\": [51.95, 51.95, 720.77, 720.92, 98.0],\n", - " \"quantity\": [75, 155, 100, 100, 100]\n", - " }\n", - ")\n", - "\n", - "stocks = pd.DataFrame(\n", - " {\n", - " \"ticker\": [\"MSFT\", \"GOOG\", \"AAPL\"],\n", - " \"name\": [\"Microsoft Corporation\", \"Alphabet Inc\", \"Apple Inc\"],\n", - " \"exchange\": [\"NASDAQ\", \"NASDAQ\", \"NASDAQ\"]\n", - " }\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "def move_date(df, col):\n", - " max_date = df[col].max()\n", - " now_date = datetime.datetime.now()\n", - " delta = now_date - max_date \n", - " df[col] = df[col] + delta \n", - " return df\n", - "\n", - "quotes = move_date(quotes, \"time\")\n", - "trades = move_date(trades, \"time\")\n", - "trades.to_csv('trades.csv', index=False)\n", - "data_uri = os.path.join(ABS_PATH, 'trades.csv')" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
timetickerbidask
02022-01-31 14:41:48.260566GOOG720.50720.93
12022-01-31 14:41:48.260566MSFT51.9551.96
22022-01-31 14:41:48.267566MSFT51.9751.98
32022-01-31 14:41:48.278566MSFT51.9952.00
42022-01-31 14:41:48.285566GOOG720.50720.93
52022-01-31 14:41:48.286566AAPL97.9998.01
62022-01-31 14:41:48.309566GOOG720.50720.88
72022-01-31 14:41:48.312566MSFT52.0152.03
\n", - "
" - ], - "text/plain": [ - " time ticker bid ask\n", - "0 2022-01-31 14:41:48.260566 GOOG 720.50 720.93\n", - "1 2022-01-31 14:41:48.260566 MSFT 51.95 51.96\n", - "2 2022-01-31 14:41:48.267566 MSFT 51.97 51.98\n", - "3 2022-01-31 14:41:48.278566 MSFT 51.99 52.00\n", - "4 2022-01-31 14:41:48.285566 GOOG 720.50 720.93\n", - "5 2022-01-31 14:41:48.286566 AAPL 97.99 98.01\n", - "6 2022-01-31 14:41:48.309566 GOOG 720.50 720.88\n", - "7 2022-01-31 14:41:48.312566 MSFT 52.01 52.03" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "quotes" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
timetickerpricequantity
02022-01-31 14:41:48.288476MSFT51.9575
12022-01-31 14:41:48.303476MSFT51.95155
22022-01-31 14:41:48.313476GOOG720.77100
32022-01-31 14:41:48.313476GOOG720.92100
42022-01-31 14:41:48.313476AAPL98.00100
\n", - "
" - ], - "text/plain": [ - " time ticker price quantity\n", - "0 2022-01-31 14:41:48.288476 MSFT 51.95 75\n", - "1 2022-01-31 14:41:48.303476 MSFT 51.95 155\n", - "2 2022-01-31 14:41:48.313476 GOOG 720.77 100\n", - "3 2022-01-31 14:41:48.313476 GOOG 720.92 100\n", - "4 2022-01-31 14:41:48.313476 AAPL 98.00 100" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "trades" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
tickernameexchange
0MSFTMicrosoft CorporationNASDAQ
1GOOGAlphabet IncNASDAQ
2AAPLApple IncNASDAQ
\n", - "
" - ], - "text/plain": [ - " ticker name exchange\n", - "0 MSFT Microsoft Corporation NASDAQ\n", - "1 GOOG Alphabet Inc NASDAQ\n", - "2 AAPL Apple Inc NASDAQ" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "stocks" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Build & Ingest Simple Feature Set (stocks)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
nameexchange
ticker
MSFTMicrosoft CorporationNASDAQ
GOOGAlphabet IncNASDAQ
AAPLApple IncNASDAQ
\n", - "
" - ], - "text/plain": [ - " name exchange\n", - "ticker \n", - "MSFT Microsoft Corporation NASDAQ\n", - "GOOG Alphabet Inc NASDAQ\n", - "AAPL Apple Inc NASDAQ" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# add feature set without time column (stock ticker metadata) \n", - "stocks_set = fstore.FeatureSet(\"stocks\", entities=[fstore.Entity(\"ticker\")])\n", - "fstore.ingest(stocks_set, stocks, infer_options=fstore.InferOptions.default())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Build Advanced feature set - with feature engineering pipeline" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "quotes_set = fstore.FeatureSet(\"stock-quotes\", entities=[fstore.Entity(\"ticker\")])" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "class MyMap(MapClass):\n", - " def __init__(self, multiplier=1, **kwargs):\n", - " super().__init__(**kwargs)\n", - " self._multiplier = multiplier\n", - "\n", - " def do(self, event):\n", - " event[\"multi\"] = event[\"bid\"] * self._multiplier\n", - " return event" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "image/svg+xml": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "mlrun-flow\n", - "\n", - "\n", - "\n", - "_start\n", - "\n", - "start\n", - "\n", - "\n", - "\n", - "MyMap\n", - "\n", - "MyMap\n", - "\n", - "\n", - "\n", - "_start->MyMap\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "storey.Extend\n", - "\n", - "storey.Extend\n", - "\n", - "\n", - "\n", - "MyMap->storey.Extend\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "filter\n", - "\n", - "filter\n", - "\n", - "\n", - "\n", - "storey.Extend->filter\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "FeaturesetValidator\n", - "\n", - "FeaturesetValidator\n", - "\n", - "\n", - "\n", - "filter->FeaturesetValidator\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "Aggregates\n", - "\n", - "Aggregates\n", - "\n", - "\n", - "\n", - "FeaturesetValidator->Aggregates\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "parquet\n", - "\n", - "\n", - "parquet\n", - "\n", - "\n", - "\n", - "Aggregates->parquet\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "nosql\n", - "\n", - "\n", - "nosql\n", - "\n", - "\n", - "\n", - "Aggregates->nosql\n", - "\n", - "\n", - "\n", - "\n", - "\n" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "quotes_set.graph.to(\"MyMap\", multiplier=3)\\\n", - " .to(\"storey.Extend\", _fn=\"({'extra': event['bid'] * 77})\")\\\n", - " .to(\"storey.Filter\", \"filter\", _fn=\"(event['bid'] > 51.92)\")\\\n", - " .to(FeaturesetValidator())\n", - "\n", - "quotes_set.add_aggregation(\"ask\", [\"sum\", \"max\"], \"1h\", \"10m\", name=\"asks1\")\n", - "quotes_set.add_aggregation(\"ask\", [\"sum\", \"max\"], \"5h\", \"10m\", name=\"asks5\")\n", - "quotes_set.add_aggregation(\"bid\", [\"min\", \"max\"], \"1h\", \"10m\", name=\"bids\")\n", - "\n", - "# add feature validation policy\n", - "quotes_set[\"bid\"] = fstore.Feature(validator=MinMaxValidator(min=52, severity=\"info\"))\n", - "\n", - "# add default target definitions and plot\n", - "quotes_set.set_targets()\n", - "quotes_set.plot(rankdir=\"LR\", with_targets=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Ingest Data Into Offline And Online Stores" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.377248+00:00 args={'min': 52, 'value': 51.95}\n", - "info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.377927+00:00 args={'min': 52, 'value': 51.97}\n", - "info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.378103+00:00 args={'min': 52, 'value': 51.99}\n", - "info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.578640+00:00 args={'min': 52, 'value': 51.95}\n", - "info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.581692+00:00 args={'min': 52, 'value': 51.97}\n", - "info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.584351+00:00 args={'min': 52, 'value': 51.99}\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
asks1_max_1hasks1_sum_1hasks5_max_5hasks5_sum_5hbids_max_1hbids_min_1htimebidaskmultiextra
ticker
GOOG720.93720.93720.93720.93720.50720.502022-01-31 14:41:48.260566720.50720.932161.5055478.50
MSFT51.9651.9651.9651.9651.9551.952022-01-31 14:41:48.26056651.9551.96155.854000.15
MSFT51.98103.9451.98103.9451.9751.952022-01-31 14:41:48.26756651.9751.98155.914001.69
MSFT52.00155.9452.00155.9451.9951.952022-01-31 14:41:48.27856651.9952.00155.974003.23
GOOG720.931441.86720.931441.86720.50720.502022-01-31 14:41:48.285566720.50720.932161.5055478.50
AAPL98.0198.0198.0198.0197.9997.992022-01-31 14:41:48.28656697.9998.01293.977545.23
GOOG720.932162.74720.932162.74720.50720.502022-01-31 14:41:48.309566720.50720.882161.5055478.50
MSFT52.03207.9752.03207.9752.0151.952022-01-31 14:41:48.31256652.0152.03156.034004.77
\n", - "
" - ], - "text/plain": [ - " asks1_max_1h asks1_sum_1h asks5_max_5h asks5_sum_5h bids_max_1h \\\n", - "ticker \n", - "GOOG 720.93 720.93 720.93 720.93 720.50 \n", - "MSFT 51.96 51.96 51.96 51.96 51.95 \n", - "MSFT 51.98 103.94 51.98 103.94 51.97 \n", - "MSFT 52.00 155.94 52.00 155.94 51.99 \n", - "GOOG 720.93 1441.86 720.93 1441.86 720.50 \n", - "AAPL 98.01 98.01 98.01 98.01 97.99 \n", - "GOOG 720.93 2162.74 720.93 2162.74 720.50 \n", - "MSFT 52.03 207.97 52.03 207.97 52.01 \n", - "\n", - " bids_min_1h time bid ask multi \\\n", - "ticker \n", - "GOOG 720.50 2022-01-31 14:41:48.260566 720.50 720.93 2161.50 \n", - "MSFT 51.95 2022-01-31 14:41:48.260566 51.95 51.96 155.85 \n", - "MSFT 51.95 2022-01-31 14:41:48.267566 51.97 51.98 155.91 \n", - "MSFT 51.95 2022-01-31 14:41:48.278566 51.99 52.00 155.97 \n", - "GOOG 720.50 2022-01-31 14:41:48.285566 720.50 720.93 2161.50 \n", - "AAPL 97.99 2022-01-31 14:41:48.286566 97.99 98.01 293.97 \n", - "GOOG 720.50 2022-01-31 14:41:48.309566 720.50 720.88 2161.50 \n", - "MSFT 51.95 2022-01-31 14:41:48.312566 52.01 52.03 156.03 \n", - "\n", - " extra \n", - "ticker \n", - "GOOG 55478.50 \n", - "MSFT 4000.15 \n", - "MSFT 4001.69 \n", - "MSFT 4003.23 \n", - "GOOG 55478.50 \n", - "AAPL 7545.23 \n", - "GOOG 55478.50 \n", - "MSFT 4004.77 " - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# save ingest data and print the FeatureSet spec\n", - "fstore.ingest(quotes_set, quotes)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Get an Offline Feature Vector" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "features = [\n", - " \"stock-quotes.multi\",\n", - " \"stock-quotes.asks5_sum_5h as total_ask\",\n", - " \"stock-quotes.bids_min_1h\",\n", - " \"stock-quotes.bids_max_1h\",\n", - " \"stocks.*\",\n", - "]\n", - "\n", - "vector = fstore.FeatureVector(\"stocks-vec\", features)\n", - "vector.save()" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "target_dict = CSVTarget('mycsv',path=os.path.join(ABS_PATH, 'my_csv.csv')).to_dict()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Using `get_offline_features()` " - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [], - "source": [ - "get_offline_features_fn = mlrun.import_function('hub://get_offline_features:development')" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2022-01-31 14:41:52,066 [info] starting run get-offline-features-get_offline_features uid=956663b9a9ba448c9ea65e8e9245718e DB=http://mlrun-api:8080\n", - "> 2022-01-31 14:41:52,214 [info] Creating DataFrame from entity_rows = v3io://users/yonatan/get_offline_features/trades.csv\n", - "> 2022-01-31 14:41:52,292 [info] Preparing 'mycsv' target\n", - "> 2022-01-31 14:41:52,294 [info] getting offline features from the FeatureVector store://feature-vectors/get-offline-features-yonatan/stocks-vec\n", - "> 2022-01-31 14:41:52,708 [info] wrote target: {'name': 'mycsv', 'kind': 'csv', 'path': 'v3io://users/yonatan/get_offline_features/my_csv.csv', 'status': 'ready', 'updated': '2022-01-31T14:41:52.708534+00:00'}\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
get-offline-features-yonatan0Jan 31 14:41:52completedget-offline-features-get_offline_features
v3io_user=yonatan
kind=
owner=yonatan
host=jupyter-yoni-647b99c95d-w4jlc
entity_rows
feature_vector=store://feature-vectors/get-offline-features-yonatan/stocks-vec
target={'name': 'mycsv', 'kind': 'csv', 'path': 'v3io://users/yonatan/get_offline_features/my_csv.csv', 'partitioned': False}
entity_timestamp_column=time
target=v3io://users/yonatan/get_offline_features/my_csv.csv
feature_vector=store://feature-vectors/get-offline-features-yonatan/stocks-vec
\n", - "
\n", - "
\n", - "
\n", - " Title\n", - " ×\n", - "
\n", - " \n", - "
\n", - "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - }, - { - "data": { - "text/html": [ - " > to track results use the .show() or .logs() methods or click here to open in UI" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2022-01-31 14:41:52,896 [info] run executed, status=completed\n" - ] - } - ], - "source": [ - "gof_run = get_offline_features_fn.run(\n", - " handler='get_offline_features',\n", - " inputs= {'entity_rows': data_uri},\n", - " params={'feature_vector': vector.uri,\n", - " 'target': target_dict,\n", - " 'entity_timestamp_column': \"time\",\n", - " },\n", - " local=True\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'store://feature-vectors/get-offline-features-yonatan/stocks-vec'" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "gof_run.outputs['feature_vector']" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Unnamed: 0pricequantitymultitotal_askbids_min_1hbids_max_1hnameexchange
0051.9575155.8551.9651.9551.95Microsoft CorporationNASDAQ
1151.9575155.91103.9451.9551.97Microsoft CorporationNASDAQ
2251.9575155.97155.9451.9551.99Microsoft CorporationNASDAQ
3351.9575156.03207.9751.9552.01Microsoft CorporationNASDAQ
4451.95155155.8551.9651.9551.95Microsoft CorporationNASDAQ
5551.95155155.91103.9451.9551.97Microsoft CorporationNASDAQ
6651.95155155.97155.9451.9551.99Microsoft CorporationNASDAQ
7751.95155156.03207.9751.9552.01Microsoft CorporationNASDAQ
88720.771002161.50720.93720.50720.50Alphabet IncNASDAQ
99720.771002161.501441.86720.50720.50Alphabet IncNASDAQ
1010720.771002161.502162.74720.50720.50Alphabet IncNASDAQ
1111720.921002161.50720.93720.50720.50Alphabet IncNASDAQ
1212720.921002161.501441.86720.50720.50Alphabet IncNASDAQ
1313720.921002161.502162.74720.50720.50Alphabet IncNASDAQ
141498.00100293.9798.0197.9997.99Apple IncNASDAQ
\n", - "
" - ], - "text/plain": [ - " Unnamed: 0 price quantity multi total_ask bids_min_1h \\\n", - "0 0 51.95 75 155.85 51.96 51.95 \n", - "1 1 51.95 75 155.91 103.94 51.95 \n", - "2 2 51.95 75 155.97 155.94 51.95 \n", - "3 3 51.95 75 156.03 207.97 51.95 \n", - "4 4 51.95 155 155.85 51.96 51.95 \n", - "5 5 51.95 155 155.91 103.94 51.95 \n", - "6 6 51.95 155 155.97 155.94 51.95 \n", - "7 7 51.95 155 156.03 207.97 51.95 \n", - "8 8 720.77 100 2161.50 720.93 720.50 \n", - "9 9 720.77 100 2161.50 1441.86 720.50 \n", - "10 10 720.77 100 2161.50 2162.74 720.50 \n", - "11 11 720.92 100 2161.50 720.93 720.50 \n", - "12 12 720.92 100 2161.50 1441.86 720.50 \n", - "13 13 720.92 100 2161.50 2162.74 720.50 \n", - "14 14 98.00 100 293.97 98.01 97.99 \n", - "\n", - " bids_max_1h name exchange \n", - "0 51.95 Microsoft Corporation NASDAQ \n", - "1 51.97 Microsoft Corporation NASDAQ \n", - "2 51.99 Microsoft Corporation NASDAQ \n", - "3 52.01 Microsoft Corporation NASDAQ \n", - "4 51.95 Microsoft Corporation NASDAQ \n", - "5 51.97 Microsoft Corporation NASDAQ \n", - "6 51.99 Microsoft Corporation NASDAQ \n", - "7 52.01 Microsoft Corporation NASDAQ \n", - "8 720.50 Alphabet Inc NASDAQ \n", - "9 720.50 Alphabet Inc NASDAQ \n", - "10 720.50 Alphabet Inc NASDAQ \n", - "11 720.50 Alphabet Inc NASDAQ \n", - "12 720.50 Alphabet Inc NASDAQ \n", - "13 720.50 Alphabet Inc NASDAQ \n", - "14 97.99 Apple Inc NASDAQ " - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "mlrun.get_dataitem(gof_run.outputs['feature_vector']).as_df()" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/functions/development/get_offline_features/1.0.0/src/get_offline_features.py b/functions/development/get_offline_features/1.0.0/src/get_offline_features.py deleted file mode 100644 index 59cd114d..00000000 --- a/functions/development/get_offline_features/1.0.0/src/get_offline_features.py +++ /dev/null @@ -1,121 +0,0 @@ -from typing import Union, List, Dict - -import mlrun -import mlrun.feature_store as fs -from mlrun.datastore.store_resources import is_store_uri, parse_store_uri -from mlrun.datastore.targets import get_target_driver, kind_to_driver -from mlrun.datastore.base import DataItem -from mlrun.execution import MLClientCtx -from mlrun.utils import StorePrefix, parse_versioned_object_uri -from mlrun.errors import MLRunInvalidArgumentError - - -def get_offline_features( - context: MLClientCtx, - feature_vector: str, - features: List[str] = None, - label_feature: str = None, - description: str = None, - entity_rows: DataItem = None, - entity_timestamp_column: str = None, - target: Union[str, Dict] = None, - run_config: Union[str, Dict] = None, - drop_columns: List[str] = None, - start_time: str = None, - end_time: str = None, - with_indexes: bool = False, - update_stats: bool = False, -): - """retrieve offline feature vector results - - specify a feature vector object/uri and retrieve the desired features, their metadata - and statistics. returns :py:class:`~mlrun.feature_store.OfflineVectorResponse`, - results can be returned as a dataframe or written to a target. - If feature vector does not exist, a new one will be created and saved with the given features. - - The start_time and end_time attributes allow filtering the data to a given time range, they accept - string values or pandas `Timestamp` objects, string values can also be relative, for example: - "now", "now - 1d2h", "now+5m", where a valid pandas Timedelta string follows the verb "now", - for time alignment you can use the verb "floor" e.g. "now -1d floor 1H" will align the time to the last hour - (the floor string is passed to pandas.Timestamp.floor(), can use D, H, T, S for day, hour, min, sec alignment) - - - :param context: MLRun context - :param feature_vector: feature vector uri - :param features: Relevant only if feature_vector not exist. list of feature to collect to this vector - format [/]. [as ] - :param label_feature: feature name to be used as label data - :param description: text description of the vector - :param entity_rows: URI of the data entity rows to join with - :param target: where to write the results to - :param drop_columns: list of columns to drop from the final result - :param entity_timestamp_column: timestamp column name in the entity rows dataframe - :param run_config: function and/or run configuration - see :py:class:`~mlrun.feature_store.RunConfig` - :param start_time: datetime, low limit of time needed to be filtered. Optional - entity_timestamp_column must be passed when using time filtering - :param end_time: datetime, high limit of time needed to be filtered. Optional - entity_timestamp_column must be passed when using time filtering - :param with_indexes: return vector with index columns (default False) - :param update_stats: update features statistics from the requested feature sets on the vector. Default is False. - - :returns feature_vector input - """ - - if features is not None: - # Creating a new FeatureVector and saving: - if is_store_uri(feature_vector): - prefix, new_uri = parse_store_uri(feature_vector) - if prefix != StorePrefix.FeatureVector: - raise MLRunInvalidArgumentError( - f"provided store uri ({feature_vector}) does not represent a feature vector (prefix={prefix})" - ) - feature_vector = new_uri - - context.logger.info(f"Creating FeatureVector {feature_vector}") - project, name, tag, _ = parse_versioned_object_uri(feature_vector, mlrun.mlconf.default_project) - vector = fs.FeatureVector(name, features, label_feature=label_feature, description=description) - vector.metadata.project = project - vector.metadata.tag = tag - vector.save() - feature_vector = vector.uri - - # Preparing entity_rows: - if entity_rows is not None: - context.logger.info(f"Creating DataFrame from entity_rows = {entity_rows}") - entity_rows = entity_rows.as_df() - - # Preparing target: - if target: - if isinstance(target, str): - target = kind_to_driver[target]() - - name = target.name if hasattr(target, "name") else target["name"] - context.logger.info(f"Preparing '{name}' target") - target = get_target_driver(target) - if target.path: - context.log_result("target", target.path) - - # Preparing run_config: - if run_config and isinstance(run_config, dict): - context.logger.info("Preparing run configuration") - run_config = fs.RunConfig(**run_config) - - # Calling get_offline_features: - context.logger.info( - f"getting offline features from the FeatureVector {feature_vector}" - ) - fs.get_offline_features( - feature_vector=feature_vector, - entity_rows=entity_rows, - entity_timestamp_column=entity_timestamp_column, - target=target, - run_config=run_config, - drop_columns=drop_columns, - start_time=start_time, - end_time=end_time, - with_indexes=with_indexes, - update_stats=update_stats, - ) - - context.log_result("feature_vector", feature_vector) diff --git a/functions/development/get_offline_features/1.0.0/src/item.yaml b/functions/development/get_offline_features/1.0.0/src/item.yaml deleted file mode 100644 index cfd44487..00000000 --- a/functions/development/get_offline_features/1.0.0/src/item.yaml +++ /dev/null @@ -1,25 +0,0 @@ -apiVersion: v1 -categories: -- data-preparation -- data-analysis -- feature-store -description: retrieve offline feature vector results -doc: '' -example: get_offline_features.ipynb -generationDate: 2022-05-25:10-58 -icon: '' -labels: - author: yonish -maintainers: [] -marketplaceType: '' -mlrunVersion: 1.0.1 -name: get_offline_features -platformVersion: '' -spec: - filename: get_offline_features.py - handler: get_offline_features - image: mlrun/mlrun - kind: job - requirements: [] -url: '' -version: 1.0.0 diff --git a/functions/development/get_offline_features/1.0.0/src/test_get_offline_features.py b/functions/development/get_offline_features/1.0.0/src/test_get_offline_features.py deleted file mode 100644 index 1aa92f0b..00000000 --- a/functions/development/get_offline_features/1.0.0/src/test_get_offline_features.py +++ /dev/null @@ -1,224 +0,0 @@ -import os -import tempfile -import shutil -import datetime - -import pytest -import mlrun -import mlrun.feature_store as fstore -from mlrun.datastore.targets import CSVTarget -from mlrun.feature_store.steps import * -from mlrun.features import MinMaxValidator -from mlrun.run import get_dataitem - - -REQUIRED_ENV_VARS = [ - "MLRUN_DBPATH", - "MLRUN_ARTIFACT_PATH", - "V3IO_USERNAME", - "V3IO_API", - "V3IO_ACCESS_KEY", -] - - -def _validate_environment_variables() -> bool: - """ - Checks that all required Environment variables are set. - """ - environment_keys = os.environ.keys() - return all(key in environment_keys for key in REQUIRED_ENV_VARS) - - -def _set_environment(): - """ - Creating project and temp dir for the project. - """ - artifact_path = tempfile.TemporaryDirectory().name - os.makedirs(artifact_path) - project = mlrun.get_or_create_project( - "get-offline-features-test", context="./", user_project=True - ) - return artifact_path, project - - -def _cleanup_environment(artifact_path: str): - """ - Cleanup the test environment, deleting files and artifacts created during the test. - - :param artifact_path: The artifact path to delete. - """ - # Clean the local directory: - for test_output in [ - *os.listdir(artifact_path), - "schedules", - "runs", - "artifacts", - "functions", - ]: - test_output_path = os.path.abspath(f"./{test_output}") - if os.path.exists(test_output_path): - if os.path.isdir(test_output_path): - shutil.rmtree(test_output_path) - else: - os.remove(test_output_path) - - # Clean the artifacts directory: - shutil.rmtree(artifact_path) - - -def create_dataframes() -> (pd.DataFrame, pd.DataFrame, pd.DataFrame): - """ - Creates all the necessary DataFrames to the test. - """ - - def move_date(df, col): - max_date = df[col].max() - now_date = datetime.datetime.now() - delta = now_date - max_date - df[col] = df[col] + delta - return df - - stocks = pd.DataFrame( - { - "ticker": ["MSFT", "GOOG", "AAPL"], - "name": ["Microsoft Corporation", "Alphabet Inc", "Apple Inc"], - "exchange": ["NASDAQ", "NASDAQ", "NASDAQ"], - } - ) - - quotes = pd.DataFrame( - { - "time": [ - pd.Timestamp("2016-05-25 13:30:00.023"), - pd.Timestamp("2016-05-25 13:30:00.023"), - pd.Timestamp("2016-05-25 13:30:00.030"), - pd.Timestamp("2016-05-25 13:30:00.041"), - pd.Timestamp("2016-05-25 13:30:00.048"), - pd.Timestamp("2016-05-25 13:30:00.049"), - pd.Timestamp("2016-05-25 13:30:00.072"), - pd.Timestamp("2016-05-25 13:30:00.075"), - ], - "ticker": ["GOOG", "MSFT", "MSFT", "MSFT", "GOOG", "AAPL", "GOOG", "MSFT"], - "bid": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01], - "ask": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03], - } - ) - - trades = pd.DataFrame( - { - "time": [ - pd.Timestamp("2016-05-25 13:30:00.023"), - pd.Timestamp("2016-05-25 13:30:00.038"), - pd.Timestamp("2016-05-25 13:30:00.048"), - pd.Timestamp("2016-05-25 13:30:00.048"), - pd.Timestamp("2016-05-25 13:30:00.048"), - ], - "ticker": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"], - "price": [51.95, 51.95, 720.77, 720.92, 98.0], - "quantity": [75, 155, 100, 100, 100], - } - ) - quotes = move_date(quotes, "time") - trades = move_date(trades, "time") - return quotes, trades, stocks - - -class MyMap(MapClass): - def __init__(self, multiplier=1, **kwargs): - super().__init__(**kwargs) - self._multiplier = multiplier - - def do(self, event): - event["multi"] = event["bid"] * self._multiplier - return event - - -def _create_feature_set(): - """ - Creating all the necessary FeatureSets for the test. - """ - stocks_set = fstore.FeatureSet("stocks", entities=[fstore.Entity("ticker")]) - - quotes_set = fstore.FeatureSet("stock-quotes", entities=[fstore.Entity("ticker")]) - - quotes_set.graph.to("MyMap", multiplier=3).to( - "storey.Extend", _fn="({'extra': event['bid'] * 77})" - ).to("storey.Filter", "filter", _fn="(event['bid'] > 51.92)").to( - FeaturesetValidator() - ) - - quotes_set.add_aggregation("ask", ["sum", "max"], "1h", "10m", name="asks1") - quotes_set.add_aggregation("ask", ["sum", "max"], "5h", "10m", name="asks5") - quotes_set.add_aggregation("bid", ["min", "max"], "1h", "10m", name="bids") - - # add feature validation policy - quotes_set["bid"] = fstore.Feature( - validator=MinMaxValidator(min=52, severity="info") - ) - - # add default target definitions and plot - quotes_set.set_targets() - return quotes_set, stocks_set - - -@pytest.mark.skipif( - condition=not _validate_environment_variables(), - reason="Project's environment variables are not set", -) -def test_get_offline_vector(): - # Creating project: - artifact_path, project = _set_environment() - - # Importing the marketplace function: - gof_fn = mlrun.import_function("function.yaml") - - # Creating the dataframes: - quotes, trades, stocks = create_dataframes() - - # Defining features for the FeatureVector: - features = [ - "stock-quotes.multi", - "stock-quotes.asks5_sum_5h as total_ask", - "stock-quotes.bids_min_1h", - "stock-quotes.bids_max_1h", - "stocks.*", - ] - - # Creating the FeatureSets and ingesting them: - quotes_set, stocks_set = _create_feature_set() - fstore.ingest(stocks_set, stocks) - fstore.ingest(quotes_set, quotes) - - # Saving the trades dataframe as a csv to use as entity_rows: - trades_uri = os.path.join(artifact_path, "trades.csv") - trades.to_csv(trades_uri, index=False) - - # Creating target for the FeatureVector: - target_dict = CSVTarget( - "mycsv", path=os.path.join(artifact_path, "my_csv.csv") - ).to_dict() - - # Running the getting_offline_features function: - gof_run = None - try: - gof_run = gof_fn.run( - handler="get_offline_features", - inputs={"entity_rows": trades_uri}, - params={ - "feature_vector": "stocks-vec", - "features": features, - "target": target_dict, - "entity_timestamp_column": "time", - }, - local=True, - ) - - except Exception as e: - print(f"- The test failed - raised the following error:\n- {e}") - - target_df = get_dataitem(gof_run.outputs["target"]).as_df() - vector_df = get_dataitem(gof_run.outputs["feature_vector"]).as_df() - - # Asserting that the target and FeatureVector dataframes are the same: - assert vector_df.equals(target_df), "Target and feature vector are not the same" - _cleanup_environment(artifact_path) diff --git a/functions/development/get_offline_features/1.0.0/static/documentation.html b/functions/development/get_offline_features/1.0.0/static/documentation.html deleted file mode 100644 index 7e82b21a..00000000 --- a/functions/development/get_offline_features/1.0.0/static/documentation.html +++ /dev/null @@ -1,167 +0,0 @@ - - - - - - - -get_offline_features package - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- - -
-
-
-
-
-
-

get_offline_features package

-
-

Submodules

-
-
-

get_offline_features.get_offline_features module

-
-
-get_offline_features.get_offline_features.get_offline_features(context: mlrun.execution.MLClientCtx, feature_vector: str, features: Optional[List[str]] = None, label_feature: Optional[str] = None, description: Optional[str] = None, entity_rows: Optional[mlrun.datastore.base.DataItem] = None, entity_timestamp_column: Optional[str] = None, target: Optional[Union[str, Dict]] = None, run_config: Optional[Union[str, Dict]] = None, drop_columns: Optional[List[str]] = None, start_time: Optional[str] = None, end_time: Optional[str] = None, with_indexes: bool = False, update_stats: bool = False)[source]
-

retrieve offline feature vector results

-

specify a feature vector object/uri and retrieve the desired features, their metadata -and statistics. returns OfflineVectorResponse, -results can be returned as a dataframe or written to a target. -If feature vector does not exist, a new one will be created and saved with the given features.

-

The start_time and end_time attributes allow filtering the data to a given time range, they accept -string values or pandas Timestamp objects, string values can also be relative, for example: -“now”, “now - 1d2h”, “now+5m”, where a valid pandas Timedelta string follows the verb “now”, -for time alignment you can use the verb “floor” e.g. “now -1d floor 1H” will align the time to the last hour -(the floor string is passed to pandas.Timestamp.floor(), can use D, H, T, S for day, hour, min, sec alignment)

-
-
Parameters
-
    -
  • context – MLRun context

  • -
  • feature_vector – feature vector uri

  • -
  • features – Relevant only if feature_vector not exist. list of feature to collect to this vector -format [<project>/]<feature_set>.<feature_name or *> [as <alias>]

  • -
  • label_feature – feature name to be used as label data

  • -
  • description – text description of the vector

  • -
  • entity_rows – URI of the data entity rows to join with

  • -
  • target – where to write the results to

  • -
  • drop_columns – list of columns to drop from the final result

  • -
  • entity_timestamp_column – timestamp column name in the entity rows dataframe

  • -
  • run_config – function and/or run configuration -see RunConfig

  • -
  • start_time – datetime, low limit of time needed to be filtered. Optional -entity_timestamp_column must be passed when using time filtering

  • -
  • end_time – datetime, high limit of time needed to be filtered. Optional -entity_timestamp_column must be passed when using time filtering

  • -
  • with_indexes – return vector with index columns (default False)

  • -
  • update_stats – update features statistics from the requested feature sets on the vector. Default is False.

  • -
-
-
-

:returns feature_vector input

-
-
-
-

Module contents

-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/get_offline_features/1.0.0/static/example.html b/functions/development/get_offline_features/1.0.0/static/example.html deleted file mode 100644 index 4e837c4f..00000000 --- a/functions/development/get_offline_features/1.0.0/static/example.html +++ /dev/null @@ -1,1259 +0,0 @@ - - - - - - - -get_offline_features() from MLRun FeatureStore - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
- -
-
-
-
-

get_offline_features() from MLRun FeatureStore

-

This MLRun Function has the following params:

-
    -
  • feature_vector: str, feature vector uri.

  • -
  • entity_rows: DataItem = None, URI of the data entity rows to join with.

  • -
  • entity_timestamp_column: str = None, timestamp column name in the entity rows dataframe.

  • -
  • target: Union[str, Dict] = None, where to write the results to.

  • -
  • run_config: Union[str, Dict] = None, function and/or run configuration see :py:class:~mlrun.feature_store.RunConfig.

  • -
  • drop_columns: List[str] = None, list of columns to drop from the final result.

  • -
  • start_time: str = None, datetime, low limit of time needed to be filtered. Optional. entity_timestamp_column must be passed when using time filtering.

  • -
  • end_time: str = None, datetime, high limit of time needed to be filtered. Optional. entity_timestamp_column must be passed when using time filtering.

  • -
  • with_indexes: bool = False, return vector with index columns (default False).

  • -
  • update_stats: bool = False, update features statistics from the requested feature sets on the vector. Default is False.

  • -
-
-
-
import mlrun
-import mlrun.feature_store as fstore
-from mlrun.datastore.targets import CSVTarget
-from mlrun.datastore.sources import CSVSource
-from mlrun.run import get_dataitem
-from mlrun.feature_store.steps import *
-from mlrun.features import MinMaxValidator
-import pandas as pd
-import datetime
-import os
-
-
-
-
-
-
-
ABS_PATH = 'v3io://users/{}/get_offline_features/'.format(os.environ['V3IO_USERNAME'])
-# Initialize the MLRun project object
-project = mlrun.get_or_create_project('get-offline-features', context="./", user_project=True)
-
-
-
-
-
> 2022-01-31 14:41:48,288 [info] loaded project get-offline-features from MLRun DB
-
-
-
-
-
-

Generating the Same FeatureSets and FeatureVecotrs Based on the Stocks Example

-
-

Create Sample Data For Demo

-
-
-
quotes = pd.DataFrame(
-    {
-        "time": [
-            pd.Timestamp("2016-05-25 13:30:00.023"),
-            pd.Timestamp("2016-05-25 13:30:00.023"),
-            pd.Timestamp("2016-05-25 13:30:00.030"),
-            pd.Timestamp("2016-05-25 13:30:00.041"),
-            pd.Timestamp("2016-05-25 13:30:00.048"),
-            pd.Timestamp("2016-05-25 13:30:00.049"),
-            pd.Timestamp("2016-05-25 13:30:00.072"),
-            pd.Timestamp("2016-05-25 13:30:00.075")
-        ],
-        "ticker": [
-               "GOOG",
-               "MSFT",
-               "MSFT",
-               "MSFT",
-               "GOOG",
-               "AAPL",
-               "GOOG",
-               "MSFT"
-           ],
-           "bid": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01],
-           "ask": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03]
-    }
-)
-
-trades = pd.DataFrame(
-       {
-           "time": [
-               pd.Timestamp("2016-05-25 13:30:00.023"),
-               pd.Timestamp("2016-05-25 13:30:00.038"),
-               pd.Timestamp("2016-05-25 13:30:00.048"),
-               pd.Timestamp("2016-05-25 13:30:00.048"),
-               pd.Timestamp("2016-05-25 13:30:00.048")
-           ],
-           "ticker": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"],
-           "price": [51.95, 51.95, 720.77, 720.92, 98.0],
-           "quantity": [75, 155, 100, 100, 100]
-       }
-)
-
-stocks = pd.DataFrame(
-       {
-           "ticker": ["MSFT", "GOOG", "AAPL"],
-           "name": ["Microsoft Corporation", "Alphabet Inc", "Apple Inc"],
-           "exchange": ["NASDAQ", "NASDAQ", "NASDAQ"]
-       }
-)
-
-
-
-
-
-
-
def move_date(df, col):
-    max_date = df[col].max()
-    now_date = datetime.datetime.now()
-    delta = now_date - max_date 
-    df[col] = df[col] + delta 
-    return df
-
-quotes = move_date(quotes, "time")
-trades = move_date(trades, "time")
-trades.to_csv('trades.csv', index=False)
-data_uri = os.path.join(ABS_PATH, 'trades.csv')
-
-
-
-
-
-
-
quotes
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
timetickerbidask
02022-01-31 14:41:48.260566GOOG720.50720.93
12022-01-31 14:41:48.260566MSFT51.9551.96
22022-01-31 14:41:48.267566MSFT51.9751.98
32022-01-31 14:41:48.278566MSFT51.9952.00
42022-01-31 14:41:48.285566GOOG720.50720.93
52022-01-31 14:41:48.286566AAPL97.9998.01
62022-01-31 14:41:48.309566GOOG720.50720.88
72022-01-31 14:41:48.312566MSFT52.0152.03
-
-
-
-
-
trades
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
timetickerpricequantity
02022-01-31 14:41:48.288476MSFT51.9575
12022-01-31 14:41:48.303476MSFT51.95155
22022-01-31 14:41:48.313476GOOG720.77100
32022-01-31 14:41:48.313476GOOG720.92100
42022-01-31 14:41:48.313476AAPL98.00100
-
-
-
-
-
stocks
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
tickernameexchange
0MSFTMicrosoft CorporationNASDAQ
1GOOGAlphabet IncNASDAQ
2AAPLApple IncNASDAQ
-
-
-
-
-

Build & Ingest Simple Feature Set (stocks)

-
-
-
# add feature set without time column (stock ticker metadata) 
-stocks_set = fstore.FeatureSet("stocks", entities=[fstore.Entity("ticker")])
-fstore.ingest(stocks_set, stocks, infer_options=fstore.InferOptions.default())
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
nameexchange
ticker
MSFTMicrosoft CorporationNASDAQ
GOOGAlphabet IncNASDAQ
AAPLApple IncNASDAQ
-
-
-
-
-

Build Advanced feature set - with feature engineering pipeline

-
-
-
quotes_set = fstore.FeatureSet("stock-quotes", entities=[fstore.Entity("ticker")])
-
-
-
-
-
-
-
class MyMap(MapClass):
-    def __init__(self, multiplier=1, **kwargs):
-        super().__init__(**kwargs)
-        self._multiplier = multiplier
-
-    def do(self, event):
-        event["multi"] = event["bid"] * self._multiplier
-        return event
-
-
-
-
-
-
-
quotes_set.graph.to("MyMap", multiplier=3)\
-                .to("storey.Extend", _fn="({'extra': event['bid'] * 77})")\
-                .to("storey.Filter", "filter", _fn="(event['bid'] > 51.92)")\
-                .to(FeaturesetValidator())
-
-quotes_set.add_aggregation("ask", ["sum", "max"], "1h", "10m", name="asks1")
-quotes_set.add_aggregation("ask", ["sum", "max"], "5h", "10m", name="asks5")
-quotes_set.add_aggregation("bid", ["min", "max"], "1h", "10m", name="bids")
-
-# add feature validation policy
-quotes_set["bid"] = fstore.Feature(validator=MinMaxValidator(min=52, severity="info"))
-
-# add default target definitions and plot
-quotes_set.set_targets()
-quotes_set.plot(rankdir="LR", with_targets=True)
-
-
-
-
-_images/01e548158b906df8cc3f4f282097c5f50e603245dd43f2314bc9ff5ef6aa1447.svg
-
-
-
-

Ingest Data Into Offline And Online Stores

-
-
-
# save ingest data and print the FeatureSet spec
-fstore.ingest(quotes_set, quotes)
-
-
-
-
-
info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.377248+00:00 args={'min': 52, 'value': 51.95}
-info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.377927+00:00 args={'min': 52, 'value': 51.97}
-info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.378103+00:00 args={'min': 52, 'value': 51.99}
-info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.578640+00:00 args={'min': 52, 'value': 51.95}
-info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.581692+00:00 args={'min': 52, 'value': 51.97}
-info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.584351+00:00 args={'min': 52, 'value': 51.99}
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
asks1_max_1hasks1_sum_1hasks5_max_5hasks5_sum_5hbids_max_1hbids_min_1htimebidaskmultiextra
ticker
GOOG720.93720.93720.93720.93720.50720.502022-01-31 14:41:48.260566720.50720.932161.5055478.50
MSFT51.9651.9651.9651.9651.9551.952022-01-31 14:41:48.26056651.9551.96155.854000.15
MSFT51.98103.9451.98103.9451.9751.952022-01-31 14:41:48.26756651.9751.98155.914001.69
MSFT52.00155.9452.00155.9451.9951.952022-01-31 14:41:48.27856651.9952.00155.974003.23
GOOG720.931441.86720.931441.86720.50720.502022-01-31 14:41:48.285566720.50720.932161.5055478.50
AAPL98.0198.0198.0198.0197.9997.992022-01-31 14:41:48.28656697.9998.01293.977545.23
GOOG720.932162.74720.932162.74720.50720.502022-01-31 14:41:48.309566720.50720.882161.5055478.50
MSFT52.03207.9752.03207.9752.0151.952022-01-31 14:41:48.31256652.0152.03156.034004.77
-
-
-
-
-

Get an Offline Feature Vector

-
-
-
features = [
-    "stock-quotes.multi",
-    "stock-quotes.asks5_sum_5h as total_ask",
-    "stock-quotes.bids_min_1h",
-    "stock-quotes.bids_max_1h",
-    "stocks.*",
-]
-
-vector = fstore.FeatureVector("stocks-vec", features)
-vector.save()
-
-
-
-
-
-
-
target_dict = CSVTarget('mycsv',path=os.path.join(ABS_PATH, 'my_csv.csv')).to_dict()
-
-
-
-
-
-
-

Using get_offline_features()

-
-
-
get_offline_features_fn = mlrun.import_function('hub://get_offline_features:development')
-
-
-
-
-
-
-
gof_run = get_offline_features_fn.run(
-    handler='get_offline_features',
-    inputs= {'entity_rows': data_uri},
-    params={'feature_vector': vector.uri,
-           'target': target_dict,
-            'entity_timestamp_column': "time",
-           },
-    local=True
-)
-
-
-
-
-
> 2022-01-31 14:41:52,066 [info] starting run get-offline-features-get_offline_features uid=956663b9a9ba448c9ea65e8e9245718e DB=http://mlrun-api:8080
-> 2022-01-31 14:41:52,214 [info] Creating DataFrame from entity_rows = v3io://users/yonatan/get_offline_features/trades.csv
-> 2022-01-31 14:41:52,292 [info] Preparing 'mycsv' target
-> 2022-01-31 14:41:52,294 [info] getting offline features from the FeatureVector store://feature-vectors/get-offline-features-yonatan/stocks-vec
-> 2022-01-31 14:41:52,708 [info] wrote target: {'name': 'mycsv', 'kind': 'csv', 'path': 'v3io://users/yonatan/get_offline_features/my_csv.csv', 'status': 'ready', 'updated': '2022-01-31T14:41:52.708534+00:00'}
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
get-offline-features-yonatan0Jan 31 14:41:52completedget-offline-features-get_offline_features
v3io_user=yonatan
kind=
owner=yonatan
host=jupyter-yoni-647b99c95d-w4jlc
entity_rows
feature_vector=store://feature-vectors/get-offline-features-yonatan/stocks-vec
target={'name': 'mycsv', 'kind': 'csv', 'path': 'v3io://users/yonatan/get_offline_features/my_csv.csv', 'partitioned': False}
entity_timestamp_column=time
target=v3io://users/yonatan/get_offline_features/my_csv.csv
feature_vector=store://feature-vectors/get-offline-features-yonatan/stocks-vec
-
- -
-

-
-
-
> to track results use the .show() or .logs() methods or click here to open in UI
> 2022-01-31 14:41:52,896 [info] run executed, status=completed
-
-
-
-
-
-
-
gof_run.outputs['feature_vector']
-
-
-
-
-
'store://feature-vectors/get-offline-features-yonatan/stocks-vec'
-
-
-
-
-
-
-
mlrun.get_dataitem(gof_run.outputs['feature_vector']).as_df()
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Unnamed: 0pricequantitymultitotal_askbids_min_1hbids_max_1hnameexchange
0051.9575155.8551.9651.9551.95Microsoft CorporationNASDAQ
1151.9575155.91103.9451.9551.97Microsoft CorporationNASDAQ
2251.9575155.97155.9451.9551.99Microsoft CorporationNASDAQ
3351.9575156.03207.9751.9552.01Microsoft CorporationNASDAQ
4451.95155155.8551.9651.9551.95Microsoft CorporationNASDAQ
5551.95155155.91103.9451.9551.97Microsoft CorporationNASDAQ
6651.95155155.97155.9451.9551.99Microsoft CorporationNASDAQ
7751.95155156.03207.9751.9552.01Microsoft CorporationNASDAQ
88720.771002161.50720.93720.50720.50Alphabet IncNASDAQ
99720.771002161.501441.86720.50720.50Alphabet IncNASDAQ
1010720.771002161.502162.74720.50720.50Alphabet IncNASDAQ
1111720.921002161.50720.93720.50720.50Alphabet IncNASDAQ
1212720.921002161.501441.86720.50720.50Alphabet IncNASDAQ
1313720.921002161.502162.74720.50720.50Alphabet IncNASDAQ
141498.00100293.9798.0197.9997.99Apple IncNASDAQ
-
-
-
-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/get_offline_features/1.0.0/static/function.html b/functions/development/get_offline_features/1.0.0/static/function.html deleted file mode 100644 index 00a8866c..00000000 --- a/functions/development/get_offline_features/1.0.0/static/function.html +++ /dev/null @@ -1,148 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: job
-metadata:
-  name: get-offline-features
-  tag: ''
-  hash: e7bd4fc1f7377374a910d4aa21e31dbba31f59a3
-  project: ''
-  labels:
-    author: yonish
-  categories:
-  - data-preparation
-  - data-analysis
-  - feature-store
-spec:
-  command: ''
-  args: []
-  image: mlrun/mlrun
-  build:
-    functionSourceCode: ZnJvbSB0eXBpbmcgaW1wb3J0IFVuaW9uLCBMaXN0LCBEaWN0CgppbXBvcnQgbWxydW4KaW1wb3J0IG1scnVuLmZlYXR1cmVfc3RvcmUgYXMgZnMKZnJvbSBtbHJ1bi5kYXRhc3RvcmUuc3RvcmVfcmVzb3VyY2VzIGltcG9ydCBpc19zdG9yZV91cmksIHBhcnNlX3N0b3JlX3VyaQpmcm9tIG1scnVuLmRhdGFzdG9yZS50YXJnZXRzIGltcG9ydCBnZXRfdGFyZ2V0X2RyaXZlciwga2luZF90b19kcml2ZXIKZnJvbSBtbHJ1bi5kYXRhc3RvcmUuYmFzZSBpbXBvcnQgRGF0YUl0ZW0KZnJvbSBtbHJ1bi5leGVjdXRpb24gaW1wb3J0IE1MQ2xpZW50Q3R4CmZyb20gbWxydW4udXRpbHMgaW1wb3J0IFN0b3JlUHJlZml4LCBwYXJzZV92ZXJzaW9uZWRfb2JqZWN0X3VyaQpmcm9tIG1scnVuLmVycm9ycyBpbXBvcnQgTUxSdW5JbnZhbGlkQXJndW1lbnRFcnJvcgoKCmRlZiBnZXRfb2ZmbGluZV9mZWF0dXJlcygKICAgIGNvbnRleHQ6IE1MQ2xpZW50Q3R4LAogICAgZmVhdHVyZV92ZWN0b3I6IHN0ciwKICAgIGZlYXR1cmVzOiBMaXN0W3N0cl0gPSBOb25lLAogICAgbGFiZWxfZmVhdHVyZTogc3RyID0gTm9uZSwKICAgIGRlc2NyaXB0aW9uOiBzdHIgPSBOb25lLAogICAgZW50aXR5X3Jvd3M6IERhdGFJdGVtID0gTm9uZSwKICAgIGVudGl0eV90aW1lc3RhbXBfY29sdW1uOiBzdHIgPSBOb25lLAogICAgdGFyZ2V0OiBVbmlvbltzdHIsIERpY3RdID0gTm9uZSwKICAgIHJ1bl9jb25maWc6IFVuaW9uW3N0ciwgRGljdF0gPSBOb25lLAogICAgZHJvcF9jb2x1bW5zOiBMaXN0W3N0cl0gPSBOb25lLAogICAgc3RhcnRfdGltZTogc3RyID0gTm9uZSwKICAgIGVuZF90aW1lOiBzdHIgPSBOb25lLAogICAgd2l0aF9pbmRleGVzOiBib29sID0gRmFsc2UsCiAgICB1cGRhdGVfc3RhdHM6IGJvb2wgPSBGYWxzZSwKKToKICAgICIiInJldHJpZXZlIG9mZmxpbmUgZmVhdHVyZSB2ZWN0b3IgcmVzdWx0cwoKICAgIHNwZWNpZnkgYSBmZWF0dXJlIHZlY3RvciBvYmplY3QvdXJpIGFuZCByZXRyaWV2ZSB0aGUgZGVzaXJlZCBmZWF0dXJlcywgdGhlaXIgbWV0YWRhdGEKICAgIGFuZCBzdGF0aXN0aWNzLiByZXR1cm5zIDpweTpjbGFzczpgfm1scnVuLmZlYXR1cmVfc3RvcmUuT2ZmbGluZVZlY3RvclJlc3BvbnNlYCwKICAgIHJlc3VsdHMgY2FuIGJlIHJldHVybmVkIGFzIGEgZGF0YWZyYW1lIG9yIHdyaXR0ZW4gdG8gYSB0YXJnZXQuCiAgICBJZiBmZWF0dXJlIHZlY3RvciBkb2VzIG5vdCBleGlzdCwgYSBuZXcgb25lIHdpbGwgYmUgY3JlYXRlZCBhbmQgc2F2ZWQgd2l0aCB0aGUgZ2l2ZW4gZmVhdHVyZXMuCgogICAgVGhlIHN0YXJ0X3RpbWUgYW5kIGVuZF90aW1lIGF0dHJpYnV0ZXMgYWxsb3cgZmlsdGVyaW5nIHRoZSBkYXRhIHRvIGEgZ2l2ZW4gdGltZSByYW5nZSwgdGhleSBhY2NlcHQKICAgIHN0cmluZyB2YWx1ZXMgb3IgcGFuZGFzIGBUaW1lc3RhbXBgIG9iamVjdHMsIHN0cmluZyB2YWx1ZXMgY2FuIGFsc28gYmUgcmVsYXRpdmUsIGZvciBleGFtcGxlOgogICAgIm5vdyIsICJub3cgLSAxZDJoIiwgIm5vdys1bSIsIHdoZXJlIGEgdmFsaWQgcGFuZGFzIFRpbWVkZWx0YSBzdHJpbmcgZm9sbG93cyB0aGUgdmVyYiAibm93IiwKICAgIGZvciB0aW1lIGFsaWdubWVudCB5b3UgY2FuIHVzZSB0aGUgdmVyYiAiZmxvb3IiIGUuZy4gIm5vdyAtMWQgZmxvb3IgMUgiIHdpbGwgYWxpZ24gdGhlIHRpbWUgdG8gdGhlIGxhc3QgaG91cgogICAgKHRoZSBmbG9vciBzdHJpbmcgaXMgcGFzc2VkIHRvIHBhbmRhcy5UaW1lc3RhbXAuZmxvb3IoKSwgY2FuIHVzZSBELCBILCBULCBTIGZvciBkYXksIGhvdXIsIG1pbiwgc2VjIGFsaWdubWVudCkKCgogICAgOnBhcmFtIGNvbnRleHQ6ICAgICAgICBNTFJ1biBjb250ZXh0CiAgICA6cGFyYW0gZmVhdHVyZV92ZWN0b3I6IGZlYXR1cmUgdmVjdG9yIHVyaQogICAgOnBhcmFtIGZlYXR1cmVzOiAgICAgICBSZWxldmFudCBvbmx5IGlmIGZlYXR1cmVfdmVjdG9yIG5vdCBleGlzdC4gbGlzdCBvZiBmZWF0dXJlIHRvIGNvbGxlY3QgdG8gdGhpcyB2ZWN0b3IKICAgICAgICAgICAgICAgICAgICAgICAgICAgZm9ybWF0IFs8cHJvamVjdD4vXTxmZWF0dXJlX3NldD4uPGZlYXR1cmVfbmFtZSBvciAqPiBbYXMgPGFsaWFzPl0KICAgIDpwYXJhbSBsYWJlbF9mZWF0dXJlOiAgZmVhdHVyZSBuYW1lIHRvIGJlIHVzZWQgYXMgbGFiZWwgZGF0YQogICAgOnBhcmFtIGRlc2NyaXB0aW9uOiAgICB0ZXh0IGRlc2NyaXB0aW9uIG9mIHRoZSB2ZWN0b3IKICAgIDpwYXJhbSBlbnRpdHlfcm93czogICAgVVJJIG9mIHRoZSBkYXRhIGVudGl0eSByb3dzIHRvIGpvaW4gd2l0aAogICAgOnBhcmFtIHRhcmdldDogICAgICAgICB3aGVyZSB0byB3cml0ZSB0aGUgcmVzdWx0cyB0bwogICAgOnBhcmFtIGRyb3BfY29sdW1uczogICBsaXN0IG9mIGNvbHVtbnMgdG8gZHJvcCBmcm9tIHRoZSBmaW5hbCByZXN1bHQKICAgIDpwYXJhbSBlbnRpdHlfdGltZXN0YW1wX2NvbHVtbjogdGltZXN0YW1wIGNvbHVtbiBuYW1lIGluIHRoZSBlbnRpdHkgcm93cyBkYXRhZnJhbWUKICAgIDpwYXJhbSBydW5fY29uZmlnOiAgICAgZnVuY3Rpb24gYW5kL29yIHJ1biBjb25maWd1cmF0aW9uCiAgICAgICAgICAgICAgICAgICAgICAgICAgIHNlZSA6cHk6Y2xhc3M6YH5tbHJ1bi5mZWF0dXJlX3N0b3JlLlJ1bkNvbmZpZ2AKICAgIDpwYXJhbSBzdGFydF90aW1lOiAgICAgIGRhdGV0aW1lLCBsb3cgbGltaXQgb2YgdGltZSBuZWVkZWQgdG8gYmUgZmlsdGVyZWQuIE9wdGlvbmFsCiAgICAgICAgZW50aXR5X3RpbWVzdGFtcF9jb2x1bW4gbXVzdCBiZSBwYXNzZWQgd2hlbiB1c2luZyB0aW1lIGZpbHRlcmluZwogICAgOnBhcmFtIGVuZF90aW1lOiAgICAgICAgZGF0ZXRpbWUsIGhpZ2ggbGltaXQgb2YgdGltZSBuZWVkZWQgdG8gYmUgZmlsdGVyZWQuIE9wdGlvbmFsCiAgICAgICAgZW50aXR5X3RpbWVzdGFtcF9jb2x1bW4gbXVzdCBiZSBwYXNzZWQgd2hlbiB1c2luZyB0aW1lIGZpbHRlcmluZwogICAgOnBhcmFtIHdpdGhfaW5kZXhlczogICAgcmV0dXJuIHZlY3RvciB3aXRoIGluZGV4IGNvbHVtbnMgKGRlZmF1bHQgRmFsc2UpCiAgICA6cGFyYW0gdXBkYXRlX3N0YXRzOiAgICB1cGRhdGUgZmVhdHVyZXMgc3RhdGlzdGljcyBmcm9tIHRoZSByZXF1ZXN0ZWQgZmVhdHVyZSBzZXRzIG9uIHRoZSB2ZWN0b3IuIERlZmF1bHQgaXMgRmFsc2UuCgogICAgOnJldHVybnMgZmVhdHVyZV92ZWN0b3IgaW5wdXQKICAgICIiIgoKICAgIGlmIGZlYXR1cmVzIGlzIG5vdCBOb25lOgogICAgICAgICMgQ3JlYXRpbmcgYSBuZXcgRmVhdHVyZVZlY3RvciBhbmQgc2F2aW5nOgogICAgICAgIGlmIGlzX3N0b3JlX3VyaShmZWF0dXJlX3ZlY3Rvcik6CiAgICAgICAgICAgIHByZWZpeCwgbmV3X3VyaSA9IHBhcnNlX3N0b3JlX3VyaShmZWF0dXJlX3ZlY3RvcikKICAgICAgICAgICAgaWYgcHJlZml4ICE9IFN0b3JlUHJlZml4LkZlYXR1cmVWZWN0b3I6CiAgICAgICAgICAgICAgICByYWlzZSBNTFJ1bkludmFsaWRBcmd1bWVudEVycm9yKAogICAgICAgICAgICAgICAgICAgIGYicHJvdmlkZWQgc3RvcmUgdXJpICh7ZmVhdHVyZV92ZWN0b3J9KSBkb2VzIG5vdCByZXByZXNlbnQgYSBmZWF0dXJlIHZlY3RvciAocHJlZml4PXtwcmVmaXh9KSIKICAgICAgICAgICAgICAgICkKICAgICAgICAgICAgZmVhdHVyZV92ZWN0b3IgPSBuZXdfdXJpCgogICAgICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZiJDcmVhdGluZyBGZWF0dXJlVmVjdG9yIHtmZWF0dXJlX3ZlY3Rvcn0iKQogICAgICAgIHByb2plY3QsIG5hbWUsIHRhZywgXyA9IHBhcnNlX3ZlcnNpb25lZF9vYmplY3RfdXJpKGZlYXR1cmVfdmVjdG9yLCBtbHJ1bi5tbGNvbmYuZGVmYXVsdF9wcm9qZWN0KQogICAgICAgIHZlY3RvciA9IGZzLkZlYXR1cmVWZWN0b3IobmFtZSwgZmVhdHVyZXMsIGxhYmVsX2ZlYXR1cmU9bGFiZWxfZmVhdHVyZSwgZGVzY3JpcHRpb249ZGVzY3JpcHRpb24pCiAgICAgICAgdmVjdG9yLm1ldGFkYXRhLnByb2plY3QgPSBwcm9qZWN0CiAgICAgICAgdmVjdG9yLm1ldGFkYXRhLnRhZyA9IHRhZwogICAgICAgIHZlY3Rvci5zYXZlKCkKICAgICAgICBmZWF0dXJlX3ZlY3RvciA9IHZlY3Rvci51cmkKCiAgICAjIFByZXBhcmluZyBlbnRpdHlfcm93czoKICAgIGlmIGVudGl0eV9yb3dzIGlzIG5vdCBOb25lOgogICAgICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZiJDcmVhdGluZyBEYXRhRnJhbWUgZnJvbSBlbnRpdHlfcm93cyA9IHtlbnRpdHlfcm93c30iKQogICAgICAgIGVudGl0eV9yb3dzID0gZW50aXR5X3Jvd3MuYXNfZGYoKQoKICAgICMgUHJlcGFyaW5nIHRhcmdldDoKICAgIGlmIHRhcmdldDoKICAgICAgICBpZiBpc2luc3RhbmNlKHRhcmdldCwgc3RyKToKICAgICAgICAgICAgdGFyZ2V0ID0ga2luZF90b19kcml2ZXJbdGFyZ2V0XSgpCgogICAgICAgIG5hbWUgPSB0YXJnZXQubmFtZSBpZiBoYXNhdHRyKHRhcmdldCwgIm5hbWUiKSBlbHNlIHRhcmdldFsibmFtZSJdCiAgICAgICAgY29udGV4dC5sb2dnZXIuaW5mbyhmIlByZXBhcmluZyAne25hbWV9JyB0YXJnZXQiKQogICAgICAgIHRhcmdldCA9IGdldF90YXJnZXRfZHJpdmVyKHRhcmdldCkKICAgIGlmIHRhcmdldC5wYXRoOgogICAgICAgIGNvbnRleHQubG9nX3Jlc3VsdCgidGFyZ2V0IiwgdGFyZ2V0LnBhdGgpCgogICAgIyBQcmVwYXJpbmcgcnVuX2NvbmZpZzoKICAgIGlmIHJ1bl9jb25maWcgYW5kIGlzaW5zdGFuY2UocnVuX2NvbmZpZywgZGljdCk6CiAgICAgICAgY29udGV4dC5sb2dnZXIuaW5mbygiUHJlcGFyaW5nIHJ1biBjb25maWd1cmF0aW9uIikKICAgICAgICBydW5fY29uZmlnID0gZnMuUnVuQ29uZmlnKCoqcnVuX2NvbmZpZykKCiAgICAjIENhbGxpbmcgZ2V0X29mZmxpbmVfZmVhdHVyZXM6CiAgICBjb250ZXh0LmxvZ2dlci5pbmZvKAogICAgICAgIGYiZ2V0dGluZyBvZmZsaW5lIGZlYXR1cmVzIGZyb20gdGhlIEZlYXR1cmVWZWN0b3Ige2ZlYXR1cmVfdmVjdG9yfSIKICAgICkKICAgIGZzLmdldF9vZmZsaW5lX2ZlYXR1cmVzKAogICAgICAgIGZlYXR1cmVfdmVjdG9yPWZlYXR1cmVfdmVjdG9yLAogICAgICAgIGVudGl0eV9yb3dzPWVudGl0eV9yb3dzLAogICAgICAgIGVudGl0eV90aW1lc3RhbXBfY29sdW1uPWVudGl0eV90aW1lc3RhbXBfY29sdW1uLAogICAgICAgIHRhcmdldD10YXJnZXQsCiAgICAgICAgcnVuX2NvbmZpZz1ydW5fY29uZmlnLAogICAgICAgIGRyb3BfY29sdW1ucz1kcm9wX2NvbHVtbnMsCiAgICAgICAgc3RhcnRfdGltZT1zdGFydF90aW1lLAogICAgICAgIGVuZF90aW1lPWVuZF90aW1lLAogICAgICAgIHdpdGhfaW5kZXhlcz13aXRoX2luZGV4ZXMsCiAgICAgICAgdXBkYXRlX3N0YXRzPXVwZGF0ZV9zdGF0cywKICAgICkKCiAgICBjb250ZXh0LmxvZ19yZXN1bHQoImZlYXR1cmVfdmVjdG9yIiwgZmVhdHVyZV92ZWN0b3IpCg==
-    commands: []
-    code_origin: https://github.com/mlrun/functions.git#552572c0a503a86a12830c7ab8eb515b2f1526fa:/Users/yonatanshelach/yoni/projects/functions/get_offline_features/get_offline_features.py
-    origin_filename: /Users/yonatanshelach/yoni/projects/functions/get_offline_features/get_offline_features.py
-  entry_points:
-    get_offline_features:
-      name: get_offline_features
-      doc: 'retrieve offline feature vector results
-
-
-        specify a feature vector object/uri and retrieve the desired features, their
-        metadata
-
-        and statistics. returns :py:class:`~mlrun.feature_store.OfflineVectorResponse`,
-
-        results can be returned as a dataframe or written to a target.
-
-        If feature vector does not exist, a new one will be created and saved with
-        the given features.
-
-
-        The start_time and end_time attributes allow filtering the data to a given
-        time range, they accept
-
-        string values or pandas `Timestamp` objects, string values can also be relative,
-        for example:
-
-        "now", "now - 1d2h", "now+5m", where a valid pandas Timedelta string follows
-        the verb "now",
-
-        for time alignment you can use the verb "floor" e.g. "now -1d floor 1H" will
-        align the time to the last hour
-
-        (the floor string is passed to pandas.Timestamp.floor(), can use D, H, T,
-        S for day, hour, min, sec alignment)'
-      parameters:
-      - name: context
-        type: MLClientCtx
-        doc: MLRun context
-        default: ''
-      - name: feature_vector
-        type: str
-        doc: feature vector uri
-        default: ''
-      - name: features
-        type: List[str]
-        doc: Relevant only if feature_vector not exist. list of feature to collect
-          to this vector format [/]. [as
-          ]
-        default: null
-      - name: label_feature
-        type: str
-        doc: feature name to be used as label data
-        default: null
-      - name: description
-        type: str
-        doc: text description of the vector
-        default: null
-      - name: entity_rows
-        type: DataItem
-        doc: URI of the data entity rows to join with
-        default: null
-      - name: entity_timestamp_column
-        type: str
-        doc: timestamp column name in the entity rows dataframe
-        default: null
-      - name: target
-        type: Union[str, Dict]
-        doc: where to write the results to
-        default: null
-      - name: run_config
-        type: Union[str, Dict]
-        doc: function and/or run configuration see :py:class:`~mlrun.feature_store.RunConfig`
-        default: null
-      - name: drop_columns
-        type: List[str]
-        doc: list of columns to drop from the final result
-        default: null
-      - name: start_time
-        type: str
-        doc: datetime, low limit of time needed to be filtered. Optional entity_timestamp_column
-          must be passed when using time filtering
-        default: null
-      - name: end_time
-        type: str
-        doc: datetime, high limit of time needed to be filtered. Optional entity_timestamp_column
-          must be passed when using time filtering
-        default: null
-      - name: with_indexes
-        type: bool
-        doc: return vector with index columns (default False)
-        default: false
-      - name: update_stats
-        type: bool
-        doc: update features statistics from the requested feature sets on the vector.
-          Default is False.
-        default: false
-      outputs:
-      - default: ''
-      lineno: 13
-  description: retrieve offline feature vector results
-  default_handler: get_offline_features
-  disable_auto_mount: false
-  env: []
-  priority_class_name: ''
-  preemption_mode: prevent
-  affinity: null
-  tolerations: null
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/get_offline_features/1.0.0/static/item.html b/functions/development/get_offline_features/1.0.0/static/item.html deleted file mode 100644 index 1ccce858..00000000 --- a/functions/development/get_offline_features/1.0.0/static/item.html +++ /dev/null @@ -1,47 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- data-preparation
-- data-analysis
-- feature-store
-description: retrieve offline feature vector results
-doc: ''
-example: get_offline_features.ipynb
-generationDate: 2022-05-25:10-58
-icon: ''
-labels:
-  author: yonish
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 1.0.1
-name: get_offline_features
-platformVersion: ''
-spec:
-  filename: get_offline_features.py
-  handler: get_offline_features
-  image: mlrun/mlrun
-  kind: job
-  requirements: []
-url: ''
-version: 1.0.0
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/get_offline_features/1.0.0/static/source.html b/functions/development/get_offline_features/1.0.0/static/source.html deleted file mode 100644 index a06d4cdf..00000000 --- a/functions/development/get_offline_features/1.0.0/static/source.html +++ /dev/null @@ -1,143 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-from typing import Union, List, Dict
-
-import mlrun
-import mlrun.feature_store as fs
-from mlrun.datastore.store_resources import is_store_uri, parse_store_uri
-from mlrun.datastore.targets import get_target_driver, kind_to_driver
-from mlrun.datastore.base import DataItem
-from mlrun.execution import MLClientCtx
-from mlrun.utils import StorePrefix, parse_versioned_object_uri
-from mlrun.errors import MLRunInvalidArgumentError
-
-
-def get_offline_features(
-    context: MLClientCtx,
-    feature_vector: str,
-    features: List[str] = None,
-    label_feature: str = None,
-    description: str = None,
-    entity_rows: DataItem = None,
-    entity_timestamp_column: str = None,
-    target: Union[str, Dict] = None,
-    run_config: Union[str, Dict] = None,
-    drop_columns: List[str] = None,
-    start_time: str = None,
-    end_time: str = None,
-    with_indexes: bool = False,
-    update_stats: bool = False,
-):
-    """retrieve offline feature vector results
-
-    specify a feature vector object/uri and retrieve the desired features, their metadata
-    and statistics. returns :py:class:`~mlrun.feature_store.OfflineVectorResponse`,
-    results can be returned as a dataframe or written to a target.
-    If feature vector does not exist, a new one will be created and saved with the given features.
-
-    The start_time and end_time attributes allow filtering the data to a given time range, they accept
-    string values or pandas `Timestamp` objects, string values can also be relative, for example:
-    "now", "now - 1d2h", "now+5m", where a valid pandas Timedelta string follows the verb "now",
-    for time alignment you can use the verb "floor" e.g. "now -1d floor 1H" will align the time to the last hour
-    (the floor string is passed to pandas.Timestamp.floor(), can use D, H, T, S for day, hour, min, sec alignment)
-
-
-    :param context:        MLRun context
-    :param feature_vector: feature vector uri
-    :param features:       Relevant only if feature_vector not exist. list of feature to collect to this vector
-                           format [/]. [as ]
-    :param label_feature:  feature name to be used as label data
-    :param description:    text description of the vector
-    :param entity_rows:    URI of the data entity rows to join with
-    :param target:         where to write the results to
-    :param drop_columns:   list of columns to drop from the final result
-    :param entity_timestamp_column: timestamp column name in the entity rows dataframe
-    :param run_config:     function and/or run configuration
-                           see :py:class:`~mlrun.feature_store.RunConfig`
-    :param start_time:      datetime, low limit of time needed to be filtered. Optional
-        entity_timestamp_column must be passed when using time filtering
-    :param end_time:        datetime, high limit of time needed to be filtered. Optional
-        entity_timestamp_column must be passed when using time filtering
-    :param with_indexes:    return vector with index columns (default False)
-    :param update_stats:    update features statistics from the requested feature sets on the vector. Default is False.
-
-    :returns feature_vector input
-    """
-
-    if features is not None:
-        # Creating a new FeatureVector and saving:
-        if is_store_uri(feature_vector):
-            prefix, new_uri = parse_store_uri(feature_vector)
-            if prefix != StorePrefix.FeatureVector:
-                raise MLRunInvalidArgumentError(
-                    f"provided store uri ({feature_vector}) does not represent a feature vector (prefix={prefix})"
-                )
-            feature_vector = new_uri
-
-        context.logger.info(f"Creating FeatureVector {feature_vector}")
-        project, name, tag, _ = parse_versioned_object_uri(feature_vector, mlrun.mlconf.default_project)
-        vector = fs.FeatureVector(name, features, label_feature=label_feature, description=description)
-        vector.metadata.project = project
-        vector.metadata.tag = tag
-        vector.save()
-        feature_vector = vector.uri
-
-    # Preparing entity_rows:
-    if entity_rows is not None:
-        context.logger.info(f"Creating DataFrame from entity_rows = {entity_rows}")
-        entity_rows = entity_rows.as_df()
-
-    # Preparing target:
-    if target:
-        if isinstance(target, str):
-            target = kind_to_driver[target]()
-
-        name = target.name if hasattr(target, "name") else target["name"]
-        context.logger.info(f"Preparing '{name}' target")
-        target = get_target_driver(target)
-    if target.path:
-        context.log_result("target", target.path)
-
-    # Preparing run_config:
-    if run_config and isinstance(run_config, dict):
-        context.logger.info("Preparing run configuration")
-        run_config = fs.RunConfig(**run_config)
-
-    # Calling get_offline_features:
-    context.logger.info(
-        f"getting offline features from the FeatureVector {feature_vector}"
-    )
-    fs.get_offline_features(
-        feature_vector=feature_vector,
-        entity_rows=entity_rows,
-        entity_timestamp_column=entity_timestamp_column,
-        target=target,
-        run_config=run_config,
-        drop_columns=drop_columns,
-        start_time=start_time,
-        end_time=end_time,
-        with_indexes=with_indexes,
-        update_stats=update_stats,
-    )
-
-    context.log_result("feature_vector", feature_vector)
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/get_offline_features/1.0.1/src/function.yaml b/functions/development/get_offline_features/1.0.1/src/function.yaml deleted file mode 100644 index f48004c9..00000000 --- a/functions/development/get_offline_features/1.0.1/src/function.yaml +++ /dev/null @@ -1,126 +0,0 @@ -kind: job -metadata: - name: get-offline-features - tag: '' - hash: c27193bde78516cff1f9f25c397c0aced413df40 - project: '' - labels: - author: yonish - categories: - - data-preparation - - data-analysis - - feature-store -spec: - command: '' - args: [] - image: mlrun/mlrun - build: - functionSourceCode: ZnJvbSB0eXBpbmcgaW1wb3J0IFVuaW9uLCBMaXN0LCBEaWN0CgppbXBvcnQgbWxydW4KaW1wb3J0IG1scnVuLmZlYXR1cmVfc3RvcmUgYXMgZnMKZnJvbSBtbHJ1bi5kYXRhc3RvcmUuc3RvcmVfcmVzb3VyY2VzIGltcG9ydCBpc19zdG9yZV91cmksIHBhcnNlX3N0b3JlX3VyaQpmcm9tIG1scnVuLmRhdGFzdG9yZS50YXJnZXRzIGltcG9ydCBnZXRfdGFyZ2V0X2RyaXZlciwga2luZF90b19kcml2ZXIKZnJvbSBtbHJ1bi5kYXRhc3RvcmUuYmFzZSBpbXBvcnQgRGF0YUl0ZW0KZnJvbSBtbHJ1bi5leGVjdXRpb24gaW1wb3J0IE1MQ2xpZW50Q3R4CmZyb20gbWxydW4udXRpbHMgaW1wb3J0IFN0b3JlUHJlZml4LCBwYXJzZV92ZXJzaW9uZWRfb2JqZWN0X3VyaQpmcm9tIG1scnVuLmVycm9ycyBpbXBvcnQgTUxSdW5JbnZhbGlkQXJndW1lbnRFcnJvcgoKCmRlZiBnZXRfb2ZmbGluZV9mZWF0dXJlcygKICAgIGNvbnRleHQ6IE1MQ2xpZW50Q3R4LAogICAgZmVhdHVyZV92ZWN0b3I6IHN0ciwKICAgIGZlYXR1cmVzOiBMaXN0W3N0cl0gPSBOb25lLAogICAgbGFiZWxfZmVhdHVyZTogc3RyID0gTm9uZSwKICAgIGRlc2NyaXB0aW9uOiBzdHIgPSBOb25lLAogICAgZW50aXR5X3Jvd3M6IERhdGFJdGVtID0gTm9uZSwKICAgIGVudGl0eV90aW1lc3RhbXBfY29sdW1uOiBzdHIgPSBOb25lLAogICAgdGFyZ2V0OiBVbmlvbltzdHIsIERpY3RdID0gTm9uZSwKICAgIHJ1bl9jb25maWc6IFVuaW9uW3N0ciwgRGljdF0gPSBOb25lLAogICAgZHJvcF9jb2x1bW5zOiBMaXN0W3N0cl0gPSBOb25lLAogICAgc3RhcnRfdGltZTogc3RyID0gTm9uZSwKICAgIGVuZF90aW1lOiBzdHIgPSBOb25lLAogICAgd2l0aF9pbmRleGVzOiBib29sID0gRmFsc2UsCiAgICB1cGRhdGVfc3RhdHM6IGJvb2wgPSBGYWxzZSwKKToKICAgICIiInJldHJpZXZlIG9mZmxpbmUgZmVhdHVyZSB2ZWN0b3IgcmVzdWx0cwoKICAgIHNwZWNpZnkgYSBmZWF0dXJlIHZlY3RvciBvYmplY3QvdXJpIGFuZCByZXRyaWV2ZSB0aGUgZGVzaXJlZCBmZWF0dXJlcywgdGhlaXIgbWV0YWRhdGEKICAgIGFuZCBzdGF0aXN0aWNzLiByZXR1cm5zIDpweTpjbGFzczpgfm1scnVuLmZlYXR1cmVfc3RvcmUuT2ZmbGluZVZlY3RvclJlc3BvbnNlYCwKICAgIHJlc3VsdHMgY2FuIGJlIHJldHVybmVkIGFzIGEgZGF0YWZyYW1lIG9yIHdyaXR0ZW4gdG8gYSB0YXJnZXQuCiAgICBJZiBmZWF0dXJlIHZlY3RvciBkb2VzIG5vdCBleGlzdCwgYSBuZXcgb25lIHdpbGwgYmUgY3JlYXRlZCBhbmQgc2F2ZWQgd2l0aCB0aGUgZ2l2ZW4gZmVhdHVyZXMuCgogICAgVGhlIHN0YXJ0X3RpbWUgYW5kIGVuZF90aW1lIGF0dHJpYnV0ZXMgYWxsb3cgZmlsdGVyaW5nIHRoZSBkYXRhIHRvIGEgZ2l2ZW4gdGltZSByYW5nZSwgdGhleSBhY2NlcHQKICAgIHN0cmluZyB2YWx1ZXMgb3IgcGFuZGFzIGBUaW1lc3RhbXBgIG9iamVjdHMsIHN0cmluZyB2YWx1ZXMgY2FuIGFsc28gYmUgcmVsYXRpdmUsIGZvciBleGFtcGxlOgogICAgIm5vdyIsICJub3cgLSAxZDJoIiwgIm5vdys1bSIsIHdoZXJlIGEgdmFsaWQgcGFuZGFzIFRpbWVkZWx0YSBzdHJpbmcgZm9sbG93cyB0aGUgdmVyYiAibm93IiwKICAgIGZvciB0aW1lIGFsaWdubWVudCB5b3UgY2FuIHVzZSB0aGUgdmVyYiAiZmxvb3IiIGUuZy4gIm5vdyAtMWQgZmxvb3IgMUgiIHdpbGwgYWxpZ24gdGhlIHRpbWUgdG8gdGhlIGxhc3QgaG91cgogICAgKHRoZSBmbG9vciBzdHJpbmcgaXMgcGFzc2VkIHRvIHBhbmRhcy5UaW1lc3RhbXAuZmxvb3IoKSwgY2FuIHVzZSBELCBILCBULCBTIGZvciBkYXksIGhvdXIsIG1pbiwgc2VjIGFsaWdubWVudCkKCgogICAgOnBhcmFtIGNvbnRleHQ6ICAgICAgICBNTFJ1biBjb250ZXh0CiAgICA6cGFyYW0gZmVhdHVyZV92ZWN0b3I6IGZlYXR1cmUgdmVjdG9yIHVyaQogICAgOnBhcmFtIGZlYXR1cmVzOiAgICAgICBSZWxldmFudCBvbmx5IGlmIGZlYXR1cmVfdmVjdG9yIG5vdCBleGlzdC4gbGlzdCBvZiBmZWF0dXJlIHRvIGNvbGxlY3QgdG8gdGhpcyB2ZWN0b3IKICAgICAgICAgICAgICAgICAgICAgICAgICAgZm9ybWF0IFs8cHJvamVjdD4vXTxmZWF0dXJlX3NldD4uPGZlYXR1cmVfbmFtZSBvciAqPiBbYXMgPGFsaWFzPl0KICAgIDpwYXJhbSBsYWJlbF9mZWF0dXJlOiAgZmVhdHVyZSBuYW1lIHRvIGJlIHVzZWQgYXMgbGFiZWwgZGF0YQogICAgOnBhcmFtIGRlc2NyaXB0aW9uOiAgICB0ZXh0IGRlc2NyaXB0aW9uIG9mIHRoZSB2ZWN0b3IKICAgIDpwYXJhbSBlbnRpdHlfcm93czogICAgVVJJIG9mIHRoZSBkYXRhIGVudGl0eSByb3dzIHRvIGpvaW4gd2l0aAogICAgOnBhcmFtIHRhcmdldDogICAgICAgICB3aGVyZSB0byB3cml0ZSB0aGUgcmVzdWx0cyB0bwogICAgOnBhcmFtIGRyb3BfY29sdW1uczogICBsaXN0IG9mIGNvbHVtbnMgdG8gZHJvcCBmcm9tIHRoZSBmaW5hbCByZXN1bHQKICAgIDpwYXJhbSBlbnRpdHlfdGltZXN0YW1wX2NvbHVtbjogdGltZXN0YW1wIGNvbHVtbiBuYW1lIGluIHRoZSBlbnRpdHkgcm93cyBkYXRhZnJhbWUKICAgIDpwYXJhbSBydW5fY29uZmlnOiAgICAgZnVuY3Rpb24gYW5kL29yIHJ1biBjb25maWd1cmF0aW9uCiAgICAgICAgICAgICAgICAgICAgICAgICAgIHNlZSA6cHk6Y2xhc3M6YH5tbHJ1bi5mZWF0dXJlX3N0b3JlLlJ1bkNvbmZpZ2AKICAgIDpwYXJhbSBzdGFydF90aW1lOiAgICAgIGRhdGV0aW1lLCBsb3cgbGltaXQgb2YgdGltZSBuZWVkZWQgdG8gYmUgZmlsdGVyZWQuIE9wdGlvbmFsCiAgICAgICAgZW50aXR5X3RpbWVzdGFtcF9jb2x1bW4gbXVzdCBiZSBwYXNzZWQgd2hlbiB1c2luZyB0aW1lIGZpbHRlcmluZwogICAgOnBhcmFtIGVuZF90aW1lOiAgICAgICAgZGF0ZXRpbWUsIGhpZ2ggbGltaXQgb2YgdGltZSBuZWVkZWQgdG8gYmUgZmlsdGVyZWQuIE9wdGlvbmFsCiAgICAgICAgZW50aXR5X3RpbWVzdGFtcF9jb2x1bW4gbXVzdCBiZSBwYXNzZWQgd2hlbiB1c2luZyB0aW1lIGZpbHRlcmluZwogICAgOnBhcmFtIHdpdGhfaW5kZXhlczogICAgcmV0dXJuIHZlY3RvciB3aXRoIGluZGV4IGNvbHVtbnMgKGRlZmF1bHQgRmFsc2UpCiAgICA6cGFyYW0gdXBkYXRlX3N0YXRzOiAgICB1cGRhdGUgZmVhdHVyZXMgc3RhdGlzdGljcyBmcm9tIHRoZSByZXF1ZXN0ZWQgZmVhdHVyZSBzZXRzIG9uIHRoZSB2ZWN0b3IuIERlZmF1bHQgaXMgRmFsc2UuCgogICAgOnJldHVybnMgZmVhdHVyZV92ZWN0b3IgaW5wdXQKICAgICIiIgoKICAgIGlmIGZlYXR1cmVzIGlzIG5vdCBOb25lOgogICAgICAgICMgQ3JlYXRpbmcgYSBuZXcgRmVhdHVyZVZlY3RvciBhbmQgc2F2aW5nOgogICAgICAgIGlmIGlzX3N0b3JlX3VyaShmZWF0dXJlX3ZlY3Rvcik6CiAgICAgICAgICAgIHByZWZpeCwgbmV3X3VyaSA9IHBhcnNlX3N0b3JlX3VyaShmZWF0dXJlX3ZlY3RvcikKICAgICAgICAgICAgaWYgcHJlZml4ICE9IFN0b3JlUHJlZml4LkZlYXR1cmVWZWN0b3I6CiAgICAgICAgICAgICAgICByYWlzZSBNTFJ1bkludmFsaWRBcmd1bWVudEVycm9yKAogICAgICAgICAgICAgICAgICAgIGYicHJvdmlkZWQgc3RvcmUgdXJpICh7ZmVhdHVyZV92ZWN0b3J9KSBkb2VzIG5vdCByZXByZXNlbnQgYSBmZWF0dXJlIHZlY3RvciAocHJlZml4PXtwcmVmaXh9KSIKICAgICAgICAgICAgICAgICkKICAgICAgICAgICAgZmVhdHVyZV92ZWN0b3IgPSBuZXdfdXJpCgogICAgICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZiJDcmVhdGluZyBGZWF0dXJlVmVjdG9yIHtmZWF0dXJlX3ZlY3Rvcn0iKQogICAgICAgIHByb2plY3QsIG5hbWUsIHRhZywgXyA9IHBhcnNlX3ZlcnNpb25lZF9vYmplY3RfdXJpKGZlYXR1cmVfdmVjdG9yLCBtbHJ1bi5tbGNvbmYuZGVmYXVsdF9wcm9qZWN0KQogICAgICAgIHZlY3RvciA9IGZzLkZlYXR1cmVWZWN0b3IobmFtZSwgZmVhdHVyZXMsIGxhYmVsX2ZlYXR1cmU9bGFiZWxfZmVhdHVyZSwgZGVzY3JpcHRpb249ZGVzY3JpcHRpb24pCiAgICAgICAgdmVjdG9yLm1ldGFkYXRhLnByb2plY3QgPSBwcm9qZWN0CiAgICAgICAgdmVjdG9yLm1ldGFkYXRhLnRhZyA9IHRhZwogICAgICAgIHZlY3Rvci5zYXZlKCkKICAgICAgICBmZWF0dXJlX3ZlY3RvciA9IHZlY3Rvci51cmkKCiAgICAjIFByZXBhcmluZyBlbnRpdHlfcm93czoKICAgIGlmIGVudGl0eV9yb3dzIGlzIG5vdCBOb25lOgogICAgICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZiJDcmVhdGluZyBEYXRhRnJhbWUgZnJvbSBlbnRpdHlfcm93cyA9IHtlbnRpdHlfcm93c30iKQogICAgICAgIGVudGl0eV9yb3dzID0gZW50aXR5X3Jvd3MuYXNfZGYoKQoKICAgICMgUHJlcGFyaW5nIHRhcmdldDoKICAgIGlmIHRhcmdldDoKICAgICAgICBpZiBpc2luc3RhbmNlKHRhcmdldCwgc3RyKToKICAgICAgICAgICAgdGFyZ2V0ID0ga2luZF90b19kcml2ZXJbdGFyZ2V0XSgpCgogICAgICAgIG5hbWUgPSB0YXJnZXQubmFtZSBpZiBoYXNhdHRyKHRhcmdldCwgIm5hbWUiKSBlbHNlIHRhcmdldFsibmFtZSJdCiAgICAgICAgY29udGV4dC5sb2dnZXIuaW5mbyhmIlByZXBhcmluZyAne25hbWV9JyB0YXJnZXQiKQogICAgICAgIHRhcmdldCA9IGdldF90YXJnZXRfZHJpdmVyKHRhcmdldCkKICAgIGlmIGhhc2F0dHIodGFyZ2V0LCAncGF0aCcpIGFuZCB0YXJnZXQucGF0aDoKICAgICAgICBjb250ZXh0LmxvZ19yZXN1bHQoInRhcmdldCIsIHRhcmdldC5wYXRoKQoKICAgICMgUHJlcGFyaW5nIHJ1bl9jb25maWc6CiAgICBpZiBydW5fY29uZmlnIGFuZCBpc2luc3RhbmNlKHJ1bl9jb25maWcsIGRpY3QpOgogICAgICAgIGNvbnRleHQubG9nZ2VyLmluZm8oIlByZXBhcmluZyBydW4gY29uZmlndXJhdGlvbiIpCiAgICAgICAgcnVuX2NvbmZpZyA9IGZzLlJ1bkNvbmZpZygqKnJ1bl9jb25maWcpCgogICAgIyBDYWxsaW5nIGdldF9vZmZsaW5lX2ZlYXR1cmVzOgogICAgY29udGV4dC5sb2dnZXIuaW5mbygKICAgICAgICBmImdldHRpbmcgb2ZmbGluZSBmZWF0dXJlcyBmcm9tIHRoZSBGZWF0dXJlVmVjdG9yIHtmZWF0dXJlX3ZlY3Rvcn0iCiAgICApCiAgICBmcy5nZXRfb2ZmbGluZV9mZWF0dXJlcygKICAgICAgICBmZWF0dXJlX3ZlY3Rvcj1mZWF0dXJlX3ZlY3RvciwKICAgICAgICBlbnRpdHlfcm93cz1lbnRpdHlfcm93cywKICAgICAgICBlbnRpdHlfdGltZXN0YW1wX2NvbHVtbj1lbnRpdHlfdGltZXN0YW1wX2NvbHVtbiwKICAgICAgICB0YXJnZXQ9dGFyZ2V0LAogICAgICAgIHJ1bl9jb25maWc9cnVuX2NvbmZpZywKICAgICAgICBkcm9wX2NvbHVtbnM9ZHJvcF9jb2x1bW5zLAogICAgICAgIHN0YXJ0X3RpbWU9c3RhcnRfdGltZSwKICAgICAgICBlbmRfdGltZT1lbmRfdGltZSwKICAgICAgICB3aXRoX2luZGV4ZXM9d2l0aF9pbmRleGVzLAogICAgICAgIHVwZGF0ZV9zdGF0cz11cGRhdGVfc3RhdHMsCiAgICApCgogICAgY29udGV4dC5sb2dfcmVzdWx0KCJmZWF0dXJlX3ZlY3RvciIsIGZlYXR1cmVfdmVjdG9yKQo= - commands: [] - code_origin: https://github.com/mlrun/functions.git#9899e19f8c98568005dabe064fa29db6d9a47531:/Users/yonatanshelach/yoni/projects/functions/get_offline_features/get_offline_features.py - origin_filename: /Users/yonatanshelach/yoni/projects/functions/get_offline_features/get_offline_features.py - entry_points: - get_offline_features: - name: get_offline_features - doc: 'retrieve offline feature vector results - - - specify a feature vector object/uri and retrieve the desired features, their - metadata - - and statistics. returns :py:class:`~mlrun.feature_store.OfflineVectorResponse`, - - results can be returned as a dataframe or written to a target. - - If feature vector does not exist, a new one will be created and saved with - the given features. - - - The start_time and end_time attributes allow filtering the data to a given - time range, they accept - - string values or pandas `Timestamp` objects, string values can also be relative, - for example: - - "now", "now - 1d2h", "now+5m", where a valid pandas Timedelta string follows - the verb "now", - - for time alignment you can use the verb "floor" e.g. "now -1d floor 1H" will - align the time to the last hour - - (the floor string is passed to pandas.Timestamp.floor(), can use D, H, T, - S for day, hour, min, sec alignment)' - parameters: - - name: context - type: MLClientCtx - doc: MLRun context - default: '' - - name: feature_vector - type: str - doc: feature vector uri - default: '' - - name: features - type: List[str] - doc: Relevant only if feature_vector not exist. list of feature to collect - to this vector format [/]. [as - ] - default: null - - name: label_feature - type: str - doc: feature name to be used as label data - default: null - - name: description - type: str - doc: text description of the vector - default: null - - name: entity_rows - type: DataItem - doc: URI of the data entity rows to join with - default: null - - name: entity_timestamp_column - type: str - doc: timestamp column name in the entity rows dataframe - default: null - - name: target - type: Union[str, Dict] - doc: where to write the results to - default: null - - name: run_config - type: Union[str, Dict] - doc: function and/or run configuration see :py:class:`~mlrun.feature_store.RunConfig` - default: null - - name: drop_columns - type: List[str] - doc: list of columns to drop from the final result - default: null - - name: start_time - type: str - doc: datetime, low limit of time needed to be filtered. Optional entity_timestamp_column - must be passed when using time filtering - default: null - - name: end_time - type: str - doc: datetime, high limit of time needed to be filtered. Optional entity_timestamp_column - must be passed when using time filtering - default: null - - name: with_indexes - type: bool - doc: return vector with index columns (default False) - default: false - - name: update_stats - type: bool - doc: update features statistics from the requested feature sets on the vector. - Default is False. - default: false - outputs: - - default: '' - lineno: 13 - description: retrieve offline feature vector results - default_handler: get_offline_features - disable_auto_mount: false - env: [] - priority_class_name: '' - preemption_mode: prevent - affinity: null - tolerations: null -verbose: false diff --git a/functions/development/get_offline_features/1.0.1/src/get_offline_features.ipynb b/functions/development/get_offline_features/1.0.1/src/get_offline_features.ipynb deleted file mode 100644 index d97402a2..00000000 --- a/functions/development/get_offline_features/1.0.1/src/get_offline_features.ipynb +++ /dev/null @@ -1,1536 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# `get_offline_features()` from MLRun FeatureStore\n", - "\n", - "This MLRun Function has the following `params`:\n", - "\n", - "- `feature_vector: str`, feature vector uri.\n", - "\n", - "- `entity_rows: DataItem` = None, URI of the data entity rows to join with.\n", - "\n", - "- `entity_timestamp_column: str = None`, timestamp column name in the entity rows dataframe.\n", - "\n", - "- `target: Union[str, Dict] = None`, where to write the results to.\n", - "\n", - "- `run_config: Union[str, Dict] = None`, function and/or run configuration see :py:class:`~mlrun.feature_store.RunConfig`.\n", - "\n", - "- `drop_columns: List[str] = None`, list of columns to drop from the final result. \n", - "\n", - "- `start_time: str = None`, datetime, low limit of time needed to be filtered. Optional. `entity_timestamp_column` must be passed when using time filtering.\n", - "\n", - "- `end_time: str = None`, datetime, high limit of time needed to be filtered. Optional. `entity_timestamp_column` must be passed when using time filtering.\n", - "\n", - "- `with_indexes: bool = False`, return vector with index columns (default False).\n", - "\n", - "- `update_stats: bool = False`, update features statistics from the requested feature sets on the vector. Default is False." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import mlrun\n", - "import mlrun.feature_store as fstore\n", - "from mlrun.datastore.targets import CSVTarget\n", - "from mlrun.datastore.sources import CSVSource\n", - "from mlrun.run import get_dataitem\n", - "from mlrun.feature_store.steps import *\n", - "from mlrun.features import MinMaxValidator\n", - "import pandas as pd\n", - "import datetime\n", - "import os" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2022-01-31 14:41:48,288 [info] loaded project get-offline-features from MLRun DB\n" - ] - } - ], - "source": [ - "ABS_PATH = 'v3io://users/{}/get_offline_features/'.format(os.environ['V3IO_USERNAME'])\n", - "# Initialize the MLRun project object\n", - "project = mlrun.get_or_create_project('get-offline-features', context=\"./\", user_project=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Generating the Same FeatureSets and FeatureVecotrs Based on the Stocks Example" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Create Sample Data For Demo" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "quotes = pd.DataFrame(\n", - " {\n", - " \"time\": [\n", - " pd.Timestamp(\"2016-05-25 13:30:00.023\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.023\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.030\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.041\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.048\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.049\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.072\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.075\")\n", - " ],\n", - " \"ticker\": [\n", - " \"GOOG\",\n", - " \"MSFT\",\n", - " \"MSFT\",\n", - " \"MSFT\",\n", - " \"GOOG\",\n", - " \"AAPL\",\n", - " \"GOOG\",\n", - " \"MSFT\"\n", - " ],\n", - " \"bid\": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01],\n", - " \"ask\": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03]\n", - " }\n", - ")\n", - "\n", - "trades = pd.DataFrame(\n", - " {\n", - " \"time\": [\n", - " pd.Timestamp(\"2016-05-25 13:30:00.023\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.038\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.048\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.048\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.048\")\n", - " ],\n", - " \"ticker\": [\"MSFT\", \"MSFT\", \"GOOG\", \"GOOG\", \"AAPL\"],\n", - " \"price\": [51.95, 51.95, 720.77, 720.92, 98.0],\n", - " \"quantity\": [75, 155, 100, 100, 100]\n", - " }\n", - ")\n", - "\n", - "stocks = pd.DataFrame(\n", - " {\n", - " \"ticker\": [\"MSFT\", \"GOOG\", \"AAPL\"],\n", - " \"name\": [\"Microsoft Corporation\", \"Alphabet Inc\", \"Apple Inc\"],\n", - " \"exchange\": [\"NASDAQ\", \"NASDAQ\", \"NASDAQ\"]\n", - " }\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "def move_date(df, col):\n", - " max_date = df[col].max()\n", - " now_date = datetime.datetime.now()\n", - " delta = now_date - max_date \n", - " df[col] = df[col] + delta \n", - " return df\n", - "\n", - "quotes = move_date(quotes, \"time\")\n", - "trades = move_date(trades, \"time\")\n", - "trades.to_csv('trades.csv', index=False)\n", - "data_uri = os.path.join(ABS_PATH, 'trades.csv')" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
timetickerbidask
02022-01-31 14:41:48.260566GOOG720.50720.93
12022-01-31 14:41:48.260566MSFT51.9551.96
22022-01-31 14:41:48.267566MSFT51.9751.98
32022-01-31 14:41:48.278566MSFT51.9952.00
42022-01-31 14:41:48.285566GOOG720.50720.93
52022-01-31 14:41:48.286566AAPL97.9998.01
62022-01-31 14:41:48.309566GOOG720.50720.88
72022-01-31 14:41:48.312566MSFT52.0152.03
\n", - "
" - ], - "text/plain": [ - " time ticker bid ask\n", - "0 2022-01-31 14:41:48.260566 GOOG 720.50 720.93\n", - "1 2022-01-31 14:41:48.260566 MSFT 51.95 51.96\n", - "2 2022-01-31 14:41:48.267566 MSFT 51.97 51.98\n", - "3 2022-01-31 14:41:48.278566 MSFT 51.99 52.00\n", - "4 2022-01-31 14:41:48.285566 GOOG 720.50 720.93\n", - "5 2022-01-31 14:41:48.286566 AAPL 97.99 98.01\n", - "6 2022-01-31 14:41:48.309566 GOOG 720.50 720.88\n", - "7 2022-01-31 14:41:48.312566 MSFT 52.01 52.03" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "quotes" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
timetickerpricequantity
02022-01-31 14:41:48.288476MSFT51.9575
12022-01-31 14:41:48.303476MSFT51.95155
22022-01-31 14:41:48.313476GOOG720.77100
32022-01-31 14:41:48.313476GOOG720.92100
42022-01-31 14:41:48.313476AAPL98.00100
\n", - "
" - ], - "text/plain": [ - " time ticker price quantity\n", - "0 2022-01-31 14:41:48.288476 MSFT 51.95 75\n", - "1 2022-01-31 14:41:48.303476 MSFT 51.95 155\n", - "2 2022-01-31 14:41:48.313476 GOOG 720.77 100\n", - "3 2022-01-31 14:41:48.313476 GOOG 720.92 100\n", - "4 2022-01-31 14:41:48.313476 AAPL 98.00 100" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "trades" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
tickernameexchange
0MSFTMicrosoft CorporationNASDAQ
1GOOGAlphabet IncNASDAQ
2AAPLApple IncNASDAQ
\n", - "
" - ], - "text/plain": [ - " ticker name exchange\n", - "0 MSFT Microsoft Corporation NASDAQ\n", - "1 GOOG Alphabet Inc NASDAQ\n", - "2 AAPL Apple Inc NASDAQ" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "stocks" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Build & Ingest Simple Feature Set (stocks)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
nameexchange
ticker
MSFTMicrosoft CorporationNASDAQ
GOOGAlphabet IncNASDAQ
AAPLApple IncNASDAQ
\n", - "
" - ], - "text/plain": [ - " name exchange\n", - "ticker \n", - "MSFT Microsoft Corporation NASDAQ\n", - "GOOG Alphabet Inc NASDAQ\n", - "AAPL Apple Inc NASDAQ" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# add feature set without time column (stock ticker metadata) \n", - "stocks_set = fstore.FeatureSet(\"stocks\", entities=[fstore.Entity(\"ticker\")])\n", - "fstore.ingest(stocks_set, stocks, infer_options=fstore.InferOptions.default())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Build Advanced feature set - with feature engineering pipeline" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "quotes_set = fstore.FeatureSet(\"stock-quotes\", entities=[fstore.Entity(\"ticker\")])" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "class MyMap(MapClass):\n", - " def __init__(self, multiplier=1, **kwargs):\n", - " super().__init__(**kwargs)\n", - " self._multiplier = multiplier\n", - "\n", - " def do(self, event):\n", - " event[\"multi\"] = event[\"bid\"] * self._multiplier\n", - " return event" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "image/svg+xml": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "mlrun-flow\n", - "\n", - "\n", - "\n", - "_start\n", - "\n", - "start\n", - "\n", - "\n", - "\n", - "MyMap\n", - "\n", - "MyMap\n", - "\n", - "\n", - "\n", - "_start->MyMap\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "storey.Extend\n", - "\n", - "storey.Extend\n", - "\n", - "\n", - "\n", - "MyMap->storey.Extend\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "filter\n", - "\n", - "filter\n", - "\n", - "\n", - "\n", - "storey.Extend->filter\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "FeaturesetValidator\n", - "\n", - "FeaturesetValidator\n", - "\n", - "\n", - "\n", - "filter->FeaturesetValidator\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "Aggregates\n", - "\n", - "Aggregates\n", - "\n", - "\n", - "\n", - "FeaturesetValidator->Aggregates\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "parquet\n", - "\n", - "\n", - "parquet\n", - "\n", - "\n", - "\n", - "Aggregates->parquet\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "nosql\n", - "\n", - "\n", - "nosql\n", - "\n", - "\n", - "\n", - "Aggregates->nosql\n", - "\n", - "\n", - "\n", - "\n", - "\n" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "quotes_set.graph.to(\"MyMap\", multiplier=3)\\\n", - " .to(\"storey.Extend\", _fn=\"({'extra': event['bid'] * 77})\")\\\n", - " .to(\"storey.Filter\", \"filter\", _fn=\"(event['bid'] > 51.92)\")\\\n", - " .to(FeaturesetValidator())\n", - "\n", - "quotes_set.add_aggregation(\"ask\", [\"sum\", \"max\"], \"1h\", \"10m\", name=\"asks1\")\n", - "quotes_set.add_aggregation(\"ask\", [\"sum\", \"max\"], \"5h\", \"10m\", name=\"asks5\")\n", - "quotes_set.add_aggregation(\"bid\", [\"min\", \"max\"], \"1h\", \"10m\", name=\"bids\")\n", - "\n", - "# add feature validation policy\n", - "quotes_set[\"bid\"] = fstore.Feature(validator=MinMaxValidator(min=52, severity=\"info\"))\n", - "\n", - "# add default target definitions and plot\n", - "quotes_set.set_targets()\n", - "quotes_set.plot(rankdir=\"LR\", with_targets=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Ingest Data Into Offline And Online Stores" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.377248+00:00 args={'min': 52, 'value': 51.95}\n", - "info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.377927+00:00 args={'min': 52, 'value': 51.97}\n", - "info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.378103+00:00 args={'min': 52, 'value': 51.99}\n", - "info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.578640+00:00 args={'min': 52, 'value': 51.95}\n", - "info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.581692+00:00 args={'min': 52, 'value': 51.97}\n", - "info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.584351+00:00 args={'min': 52, 'value': 51.99}\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
asks1_max_1hasks1_sum_1hasks5_max_5hasks5_sum_5hbids_max_1hbids_min_1htimebidaskmultiextra
ticker
GOOG720.93720.93720.93720.93720.50720.502022-01-31 14:41:48.260566720.50720.932161.5055478.50
MSFT51.9651.9651.9651.9651.9551.952022-01-31 14:41:48.26056651.9551.96155.854000.15
MSFT51.98103.9451.98103.9451.9751.952022-01-31 14:41:48.26756651.9751.98155.914001.69
MSFT52.00155.9452.00155.9451.9951.952022-01-31 14:41:48.27856651.9952.00155.974003.23
GOOG720.931441.86720.931441.86720.50720.502022-01-31 14:41:48.285566720.50720.932161.5055478.50
AAPL98.0198.0198.0198.0197.9997.992022-01-31 14:41:48.28656697.9998.01293.977545.23
GOOG720.932162.74720.932162.74720.50720.502022-01-31 14:41:48.309566720.50720.882161.5055478.50
MSFT52.03207.9752.03207.9752.0151.952022-01-31 14:41:48.31256652.0152.03156.034004.77
\n", - "
" - ], - "text/plain": [ - " asks1_max_1h asks1_sum_1h asks5_max_5h asks5_sum_5h bids_max_1h \\\n", - "ticker \n", - "GOOG 720.93 720.93 720.93 720.93 720.50 \n", - "MSFT 51.96 51.96 51.96 51.96 51.95 \n", - "MSFT 51.98 103.94 51.98 103.94 51.97 \n", - "MSFT 52.00 155.94 52.00 155.94 51.99 \n", - "GOOG 720.93 1441.86 720.93 1441.86 720.50 \n", - "AAPL 98.01 98.01 98.01 98.01 97.99 \n", - "GOOG 720.93 2162.74 720.93 2162.74 720.50 \n", - "MSFT 52.03 207.97 52.03 207.97 52.01 \n", - "\n", - " bids_min_1h time bid ask multi \\\n", - "ticker \n", - "GOOG 720.50 2022-01-31 14:41:48.260566 720.50 720.93 2161.50 \n", - "MSFT 51.95 2022-01-31 14:41:48.260566 51.95 51.96 155.85 \n", - "MSFT 51.95 2022-01-31 14:41:48.267566 51.97 51.98 155.91 \n", - "MSFT 51.95 2022-01-31 14:41:48.278566 51.99 52.00 155.97 \n", - "GOOG 720.50 2022-01-31 14:41:48.285566 720.50 720.93 2161.50 \n", - "AAPL 97.99 2022-01-31 14:41:48.286566 97.99 98.01 293.97 \n", - "GOOG 720.50 2022-01-31 14:41:48.309566 720.50 720.88 2161.50 \n", - "MSFT 51.95 2022-01-31 14:41:48.312566 52.01 52.03 156.03 \n", - "\n", - " extra \n", - "ticker \n", - "GOOG 55478.50 \n", - "MSFT 4000.15 \n", - "MSFT 4001.69 \n", - "MSFT 4003.23 \n", - "GOOG 55478.50 \n", - "AAPL 7545.23 \n", - "GOOG 55478.50 \n", - "MSFT 4004.77 " - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# save ingest data and print the FeatureSet spec\n", - "fstore.ingest(quotes_set, quotes)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Get an Offline Feature Vector" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "features = [\n", - " \"stock-quotes.multi\",\n", - " \"stock-quotes.asks5_sum_5h as total_ask\",\n", - " \"stock-quotes.bids_min_1h\",\n", - " \"stock-quotes.bids_max_1h\",\n", - " \"stocks.*\",\n", - "]\n", - "\n", - "vector = fstore.FeatureVector(\"stocks-vec\", features)\n", - "vector.save()" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "target_dict = CSVTarget('mycsv',path=os.path.join(ABS_PATH, 'my_csv.csv')).to_dict()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Using `get_offline_features()` " - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [], - "source": [ - "get_offline_features_fn = mlrun.import_function('hub://get_offline_features:development')" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2022-01-31 14:41:52,066 [info] starting run get-offline-features-get_offline_features uid=956663b9a9ba448c9ea65e8e9245718e DB=http://mlrun-api:8080\n", - "> 2022-01-31 14:41:52,214 [info] Creating DataFrame from entity_rows = v3io://users/yonatan/get_offline_features/trades.csv\n", - "> 2022-01-31 14:41:52,292 [info] Preparing 'mycsv' target\n", - "> 2022-01-31 14:41:52,294 [info] getting offline features from the FeatureVector store://feature-vectors/get-offline-features-yonatan/stocks-vec\n", - "> 2022-01-31 14:41:52,708 [info] wrote target: {'name': 'mycsv', 'kind': 'csv', 'path': 'v3io://users/yonatan/get_offline_features/my_csv.csv', 'status': 'ready', 'updated': '2022-01-31T14:41:52.708534+00:00'}\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
get-offline-features-yonatan0Jan 31 14:41:52completedget-offline-features-get_offline_features
v3io_user=yonatan
kind=
owner=yonatan
host=jupyter-yoni-647b99c95d-w4jlc
entity_rows
feature_vector=store://feature-vectors/get-offline-features-yonatan/stocks-vec
target={'name': 'mycsv', 'kind': 'csv', 'path': 'v3io://users/yonatan/get_offline_features/my_csv.csv', 'partitioned': False}
entity_timestamp_column=time
target=v3io://users/yonatan/get_offline_features/my_csv.csv
feature_vector=store://feature-vectors/get-offline-features-yonatan/stocks-vec
\n", - "
\n", - "
\n", - "
\n", - " Title\n", - " ×\n", - "
\n", - " \n", - "
\n", - "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - }, - { - "data": { - "text/html": [ - " > to track results use the .show() or .logs() methods or click here to open in UI" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2022-01-31 14:41:52,896 [info] run executed, status=completed\n" - ] - } - ], - "source": [ - "gof_run = get_offline_features_fn.run(\n", - " handler='get_offline_features',\n", - " inputs= {'entity_rows': data_uri},\n", - " params={'feature_vector': vector.uri,\n", - " 'target': target_dict,\n", - " 'entity_timestamp_column': \"time\",\n", - " },\n", - " local=True\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'store://feature-vectors/get-offline-features-yonatan/stocks-vec'" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "gof_run.outputs['feature_vector']" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Unnamed: 0pricequantitymultitotal_askbids_min_1hbids_max_1hnameexchange
0051.9575155.8551.9651.9551.95Microsoft CorporationNASDAQ
1151.9575155.91103.9451.9551.97Microsoft CorporationNASDAQ
2251.9575155.97155.9451.9551.99Microsoft CorporationNASDAQ
3351.9575156.03207.9751.9552.01Microsoft CorporationNASDAQ
4451.95155155.8551.9651.9551.95Microsoft CorporationNASDAQ
5551.95155155.91103.9451.9551.97Microsoft CorporationNASDAQ
6651.95155155.97155.9451.9551.99Microsoft CorporationNASDAQ
7751.95155156.03207.9751.9552.01Microsoft CorporationNASDAQ
88720.771002161.50720.93720.50720.50Alphabet IncNASDAQ
99720.771002161.501441.86720.50720.50Alphabet IncNASDAQ
1010720.771002161.502162.74720.50720.50Alphabet IncNASDAQ
1111720.921002161.50720.93720.50720.50Alphabet IncNASDAQ
1212720.921002161.501441.86720.50720.50Alphabet IncNASDAQ
1313720.921002161.502162.74720.50720.50Alphabet IncNASDAQ
141498.00100293.9798.0197.9997.99Apple IncNASDAQ
\n", - "
" - ], - "text/plain": [ - " Unnamed: 0 price quantity multi total_ask bids_min_1h \\\n", - "0 0 51.95 75 155.85 51.96 51.95 \n", - "1 1 51.95 75 155.91 103.94 51.95 \n", - "2 2 51.95 75 155.97 155.94 51.95 \n", - "3 3 51.95 75 156.03 207.97 51.95 \n", - "4 4 51.95 155 155.85 51.96 51.95 \n", - "5 5 51.95 155 155.91 103.94 51.95 \n", - "6 6 51.95 155 155.97 155.94 51.95 \n", - "7 7 51.95 155 156.03 207.97 51.95 \n", - "8 8 720.77 100 2161.50 720.93 720.50 \n", - "9 9 720.77 100 2161.50 1441.86 720.50 \n", - "10 10 720.77 100 2161.50 2162.74 720.50 \n", - "11 11 720.92 100 2161.50 720.93 720.50 \n", - "12 12 720.92 100 2161.50 1441.86 720.50 \n", - "13 13 720.92 100 2161.50 2162.74 720.50 \n", - "14 14 98.00 100 293.97 98.01 97.99 \n", - "\n", - " bids_max_1h name exchange \n", - "0 51.95 Microsoft Corporation NASDAQ \n", - "1 51.97 Microsoft Corporation NASDAQ \n", - "2 51.99 Microsoft Corporation NASDAQ \n", - "3 52.01 Microsoft Corporation NASDAQ \n", - "4 51.95 Microsoft Corporation NASDAQ \n", - "5 51.97 Microsoft Corporation NASDAQ \n", - "6 51.99 Microsoft Corporation NASDAQ \n", - "7 52.01 Microsoft Corporation NASDAQ \n", - "8 720.50 Alphabet Inc NASDAQ \n", - "9 720.50 Alphabet Inc NASDAQ \n", - "10 720.50 Alphabet Inc NASDAQ \n", - "11 720.50 Alphabet Inc NASDAQ \n", - "12 720.50 Alphabet Inc NASDAQ \n", - "13 720.50 Alphabet Inc NASDAQ \n", - "14 97.99 Apple Inc NASDAQ " - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "mlrun.get_dataitem(gof_run.outputs['feature_vector']).as_df()" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/functions/development/get_offline_features/1.0.1/src/get_offline_features.py b/functions/development/get_offline_features/1.0.1/src/get_offline_features.py deleted file mode 100644 index 9347637c..00000000 --- a/functions/development/get_offline_features/1.0.1/src/get_offline_features.py +++ /dev/null @@ -1,121 +0,0 @@ -from typing import Union, List, Dict - -import mlrun -import mlrun.feature_store as fs -from mlrun.datastore.store_resources import is_store_uri, parse_store_uri -from mlrun.datastore.targets import get_target_driver, kind_to_driver -from mlrun.datastore.base import DataItem -from mlrun.execution import MLClientCtx -from mlrun.utils import StorePrefix, parse_versioned_object_uri -from mlrun.errors import MLRunInvalidArgumentError - - -def get_offline_features( - context: MLClientCtx, - feature_vector: str, - features: List[str] = None, - label_feature: str = None, - description: str = None, - entity_rows: DataItem = None, - entity_timestamp_column: str = None, - target: Union[str, Dict] = None, - run_config: Union[str, Dict] = None, - drop_columns: List[str] = None, - start_time: str = None, - end_time: str = None, - with_indexes: bool = False, - update_stats: bool = False, -): - """retrieve offline feature vector results - - specify a feature vector object/uri and retrieve the desired features, their metadata - and statistics. returns :py:class:`~mlrun.feature_store.OfflineVectorResponse`, - results can be returned as a dataframe or written to a target. - If feature vector does not exist, a new one will be created and saved with the given features. - - The start_time and end_time attributes allow filtering the data to a given time range, they accept - string values or pandas `Timestamp` objects, string values can also be relative, for example: - "now", "now - 1d2h", "now+5m", where a valid pandas Timedelta string follows the verb "now", - for time alignment you can use the verb "floor" e.g. "now -1d floor 1H" will align the time to the last hour - (the floor string is passed to pandas.Timestamp.floor(), can use D, H, T, S for day, hour, min, sec alignment) - - - :param context: MLRun context - :param feature_vector: feature vector uri - :param features: Relevant only if feature_vector not exist. list of feature to collect to this vector - format [/]. [as ] - :param label_feature: feature name to be used as label data - :param description: text description of the vector - :param entity_rows: URI of the data entity rows to join with - :param target: where to write the results to - :param drop_columns: list of columns to drop from the final result - :param entity_timestamp_column: timestamp column name in the entity rows dataframe - :param run_config: function and/or run configuration - see :py:class:`~mlrun.feature_store.RunConfig` - :param start_time: datetime, low limit of time needed to be filtered. Optional - entity_timestamp_column must be passed when using time filtering - :param end_time: datetime, high limit of time needed to be filtered. Optional - entity_timestamp_column must be passed when using time filtering - :param with_indexes: return vector with index columns (default False) - :param update_stats: update features statistics from the requested feature sets on the vector. Default is False. - - :returns feature_vector input - """ - - if features is not None: - # Creating a new FeatureVector and saving: - if is_store_uri(feature_vector): - prefix, new_uri = parse_store_uri(feature_vector) - if prefix != StorePrefix.FeatureVector: - raise MLRunInvalidArgumentError( - f"provided store uri ({feature_vector}) does not represent a feature vector (prefix={prefix})" - ) - feature_vector = new_uri - - context.logger.info(f"Creating FeatureVector {feature_vector}") - project, name, tag, _ = parse_versioned_object_uri(feature_vector, mlrun.mlconf.default_project) - vector = fs.FeatureVector(name, features, label_feature=label_feature, description=description) - vector.metadata.project = project - vector.metadata.tag = tag - vector.save() - feature_vector = vector.uri - - # Preparing entity_rows: - if entity_rows is not None: - context.logger.info(f"Creating DataFrame from entity_rows = {entity_rows}") - entity_rows = entity_rows.as_df() - - # Preparing target: - if target: - if isinstance(target, str): - target = kind_to_driver[target]() - - name = target.name if hasattr(target, "name") else target["name"] - context.logger.info(f"Preparing '{name}' target") - target = get_target_driver(target) - if hasattr(target, 'path') and target.path: - context.log_result("target", target.path) - - # Preparing run_config: - if run_config and isinstance(run_config, dict): - context.logger.info("Preparing run configuration") - run_config = fs.RunConfig(**run_config) - - # Calling get_offline_features: - context.logger.info( - f"getting offline features from the FeatureVector {feature_vector}" - ) - fs.get_offline_features( - feature_vector=feature_vector, - entity_rows=entity_rows, - entity_timestamp_column=entity_timestamp_column, - target=target, - run_config=run_config, - drop_columns=drop_columns, - start_time=start_time, - end_time=end_time, - with_indexes=with_indexes, - update_stats=update_stats, - ) - - context.log_result("feature_vector", feature_vector) diff --git a/functions/development/get_offline_features/1.0.1/src/item.yaml b/functions/development/get_offline_features/1.0.1/src/item.yaml deleted file mode 100644 index 3b453937..00000000 --- a/functions/development/get_offline_features/1.0.1/src/item.yaml +++ /dev/null @@ -1,25 +0,0 @@ -apiVersion: v1 -categories: -- data-preparation -- data-analysis -- feature-store -description: retrieve offline feature vector results -doc: '' -example: get_offline_features.ipynb -generationDate: 2022-05-25:10-58 -icon: '' -labels: - author: yonish -maintainers: [] -marketplaceType: '' -mlrunVersion: 1.0.1 -name: get_offline_features -platformVersion: '' -spec: - filename: get_offline_features.py - handler: get_offline_features - image: mlrun/mlrun - kind: job - requirements: [] -url: '' -version: 1.0.1 diff --git a/functions/development/get_offline_features/1.0.1/src/test_get_offline_features.py b/functions/development/get_offline_features/1.0.1/src/test_get_offline_features.py deleted file mode 100644 index 1aa92f0b..00000000 --- a/functions/development/get_offline_features/1.0.1/src/test_get_offline_features.py +++ /dev/null @@ -1,224 +0,0 @@ -import os -import tempfile -import shutil -import datetime - -import pytest -import mlrun -import mlrun.feature_store as fstore -from mlrun.datastore.targets import CSVTarget -from mlrun.feature_store.steps import * -from mlrun.features import MinMaxValidator -from mlrun.run import get_dataitem - - -REQUIRED_ENV_VARS = [ - "MLRUN_DBPATH", - "MLRUN_ARTIFACT_PATH", - "V3IO_USERNAME", - "V3IO_API", - "V3IO_ACCESS_KEY", -] - - -def _validate_environment_variables() -> bool: - """ - Checks that all required Environment variables are set. - """ - environment_keys = os.environ.keys() - return all(key in environment_keys for key in REQUIRED_ENV_VARS) - - -def _set_environment(): - """ - Creating project and temp dir for the project. - """ - artifact_path = tempfile.TemporaryDirectory().name - os.makedirs(artifact_path) - project = mlrun.get_or_create_project( - "get-offline-features-test", context="./", user_project=True - ) - return artifact_path, project - - -def _cleanup_environment(artifact_path: str): - """ - Cleanup the test environment, deleting files and artifacts created during the test. - - :param artifact_path: The artifact path to delete. - """ - # Clean the local directory: - for test_output in [ - *os.listdir(artifact_path), - "schedules", - "runs", - "artifacts", - "functions", - ]: - test_output_path = os.path.abspath(f"./{test_output}") - if os.path.exists(test_output_path): - if os.path.isdir(test_output_path): - shutil.rmtree(test_output_path) - else: - os.remove(test_output_path) - - # Clean the artifacts directory: - shutil.rmtree(artifact_path) - - -def create_dataframes() -> (pd.DataFrame, pd.DataFrame, pd.DataFrame): - """ - Creates all the necessary DataFrames to the test. - """ - - def move_date(df, col): - max_date = df[col].max() - now_date = datetime.datetime.now() - delta = now_date - max_date - df[col] = df[col] + delta - return df - - stocks = pd.DataFrame( - { - "ticker": ["MSFT", "GOOG", "AAPL"], - "name": ["Microsoft Corporation", "Alphabet Inc", "Apple Inc"], - "exchange": ["NASDAQ", "NASDAQ", "NASDAQ"], - } - ) - - quotes = pd.DataFrame( - { - "time": [ - pd.Timestamp("2016-05-25 13:30:00.023"), - pd.Timestamp("2016-05-25 13:30:00.023"), - pd.Timestamp("2016-05-25 13:30:00.030"), - pd.Timestamp("2016-05-25 13:30:00.041"), - pd.Timestamp("2016-05-25 13:30:00.048"), - pd.Timestamp("2016-05-25 13:30:00.049"), - pd.Timestamp("2016-05-25 13:30:00.072"), - pd.Timestamp("2016-05-25 13:30:00.075"), - ], - "ticker": ["GOOG", "MSFT", "MSFT", "MSFT", "GOOG", "AAPL", "GOOG", "MSFT"], - "bid": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01], - "ask": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03], - } - ) - - trades = pd.DataFrame( - { - "time": [ - pd.Timestamp("2016-05-25 13:30:00.023"), - pd.Timestamp("2016-05-25 13:30:00.038"), - pd.Timestamp("2016-05-25 13:30:00.048"), - pd.Timestamp("2016-05-25 13:30:00.048"), - pd.Timestamp("2016-05-25 13:30:00.048"), - ], - "ticker": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"], - "price": [51.95, 51.95, 720.77, 720.92, 98.0], - "quantity": [75, 155, 100, 100, 100], - } - ) - quotes = move_date(quotes, "time") - trades = move_date(trades, "time") - return quotes, trades, stocks - - -class MyMap(MapClass): - def __init__(self, multiplier=1, **kwargs): - super().__init__(**kwargs) - self._multiplier = multiplier - - def do(self, event): - event["multi"] = event["bid"] * self._multiplier - return event - - -def _create_feature_set(): - """ - Creating all the necessary FeatureSets for the test. - """ - stocks_set = fstore.FeatureSet("stocks", entities=[fstore.Entity("ticker")]) - - quotes_set = fstore.FeatureSet("stock-quotes", entities=[fstore.Entity("ticker")]) - - quotes_set.graph.to("MyMap", multiplier=3).to( - "storey.Extend", _fn="({'extra': event['bid'] * 77})" - ).to("storey.Filter", "filter", _fn="(event['bid'] > 51.92)").to( - FeaturesetValidator() - ) - - quotes_set.add_aggregation("ask", ["sum", "max"], "1h", "10m", name="asks1") - quotes_set.add_aggregation("ask", ["sum", "max"], "5h", "10m", name="asks5") - quotes_set.add_aggregation("bid", ["min", "max"], "1h", "10m", name="bids") - - # add feature validation policy - quotes_set["bid"] = fstore.Feature( - validator=MinMaxValidator(min=52, severity="info") - ) - - # add default target definitions and plot - quotes_set.set_targets() - return quotes_set, stocks_set - - -@pytest.mark.skipif( - condition=not _validate_environment_variables(), - reason="Project's environment variables are not set", -) -def test_get_offline_vector(): - # Creating project: - artifact_path, project = _set_environment() - - # Importing the marketplace function: - gof_fn = mlrun.import_function("function.yaml") - - # Creating the dataframes: - quotes, trades, stocks = create_dataframes() - - # Defining features for the FeatureVector: - features = [ - "stock-quotes.multi", - "stock-quotes.asks5_sum_5h as total_ask", - "stock-quotes.bids_min_1h", - "stock-quotes.bids_max_1h", - "stocks.*", - ] - - # Creating the FeatureSets and ingesting them: - quotes_set, stocks_set = _create_feature_set() - fstore.ingest(stocks_set, stocks) - fstore.ingest(quotes_set, quotes) - - # Saving the trades dataframe as a csv to use as entity_rows: - trades_uri = os.path.join(artifact_path, "trades.csv") - trades.to_csv(trades_uri, index=False) - - # Creating target for the FeatureVector: - target_dict = CSVTarget( - "mycsv", path=os.path.join(artifact_path, "my_csv.csv") - ).to_dict() - - # Running the getting_offline_features function: - gof_run = None - try: - gof_run = gof_fn.run( - handler="get_offline_features", - inputs={"entity_rows": trades_uri}, - params={ - "feature_vector": "stocks-vec", - "features": features, - "target": target_dict, - "entity_timestamp_column": "time", - }, - local=True, - ) - - except Exception as e: - print(f"- The test failed - raised the following error:\n- {e}") - - target_df = get_dataitem(gof_run.outputs["target"]).as_df() - vector_df = get_dataitem(gof_run.outputs["feature_vector"]).as_df() - - # Asserting that the target and FeatureVector dataframes are the same: - assert vector_df.equals(target_df), "Target and feature vector are not the same" - _cleanup_environment(artifact_path) diff --git a/functions/development/get_offline_features/1.0.1/static/documentation.html b/functions/development/get_offline_features/1.0.1/static/documentation.html deleted file mode 100644 index 7e82b21a..00000000 --- a/functions/development/get_offline_features/1.0.1/static/documentation.html +++ /dev/null @@ -1,167 +0,0 @@ - - - - - - - -get_offline_features package - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- - -
-
-
-
-
-
-

get_offline_features package

-
-

Submodules

-
-
-

get_offline_features.get_offline_features module

-
-
-get_offline_features.get_offline_features.get_offline_features(context: mlrun.execution.MLClientCtx, feature_vector: str, features: Optional[List[str]] = None, label_feature: Optional[str] = None, description: Optional[str] = None, entity_rows: Optional[mlrun.datastore.base.DataItem] = None, entity_timestamp_column: Optional[str] = None, target: Optional[Union[str, Dict]] = None, run_config: Optional[Union[str, Dict]] = None, drop_columns: Optional[List[str]] = None, start_time: Optional[str] = None, end_time: Optional[str] = None, with_indexes: bool = False, update_stats: bool = False)[source]
-

retrieve offline feature vector results

-

specify a feature vector object/uri and retrieve the desired features, their metadata -and statistics. returns OfflineVectorResponse, -results can be returned as a dataframe or written to a target. -If feature vector does not exist, a new one will be created and saved with the given features.

-

The start_time and end_time attributes allow filtering the data to a given time range, they accept -string values or pandas Timestamp objects, string values can also be relative, for example: -“now”, “now - 1d2h”, “now+5m”, where a valid pandas Timedelta string follows the verb “now”, -for time alignment you can use the verb “floor” e.g. “now -1d floor 1H” will align the time to the last hour -(the floor string is passed to pandas.Timestamp.floor(), can use D, H, T, S for day, hour, min, sec alignment)

-
-
Parameters
-
    -
  • context – MLRun context

  • -
  • feature_vector – feature vector uri

  • -
  • features – Relevant only if feature_vector not exist. list of feature to collect to this vector -format [<project>/]<feature_set>.<feature_name or *> [as <alias>]

  • -
  • label_feature – feature name to be used as label data

  • -
  • description – text description of the vector

  • -
  • entity_rows – URI of the data entity rows to join with

  • -
  • target – where to write the results to

  • -
  • drop_columns – list of columns to drop from the final result

  • -
  • entity_timestamp_column – timestamp column name in the entity rows dataframe

  • -
  • run_config – function and/or run configuration -see RunConfig

  • -
  • start_time – datetime, low limit of time needed to be filtered. Optional -entity_timestamp_column must be passed when using time filtering

  • -
  • end_time – datetime, high limit of time needed to be filtered. Optional -entity_timestamp_column must be passed when using time filtering

  • -
  • with_indexes – return vector with index columns (default False)

  • -
  • update_stats – update features statistics from the requested feature sets on the vector. Default is False.

  • -
-
-
-

:returns feature_vector input

-
-
-
-

Module contents

-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/get_offline_features/1.0.1/static/example.html b/functions/development/get_offline_features/1.0.1/static/example.html deleted file mode 100644 index 4e837c4f..00000000 --- a/functions/development/get_offline_features/1.0.1/static/example.html +++ /dev/null @@ -1,1259 +0,0 @@ - - - - - - - -get_offline_features() from MLRun FeatureStore - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
- -
-
-
-
-

get_offline_features() from MLRun FeatureStore

-

This MLRun Function has the following params:

-
    -
  • feature_vector: str, feature vector uri.

  • -
  • entity_rows: DataItem = None, URI of the data entity rows to join with.

  • -
  • entity_timestamp_column: str = None, timestamp column name in the entity rows dataframe.

  • -
  • target: Union[str, Dict] = None, where to write the results to.

  • -
  • run_config: Union[str, Dict] = None, function and/or run configuration see :py:class:~mlrun.feature_store.RunConfig.

  • -
  • drop_columns: List[str] = None, list of columns to drop from the final result.

  • -
  • start_time: str = None, datetime, low limit of time needed to be filtered. Optional. entity_timestamp_column must be passed when using time filtering.

  • -
  • end_time: str = None, datetime, high limit of time needed to be filtered. Optional. entity_timestamp_column must be passed when using time filtering.

  • -
  • with_indexes: bool = False, return vector with index columns (default False).

  • -
  • update_stats: bool = False, update features statistics from the requested feature sets on the vector. Default is False.

  • -
-
-
-
import mlrun
-import mlrun.feature_store as fstore
-from mlrun.datastore.targets import CSVTarget
-from mlrun.datastore.sources import CSVSource
-from mlrun.run import get_dataitem
-from mlrun.feature_store.steps import *
-from mlrun.features import MinMaxValidator
-import pandas as pd
-import datetime
-import os
-
-
-
-
-
-
-
ABS_PATH = 'v3io://users/{}/get_offline_features/'.format(os.environ['V3IO_USERNAME'])
-# Initialize the MLRun project object
-project = mlrun.get_or_create_project('get-offline-features', context="./", user_project=True)
-
-
-
-
-
> 2022-01-31 14:41:48,288 [info] loaded project get-offline-features from MLRun DB
-
-
-
-
-
-

Generating the Same FeatureSets and FeatureVecotrs Based on the Stocks Example

-
-

Create Sample Data For Demo

-
-
-
quotes = pd.DataFrame(
-    {
-        "time": [
-            pd.Timestamp("2016-05-25 13:30:00.023"),
-            pd.Timestamp("2016-05-25 13:30:00.023"),
-            pd.Timestamp("2016-05-25 13:30:00.030"),
-            pd.Timestamp("2016-05-25 13:30:00.041"),
-            pd.Timestamp("2016-05-25 13:30:00.048"),
-            pd.Timestamp("2016-05-25 13:30:00.049"),
-            pd.Timestamp("2016-05-25 13:30:00.072"),
-            pd.Timestamp("2016-05-25 13:30:00.075")
-        ],
-        "ticker": [
-               "GOOG",
-               "MSFT",
-               "MSFT",
-               "MSFT",
-               "GOOG",
-               "AAPL",
-               "GOOG",
-               "MSFT"
-           ],
-           "bid": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01],
-           "ask": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03]
-    }
-)
-
-trades = pd.DataFrame(
-       {
-           "time": [
-               pd.Timestamp("2016-05-25 13:30:00.023"),
-               pd.Timestamp("2016-05-25 13:30:00.038"),
-               pd.Timestamp("2016-05-25 13:30:00.048"),
-               pd.Timestamp("2016-05-25 13:30:00.048"),
-               pd.Timestamp("2016-05-25 13:30:00.048")
-           ],
-           "ticker": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"],
-           "price": [51.95, 51.95, 720.77, 720.92, 98.0],
-           "quantity": [75, 155, 100, 100, 100]
-       }
-)
-
-stocks = pd.DataFrame(
-       {
-           "ticker": ["MSFT", "GOOG", "AAPL"],
-           "name": ["Microsoft Corporation", "Alphabet Inc", "Apple Inc"],
-           "exchange": ["NASDAQ", "NASDAQ", "NASDAQ"]
-       }
-)
-
-
-
-
-
-
-
def move_date(df, col):
-    max_date = df[col].max()
-    now_date = datetime.datetime.now()
-    delta = now_date - max_date 
-    df[col] = df[col] + delta 
-    return df
-
-quotes = move_date(quotes, "time")
-trades = move_date(trades, "time")
-trades.to_csv('trades.csv', index=False)
-data_uri = os.path.join(ABS_PATH, 'trades.csv')
-
-
-
-
-
-
-
quotes
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
timetickerbidask
02022-01-31 14:41:48.260566GOOG720.50720.93
12022-01-31 14:41:48.260566MSFT51.9551.96
22022-01-31 14:41:48.267566MSFT51.9751.98
32022-01-31 14:41:48.278566MSFT51.9952.00
42022-01-31 14:41:48.285566GOOG720.50720.93
52022-01-31 14:41:48.286566AAPL97.9998.01
62022-01-31 14:41:48.309566GOOG720.50720.88
72022-01-31 14:41:48.312566MSFT52.0152.03
-
-
-
-
-
trades
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
timetickerpricequantity
02022-01-31 14:41:48.288476MSFT51.9575
12022-01-31 14:41:48.303476MSFT51.95155
22022-01-31 14:41:48.313476GOOG720.77100
32022-01-31 14:41:48.313476GOOG720.92100
42022-01-31 14:41:48.313476AAPL98.00100
-
-
-
-
-
stocks
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
tickernameexchange
0MSFTMicrosoft CorporationNASDAQ
1GOOGAlphabet IncNASDAQ
2AAPLApple IncNASDAQ
-
-
-
-
-

Build & Ingest Simple Feature Set (stocks)

-
-
-
# add feature set without time column (stock ticker metadata) 
-stocks_set = fstore.FeatureSet("stocks", entities=[fstore.Entity("ticker")])
-fstore.ingest(stocks_set, stocks, infer_options=fstore.InferOptions.default())
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
nameexchange
ticker
MSFTMicrosoft CorporationNASDAQ
GOOGAlphabet IncNASDAQ
AAPLApple IncNASDAQ
-
-
-
-
-

Build Advanced feature set - with feature engineering pipeline

-
-
-
quotes_set = fstore.FeatureSet("stock-quotes", entities=[fstore.Entity("ticker")])
-
-
-
-
-
-
-
class MyMap(MapClass):
-    def __init__(self, multiplier=1, **kwargs):
-        super().__init__(**kwargs)
-        self._multiplier = multiplier
-
-    def do(self, event):
-        event["multi"] = event["bid"] * self._multiplier
-        return event
-
-
-
-
-
-
-
quotes_set.graph.to("MyMap", multiplier=3)\
-                .to("storey.Extend", _fn="({'extra': event['bid'] * 77})")\
-                .to("storey.Filter", "filter", _fn="(event['bid'] > 51.92)")\
-                .to(FeaturesetValidator())
-
-quotes_set.add_aggregation("ask", ["sum", "max"], "1h", "10m", name="asks1")
-quotes_set.add_aggregation("ask", ["sum", "max"], "5h", "10m", name="asks5")
-quotes_set.add_aggregation("bid", ["min", "max"], "1h", "10m", name="bids")
-
-# add feature validation policy
-quotes_set["bid"] = fstore.Feature(validator=MinMaxValidator(min=52, severity="info"))
-
-# add default target definitions and plot
-quotes_set.set_targets()
-quotes_set.plot(rankdir="LR", with_targets=True)
-
-
-
-
-_images/01e548158b906df8cc3f4f282097c5f50e603245dd43f2314bc9ff5ef6aa1447.svg
-
-
-
-

Ingest Data Into Offline And Online Stores

-
-
-
# save ingest data and print the FeatureSet spec
-fstore.ingest(quotes_set, quotes)
-
-
-
-
-
info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.377248+00:00 args={'min': 52, 'value': 51.95}
-info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.377927+00:00 args={'min': 52, 'value': 51.97}
-info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.378103+00:00 args={'min': 52, 'value': 51.99}
-info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.578640+00:00 args={'min': 52, 'value': 51.95}
-info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.581692+00:00 args={'min': 52, 'value': 51.97}
-info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.584351+00:00 args={'min': 52, 'value': 51.99}
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
asks1_max_1hasks1_sum_1hasks5_max_5hasks5_sum_5hbids_max_1hbids_min_1htimebidaskmultiextra
ticker
GOOG720.93720.93720.93720.93720.50720.502022-01-31 14:41:48.260566720.50720.932161.5055478.50
MSFT51.9651.9651.9651.9651.9551.952022-01-31 14:41:48.26056651.9551.96155.854000.15
MSFT51.98103.9451.98103.9451.9751.952022-01-31 14:41:48.26756651.9751.98155.914001.69
MSFT52.00155.9452.00155.9451.9951.952022-01-31 14:41:48.27856651.9952.00155.974003.23
GOOG720.931441.86720.931441.86720.50720.502022-01-31 14:41:48.285566720.50720.932161.5055478.50
AAPL98.0198.0198.0198.0197.9997.992022-01-31 14:41:48.28656697.9998.01293.977545.23
GOOG720.932162.74720.932162.74720.50720.502022-01-31 14:41:48.309566720.50720.882161.5055478.50
MSFT52.03207.9752.03207.9752.0151.952022-01-31 14:41:48.31256652.0152.03156.034004.77
-
-
-
-
-

Get an Offline Feature Vector

-
-
-
features = [
-    "stock-quotes.multi",
-    "stock-quotes.asks5_sum_5h as total_ask",
-    "stock-quotes.bids_min_1h",
-    "stock-quotes.bids_max_1h",
-    "stocks.*",
-]
-
-vector = fstore.FeatureVector("stocks-vec", features)
-vector.save()
-
-
-
-
-
-
-
target_dict = CSVTarget('mycsv',path=os.path.join(ABS_PATH, 'my_csv.csv')).to_dict()
-
-
-
-
-
-
-

Using get_offline_features()

-
-
-
get_offline_features_fn = mlrun.import_function('hub://get_offline_features:development')
-
-
-
-
-
-
-
gof_run = get_offline_features_fn.run(
-    handler='get_offline_features',
-    inputs= {'entity_rows': data_uri},
-    params={'feature_vector': vector.uri,
-           'target': target_dict,
-            'entity_timestamp_column': "time",
-           },
-    local=True
-)
-
-
-
-
-
> 2022-01-31 14:41:52,066 [info] starting run get-offline-features-get_offline_features uid=956663b9a9ba448c9ea65e8e9245718e DB=http://mlrun-api:8080
-> 2022-01-31 14:41:52,214 [info] Creating DataFrame from entity_rows = v3io://users/yonatan/get_offline_features/trades.csv
-> 2022-01-31 14:41:52,292 [info] Preparing 'mycsv' target
-> 2022-01-31 14:41:52,294 [info] getting offline features from the FeatureVector store://feature-vectors/get-offline-features-yonatan/stocks-vec
-> 2022-01-31 14:41:52,708 [info] wrote target: {'name': 'mycsv', 'kind': 'csv', 'path': 'v3io://users/yonatan/get_offline_features/my_csv.csv', 'status': 'ready', 'updated': '2022-01-31T14:41:52.708534+00:00'}
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
get-offline-features-yonatan0Jan 31 14:41:52completedget-offline-features-get_offline_features
v3io_user=yonatan
kind=
owner=yonatan
host=jupyter-yoni-647b99c95d-w4jlc
entity_rows
feature_vector=store://feature-vectors/get-offline-features-yonatan/stocks-vec
target={'name': 'mycsv', 'kind': 'csv', 'path': 'v3io://users/yonatan/get_offline_features/my_csv.csv', 'partitioned': False}
entity_timestamp_column=time
target=v3io://users/yonatan/get_offline_features/my_csv.csv
feature_vector=store://feature-vectors/get-offline-features-yonatan/stocks-vec
-
- -
-

-
-
-
> to track results use the .show() or .logs() methods or click here to open in UI
> 2022-01-31 14:41:52,896 [info] run executed, status=completed
-
-
-
-
-
-
-
gof_run.outputs['feature_vector']
-
-
-
-
-
'store://feature-vectors/get-offline-features-yonatan/stocks-vec'
-
-
-
-
-
-
-
mlrun.get_dataitem(gof_run.outputs['feature_vector']).as_df()
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Unnamed: 0pricequantitymultitotal_askbids_min_1hbids_max_1hnameexchange
0051.9575155.8551.9651.9551.95Microsoft CorporationNASDAQ
1151.9575155.91103.9451.9551.97Microsoft CorporationNASDAQ
2251.9575155.97155.9451.9551.99Microsoft CorporationNASDAQ
3351.9575156.03207.9751.9552.01Microsoft CorporationNASDAQ
4451.95155155.8551.9651.9551.95Microsoft CorporationNASDAQ
5551.95155155.91103.9451.9551.97Microsoft CorporationNASDAQ
6651.95155155.97155.9451.9551.99Microsoft CorporationNASDAQ
7751.95155156.03207.9751.9552.01Microsoft CorporationNASDAQ
88720.771002161.50720.93720.50720.50Alphabet IncNASDAQ
99720.771002161.501441.86720.50720.50Alphabet IncNASDAQ
1010720.771002161.502162.74720.50720.50Alphabet IncNASDAQ
1111720.921002161.50720.93720.50720.50Alphabet IncNASDAQ
1212720.921002161.501441.86720.50720.50Alphabet IncNASDAQ
1313720.921002161.502162.74720.50720.50Alphabet IncNASDAQ
141498.00100293.9798.0197.9997.99Apple IncNASDAQ
-
-
-
-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/get_offline_features/1.0.1/static/function.html b/functions/development/get_offline_features/1.0.1/static/function.html deleted file mode 100644 index 666a118c..00000000 --- a/functions/development/get_offline_features/1.0.1/static/function.html +++ /dev/null @@ -1,148 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: job
-metadata:
-  name: get-offline-features
-  tag: ''
-  hash: c27193bde78516cff1f9f25c397c0aced413df40
-  project: ''
-  labels:
-    author: yonish
-  categories:
-  - data-preparation
-  - data-analysis
-  - feature-store
-spec:
-  command: ''
-  args: []
-  image: mlrun/mlrun
-  build:
-    functionSourceCode: ZnJvbSB0eXBpbmcgaW1wb3J0IFVuaW9uLCBMaXN0LCBEaWN0CgppbXBvcnQgbWxydW4KaW1wb3J0IG1scnVuLmZlYXR1cmVfc3RvcmUgYXMgZnMKZnJvbSBtbHJ1bi5kYXRhc3RvcmUuc3RvcmVfcmVzb3VyY2VzIGltcG9ydCBpc19zdG9yZV91cmksIHBhcnNlX3N0b3JlX3VyaQpmcm9tIG1scnVuLmRhdGFzdG9yZS50YXJnZXRzIGltcG9ydCBnZXRfdGFyZ2V0X2RyaXZlciwga2luZF90b19kcml2ZXIKZnJvbSBtbHJ1bi5kYXRhc3RvcmUuYmFzZSBpbXBvcnQgRGF0YUl0ZW0KZnJvbSBtbHJ1bi5leGVjdXRpb24gaW1wb3J0IE1MQ2xpZW50Q3R4CmZyb20gbWxydW4udXRpbHMgaW1wb3J0IFN0b3JlUHJlZml4LCBwYXJzZV92ZXJzaW9uZWRfb2JqZWN0X3VyaQpmcm9tIG1scnVuLmVycm9ycyBpbXBvcnQgTUxSdW5JbnZhbGlkQXJndW1lbnRFcnJvcgoKCmRlZiBnZXRfb2ZmbGluZV9mZWF0dXJlcygKICAgIGNvbnRleHQ6IE1MQ2xpZW50Q3R4LAogICAgZmVhdHVyZV92ZWN0b3I6IHN0ciwKICAgIGZlYXR1cmVzOiBMaXN0W3N0cl0gPSBOb25lLAogICAgbGFiZWxfZmVhdHVyZTogc3RyID0gTm9uZSwKICAgIGRlc2NyaXB0aW9uOiBzdHIgPSBOb25lLAogICAgZW50aXR5X3Jvd3M6IERhdGFJdGVtID0gTm9uZSwKICAgIGVudGl0eV90aW1lc3RhbXBfY29sdW1uOiBzdHIgPSBOb25lLAogICAgdGFyZ2V0OiBVbmlvbltzdHIsIERpY3RdID0gTm9uZSwKICAgIHJ1bl9jb25maWc6IFVuaW9uW3N0ciwgRGljdF0gPSBOb25lLAogICAgZHJvcF9jb2x1bW5zOiBMaXN0W3N0cl0gPSBOb25lLAogICAgc3RhcnRfdGltZTogc3RyID0gTm9uZSwKICAgIGVuZF90aW1lOiBzdHIgPSBOb25lLAogICAgd2l0aF9pbmRleGVzOiBib29sID0gRmFsc2UsCiAgICB1cGRhdGVfc3RhdHM6IGJvb2wgPSBGYWxzZSwKKToKICAgICIiInJldHJpZXZlIG9mZmxpbmUgZmVhdHVyZSB2ZWN0b3IgcmVzdWx0cwoKICAgIHNwZWNpZnkgYSBmZWF0dXJlIHZlY3RvciBvYmplY3QvdXJpIGFuZCByZXRyaWV2ZSB0aGUgZGVzaXJlZCBmZWF0dXJlcywgdGhlaXIgbWV0YWRhdGEKICAgIGFuZCBzdGF0aXN0aWNzLiByZXR1cm5zIDpweTpjbGFzczpgfm1scnVuLmZlYXR1cmVfc3RvcmUuT2ZmbGluZVZlY3RvclJlc3BvbnNlYCwKICAgIHJlc3VsdHMgY2FuIGJlIHJldHVybmVkIGFzIGEgZGF0YWZyYW1lIG9yIHdyaXR0ZW4gdG8gYSB0YXJnZXQuCiAgICBJZiBmZWF0dXJlIHZlY3RvciBkb2VzIG5vdCBleGlzdCwgYSBuZXcgb25lIHdpbGwgYmUgY3JlYXRlZCBhbmQgc2F2ZWQgd2l0aCB0aGUgZ2l2ZW4gZmVhdHVyZXMuCgogICAgVGhlIHN0YXJ0X3RpbWUgYW5kIGVuZF90aW1lIGF0dHJpYnV0ZXMgYWxsb3cgZmlsdGVyaW5nIHRoZSBkYXRhIHRvIGEgZ2l2ZW4gdGltZSByYW5nZSwgdGhleSBhY2NlcHQKICAgIHN0cmluZyB2YWx1ZXMgb3IgcGFuZGFzIGBUaW1lc3RhbXBgIG9iamVjdHMsIHN0cmluZyB2YWx1ZXMgY2FuIGFsc28gYmUgcmVsYXRpdmUsIGZvciBleGFtcGxlOgogICAgIm5vdyIsICJub3cgLSAxZDJoIiwgIm5vdys1bSIsIHdoZXJlIGEgdmFsaWQgcGFuZGFzIFRpbWVkZWx0YSBzdHJpbmcgZm9sbG93cyB0aGUgdmVyYiAibm93IiwKICAgIGZvciB0aW1lIGFsaWdubWVudCB5b3UgY2FuIHVzZSB0aGUgdmVyYiAiZmxvb3IiIGUuZy4gIm5vdyAtMWQgZmxvb3IgMUgiIHdpbGwgYWxpZ24gdGhlIHRpbWUgdG8gdGhlIGxhc3QgaG91cgogICAgKHRoZSBmbG9vciBzdHJpbmcgaXMgcGFzc2VkIHRvIHBhbmRhcy5UaW1lc3RhbXAuZmxvb3IoKSwgY2FuIHVzZSBELCBILCBULCBTIGZvciBkYXksIGhvdXIsIG1pbiwgc2VjIGFsaWdubWVudCkKCgogICAgOnBhcmFtIGNvbnRleHQ6ICAgICAgICBNTFJ1biBjb250ZXh0CiAgICA6cGFyYW0gZmVhdHVyZV92ZWN0b3I6IGZlYXR1cmUgdmVjdG9yIHVyaQogICAgOnBhcmFtIGZlYXR1cmVzOiAgICAgICBSZWxldmFudCBvbmx5IGlmIGZlYXR1cmVfdmVjdG9yIG5vdCBleGlzdC4gbGlzdCBvZiBmZWF0dXJlIHRvIGNvbGxlY3QgdG8gdGhpcyB2ZWN0b3IKICAgICAgICAgICAgICAgICAgICAgICAgICAgZm9ybWF0IFs8cHJvamVjdD4vXTxmZWF0dXJlX3NldD4uPGZlYXR1cmVfbmFtZSBvciAqPiBbYXMgPGFsaWFzPl0KICAgIDpwYXJhbSBsYWJlbF9mZWF0dXJlOiAgZmVhdHVyZSBuYW1lIHRvIGJlIHVzZWQgYXMgbGFiZWwgZGF0YQogICAgOnBhcmFtIGRlc2NyaXB0aW9uOiAgICB0ZXh0IGRlc2NyaXB0aW9uIG9mIHRoZSB2ZWN0b3IKICAgIDpwYXJhbSBlbnRpdHlfcm93czogICAgVVJJIG9mIHRoZSBkYXRhIGVudGl0eSByb3dzIHRvIGpvaW4gd2l0aAogICAgOnBhcmFtIHRhcmdldDogICAgICAgICB3aGVyZSB0byB3cml0ZSB0aGUgcmVzdWx0cyB0bwogICAgOnBhcmFtIGRyb3BfY29sdW1uczogICBsaXN0IG9mIGNvbHVtbnMgdG8gZHJvcCBmcm9tIHRoZSBmaW5hbCByZXN1bHQKICAgIDpwYXJhbSBlbnRpdHlfdGltZXN0YW1wX2NvbHVtbjogdGltZXN0YW1wIGNvbHVtbiBuYW1lIGluIHRoZSBlbnRpdHkgcm93cyBkYXRhZnJhbWUKICAgIDpwYXJhbSBydW5fY29uZmlnOiAgICAgZnVuY3Rpb24gYW5kL29yIHJ1biBjb25maWd1cmF0aW9uCiAgICAgICAgICAgICAgICAgICAgICAgICAgIHNlZSA6cHk6Y2xhc3M6YH5tbHJ1bi5mZWF0dXJlX3N0b3JlLlJ1bkNvbmZpZ2AKICAgIDpwYXJhbSBzdGFydF90aW1lOiAgICAgIGRhdGV0aW1lLCBsb3cgbGltaXQgb2YgdGltZSBuZWVkZWQgdG8gYmUgZmlsdGVyZWQuIE9wdGlvbmFsCiAgICAgICAgZW50aXR5X3RpbWVzdGFtcF9jb2x1bW4gbXVzdCBiZSBwYXNzZWQgd2hlbiB1c2luZyB0aW1lIGZpbHRlcmluZwogICAgOnBhcmFtIGVuZF90aW1lOiAgICAgICAgZGF0ZXRpbWUsIGhpZ2ggbGltaXQgb2YgdGltZSBuZWVkZWQgdG8gYmUgZmlsdGVyZWQuIE9wdGlvbmFsCiAgICAgICAgZW50aXR5X3RpbWVzdGFtcF9jb2x1bW4gbXVzdCBiZSBwYXNzZWQgd2hlbiB1c2luZyB0aW1lIGZpbHRlcmluZwogICAgOnBhcmFtIHdpdGhfaW5kZXhlczogICAgcmV0dXJuIHZlY3RvciB3aXRoIGluZGV4IGNvbHVtbnMgKGRlZmF1bHQgRmFsc2UpCiAgICA6cGFyYW0gdXBkYXRlX3N0YXRzOiAgICB1cGRhdGUgZmVhdHVyZXMgc3RhdGlzdGljcyBmcm9tIHRoZSByZXF1ZXN0ZWQgZmVhdHVyZSBzZXRzIG9uIHRoZSB2ZWN0b3IuIERlZmF1bHQgaXMgRmFsc2UuCgogICAgOnJldHVybnMgZmVhdHVyZV92ZWN0b3IgaW5wdXQKICAgICIiIgoKICAgIGlmIGZlYXR1cmVzIGlzIG5vdCBOb25lOgogICAgICAgICMgQ3JlYXRpbmcgYSBuZXcgRmVhdHVyZVZlY3RvciBhbmQgc2F2aW5nOgogICAgICAgIGlmIGlzX3N0b3JlX3VyaShmZWF0dXJlX3ZlY3Rvcik6CiAgICAgICAgICAgIHByZWZpeCwgbmV3X3VyaSA9IHBhcnNlX3N0b3JlX3VyaShmZWF0dXJlX3ZlY3RvcikKICAgICAgICAgICAgaWYgcHJlZml4ICE9IFN0b3JlUHJlZml4LkZlYXR1cmVWZWN0b3I6CiAgICAgICAgICAgICAgICByYWlzZSBNTFJ1bkludmFsaWRBcmd1bWVudEVycm9yKAogICAgICAgICAgICAgICAgICAgIGYicHJvdmlkZWQgc3RvcmUgdXJpICh7ZmVhdHVyZV92ZWN0b3J9KSBkb2VzIG5vdCByZXByZXNlbnQgYSBmZWF0dXJlIHZlY3RvciAocHJlZml4PXtwcmVmaXh9KSIKICAgICAgICAgICAgICAgICkKICAgICAgICAgICAgZmVhdHVyZV92ZWN0b3IgPSBuZXdfdXJpCgogICAgICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZiJDcmVhdGluZyBGZWF0dXJlVmVjdG9yIHtmZWF0dXJlX3ZlY3Rvcn0iKQogICAgICAgIHByb2plY3QsIG5hbWUsIHRhZywgXyA9IHBhcnNlX3ZlcnNpb25lZF9vYmplY3RfdXJpKGZlYXR1cmVfdmVjdG9yLCBtbHJ1bi5tbGNvbmYuZGVmYXVsdF9wcm9qZWN0KQogICAgICAgIHZlY3RvciA9IGZzLkZlYXR1cmVWZWN0b3IobmFtZSwgZmVhdHVyZXMsIGxhYmVsX2ZlYXR1cmU9bGFiZWxfZmVhdHVyZSwgZGVzY3JpcHRpb249ZGVzY3JpcHRpb24pCiAgICAgICAgdmVjdG9yLm1ldGFkYXRhLnByb2plY3QgPSBwcm9qZWN0CiAgICAgICAgdmVjdG9yLm1ldGFkYXRhLnRhZyA9IHRhZwogICAgICAgIHZlY3Rvci5zYXZlKCkKICAgICAgICBmZWF0dXJlX3ZlY3RvciA9IHZlY3Rvci51cmkKCiAgICAjIFByZXBhcmluZyBlbnRpdHlfcm93czoKICAgIGlmIGVudGl0eV9yb3dzIGlzIG5vdCBOb25lOgogICAgICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZiJDcmVhdGluZyBEYXRhRnJhbWUgZnJvbSBlbnRpdHlfcm93cyA9IHtlbnRpdHlfcm93c30iKQogICAgICAgIGVudGl0eV9yb3dzID0gZW50aXR5X3Jvd3MuYXNfZGYoKQoKICAgICMgUHJlcGFyaW5nIHRhcmdldDoKICAgIGlmIHRhcmdldDoKICAgICAgICBpZiBpc2luc3RhbmNlKHRhcmdldCwgc3RyKToKICAgICAgICAgICAgdGFyZ2V0ID0ga2luZF90b19kcml2ZXJbdGFyZ2V0XSgpCgogICAgICAgIG5hbWUgPSB0YXJnZXQubmFtZSBpZiBoYXNhdHRyKHRhcmdldCwgIm5hbWUiKSBlbHNlIHRhcmdldFsibmFtZSJdCiAgICAgICAgY29udGV4dC5sb2dnZXIuaW5mbyhmIlByZXBhcmluZyAne25hbWV9JyB0YXJnZXQiKQogICAgICAgIHRhcmdldCA9IGdldF90YXJnZXRfZHJpdmVyKHRhcmdldCkKICAgIGlmIGhhc2F0dHIodGFyZ2V0LCAncGF0aCcpIGFuZCB0YXJnZXQucGF0aDoKICAgICAgICBjb250ZXh0LmxvZ19yZXN1bHQoInRhcmdldCIsIHRhcmdldC5wYXRoKQoKICAgICMgUHJlcGFyaW5nIHJ1bl9jb25maWc6CiAgICBpZiBydW5fY29uZmlnIGFuZCBpc2luc3RhbmNlKHJ1bl9jb25maWcsIGRpY3QpOgogICAgICAgIGNvbnRleHQubG9nZ2VyLmluZm8oIlByZXBhcmluZyBydW4gY29uZmlndXJhdGlvbiIpCiAgICAgICAgcnVuX2NvbmZpZyA9IGZzLlJ1bkNvbmZpZygqKnJ1bl9jb25maWcpCgogICAgIyBDYWxsaW5nIGdldF9vZmZsaW5lX2ZlYXR1cmVzOgogICAgY29udGV4dC5sb2dnZXIuaW5mbygKICAgICAgICBmImdldHRpbmcgb2ZmbGluZSBmZWF0dXJlcyBmcm9tIHRoZSBGZWF0dXJlVmVjdG9yIHtmZWF0dXJlX3ZlY3Rvcn0iCiAgICApCiAgICBmcy5nZXRfb2ZmbGluZV9mZWF0dXJlcygKICAgICAgICBmZWF0dXJlX3ZlY3Rvcj1mZWF0dXJlX3ZlY3RvciwKICAgICAgICBlbnRpdHlfcm93cz1lbnRpdHlfcm93cywKICAgICAgICBlbnRpdHlfdGltZXN0YW1wX2NvbHVtbj1lbnRpdHlfdGltZXN0YW1wX2NvbHVtbiwKICAgICAgICB0YXJnZXQ9dGFyZ2V0LAogICAgICAgIHJ1bl9jb25maWc9cnVuX2NvbmZpZywKICAgICAgICBkcm9wX2NvbHVtbnM9ZHJvcF9jb2x1bW5zLAogICAgICAgIHN0YXJ0X3RpbWU9c3RhcnRfdGltZSwKICAgICAgICBlbmRfdGltZT1lbmRfdGltZSwKICAgICAgICB3aXRoX2luZGV4ZXM9d2l0aF9pbmRleGVzLAogICAgICAgIHVwZGF0ZV9zdGF0cz11cGRhdGVfc3RhdHMsCiAgICApCgogICAgY29udGV4dC5sb2dfcmVzdWx0KCJmZWF0dXJlX3ZlY3RvciIsIGZlYXR1cmVfdmVjdG9yKQo=
-    commands: []
-    code_origin: https://github.com/mlrun/functions.git#9899e19f8c98568005dabe064fa29db6d9a47531:/Users/yonatanshelach/yoni/projects/functions/get_offline_features/get_offline_features.py
-    origin_filename: /Users/yonatanshelach/yoni/projects/functions/get_offline_features/get_offline_features.py
-  entry_points:
-    get_offline_features:
-      name: get_offline_features
-      doc: 'retrieve offline feature vector results
-
-
-        specify a feature vector object/uri and retrieve the desired features, their
-        metadata
-
-        and statistics. returns :py:class:`~mlrun.feature_store.OfflineVectorResponse`,
-
-        results can be returned as a dataframe or written to a target.
-
-        If feature vector does not exist, a new one will be created and saved with
-        the given features.
-
-
-        The start_time and end_time attributes allow filtering the data to a given
-        time range, they accept
-
-        string values or pandas `Timestamp` objects, string values can also be relative,
-        for example:
-
-        "now", "now - 1d2h", "now+5m", where a valid pandas Timedelta string follows
-        the verb "now",
-
-        for time alignment you can use the verb "floor" e.g. "now -1d floor 1H" will
-        align the time to the last hour
-
-        (the floor string is passed to pandas.Timestamp.floor(), can use D, H, T,
-        S for day, hour, min, sec alignment)'
-      parameters:
-      - name: context
-        type: MLClientCtx
-        doc: MLRun context
-        default: ''
-      - name: feature_vector
-        type: str
-        doc: feature vector uri
-        default: ''
-      - name: features
-        type: List[str]
-        doc: Relevant only if feature_vector not exist. list of feature to collect
-          to this vector format [/]. [as
-          ]
-        default: null
-      - name: label_feature
-        type: str
-        doc: feature name to be used as label data
-        default: null
-      - name: description
-        type: str
-        doc: text description of the vector
-        default: null
-      - name: entity_rows
-        type: DataItem
-        doc: URI of the data entity rows to join with
-        default: null
-      - name: entity_timestamp_column
-        type: str
-        doc: timestamp column name in the entity rows dataframe
-        default: null
-      - name: target
-        type: Union[str, Dict]
-        doc: where to write the results to
-        default: null
-      - name: run_config
-        type: Union[str, Dict]
-        doc: function and/or run configuration see :py:class:`~mlrun.feature_store.RunConfig`
-        default: null
-      - name: drop_columns
-        type: List[str]
-        doc: list of columns to drop from the final result
-        default: null
-      - name: start_time
-        type: str
-        doc: datetime, low limit of time needed to be filtered. Optional entity_timestamp_column
-          must be passed when using time filtering
-        default: null
-      - name: end_time
-        type: str
-        doc: datetime, high limit of time needed to be filtered. Optional entity_timestamp_column
-          must be passed when using time filtering
-        default: null
-      - name: with_indexes
-        type: bool
-        doc: return vector with index columns (default False)
-        default: false
-      - name: update_stats
-        type: bool
-        doc: update features statistics from the requested feature sets on the vector.
-          Default is False.
-        default: false
-      outputs:
-      - default: ''
-      lineno: 13
-  description: retrieve offline feature vector results
-  default_handler: get_offline_features
-  disable_auto_mount: false
-  env: []
-  priority_class_name: ''
-  preemption_mode: prevent
-  affinity: null
-  tolerations: null
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/get_offline_features/1.0.1/static/item.html b/functions/development/get_offline_features/1.0.1/static/item.html deleted file mode 100644 index 900d7297..00000000 --- a/functions/development/get_offline_features/1.0.1/static/item.html +++ /dev/null @@ -1,47 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- data-preparation
-- data-analysis
-- feature-store
-description: retrieve offline feature vector results
-doc: ''
-example: get_offline_features.ipynb
-generationDate: 2022-05-25:10-58
-icon: ''
-labels:
-  author: yonish
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 1.0.1
-name: get_offline_features
-platformVersion: ''
-spec:
-  filename: get_offline_features.py
-  handler: get_offline_features
-  image: mlrun/mlrun
-  kind: job
-  requirements: []
-url: ''
-version: 1.0.1
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/get_offline_features/1.0.1/static/source.html b/functions/development/get_offline_features/1.0.1/static/source.html deleted file mode 100644 index 47b760d0..00000000 --- a/functions/development/get_offline_features/1.0.1/static/source.html +++ /dev/null @@ -1,143 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-from typing import Union, List, Dict
-
-import mlrun
-import mlrun.feature_store as fs
-from mlrun.datastore.store_resources import is_store_uri, parse_store_uri
-from mlrun.datastore.targets import get_target_driver, kind_to_driver
-from mlrun.datastore.base import DataItem
-from mlrun.execution import MLClientCtx
-from mlrun.utils import StorePrefix, parse_versioned_object_uri
-from mlrun.errors import MLRunInvalidArgumentError
-
-
-def get_offline_features(
-    context: MLClientCtx,
-    feature_vector: str,
-    features: List[str] = None,
-    label_feature: str = None,
-    description: str = None,
-    entity_rows: DataItem = None,
-    entity_timestamp_column: str = None,
-    target: Union[str, Dict] = None,
-    run_config: Union[str, Dict] = None,
-    drop_columns: List[str] = None,
-    start_time: str = None,
-    end_time: str = None,
-    with_indexes: bool = False,
-    update_stats: bool = False,
-):
-    """retrieve offline feature vector results
-
-    specify a feature vector object/uri and retrieve the desired features, their metadata
-    and statistics. returns :py:class:`~mlrun.feature_store.OfflineVectorResponse`,
-    results can be returned as a dataframe or written to a target.
-    If feature vector does not exist, a new one will be created and saved with the given features.
-
-    The start_time and end_time attributes allow filtering the data to a given time range, they accept
-    string values or pandas `Timestamp` objects, string values can also be relative, for example:
-    "now", "now - 1d2h", "now+5m", where a valid pandas Timedelta string follows the verb "now",
-    for time alignment you can use the verb "floor" e.g. "now -1d floor 1H" will align the time to the last hour
-    (the floor string is passed to pandas.Timestamp.floor(), can use D, H, T, S for day, hour, min, sec alignment)
-
-
-    :param context:        MLRun context
-    :param feature_vector: feature vector uri
-    :param features:       Relevant only if feature_vector not exist. list of feature to collect to this vector
-                           format [/]. [as ]
-    :param label_feature:  feature name to be used as label data
-    :param description:    text description of the vector
-    :param entity_rows:    URI of the data entity rows to join with
-    :param target:         where to write the results to
-    :param drop_columns:   list of columns to drop from the final result
-    :param entity_timestamp_column: timestamp column name in the entity rows dataframe
-    :param run_config:     function and/or run configuration
-                           see :py:class:`~mlrun.feature_store.RunConfig`
-    :param start_time:      datetime, low limit of time needed to be filtered. Optional
-        entity_timestamp_column must be passed when using time filtering
-    :param end_time:        datetime, high limit of time needed to be filtered. Optional
-        entity_timestamp_column must be passed when using time filtering
-    :param with_indexes:    return vector with index columns (default False)
-    :param update_stats:    update features statistics from the requested feature sets on the vector. Default is False.
-
-    :returns feature_vector input
-    """
-
-    if features is not None:
-        # Creating a new FeatureVector and saving:
-        if is_store_uri(feature_vector):
-            prefix, new_uri = parse_store_uri(feature_vector)
-            if prefix != StorePrefix.FeatureVector:
-                raise MLRunInvalidArgumentError(
-                    f"provided store uri ({feature_vector}) does not represent a feature vector (prefix={prefix})"
-                )
-            feature_vector = new_uri
-
-        context.logger.info(f"Creating FeatureVector {feature_vector}")
-        project, name, tag, _ = parse_versioned_object_uri(feature_vector, mlrun.mlconf.default_project)
-        vector = fs.FeatureVector(name, features, label_feature=label_feature, description=description)
-        vector.metadata.project = project
-        vector.metadata.tag = tag
-        vector.save()
-        feature_vector = vector.uri
-
-    # Preparing entity_rows:
-    if entity_rows is not None:
-        context.logger.info(f"Creating DataFrame from entity_rows = {entity_rows}")
-        entity_rows = entity_rows.as_df()
-
-    # Preparing target:
-    if target:
-        if isinstance(target, str):
-            target = kind_to_driver[target]()
-
-        name = target.name if hasattr(target, "name") else target["name"]
-        context.logger.info(f"Preparing '{name}' target")
-        target = get_target_driver(target)
-    if hasattr(target, 'path') and target.path:
-        context.log_result("target", target.path)
-
-    # Preparing run_config:
-    if run_config and isinstance(run_config, dict):
-        context.logger.info("Preparing run configuration")
-        run_config = fs.RunConfig(**run_config)
-
-    # Calling get_offline_features:
-    context.logger.info(
-        f"getting offline features from the FeatureVector {feature_vector}"
-    )
-    fs.get_offline_features(
-        feature_vector=feature_vector,
-        entity_rows=entity_rows,
-        entity_timestamp_column=entity_timestamp_column,
-        target=target,
-        run_config=run_config,
-        drop_columns=drop_columns,
-        start_time=start_time,
-        end_time=end_time,
-        with_indexes=with_indexes,
-        update_stats=update_stats,
-    )
-
-    context.log_result("feature_vector", feature_vector)
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/get_offline_features/1.1.0/src/function.yaml b/functions/development/get_offline_features/1.1.0/src/function.yaml deleted file mode 100644 index 3482f23c..00000000 --- a/functions/development/get_offline_features/1.1.0/src/function.yaml +++ /dev/null @@ -1,126 +0,0 @@ -kind: job -metadata: - name: get-offline-features - tag: '' - hash: d8af2b5f1838ec720c9db726ec4f1e96ea82b2b8 - project: '' - labels: - author: yonish - categories: - - data-preparation - - data-analysis - - feature-store -spec: - command: '' - args: [] - image: mlrun/mlrun - build: - functionSourceCode: ZnJvbSB0eXBpbmcgaW1wb3J0IFVuaW9uLCBMaXN0LCBEaWN0CgppbXBvcnQgbWxydW4KaW1wb3J0IG1scnVuLmZlYXR1cmVfc3RvcmUgYXMgZnMKZnJvbSBtbHJ1bi5kYXRhc3RvcmUuc3RvcmVfcmVzb3VyY2VzIGltcG9ydCBpc19zdG9yZV91cmksIHBhcnNlX3N0b3JlX3VyaQpmcm9tIG1scnVuLmRhdGFzdG9yZS50YXJnZXRzIGltcG9ydCBnZXRfdGFyZ2V0X2RyaXZlciwga2luZF90b19kcml2ZXIKZnJvbSBtbHJ1bi5kYXRhc3RvcmUuYmFzZSBpbXBvcnQgRGF0YUl0ZW0KZnJvbSBtbHJ1bi5leGVjdXRpb24gaW1wb3J0IE1MQ2xpZW50Q3R4CmZyb20gbWxydW4udXRpbHMgaW1wb3J0IFN0b3JlUHJlZml4LCBwYXJzZV92ZXJzaW9uZWRfb2JqZWN0X3VyaQpmcm9tIG1scnVuLmVycm9ycyBpbXBvcnQgTUxSdW5JbnZhbGlkQXJndW1lbnRFcnJvcgoKCmRlZiBnZXRfb2ZmbGluZV9mZWF0dXJlcygKICAgIGNvbnRleHQ6IE1MQ2xpZW50Q3R4LAogICAgZmVhdHVyZV92ZWN0b3I6IHN0ciwKICAgIGZlYXR1cmVzOiBVbmlvbltMaXN0W3N0cl0sIE5vbmVdID0gTm9uZSwKICAgIGxhYmVsX2ZlYXR1cmU6IHN0ciA9IE5vbmUsCiAgICBkZXNjcmlwdGlvbjogc3RyID0gTm9uZSwKICAgIGVudGl0eV9yb3dzOiBEYXRhSXRlbSA9IE5vbmUsCiAgICBlbnRpdHlfdGltZXN0YW1wX2NvbHVtbjogc3RyID0gTm9uZSwKICAgIHRhcmdldDogVW5pb25bc3RyLCBEaWN0XSA9IE5vbmUsCiAgICBydW5fY29uZmlnOiBVbmlvbltzdHIsIERpY3RdID0gTm9uZSwKICAgIGRyb3BfY29sdW1uczogTGlzdFtzdHJdID0gTm9uZSwKICAgIHN0YXJ0X3RpbWU6IHN0ciA9IE5vbmUsCiAgICBlbmRfdGltZTogc3RyID0gTm9uZSwKICAgIHdpdGhfaW5kZXhlczogYm9vbCA9IEZhbHNlLAogICAgdXBkYXRlX3N0YXRzOiBib29sID0gRmFsc2UsCik6CiAgICAiIiJyZXRyaWV2ZSBvZmZsaW5lIGZlYXR1cmUgdmVjdG9yIHJlc3VsdHMKCiAgICBzcGVjaWZ5IGEgZmVhdHVyZSB2ZWN0b3Igb2JqZWN0L3VyaSBhbmQgcmV0cmlldmUgdGhlIGRlc2lyZWQgZmVhdHVyZXMsIHRoZWlyIG1ldGFkYXRhCiAgICBhbmQgc3RhdGlzdGljcy4gcmV0dXJucyA6cHk6Y2xhc3M6YH5tbHJ1bi5mZWF0dXJlX3N0b3JlLk9mZmxpbmVWZWN0b3JSZXNwb25zZWAsCiAgICByZXN1bHRzIGNhbiBiZSByZXR1cm5lZCBhcyBhIGRhdGFmcmFtZSBvciB3cml0dGVuIHRvIGEgdGFyZ2V0LgogICAgSWYgZmVhdHVyZSB2ZWN0b3IgZG9lcyBub3QgZXhpc3QsIGEgbmV3IG9uZSB3aWxsIGJlIGNyZWF0ZWQgYW5kIHNhdmVkIHdpdGggdGhlIGdpdmVuIGZlYXR1cmVzLgoKICAgIFRoZSBzdGFydF90aW1lIGFuZCBlbmRfdGltZSBhdHRyaWJ1dGVzIGFsbG93IGZpbHRlcmluZyB0aGUgZGF0YSB0byBhIGdpdmVuIHRpbWUgcmFuZ2UsIHRoZXkgYWNjZXB0CiAgICBzdHJpbmcgdmFsdWVzIG9yIHBhbmRhcyBgVGltZXN0YW1wYCBvYmplY3RzLCBzdHJpbmcgdmFsdWVzIGNhbiBhbHNvIGJlIHJlbGF0aXZlLCBmb3IgZXhhbXBsZToKICAgICJub3ciLCAibm93IC0gMWQyaCIsICJub3crNW0iLCB3aGVyZSBhIHZhbGlkIHBhbmRhcyBUaW1lZGVsdGEgc3RyaW5nIGZvbGxvd3MgdGhlIHZlcmIgIm5vdyIsCiAgICBmb3IgdGltZSBhbGlnbm1lbnQgeW91IGNhbiB1c2UgdGhlIHZlcmIgImZsb29yIiBlLmcuICJub3cgLTFkIGZsb29yIDFIIiB3aWxsIGFsaWduIHRoZSB0aW1lIHRvIHRoZSBsYXN0IGhvdXIKICAgICh0aGUgZmxvb3Igc3RyaW5nIGlzIHBhc3NlZCB0byBwYW5kYXMuVGltZXN0YW1wLmZsb29yKCksIGNhbiB1c2UgRCwgSCwgVCwgUyBmb3IgZGF5LCBob3VyLCBtaW4sIHNlYyBhbGlnbm1lbnQpCgoKICAgIDpwYXJhbSBjb250ZXh0OiAgICAgICAgTUxSdW4gY29udGV4dAogICAgOnBhcmFtIGZlYXR1cmVfdmVjdG9yOiBmZWF0dXJlIHZlY3RvciB1cmkKICAgIDpwYXJhbSBmZWF0dXJlczogICAgICAgUmVsZXZhbnQgb25seSBpZiBmZWF0dXJlX3ZlY3RvciBub3QgZXhpc3QuIGxpc3Qgb2YgZmVhdHVyZSB0byBjb2xsZWN0IHRvIHRoaXMgdmVjdG9yCiAgICAgICAgICAgICAgICAgICAgICAgICAgIGZvcm1hdCBbPHByb2plY3Q+L108ZmVhdHVyZV9zZXQ+LjxmZWF0dXJlX25hbWUgb3IgKj4gW2FzIDxhbGlhcz5dCiAgICA6cGFyYW0gbGFiZWxfZmVhdHVyZTogIGZlYXR1cmUgbmFtZSB0byBiZSB1c2VkIGFzIGxhYmVsIGRhdGEKICAgIDpwYXJhbSBkZXNjcmlwdGlvbjogICAgdGV4dCBkZXNjcmlwdGlvbiBvZiB0aGUgdmVjdG9yCiAgICA6cGFyYW0gZW50aXR5X3Jvd3M6ICAgIFVSSSBvZiB0aGUgZGF0YSBlbnRpdHkgcm93cyB0byBqb2luIHdpdGgKICAgIDpwYXJhbSB0YXJnZXQ6ICAgICAgICAgd2hlcmUgdG8gd3JpdGUgdGhlIHJlc3VsdHMgdG8KICAgIDpwYXJhbSBkcm9wX2NvbHVtbnM6ICAgbGlzdCBvZiBjb2x1bW5zIHRvIGRyb3AgZnJvbSB0aGUgZmluYWwgcmVzdWx0CiAgICA6cGFyYW0gZW50aXR5X3RpbWVzdGFtcF9jb2x1bW46IHRpbWVzdGFtcCBjb2x1bW4gbmFtZSBpbiB0aGUgZW50aXR5IHJvd3MgZGF0YWZyYW1lCiAgICA6cGFyYW0gcnVuX2NvbmZpZzogICAgIGZ1bmN0aW9uIGFuZC9vciBydW4gY29uZmlndXJhdGlvbgogICAgICAgICAgICAgICAgICAgICAgICAgICBzZWUgOnB5OmNsYXNzOmB+bWxydW4uZmVhdHVyZV9zdG9yZS5SdW5Db25maWdgCiAgICA6cGFyYW0gc3RhcnRfdGltZTogICAgICBkYXRldGltZSwgbG93IGxpbWl0IG9mIHRpbWUgbmVlZGVkIHRvIGJlIGZpbHRlcmVkLiBPcHRpb25hbAogICAgICAgIGVudGl0eV90aW1lc3RhbXBfY29sdW1uIG11c3QgYmUgcGFzc2VkIHdoZW4gdXNpbmcgdGltZSBmaWx0ZXJpbmcKICAgIDpwYXJhbSBlbmRfdGltZTogICAgICAgIGRhdGV0aW1lLCBoaWdoIGxpbWl0IG9mIHRpbWUgbmVlZGVkIHRvIGJlIGZpbHRlcmVkLiBPcHRpb25hbAogICAgICAgIGVudGl0eV90aW1lc3RhbXBfY29sdW1uIG11c3QgYmUgcGFzc2VkIHdoZW4gdXNpbmcgdGltZSBmaWx0ZXJpbmcKICAgIDpwYXJhbSB3aXRoX2luZGV4ZXM6ICAgIHJldHVybiB2ZWN0b3Igd2l0aCBpbmRleCBjb2x1bW5zIChkZWZhdWx0IEZhbHNlKQogICAgOnBhcmFtIHVwZGF0ZV9zdGF0czogICAgdXBkYXRlIGZlYXR1cmVzIHN0YXRpc3RpY3MgZnJvbSB0aGUgcmVxdWVzdGVkIGZlYXR1cmUgc2V0cyBvbiB0aGUgdmVjdG9yLiBEZWZhdWx0IGlzIEZhbHNlLgoKICAgIDpyZXR1cm5zIGZlYXR1cmVfdmVjdG9yIGlucHV0CiAgICAiIiIKCiAgICBpZiBmZWF0dXJlczoKICAgICAgICAjIENyZWF0aW5nIGEgbmV3IEZlYXR1cmVWZWN0b3IgYW5kIHNhdmluZzoKICAgICAgICBpZiBpc19zdG9yZV91cmkoZmVhdHVyZV92ZWN0b3IpOgogICAgICAgICAgICBwcmVmaXgsIG5ld191cmkgPSBwYXJzZV9zdG9yZV91cmkoZmVhdHVyZV92ZWN0b3IpCiAgICAgICAgICAgIGlmIHByZWZpeCAhPSBTdG9yZVByZWZpeC5GZWF0dXJlVmVjdG9yOgogICAgICAgICAgICAgICAgcmFpc2UgTUxSdW5JbnZhbGlkQXJndW1lbnRFcnJvcigKICAgICAgICAgICAgICAgICAgICBmInByb3ZpZGVkIHN0b3JlIHVyaSAoe2ZlYXR1cmVfdmVjdG9yfSkgZG9lcyBub3QgcmVwcmVzZW50IGEgZmVhdHVyZSB2ZWN0b3IgKHByZWZpeD17cHJlZml4fSkiCiAgICAgICAgICAgICAgICApCiAgICAgICAgICAgIGZlYXR1cmVfdmVjdG9yID0gbmV3X3VyaQoKICAgICAgICBjb250ZXh0LmxvZ2dlci5pbmZvKGYiQ3JlYXRpbmcgRmVhdHVyZVZlY3RvciB7ZmVhdHVyZV92ZWN0b3J9IikKICAgICAgICBwcm9qZWN0LCBuYW1lLCB0YWcsIF8gPSBwYXJzZV92ZXJzaW9uZWRfb2JqZWN0X3VyaShmZWF0dXJlX3ZlY3RvciwgbWxydW4ubWxjb25mLmRlZmF1bHRfcHJvamVjdCkKICAgICAgICB2ZWN0b3IgPSBmcy5GZWF0dXJlVmVjdG9yKG5hbWUsIGZlYXR1cmVzLCBsYWJlbF9mZWF0dXJlPWxhYmVsX2ZlYXR1cmUsIGRlc2NyaXB0aW9uPWRlc2NyaXB0aW9uKQogICAgICAgIHZlY3Rvci5tZXRhZGF0YS5wcm9qZWN0ID0gcHJvamVjdAogICAgICAgIHZlY3Rvci5tZXRhZGF0YS50YWcgPSB0YWcKICAgICAgICB2ZWN0b3Iuc2F2ZSgpCiAgICAgICAgZmVhdHVyZV92ZWN0b3IgPSB2ZWN0b3IudXJpCgogICAgIyBQcmVwYXJpbmcgZW50aXR5X3Jvd3M6CiAgICBpZiBlbnRpdHlfcm93cyBpcyBub3QgTm9uZToKICAgICAgICBjb250ZXh0LmxvZ2dlci5pbmZvKGYiQ3JlYXRpbmcgRGF0YUZyYW1lIGZyb20gZW50aXR5X3Jvd3MgPSB7ZW50aXR5X3Jvd3N9IikKICAgICAgICBlbnRpdHlfcm93cyA9IGVudGl0eV9yb3dzLmFzX2RmKCkKCiAgICAjIFByZXBhcmluZyB0YXJnZXQ6CiAgICBpZiB0YXJnZXQ6CiAgICAgICAgaWYgaXNpbnN0YW5jZSh0YXJnZXQsIHN0cik6CiAgICAgICAgICAgIHRhcmdldCA9IGtpbmRfdG9fZHJpdmVyW3RhcmdldF0oKQoKICAgICAgICBuYW1lID0gdGFyZ2V0Lm5hbWUgaWYgaGFzYXR0cih0YXJnZXQsICJuYW1lIikgZWxzZSB0YXJnZXRbIm5hbWUiXQogICAgICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZiJQcmVwYXJpbmcgJ3tuYW1lfScgdGFyZ2V0IikKICAgICAgICB0YXJnZXQgPSBnZXRfdGFyZ2V0X2RyaXZlcih0YXJnZXQpCiAgICBpZiBoYXNhdHRyKHRhcmdldCwgJ3BhdGgnKSBhbmQgdGFyZ2V0LnBhdGg6CiAgICAgICAgY29udGV4dC5sb2dfcmVzdWx0KCJ0YXJnZXQiLCB0YXJnZXQucGF0aCkKCiAgICAjIFByZXBhcmluZyBydW5fY29uZmlnOgogICAgaWYgcnVuX2NvbmZpZyBhbmQgaXNpbnN0YW5jZShydW5fY29uZmlnLCBkaWN0KToKICAgICAgICBjb250ZXh0LmxvZ2dlci5pbmZvKCJQcmVwYXJpbmcgcnVuIGNvbmZpZ3VyYXRpb24iKQogICAgICAgIHJ1bl9jb25maWcgPSBmcy5SdW5Db25maWcoKipydW5fY29uZmlnKQoKICAgICMgQ2FsbGluZyBnZXRfb2ZmbGluZV9mZWF0dXJlczoKICAgIGNvbnRleHQubG9nZ2VyLmluZm8oCiAgICAgICAgZiJnZXR0aW5nIG9mZmxpbmUgZmVhdHVyZXMgZnJvbSB0aGUgRmVhdHVyZVZlY3RvciB7ZmVhdHVyZV92ZWN0b3J9IgogICAgKQogICAgZnMuZ2V0X29mZmxpbmVfZmVhdHVyZXMoCiAgICAgICAgZmVhdHVyZV92ZWN0b3I9ZmVhdHVyZV92ZWN0b3IsCiAgICAgICAgZW50aXR5X3Jvd3M9ZW50aXR5X3Jvd3MsCiAgICAgICAgZW50aXR5X3RpbWVzdGFtcF9jb2x1bW49ZW50aXR5X3RpbWVzdGFtcF9jb2x1bW4sCiAgICAgICAgdGFyZ2V0PXRhcmdldCwKICAgICAgICBydW5fY29uZmlnPXJ1bl9jb25maWcsCiAgICAgICAgZHJvcF9jb2x1bW5zPWRyb3BfY29sdW1ucywKICAgICAgICBzdGFydF90aW1lPXN0YXJ0X3RpbWUsCiAgICAgICAgZW5kX3RpbWU9ZW5kX3RpbWUsCiAgICAgICAgd2l0aF9pbmRleGVzPXdpdGhfaW5kZXhlcywKICAgICAgICB1cGRhdGVfc3RhdHM9dXBkYXRlX3N0YXRzLAogICAgKQoKICAgIGNvbnRleHQubG9nX3Jlc3VsdCgiZmVhdHVyZV92ZWN0b3IiLCBmZWF0dXJlX3ZlY3RvcikK - commands: [] - code_origin: https://github.com/mlrun/functions.git#ecc7bc18501d377d10d2b40edccf5c4b256a79d3:/Users/yonatanshelach/yoni/projects/functions/get_offline_features/get_offline_features.py - origin_filename: /Users/yonatanshelach/yoni/projects/functions/get_offline_features/get_offline_features.py - entry_points: - get_offline_features: - name: get_offline_features - doc: 'retrieve offline feature vector results - - - specify a feature vector object/uri and retrieve the desired features, their - metadata - - and statistics. returns :py:class:`~mlrun.feature_store.OfflineVectorResponse`, - - results can be returned as a dataframe or written to a target. - - If feature vector does not exist, a new one will be created and saved with - the given features. - - - The start_time and end_time attributes allow filtering the data to a given - time range, they accept - - string values or pandas `Timestamp` objects, string values can also be relative, - for example: - - "now", "now - 1d2h", "now+5m", where a valid pandas Timedelta string follows - the verb "now", - - for time alignment you can use the verb "floor" e.g. "now -1d floor 1H" will - align the time to the last hour - - (the floor string is passed to pandas.Timestamp.floor(), can use D, H, T, - S for day, hour, min, sec alignment)' - parameters: - - name: context - type: MLClientCtx - doc: MLRun context - default: '' - - name: feature_vector - type: str - doc: feature vector uri - default: '' - - name: features - type: Union[List[str], ] - doc: Relevant only if feature_vector not exist. list of feature to collect - to this vector format [/]. [as - ] - default: null - - name: label_feature - type: str - doc: feature name to be used as label data - default: null - - name: description - type: str - doc: text description of the vector - default: null - - name: entity_rows - type: DataItem - doc: URI of the data entity rows to join with - default: null - - name: entity_timestamp_column - type: str - doc: timestamp column name in the entity rows dataframe - default: null - - name: target - type: Union[str, Dict] - doc: where to write the results to - default: null - - name: run_config - type: Union[str, Dict] - doc: function and/or run configuration see :py:class:`~mlrun.feature_store.RunConfig` - default: null - - name: drop_columns - type: List[str] - doc: list of columns to drop from the final result - default: null - - name: start_time - type: str - doc: datetime, low limit of time needed to be filtered. Optional entity_timestamp_column - must be passed when using time filtering - default: null - - name: end_time - type: str - doc: datetime, high limit of time needed to be filtered. Optional entity_timestamp_column - must be passed when using time filtering - default: null - - name: with_indexes - type: bool - doc: return vector with index columns (default False) - default: false - - name: update_stats - type: bool - doc: update features statistics from the requested feature sets on the vector. - Default is False. - default: false - outputs: - - default: '' - lineno: 13 - description: retrieve offline feature vector results - default_handler: get_offline_features - disable_auto_mount: false - env: [] - priority_class_name: '' - preemption_mode: prevent - affinity: null - tolerations: null -verbose: false diff --git a/functions/development/get_offline_features/1.1.0/src/get_offline_features.ipynb b/functions/development/get_offline_features/1.1.0/src/get_offline_features.ipynb deleted file mode 100644 index d97402a2..00000000 --- a/functions/development/get_offline_features/1.1.0/src/get_offline_features.ipynb +++ /dev/null @@ -1,1536 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# `get_offline_features()` from MLRun FeatureStore\n", - "\n", - "This MLRun Function has the following `params`:\n", - "\n", - "- `feature_vector: str`, feature vector uri.\n", - "\n", - "- `entity_rows: DataItem` = None, URI of the data entity rows to join with.\n", - "\n", - "- `entity_timestamp_column: str = None`, timestamp column name in the entity rows dataframe.\n", - "\n", - "- `target: Union[str, Dict] = None`, where to write the results to.\n", - "\n", - "- `run_config: Union[str, Dict] = None`, function and/or run configuration see :py:class:`~mlrun.feature_store.RunConfig`.\n", - "\n", - "- `drop_columns: List[str] = None`, list of columns to drop from the final result. \n", - "\n", - "- `start_time: str = None`, datetime, low limit of time needed to be filtered. Optional. `entity_timestamp_column` must be passed when using time filtering.\n", - "\n", - "- `end_time: str = None`, datetime, high limit of time needed to be filtered. Optional. `entity_timestamp_column` must be passed when using time filtering.\n", - "\n", - "- `with_indexes: bool = False`, return vector with index columns (default False).\n", - "\n", - "- `update_stats: bool = False`, update features statistics from the requested feature sets on the vector. Default is False." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import mlrun\n", - "import mlrun.feature_store as fstore\n", - "from mlrun.datastore.targets import CSVTarget\n", - "from mlrun.datastore.sources import CSVSource\n", - "from mlrun.run import get_dataitem\n", - "from mlrun.feature_store.steps import *\n", - "from mlrun.features import MinMaxValidator\n", - "import pandas as pd\n", - "import datetime\n", - "import os" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2022-01-31 14:41:48,288 [info] loaded project get-offline-features from MLRun DB\n" - ] - } - ], - "source": [ - "ABS_PATH = 'v3io://users/{}/get_offline_features/'.format(os.environ['V3IO_USERNAME'])\n", - "# Initialize the MLRun project object\n", - "project = mlrun.get_or_create_project('get-offline-features', context=\"./\", user_project=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Generating the Same FeatureSets and FeatureVecotrs Based on the Stocks Example" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Create Sample Data For Demo" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "quotes = pd.DataFrame(\n", - " {\n", - " \"time\": [\n", - " pd.Timestamp(\"2016-05-25 13:30:00.023\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.023\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.030\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.041\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.048\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.049\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.072\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.075\")\n", - " ],\n", - " \"ticker\": [\n", - " \"GOOG\",\n", - " \"MSFT\",\n", - " \"MSFT\",\n", - " \"MSFT\",\n", - " \"GOOG\",\n", - " \"AAPL\",\n", - " \"GOOG\",\n", - " \"MSFT\"\n", - " ],\n", - " \"bid\": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01],\n", - " \"ask\": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03]\n", - " }\n", - ")\n", - "\n", - "trades = pd.DataFrame(\n", - " {\n", - " \"time\": [\n", - " pd.Timestamp(\"2016-05-25 13:30:00.023\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.038\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.048\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.048\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.048\")\n", - " ],\n", - " \"ticker\": [\"MSFT\", \"MSFT\", \"GOOG\", \"GOOG\", \"AAPL\"],\n", - " \"price\": [51.95, 51.95, 720.77, 720.92, 98.0],\n", - " \"quantity\": [75, 155, 100, 100, 100]\n", - " }\n", - ")\n", - "\n", - "stocks = pd.DataFrame(\n", - " {\n", - " \"ticker\": [\"MSFT\", \"GOOG\", \"AAPL\"],\n", - " \"name\": [\"Microsoft Corporation\", \"Alphabet Inc\", \"Apple Inc\"],\n", - " \"exchange\": [\"NASDAQ\", \"NASDAQ\", \"NASDAQ\"]\n", - " }\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "def move_date(df, col):\n", - " max_date = df[col].max()\n", - " now_date = datetime.datetime.now()\n", - " delta = now_date - max_date \n", - " df[col] = df[col] + delta \n", - " return df\n", - "\n", - "quotes = move_date(quotes, \"time\")\n", - "trades = move_date(trades, \"time\")\n", - "trades.to_csv('trades.csv', index=False)\n", - "data_uri = os.path.join(ABS_PATH, 'trades.csv')" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
timetickerbidask
02022-01-31 14:41:48.260566GOOG720.50720.93
12022-01-31 14:41:48.260566MSFT51.9551.96
22022-01-31 14:41:48.267566MSFT51.9751.98
32022-01-31 14:41:48.278566MSFT51.9952.00
42022-01-31 14:41:48.285566GOOG720.50720.93
52022-01-31 14:41:48.286566AAPL97.9998.01
62022-01-31 14:41:48.309566GOOG720.50720.88
72022-01-31 14:41:48.312566MSFT52.0152.03
\n", - "
" - ], - "text/plain": [ - " time ticker bid ask\n", - "0 2022-01-31 14:41:48.260566 GOOG 720.50 720.93\n", - "1 2022-01-31 14:41:48.260566 MSFT 51.95 51.96\n", - "2 2022-01-31 14:41:48.267566 MSFT 51.97 51.98\n", - "3 2022-01-31 14:41:48.278566 MSFT 51.99 52.00\n", - "4 2022-01-31 14:41:48.285566 GOOG 720.50 720.93\n", - "5 2022-01-31 14:41:48.286566 AAPL 97.99 98.01\n", - "6 2022-01-31 14:41:48.309566 GOOG 720.50 720.88\n", - "7 2022-01-31 14:41:48.312566 MSFT 52.01 52.03" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "quotes" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
timetickerpricequantity
02022-01-31 14:41:48.288476MSFT51.9575
12022-01-31 14:41:48.303476MSFT51.95155
22022-01-31 14:41:48.313476GOOG720.77100
32022-01-31 14:41:48.313476GOOG720.92100
42022-01-31 14:41:48.313476AAPL98.00100
\n", - "
" - ], - "text/plain": [ - " time ticker price quantity\n", - "0 2022-01-31 14:41:48.288476 MSFT 51.95 75\n", - "1 2022-01-31 14:41:48.303476 MSFT 51.95 155\n", - "2 2022-01-31 14:41:48.313476 GOOG 720.77 100\n", - "3 2022-01-31 14:41:48.313476 GOOG 720.92 100\n", - "4 2022-01-31 14:41:48.313476 AAPL 98.00 100" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "trades" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
tickernameexchange
0MSFTMicrosoft CorporationNASDAQ
1GOOGAlphabet IncNASDAQ
2AAPLApple IncNASDAQ
\n", - "
" - ], - "text/plain": [ - " ticker name exchange\n", - "0 MSFT Microsoft Corporation NASDAQ\n", - "1 GOOG Alphabet Inc NASDAQ\n", - "2 AAPL Apple Inc NASDAQ" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "stocks" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Build & Ingest Simple Feature Set (stocks)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
nameexchange
ticker
MSFTMicrosoft CorporationNASDAQ
GOOGAlphabet IncNASDAQ
AAPLApple IncNASDAQ
\n", - "
" - ], - "text/plain": [ - " name exchange\n", - "ticker \n", - "MSFT Microsoft Corporation NASDAQ\n", - "GOOG Alphabet Inc NASDAQ\n", - "AAPL Apple Inc NASDAQ" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# add feature set without time column (stock ticker metadata) \n", - "stocks_set = fstore.FeatureSet(\"stocks\", entities=[fstore.Entity(\"ticker\")])\n", - "fstore.ingest(stocks_set, stocks, infer_options=fstore.InferOptions.default())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Build Advanced feature set - with feature engineering pipeline" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "quotes_set = fstore.FeatureSet(\"stock-quotes\", entities=[fstore.Entity(\"ticker\")])" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "class MyMap(MapClass):\n", - " def __init__(self, multiplier=1, **kwargs):\n", - " super().__init__(**kwargs)\n", - " self._multiplier = multiplier\n", - "\n", - " def do(self, event):\n", - " event[\"multi\"] = event[\"bid\"] * self._multiplier\n", - " return event" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "image/svg+xml": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "mlrun-flow\n", - "\n", - "\n", - "\n", - "_start\n", - "\n", - "start\n", - "\n", - "\n", - "\n", - "MyMap\n", - "\n", - "MyMap\n", - "\n", - "\n", - "\n", - "_start->MyMap\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "storey.Extend\n", - "\n", - "storey.Extend\n", - "\n", - "\n", - "\n", - "MyMap->storey.Extend\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "filter\n", - "\n", - "filter\n", - "\n", - "\n", - "\n", - "storey.Extend->filter\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "FeaturesetValidator\n", - "\n", - "FeaturesetValidator\n", - "\n", - "\n", - "\n", - "filter->FeaturesetValidator\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "Aggregates\n", - "\n", - "Aggregates\n", - "\n", - "\n", - "\n", - "FeaturesetValidator->Aggregates\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "parquet\n", - "\n", - "\n", - "parquet\n", - "\n", - "\n", - "\n", - "Aggregates->parquet\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "nosql\n", - "\n", - "\n", - "nosql\n", - "\n", - "\n", - "\n", - "Aggregates->nosql\n", - "\n", - "\n", - "\n", - "\n", - "\n" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "quotes_set.graph.to(\"MyMap\", multiplier=3)\\\n", - " .to(\"storey.Extend\", _fn=\"({'extra': event['bid'] * 77})\")\\\n", - " .to(\"storey.Filter\", \"filter\", _fn=\"(event['bid'] > 51.92)\")\\\n", - " .to(FeaturesetValidator())\n", - "\n", - "quotes_set.add_aggregation(\"ask\", [\"sum\", \"max\"], \"1h\", \"10m\", name=\"asks1\")\n", - "quotes_set.add_aggregation(\"ask\", [\"sum\", \"max\"], \"5h\", \"10m\", name=\"asks5\")\n", - "quotes_set.add_aggregation(\"bid\", [\"min\", \"max\"], \"1h\", \"10m\", name=\"bids\")\n", - "\n", - "# add feature validation policy\n", - "quotes_set[\"bid\"] = fstore.Feature(validator=MinMaxValidator(min=52, severity=\"info\"))\n", - "\n", - "# add default target definitions and plot\n", - "quotes_set.set_targets()\n", - "quotes_set.plot(rankdir=\"LR\", with_targets=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Ingest Data Into Offline And Online Stores" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.377248+00:00 args={'min': 52, 'value': 51.95}\n", - "info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.377927+00:00 args={'min': 52, 'value': 51.97}\n", - "info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.378103+00:00 args={'min': 52, 'value': 51.99}\n", - "info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.578640+00:00 args={'min': 52, 'value': 51.95}\n", - "info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.581692+00:00 args={'min': 52, 'value': 51.97}\n", - "info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.584351+00:00 args={'min': 52, 'value': 51.99}\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
asks1_max_1hasks1_sum_1hasks5_max_5hasks5_sum_5hbids_max_1hbids_min_1htimebidaskmultiextra
ticker
GOOG720.93720.93720.93720.93720.50720.502022-01-31 14:41:48.260566720.50720.932161.5055478.50
MSFT51.9651.9651.9651.9651.9551.952022-01-31 14:41:48.26056651.9551.96155.854000.15
MSFT51.98103.9451.98103.9451.9751.952022-01-31 14:41:48.26756651.9751.98155.914001.69
MSFT52.00155.9452.00155.9451.9951.952022-01-31 14:41:48.27856651.9952.00155.974003.23
GOOG720.931441.86720.931441.86720.50720.502022-01-31 14:41:48.285566720.50720.932161.5055478.50
AAPL98.0198.0198.0198.0197.9997.992022-01-31 14:41:48.28656697.9998.01293.977545.23
GOOG720.932162.74720.932162.74720.50720.502022-01-31 14:41:48.309566720.50720.882161.5055478.50
MSFT52.03207.9752.03207.9752.0151.952022-01-31 14:41:48.31256652.0152.03156.034004.77
\n", - "
" - ], - "text/plain": [ - " asks1_max_1h asks1_sum_1h asks5_max_5h asks5_sum_5h bids_max_1h \\\n", - "ticker \n", - "GOOG 720.93 720.93 720.93 720.93 720.50 \n", - "MSFT 51.96 51.96 51.96 51.96 51.95 \n", - "MSFT 51.98 103.94 51.98 103.94 51.97 \n", - "MSFT 52.00 155.94 52.00 155.94 51.99 \n", - "GOOG 720.93 1441.86 720.93 1441.86 720.50 \n", - "AAPL 98.01 98.01 98.01 98.01 97.99 \n", - "GOOG 720.93 2162.74 720.93 2162.74 720.50 \n", - "MSFT 52.03 207.97 52.03 207.97 52.01 \n", - "\n", - " bids_min_1h time bid ask multi \\\n", - "ticker \n", - "GOOG 720.50 2022-01-31 14:41:48.260566 720.50 720.93 2161.50 \n", - "MSFT 51.95 2022-01-31 14:41:48.260566 51.95 51.96 155.85 \n", - "MSFT 51.95 2022-01-31 14:41:48.267566 51.97 51.98 155.91 \n", - "MSFT 51.95 2022-01-31 14:41:48.278566 51.99 52.00 155.97 \n", - "GOOG 720.50 2022-01-31 14:41:48.285566 720.50 720.93 2161.50 \n", - "AAPL 97.99 2022-01-31 14:41:48.286566 97.99 98.01 293.97 \n", - "GOOG 720.50 2022-01-31 14:41:48.309566 720.50 720.88 2161.50 \n", - "MSFT 51.95 2022-01-31 14:41:48.312566 52.01 52.03 156.03 \n", - "\n", - " extra \n", - "ticker \n", - "GOOG 55478.50 \n", - "MSFT 4000.15 \n", - "MSFT 4001.69 \n", - "MSFT 4003.23 \n", - "GOOG 55478.50 \n", - "AAPL 7545.23 \n", - "GOOG 55478.50 \n", - "MSFT 4004.77 " - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# save ingest data and print the FeatureSet spec\n", - "fstore.ingest(quotes_set, quotes)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Get an Offline Feature Vector" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "features = [\n", - " \"stock-quotes.multi\",\n", - " \"stock-quotes.asks5_sum_5h as total_ask\",\n", - " \"stock-quotes.bids_min_1h\",\n", - " \"stock-quotes.bids_max_1h\",\n", - " \"stocks.*\",\n", - "]\n", - "\n", - "vector = fstore.FeatureVector(\"stocks-vec\", features)\n", - "vector.save()" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "target_dict = CSVTarget('mycsv',path=os.path.join(ABS_PATH, 'my_csv.csv')).to_dict()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Using `get_offline_features()` " - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [], - "source": [ - "get_offline_features_fn = mlrun.import_function('hub://get_offline_features:development')" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2022-01-31 14:41:52,066 [info] starting run get-offline-features-get_offline_features uid=956663b9a9ba448c9ea65e8e9245718e DB=http://mlrun-api:8080\n", - "> 2022-01-31 14:41:52,214 [info] Creating DataFrame from entity_rows = v3io://users/yonatan/get_offline_features/trades.csv\n", - "> 2022-01-31 14:41:52,292 [info] Preparing 'mycsv' target\n", - "> 2022-01-31 14:41:52,294 [info] getting offline features from the FeatureVector store://feature-vectors/get-offline-features-yonatan/stocks-vec\n", - "> 2022-01-31 14:41:52,708 [info] wrote target: {'name': 'mycsv', 'kind': 'csv', 'path': 'v3io://users/yonatan/get_offline_features/my_csv.csv', 'status': 'ready', 'updated': '2022-01-31T14:41:52.708534+00:00'}\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
get-offline-features-yonatan0Jan 31 14:41:52completedget-offline-features-get_offline_features
v3io_user=yonatan
kind=
owner=yonatan
host=jupyter-yoni-647b99c95d-w4jlc
entity_rows
feature_vector=store://feature-vectors/get-offline-features-yonatan/stocks-vec
target={'name': 'mycsv', 'kind': 'csv', 'path': 'v3io://users/yonatan/get_offline_features/my_csv.csv', 'partitioned': False}
entity_timestamp_column=time
target=v3io://users/yonatan/get_offline_features/my_csv.csv
feature_vector=store://feature-vectors/get-offline-features-yonatan/stocks-vec
\n", - "
\n", - "
\n", - "
\n", - " Title\n", - " ×\n", - "
\n", - " \n", - "
\n", - "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - }, - { - "data": { - "text/html": [ - " > to track results use the .show() or .logs() methods or click here to open in UI" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2022-01-31 14:41:52,896 [info] run executed, status=completed\n" - ] - } - ], - "source": [ - "gof_run = get_offline_features_fn.run(\n", - " handler='get_offline_features',\n", - " inputs= {'entity_rows': data_uri},\n", - " params={'feature_vector': vector.uri,\n", - " 'target': target_dict,\n", - " 'entity_timestamp_column': \"time\",\n", - " },\n", - " local=True\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'store://feature-vectors/get-offline-features-yonatan/stocks-vec'" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "gof_run.outputs['feature_vector']" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Unnamed: 0pricequantitymultitotal_askbids_min_1hbids_max_1hnameexchange
0051.9575155.8551.9651.9551.95Microsoft CorporationNASDAQ
1151.9575155.91103.9451.9551.97Microsoft CorporationNASDAQ
2251.9575155.97155.9451.9551.99Microsoft CorporationNASDAQ
3351.9575156.03207.9751.9552.01Microsoft CorporationNASDAQ
4451.95155155.8551.9651.9551.95Microsoft CorporationNASDAQ
5551.95155155.91103.9451.9551.97Microsoft CorporationNASDAQ
6651.95155155.97155.9451.9551.99Microsoft CorporationNASDAQ
7751.95155156.03207.9751.9552.01Microsoft CorporationNASDAQ
88720.771002161.50720.93720.50720.50Alphabet IncNASDAQ
99720.771002161.501441.86720.50720.50Alphabet IncNASDAQ
1010720.771002161.502162.74720.50720.50Alphabet IncNASDAQ
1111720.921002161.50720.93720.50720.50Alphabet IncNASDAQ
1212720.921002161.501441.86720.50720.50Alphabet IncNASDAQ
1313720.921002161.502162.74720.50720.50Alphabet IncNASDAQ
141498.00100293.9798.0197.9997.99Apple IncNASDAQ
\n", - "
" - ], - "text/plain": [ - " Unnamed: 0 price quantity multi total_ask bids_min_1h \\\n", - "0 0 51.95 75 155.85 51.96 51.95 \n", - "1 1 51.95 75 155.91 103.94 51.95 \n", - "2 2 51.95 75 155.97 155.94 51.95 \n", - "3 3 51.95 75 156.03 207.97 51.95 \n", - "4 4 51.95 155 155.85 51.96 51.95 \n", - "5 5 51.95 155 155.91 103.94 51.95 \n", - "6 6 51.95 155 155.97 155.94 51.95 \n", - "7 7 51.95 155 156.03 207.97 51.95 \n", - "8 8 720.77 100 2161.50 720.93 720.50 \n", - "9 9 720.77 100 2161.50 1441.86 720.50 \n", - "10 10 720.77 100 2161.50 2162.74 720.50 \n", - "11 11 720.92 100 2161.50 720.93 720.50 \n", - "12 12 720.92 100 2161.50 1441.86 720.50 \n", - "13 13 720.92 100 2161.50 2162.74 720.50 \n", - "14 14 98.00 100 293.97 98.01 97.99 \n", - "\n", - " bids_max_1h name exchange \n", - "0 51.95 Microsoft Corporation NASDAQ \n", - "1 51.97 Microsoft Corporation NASDAQ \n", - "2 51.99 Microsoft Corporation NASDAQ \n", - "3 52.01 Microsoft Corporation NASDAQ \n", - "4 51.95 Microsoft Corporation NASDAQ \n", - "5 51.97 Microsoft Corporation NASDAQ \n", - "6 51.99 Microsoft Corporation NASDAQ \n", - "7 52.01 Microsoft Corporation NASDAQ \n", - "8 720.50 Alphabet Inc NASDAQ \n", - "9 720.50 Alphabet Inc NASDAQ \n", - "10 720.50 Alphabet Inc NASDAQ \n", - "11 720.50 Alphabet Inc NASDAQ \n", - "12 720.50 Alphabet Inc NASDAQ \n", - "13 720.50 Alphabet Inc NASDAQ \n", - "14 97.99 Apple Inc NASDAQ " - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "mlrun.get_dataitem(gof_run.outputs['feature_vector']).as_df()" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/functions/development/get_offline_features/1.1.0/src/get_offline_features.py b/functions/development/get_offline_features/1.1.0/src/get_offline_features.py deleted file mode 100644 index ff0d1c04..00000000 --- a/functions/development/get_offline_features/1.1.0/src/get_offline_features.py +++ /dev/null @@ -1,135 +0,0 @@ -# Copyright 2019 Iguazio -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -from typing import Union, List, Dict - -import mlrun -import mlrun.feature_store as fs -from mlrun.datastore.store_resources import is_store_uri, parse_store_uri -from mlrun.datastore.targets import get_target_driver, kind_to_driver -from mlrun.datastore.base import DataItem -from mlrun.execution import MLClientCtx -from mlrun.utils import StorePrefix, parse_versioned_object_uri -from mlrun.errors import MLRunInvalidArgumentError - - -def get_offline_features( - context: MLClientCtx, - feature_vector: str, - features: Union[List[str], None] = None, - label_feature: str = None, - description: str = None, - entity_rows: DataItem = None, - entity_timestamp_column: str = None, - target: Union[str, Dict] = None, - run_config: Union[str, Dict] = None, - drop_columns: List[str] = None, - start_time: str = None, - end_time: str = None, - with_indexes: bool = False, - update_stats: bool = False, -): - """retrieve offline feature vector results - - specify a feature vector object/uri and retrieve the desired features, their metadata - and statistics. returns :py:class:`~mlrun.feature_store.OfflineVectorResponse`, - results can be returned as a dataframe or written to a target. - If feature vector does not exist, a new one will be created and saved with the given features. - - The start_time and end_time attributes allow filtering the data to a given time range, they accept - string values or pandas `Timestamp` objects, string values can also be relative, for example: - "now", "now - 1d2h", "now+5m", where a valid pandas Timedelta string follows the verb "now", - for time alignment you can use the verb "floor" e.g. "now -1d floor 1H" will align the time to the last hour - (the floor string is passed to pandas.Timestamp.floor(), can use D, H, T, S for day, hour, min, sec alignment) - - - :param context: MLRun context - :param feature_vector: feature vector uri - :param features: Relevant only if feature_vector not exist. list of feature to collect to this vector - format [/]. [as ] - :param label_feature: feature name to be used as label data - :param description: text description of the vector - :param entity_rows: URI of the data entity rows to join with - :param target: where to write the results to - :param drop_columns: list of columns to drop from the final result - :param entity_timestamp_column: timestamp column name in the entity rows dataframe - :param run_config: function and/or run configuration - see :py:class:`~mlrun.feature_store.RunConfig` - :param start_time: datetime, low limit of time needed to be filtered. Optional - entity_timestamp_column must be passed when using time filtering - :param end_time: datetime, high limit of time needed to be filtered. Optional - entity_timestamp_column must be passed when using time filtering - :param with_indexes: return vector with index columns (default False) - :param update_stats: update features statistics from the requested feature sets on the vector. Default is False. - - :returns feature_vector input - """ - - if features: - # Creating a new FeatureVector and saving: - if is_store_uri(feature_vector): - prefix, new_uri = parse_store_uri(feature_vector) - if prefix != StorePrefix.FeatureVector: - raise MLRunInvalidArgumentError( - f"provided store uri ({feature_vector}) does not represent a feature vector (prefix={prefix})" - ) - feature_vector = new_uri - - context.logger.info(f"Creating FeatureVector {feature_vector}") - project, name, tag, _ = parse_versioned_object_uri(feature_vector, mlrun.mlconf.default_project) - vector = fs.FeatureVector(name, features, label_feature=label_feature, description=description) - vector.metadata.project = project - vector.metadata.tag = tag - vector.save() - feature_vector = vector.uri - - # Preparing entity_rows: - if entity_rows is not None: - context.logger.info(f"Creating DataFrame from entity_rows = {entity_rows}") - entity_rows = entity_rows.as_df() - - # Preparing target: - if target: - if isinstance(target, str): - target = kind_to_driver[target]() - - name = target.name if hasattr(target, "name") else target["name"] - context.logger.info(f"Preparing '{name}' target") - target = get_target_driver(target) - if hasattr(target, 'path') and target.path: - context.log_result("target", target.path) - - # Preparing run_config: - if run_config and isinstance(run_config, dict): - context.logger.info("Preparing run configuration") - run_config = fs.RunConfig(**run_config) - - # Calling get_offline_features: - context.logger.info( - f"getting offline features from the FeatureVector {feature_vector}" - ) - fs.get_offline_features( - feature_vector=feature_vector, - entity_rows=entity_rows, - entity_timestamp_column=entity_timestamp_column, - target=target, - run_config=run_config, - drop_columns=drop_columns, - start_time=start_time, - end_time=end_time, - with_indexes=with_indexes, - update_stats=update_stats, - ) - - context.log_result("feature_vector", feature_vector) diff --git a/functions/development/get_offline_features/1.1.0/src/item.yaml b/functions/development/get_offline_features/1.1.0/src/item.yaml deleted file mode 100644 index 5462aee9..00000000 --- a/functions/development/get_offline_features/1.1.0/src/item.yaml +++ /dev/null @@ -1,26 +0,0 @@ -apiVersion: v1 -categories: -- data-preparation -- data-analysis -- feature-store -description: retrieve offline feature vector results -doc: '' -example: get_offline_features.ipynb -generationDate: 2022-08-28:17-25 -hidden: false -icon: '' -labels: - author: yonish -maintainers: [] -marketplaceType: '' -mlrunVersion: 1.1.0 -name: get_offline_features -platformVersion: 3.5.0 -spec: - filename: get_offline_features.py - handler: get_offline_features - image: mlrun/mlrun - kind: job - requirements: [] -url: '' -version: 1.1.0 diff --git a/functions/development/get_offline_features/1.1.0/src/test_get_offline_features.py b/functions/development/get_offline_features/1.1.0/src/test_get_offline_features.py deleted file mode 100644 index 85f07cbc..00000000 --- a/functions/development/get_offline_features/1.1.0/src/test_get_offline_features.py +++ /dev/null @@ -1,238 +0,0 @@ -# Copyright 2019 Iguazio -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -import os -import tempfile -import shutil -import datetime - -import pytest -import mlrun -import mlrun.feature_store as fstore -from mlrun.datastore.targets import CSVTarget -from mlrun.feature_store.steps import * -from mlrun.features import MinMaxValidator -from mlrun.run import get_dataitem - - -REQUIRED_ENV_VARS = [ - "MLRUN_DBPATH", - "MLRUN_ARTIFACT_PATH", - "V3IO_USERNAME", - "V3IO_API", - "V3IO_ACCESS_KEY", -] - - -def _validate_environment_variables() -> bool: - """ - Checks that all required Environment variables are set. - """ - environment_keys = os.environ.keys() - return all(key in environment_keys for key in REQUIRED_ENV_VARS) - - -def _set_environment(): - """ - Creating project and temp dir for the project. - """ - artifact_path = tempfile.TemporaryDirectory().name - os.makedirs(artifact_path) - project = mlrun.get_or_create_project( - "get-offline-features-test", context="./", user_project=True - ) - return artifact_path, project - - -def _cleanup_environment(artifact_path: str): - """ - Cleanup the test environment, deleting files and artifacts created during the test. - - :param artifact_path: The artifact path to delete. - """ - # Clean the local directory: - for test_output in [ - *os.listdir(artifact_path), - "schedules", - "runs", - "artifacts", - "functions", - ]: - test_output_path = os.path.abspath(f"./{test_output}") - if os.path.exists(test_output_path): - if os.path.isdir(test_output_path): - shutil.rmtree(test_output_path) - else: - os.remove(test_output_path) - - # Clean the artifacts directory: - shutil.rmtree(artifact_path) - - -def create_dataframes() -> (pd.DataFrame, pd.DataFrame, pd.DataFrame): - """ - Creates all the necessary DataFrames to the test. - """ - - def move_date(df, col): - max_date = df[col].max() - now_date = datetime.datetime.now() - delta = now_date - max_date - df[col] = df[col] + delta - return df - - stocks = pd.DataFrame( - { - "ticker": ["MSFT", "GOOG", "AAPL"], - "name": ["Microsoft Corporation", "Alphabet Inc", "Apple Inc"], - "exchange": ["NASDAQ", "NASDAQ", "NASDAQ"], - } - ) - - quotes = pd.DataFrame( - { - "time": [ - pd.Timestamp("2016-05-25 13:30:00.023"), - pd.Timestamp("2016-05-25 13:30:00.023"), - pd.Timestamp("2016-05-25 13:30:00.030"), - pd.Timestamp("2016-05-25 13:30:00.041"), - pd.Timestamp("2016-05-25 13:30:00.048"), - pd.Timestamp("2016-05-25 13:30:00.049"), - pd.Timestamp("2016-05-25 13:30:00.072"), - pd.Timestamp("2016-05-25 13:30:00.075"), - ], - "ticker": ["GOOG", "MSFT", "MSFT", "MSFT", "GOOG", "AAPL", "GOOG", "MSFT"], - "bid": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01], - "ask": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03], - } - ) - - trades = pd.DataFrame( - { - "time": [ - pd.Timestamp("2016-05-25 13:30:00.023"), - pd.Timestamp("2016-05-25 13:30:00.038"), - pd.Timestamp("2016-05-25 13:30:00.048"), - pd.Timestamp("2016-05-25 13:30:00.048"), - pd.Timestamp("2016-05-25 13:30:00.048"), - ], - "ticker": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"], - "price": [51.95, 51.95, 720.77, 720.92, 98.0], - "quantity": [75, 155, 100, 100, 100], - } - ) - quotes = move_date(quotes, "time") - trades = move_date(trades, "time") - return quotes, trades, stocks - - -class MyMap(MapClass): - def __init__(self, multiplier=1, **kwargs): - super().__init__(**kwargs) - self._multiplier = multiplier - - def do(self, event): - event["multi"] = event["bid"] * self._multiplier - return event - - -def _create_feature_set(): - """ - Creating all the necessary FeatureSets for the test. - """ - stocks_set = fstore.FeatureSet("stocks", entities=[fstore.Entity("ticker")]) - - quotes_set = fstore.FeatureSet("stock-quotes", entities=[fstore.Entity("ticker")]) - - quotes_set.graph.to("MyMap", multiplier=3).to( - "storey.Extend", _fn="({'extra': event['bid'] * 77})" - ).to("storey.Filter", "filter", _fn="(event['bid'] > 51.92)").to( - FeaturesetValidator() - ) - - quotes_set.add_aggregation("ask", ["sum", "max"], "1h", "10m", name="asks1") - quotes_set.add_aggregation("ask", ["sum", "max"], "5h", "10m", name="asks5") - quotes_set.add_aggregation("bid", ["min", "max"], "1h", "10m", name="bids") - - # add feature validation policy - quotes_set["bid"] = fstore.Feature( - validator=MinMaxValidator(min=52, severity="info") - ) - - # add default target definitions and plot - quotes_set.set_targets() - return quotes_set, stocks_set - - -@pytest.mark.skipif( - condition=not _validate_environment_variables(), - reason="Project's environment variables are not set", -) -def test_get_offline_vector(): - # Creating project: - artifact_path, project = _set_environment() - - # Importing the marketplace function: - gof_fn = mlrun.import_function("function.yaml") - - # Creating the dataframes: - quotes, trades, stocks = create_dataframes() - - # Defining features for the FeatureVector: - features = [ - "stock-quotes.multi", - "stock-quotes.asks5_sum_5h as total_ask", - "stock-quotes.bids_min_1h", - "stock-quotes.bids_max_1h", - "stocks.*", - ] - - # Creating the FeatureSets and ingesting them: - quotes_set, stocks_set = _create_feature_set() - fstore.ingest(stocks_set, stocks) - fstore.ingest(quotes_set, quotes) - - # Saving the trades dataframe as a csv to use as entity_rows: - trades_uri = os.path.join(artifact_path, "trades.csv") - trades.to_csv(trades_uri, index=False) - - # Creating target for the FeatureVector: - target_dict = CSVTarget( - "mycsv", path=os.path.join(artifact_path, "my_csv.csv") - ).to_dict() - - # Running the getting_offline_features function: - gof_run = None - try: - gof_run = gof_fn.run( - handler="get_offline_features", - inputs={"entity_rows": trades_uri}, - params={ - "feature_vector": "stocks-vec", - "features": features, - "target": target_dict, - "entity_timestamp_column": "time", - }, - local=True, - ) - - except Exception as e: - print(f"- The test failed - raised the following error:\n- {e}") - - target_df = get_dataitem(gof_run.outputs["target"]).as_df() - vector_df = get_dataitem(gof_run.outputs["feature_vector"]).as_df() - - # Asserting that the target and FeatureVector dataframes are the same: - assert vector_df.equals(target_df), "Target and feature vector are not the same" - _cleanup_environment(artifact_path) diff --git a/functions/development/get_offline_features/1.1.0/static/documentation.html b/functions/development/get_offline_features/1.1.0/static/documentation.html deleted file mode 100644 index 3c02dab8..00000000 --- a/functions/development/get_offline_features/1.1.0/static/documentation.html +++ /dev/null @@ -1,261 +0,0 @@ - - - - - - - -get_offline_features package - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - - - -
-
- - -
-
-
- -
-

get_offline_features package

- -
- -
-
-
-
-
-

get_offline_features package#

-
-

Submodules#

-
-
-

get_offline_features.get_offline_features module#

-
-
-get_offline_features.get_offline_features.get_offline_features(context: mlrun.execution.MLClientCtx, feature_vector: str, features: Optional[List[str]] = None, label_feature: Optional[str] = None, description: Optional[str] = None, entity_rows: Optional[mlrun.datastore.base.DataItem] = None, entity_timestamp_column: Optional[str] = None, target: Optional[Union[str, Dict]] = None, run_config: Optional[Union[str, Dict]] = None, drop_columns: Optional[List[str]] = None, start_time: Optional[str] = None, end_time: Optional[str] = None, with_indexes: bool = False, update_stats: bool = False)[source]#
-

retrieve offline feature vector results

-

specify a feature vector object/uri and retrieve the desired features, their metadata -and statistics. returns OfflineVectorResponse, -results can be returned as a dataframe or written to a target. -If feature vector does not exist, a new one will be created and saved with the given features.

-

The start_time and end_time attributes allow filtering the data to a given time range, they accept -string values or pandas Timestamp objects, string values can also be relative, for example: -“now”, “now - 1d2h”, “now+5m”, where a valid pandas Timedelta string follows the verb “now”, -for time alignment you can use the verb “floor” e.g. “now -1d floor 1H” will align the time to the last hour -(the floor string is passed to pandas.Timestamp.floor(), can use D, H, T, S for day, hour, min, sec alignment)

-
-
Parameters
-
    -
  • context – MLRun context

  • -
  • feature_vector – feature vector uri

  • -
  • features – Relevant only if feature_vector not exist. list of feature to collect to this vector -format [<project>/]<feature_set>.<feature_name or *> [as <alias>]

  • -
  • label_feature – feature name to be used as label data

  • -
  • description – text description of the vector

  • -
  • entity_rows – URI of the data entity rows to join with

  • -
  • target – where to write the results to

  • -
  • drop_columns – list of columns to drop from the final result

  • -
  • entity_timestamp_column – timestamp column name in the entity rows dataframe

  • -
  • run_config – function and/or run configuration -see RunConfig

  • -
  • start_time – datetime, low limit of time needed to be filtered. Optional -entity_timestamp_column must be passed when using time filtering

  • -
  • end_time – datetime, high limit of time needed to be filtered. Optional -entity_timestamp_column must be passed when using time filtering

  • -
  • with_indexes – return vector with index columns (default False)

  • -
  • update_stats – update features statistics from the requested feature sets on the vector. Default is False.

  • -
-
-
-

:returns feature_vector input

-
-
-
-

Module contents#

-
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/get_offline_features/1.1.0/static/example.html b/functions/development/get_offline_features/1.1.0/static/example.html deleted file mode 100644 index bbd67e37..00000000 --- a/functions/development/get_offline_features/1.1.0/static/example.html +++ /dev/null @@ -1,1390 +0,0 @@ - - - - - - - -get_offline_features() from MLRun FeatureStore - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - - - -
-
- - -
-
-
- - -
-
-
-

get_offline_features() from MLRun FeatureStore#

-

This MLRun Function has the following params:

-
    -
  • feature_vector: str, feature vector uri.

  • -
  • entity_rows: DataItem = None, URI of the data entity rows to join with.

  • -
  • entity_timestamp_column: str = None, timestamp column name in the entity rows dataframe.

  • -
  • target: Union[str, Dict] = None, where to write the results to.

  • -
  • run_config: Union[str, Dict] = None, function and/or run configuration see :py:class:~mlrun.feature_store.RunConfig.

  • -
  • drop_columns: List[str] = None, list of columns to drop from the final result.

  • -
  • start_time: str = None, datetime, low limit of time needed to be filtered. Optional. entity_timestamp_column must be passed when using time filtering.

  • -
  • end_time: str = None, datetime, high limit of time needed to be filtered. Optional. entity_timestamp_column must be passed when using time filtering.

  • -
  • with_indexes: bool = False, return vector with index columns (default False).

  • -
  • update_stats: bool = False, update features statistics from the requested feature sets on the vector. Default is False.

  • -
-
-
-
import mlrun
-import mlrun.feature_store as fstore
-from mlrun.datastore.targets import CSVTarget
-from mlrun.datastore.sources import CSVSource
-from mlrun.run import get_dataitem
-from mlrun.feature_store.steps import *
-from mlrun.features import MinMaxValidator
-import pandas as pd
-import datetime
-import os
-
-
-
-
-
-
-
ABS_PATH = 'v3io://users/{}/get_offline_features/'.format(os.environ['V3IO_USERNAME'])
-# Initialize the MLRun project object
-project = mlrun.get_or_create_project('get-offline-features', context="./", user_project=True)
-
-
-
-
-
> 2022-01-31 14:41:48,288 [info] loaded project get-offline-features from MLRun DB
-
-
-
-
-
-

Generating the Same FeatureSets and FeatureVecotrs Based on the Stocks Example#

-
-

Create Sample Data For Demo#

-
-
-
quotes = pd.DataFrame(
-    {
-        "time": [
-            pd.Timestamp("2016-05-25 13:30:00.023"),
-            pd.Timestamp("2016-05-25 13:30:00.023"),
-            pd.Timestamp("2016-05-25 13:30:00.030"),
-            pd.Timestamp("2016-05-25 13:30:00.041"),
-            pd.Timestamp("2016-05-25 13:30:00.048"),
-            pd.Timestamp("2016-05-25 13:30:00.049"),
-            pd.Timestamp("2016-05-25 13:30:00.072"),
-            pd.Timestamp("2016-05-25 13:30:00.075")
-        ],
-        "ticker": [
-               "GOOG",
-               "MSFT",
-               "MSFT",
-               "MSFT",
-               "GOOG",
-               "AAPL",
-               "GOOG",
-               "MSFT"
-           ],
-           "bid": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01],
-           "ask": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03]
-    }
-)
-
-trades = pd.DataFrame(
-       {
-           "time": [
-               pd.Timestamp("2016-05-25 13:30:00.023"),
-               pd.Timestamp("2016-05-25 13:30:00.038"),
-               pd.Timestamp("2016-05-25 13:30:00.048"),
-               pd.Timestamp("2016-05-25 13:30:00.048"),
-               pd.Timestamp("2016-05-25 13:30:00.048")
-           ],
-           "ticker": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"],
-           "price": [51.95, 51.95, 720.77, 720.92, 98.0],
-           "quantity": [75, 155, 100, 100, 100]
-       }
-)
-
-stocks = pd.DataFrame(
-       {
-           "ticker": ["MSFT", "GOOG", "AAPL"],
-           "name": ["Microsoft Corporation", "Alphabet Inc", "Apple Inc"],
-           "exchange": ["NASDAQ", "NASDAQ", "NASDAQ"]
-       }
-)
-
-
-
-
-
-
-
def move_date(df, col):
-    max_date = df[col].max()
-    now_date = datetime.datetime.now()
-    delta = now_date - max_date 
-    df[col] = df[col] + delta 
-    return df
-
-quotes = move_date(quotes, "time")
-trades = move_date(trades, "time")
-trades.to_csv('trades.csv', index=False)
-data_uri = os.path.join(ABS_PATH, 'trades.csv')
-
-
-
-
-
-
-
quotes
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
timetickerbidask
02022-01-31 14:41:48.260566GOOG720.50720.93
12022-01-31 14:41:48.260566MSFT51.9551.96
22022-01-31 14:41:48.267566MSFT51.9751.98
32022-01-31 14:41:48.278566MSFT51.9952.00
42022-01-31 14:41:48.285566GOOG720.50720.93
52022-01-31 14:41:48.286566AAPL97.9998.01
62022-01-31 14:41:48.309566GOOG720.50720.88
72022-01-31 14:41:48.312566MSFT52.0152.03
-
-
-
-
-
trades
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
timetickerpricequantity
02022-01-31 14:41:48.288476MSFT51.9575
12022-01-31 14:41:48.303476MSFT51.95155
22022-01-31 14:41:48.313476GOOG720.77100
32022-01-31 14:41:48.313476GOOG720.92100
42022-01-31 14:41:48.313476AAPL98.00100
-
-
-
-
-
stocks
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
tickernameexchange
0MSFTMicrosoft CorporationNASDAQ
1GOOGAlphabet IncNASDAQ
2AAPLApple IncNASDAQ
-
-
-
-
-

Build & Ingest Simple Feature Set (stocks)#

-
-
-
# add feature set without time column (stock ticker metadata) 
-stocks_set = fstore.FeatureSet("stocks", entities=[fstore.Entity("ticker")])
-fstore.ingest(stocks_set, stocks, infer_options=fstore.InferOptions.default())
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
nameexchange
ticker
MSFTMicrosoft CorporationNASDAQ
GOOGAlphabet IncNASDAQ
AAPLApple IncNASDAQ
-
-
-
-
-

Build Advanced feature set - with feature engineering pipeline#

-
-
-
quotes_set = fstore.FeatureSet("stock-quotes", entities=[fstore.Entity("ticker")])
-
-
-
-
-
-
-
class MyMap(MapClass):
-    def __init__(self, multiplier=1, **kwargs):
-        super().__init__(**kwargs)
-        self._multiplier = multiplier
-
-    def do(self, event):
-        event["multi"] = event["bid"] * self._multiplier
-        return event
-
-
-
-
-
-
-
quotes_set.graph.to("MyMap", multiplier=3)\
-                .to("storey.Extend", _fn="({'extra': event['bid'] * 77})")\
-                .to("storey.Filter", "filter", _fn="(event['bid'] > 51.92)")\
-                .to(FeaturesetValidator())
-
-quotes_set.add_aggregation("ask", ["sum", "max"], "1h", "10m", name="asks1")
-quotes_set.add_aggregation("ask", ["sum", "max"], "5h", "10m", name="asks5")
-quotes_set.add_aggregation("bid", ["min", "max"], "1h", "10m", name="bids")
-
-# add feature validation policy
-quotes_set["bid"] = fstore.Feature(validator=MinMaxValidator(min=52, severity="info"))
-
-# add default target definitions and plot
-quotes_set.set_targets()
-quotes_set.plot(rankdir="LR", with_targets=True)
-
-
-
-
-_images/01e548158b906df8cc3f4f282097c5f50e603245dd43f2314bc9ff5ef6aa1447.svg
-
-
-
-

Ingest Data Into Offline And Online Stores#

-
-
-
# save ingest data and print the FeatureSet spec
-fstore.ingest(quotes_set, quotes)
-
-
-
-
-
info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.377248+00:00 args={'min': 52, 'value': 51.95}
-info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.377927+00:00 args={'min': 52, 'value': 51.97}
-info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.378103+00:00 args={'min': 52, 'value': 51.99}
-info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.578640+00:00 args={'min': 52, 'value': 51.95}
-info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.581692+00:00 args={'min': 52, 'value': 51.97}
-info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.584351+00:00 args={'min': 52, 'value': 51.99}
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
asks1_max_1hasks1_sum_1hasks5_max_5hasks5_sum_5hbids_max_1hbids_min_1htimebidaskmultiextra
ticker
GOOG720.93720.93720.93720.93720.50720.502022-01-31 14:41:48.260566720.50720.932161.5055478.50
MSFT51.9651.9651.9651.9651.9551.952022-01-31 14:41:48.26056651.9551.96155.854000.15
MSFT51.98103.9451.98103.9451.9751.952022-01-31 14:41:48.26756651.9751.98155.914001.69
MSFT52.00155.9452.00155.9451.9951.952022-01-31 14:41:48.27856651.9952.00155.974003.23
GOOG720.931441.86720.931441.86720.50720.502022-01-31 14:41:48.285566720.50720.932161.5055478.50
AAPL98.0198.0198.0198.0197.9997.992022-01-31 14:41:48.28656697.9998.01293.977545.23
GOOG720.932162.74720.932162.74720.50720.502022-01-31 14:41:48.309566720.50720.882161.5055478.50
MSFT52.03207.9752.03207.9752.0151.952022-01-31 14:41:48.31256652.0152.03156.034004.77
-
-
-
-
-

Get an Offline Feature Vector#

-
-
-
features = [
-    "stock-quotes.multi",
-    "stock-quotes.asks5_sum_5h as total_ask",
-    "stock-quotes.bids_min_1h",
-    "stock-quotes.bids_max_1h",
-    "stocks.*",
-]
-
-vector = fstore.FeatureVector("stocks-vec", features)
-vector.save()
-
-
-
-
-
-
-
target_dict = CSVTarget('mycsv',path=os.path.join(ABS_PATH, 'my_csv.csv')).to_dict()
-
-
-
-
-
-
-

Using get_offline_features()#

-
-
-
get_offline_features_fn = mlrun.import_function('hub://get_offline_features:development')
-
-
-
-
-
-
-
gof_run = get_offline_features_fn.run(
-    handler='get_offline_features',
-    inputs= {'entity_rows': data_uri},
-    params={'feature_vector': vector.uri,
-           'target': target_dict,
-            'entity_timestamp_column': "time",
-           },
-    local=True
-)
-
-
-
-
-
> 2022-01-31 14:41:52,066 [info] starting run get-offline-features-get_offline_features uid=956663b9a9ba448c9ea65e8e9245718e DB=http://mlrun-api:8080
-> 2022-01-31 14:41:52,214 [info] Creating DataFrame from entity_rows = v3io://users/yonatan/get_offline_features/trades.csv
-> 2022-01-31 14:41:52,292 [info] Preparing 'mycsv' target
-> 2022-01-31 14:41:52,294 [info] getting offline features from the FeatureVector store://feature-vectors/get-offline-features-yonatan/stocks-vec
-> 2022-01-31 14:41:52,708 [info] wrote target: {'name': 'mycsv', 'kind': 'csv', 'path': 'v3io://users/yonatan/get_offline_features/my_csv.csv', 'status': 'ready', 'updated': '2022-01-31T14:41:52.708534+00:00'}
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
get-offline-features-yonatan0Jan 31 14:41:52completedget-offline-features-get_offline_features
v3io_user=yonatan
kind=
owner=yonatan
host=jupyter-yoni-647b99c95d-w4jlc
entity_rows
feature_vector=store://feature-vectors/get-offline-features-yonatan/stocks-vec
target={'name': 'mycsv', 'kind': 'csv', 'path': 'v3io://users/yonatan/get_offline_features/my_csv.csv', 'partitioned': False}
entity_timestamp_column=time
target=v3io://users/yonatan/get_offline_features/my_csv.csv
feature_vector=store://feature-vectors/get-offline-features-yonatan/stocks-vec
-
- -
-

-
-
-
> to track results use the .show() or .logs() methods or click here to open in UI
> 2022-01-31 14:41:52,896 [info] run executed, status=completed
-
-
-
-
-
-
-
gof_run.outputs['feature_vector']
-
-
-
-
-
'store://feature-vectors/get-offline-features-yonatan/stocks-vec'
-
-
-
-
-
-
-
mlrun.get_dataitem(gof_run.outputs['feature_vector']).as_df()
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Unnamed: 0pricequantitymultitotal_askbids_min_1hbids_max_1hnameexchange
0051.9575155.8551.9651.9551.95Microsoft CorporationNASDAQ
1151.9575155.91103.9451.9551.97Microsoft CorporationNASDAQ
2251.9575155.97155.9451.9551.99Microsoft CorporationNASDAQ
3351.9575156.03207.9751.9552.01Microsoft CorporationNASDAQ
4451.95155155.8551.9651.9551.95Microsoft CorporationNASDAQ
5551.95155155.91103.9451.9551.97Microsoft CorporationNASDAQ
6651.95155155.97155.9451.9551.99Microsoft CorporationNASDAQ
7751.95155156.03207.9751.9552.01Microsoft CorporationNASDAQ
88720.771002161.50720.93720.50720.50Alphabet IncNASDAQ
99720.771002161.501441.86720.50720.50Alphabet IncNASDAQ
1010720.771002161.502162.74720.50720.50Alphabet IncNASDAQ
1111720.921002161.50720.93720.50720.50Alphabet IncNASDAQ
1212720.921002161.501441.86720.50720.50Alphabet IncNASDAQ
1313720.921002161.502162.74720.50720.50Alphabet IncNASDAQ
141498.00100293.9798.0197.9997.99Apple IncNASDAQ
-
-
-
-
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/get_offline_features/1.1.0/static/function.html b/functions/development/get_offline_features/1.1.0/static/function.html deleted file mode 100644 index bb98537b..00000000 --- a/functions/development/get_offline_features/1.1.0/static/function.html +++ /dev/null @@ -1,148 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: job
-metadata:
-  name: get-offline-features
-  tag: ''
-  hash: d8af2b5f1838ec720c9db726ec4f1e96ea82b2b8
-  project: ''
-  labels:
-    author: yonish
-  categories:
-  - data-preparation
-  - data-analysis
-  - feature-store
-spec:
-  command: ''
-  args: []
-  image: mlrun/mlrun
-  build:
-    functionSourceCode: ZnJvbSB0eXBpbmcgaW1wb3J0IFVuaW9uLCBMaXN0LCBEaWN0CgppbXBvcnQgbWxydW4KaW1wb3J0IG1scnVuLmZlYXR1cmVfc3RvcmUgYXMgZnMKZnJvbSBtbHJ1bi5kYXRhc3RvcmUuc3RvcmVfcmVzb3VyY2VzIGltcG9ydCBpc19zdG9yZV91cmksIHBhcnNlX3N0b3JlX3VyaQpmcm9tIG1scnVuLmRhdGFzdG9yZS50YXJnZXRzIGltcG9ydCBnZXRfdGFyZ2V0X2RyaXZlciwga2luZF90b19kcml2ZXIKZnJvbSBtbHJ1bi5kYXRhc3RvcmUuYmFzZSBpbXBvcnQgRGF0YUl0ZW0KZnJvbSBtbHJ1bi5leGVjdXRpb24gaW1wb3J0IE1MQ2xpZW50Q3R4CmZyb20gbWxydW4udXRpbHMgaW1wb3J0IFN0b3JlUHJlZml4LCBwYXJzZV92ZXJzaW9uZWRfb2JqZWN0X3VyaQpmcm9tIG1scnVuLmVycm9ycyBpbXBvcnQgTUxSdW5JbnZhbGlkQXJndW1lbnRFcnJvcgoKCmRlZiBnZXRfb2ZmbGluZV9mZWF0dXJlcygKICAgIGNvbnRleHQ6IE1MQ2xpZW50Q3R4LAogICAgZmVhdHVyZV92ZWN0b3I6IHN0ciwKICAgIGZlYXR1cmVzOiBVbmlvbltMaXN0W3N0cl0sIE5vbmVdID0gTm9uZSwKICAgIGxhYmVsX2ZlYXR1cmU6IHN0ciA9IE5vbmUsCiAgICBkZXNjcmlwdGlvbjogc3RyID0gTm9uZSwKICAgIGVudGl0eV9yb3dzOiBEYXRhSXRlbSA9IE5vbmUsCiAgICBlbnRpdHlfdGltZXN0YW1wX2NvbHVtbjogc3RyID0gTm9uZSwKICAgIHRhcmdldDogVW5pb25bc3RyLCBEaWN0XSA9IE5vbmUsCiAgICBydW5fY29uZmlnOiBVbmlvbltzdHIsIERpY3RdID0gTm9uZSwKICAgIGRyb3BfY29sdW1uczogTGlzdFtzdHJdID0gTm9uZSwKICAgIHN0YXJ0X3RpbWU6IHN0ciA9IE5vbmUsCiAgICBlbmRfdGltZTogc3RyID0gTm9uZSwKICAgIHdpdGhfaW5kZXhlczogYm9vbCA9IEZhbHNlLAogICAgdXBkYXRlX3N0YXRzOiBib29sID0gRmFsc2UsCik6CiAgICAiIiJyZXRyaWV2ZSBvZmZsaW5lIGZlYXR1cmUgdmVjdG9yIHJlc3VsdHMKCiAgICBzcGVjaWZ5IGEgZmVhdHVyZSB2ZWN0b3Igb2JqZWN0L3VyaSBhbmQgcmV0cmlldmUgdGhlIGRlc2lyZWQgZmVhdHVyZXMsIHRoZWlyIG1ldGFkYXRhCiAgICBhbmQgc3RhdGlzdGljcy4gcmV0dXJucyA6cHk6Y2xhc3M6YH5tbHJ1bi5mZWF0dXJlX3N0b3JlLk9mZmxpbmVWZWN0b3JSZXNwb25zZWAsCiAgICByZXN1bHRzIGNhbiBiZSByZXR1cm5lZCBhcyBhIGRhdGFmcmFtZSBvciB3cml0dGVuIHRvIGEgdGFyZ2V0LgogICAgSWYgZmVhdHVyZSB2ZWN0b3IgZG9lcyBub3QgZXhpc3QsIGEgbmV3IG9uZSB3aWxsIGJlIGNyZWF0ZWQgYW5kIHNhdmVkIHdpdGggdGhlIGdpdmVuIGZlYXR1cmVzLgoKICAgIFRoZSBzdGFydF90aW1lIGFuZCBlbmRfdGltZSBhdHRyaWJ1dGVzIGFsbG93IGZpbHRlcmluZyB0aGUgZGF0YSB0byBhIGdpdmVuIHRpbWUgcmFuZ2UsIHRoZXkgYWNjZXB0CiAgICBzdHJpbmcgdmFsdWVzIG9yIHBhbmRhcyBgVGltZXN0YW1wYCBvYmplY3RzLCBzdHJpbmcgdmFsdWVzIGNhbiBhbHNvIGJlIHJlbGF0aXZlLCBmb3IgZXhhbXBsZToKICAgICJub3ciLCAibm93IC0gMWQyaCIsICJub3crNW0iLCB3aGVyZSBhIHZhbGlkIHBhbmRhcyBUaW1lZGVsdGEgc3RyaW5nIGZvbGxvd3MgdGhlIHZlcmIgIm5vdyIsCiAgICBmb3IgdGltZSBhbGlnbm1lbnQgeW91IGNhbiB1c2UgdGhlIHZlcmIgImZsb29yIiBlLmcuICJub3cgLTFkIGZsb29yIDFIIiB3aWxsIGFsaWduIHRoZSB0aW1lIHRvIHRoZSBsYXN0IGhvdXIKICAgICh0aGUgZmxvb3Igc3RyaW5nIGlzIHBhc3NlZCB0byBwYW5kYXMuVGltZXN0YW1wLmZsb29yKCksIGNhbiB1c2UgRCwgSCwgVCwgUyBmb3IgZGF5LCBob3VyLCBtaW4sIHNlYyBhbGlnbm1lbnQpCgoKICAgIDpwYXJhbSBjb250ZXh0OiAgICAgICAgTUxSdW4gY29udGV4dAogICAgOnBhcmFtIGZlYXR1cmVfdmVjdG9yOiBmZWF0dXJlIHZlY3RvciB1cmkKICAgIDpwYXJhbSBmZWF0dXJlczogICAgICAgUmVsZXZhbnQgb25seSBpZiBmZWF0dXJlX3ZlY3RvciBub3QgZXhpc3QuIGxpc3Qgb2YgZmVhdHVyZSB0byBjb2xsZWN0IHRvIHRoaXMgdmVjdG9yCiAgICAgICAgICAgICAgICAgICAgICAgICAgIGZvcm1hdCBbPHByb2plY3Q+L108ZmVhdHVyZV9zZXQ+LjxmZWF0dXJlX25hbWUgb3IgKj4gW2FzIDxhbGlhcz5dCiAgICA6cGFyYW0gbGFiZWxfZmVhdHVyZTogIGZlYXR1cmUgbmFtZSB0byBiZSB1c2VkIGFzIGxhYmVsIGRhdGEKICAgIDpwYXJhbSBkZXNjcmlwdGlvbjogICAgdGV4dCBkZXNjcmlwdGlvbiBvZiB0aGUgdmVjdG9yCiAgICA6cGFyYW0gZW50aXR5X3Jvd3M6ICAgIFVSSSBvZiB0aGUgZGF0YSBlbnRpdHkgcm93cyB0byBqb2luIHdpdGgKICAgIDpwYXJhbSB0YXJnZXQ6ICAgICAgICAgd2hlcmUgdG8gd3JpdGUgdGhlIHJlc3VsdHMgdG8KICAgIDpwYXJhbSBkcm9wX2NvbHVtbnM6ICAgbGlzdCBvZiBjb2x1bW5zIHRvIGRyb3AgZnJvbSB0aGUgZmluYWwgcmVzdWx0CiAgICA6cGFyYW0gZW50aXR5X3RpbWVzdGFtcF9jb2x1bW46IHRpbWVzdGFtcCBjb2x1bW4gbmFtZSBpbiB0aGUgZW50aXR5IHJvd3MgZGF0YWZyYW1lCiAgICA6cGFyYW0gcnVuX2NvbmZpZzogICAgIGZ1bmN0aW9uIGFuZC9vciBydW4gY29uZmlndXJhdGlvbgogICAgICAgICAgICAgICAgICAgICAgICAgICBzZWUgOnB5OmNsYXNzOmB+bWxydW4uZmVhdHVyZV9zdG9yZS5SdW5Db25maWdgCiAgICA6cGFyYW0gc3RhcnRfdGltZTogICAgICBkYXRldGltZSwgbG93IGxpbWl0IG9mIHRpbWUgbmVlZGVkIHRvIGJlIGZpbHRlcmVkLiBPcHRpb25hbAogICAgICAgIGVudGl0eV90aW1lc3RhbXBfY29sdW1uIG11c3QgYmUgcGFzc2VkIHdoZW4gdXNpbmcgdGltZSBmaWx0ZXJpbmcKICAgIDpwYXJhbSBlbmRfdGltZTogICAgICAgIGRhdGV0aW1lLCBoaWdoIGxpbWl0IG9mIHRpbWUgbmVlZGVkIHRvIGJlIGZpbHRlcmVkLiBPcHRpb25hbAogICAgICAgIGVudGl0eV90aW1lc3RhbXBfY29sdW1uIG11c3QgYmUgcGFzc2VkIHdoZW4gdXNpbmcgdGltZSBmaWx0ZXJpbmcKICAgIDpwYXJhbSB3aXRoX2luZGV4ZXM6ICAgIHJldHVybiB2ZWN0b3Igd2l0aCBpbmRleCBjb2x1bW5zIChkZWZhdWx0IEZhbHNlKQogICAgOnBhcmFtIHVwZGF0ZV9zdGF0czogICAgdXBkYXRlIGZlYXR1cmVzIHN0YXRpc3RpY3MgZnJvbSB0aGUgcmVxdWVzdGVkIGZlYXR1cmUgc2V0cyBvbiB0aGUgdmVjdG9yLiBEZWZhdWx0IGlzIEZhbHNlLgoKICAgIDpyZXR1cm5zIGZlYXR1cmVfdmVjdG9yIGlucHV0CiAgICAiIiIKCiAgICBpZiBmZWF0dXJlczoKICAgICAgICAjIENyZWF0aW5nIGEgbmV3IEZlYXR1cmVWZWN0b3IgYW5kIHNhdmluZzoKICAgICAgICBpZiBpc19zdG9yZV91cmkoZmVhdHVyZV92ZWN0b3IpOgogICAgICAgICAgICBwcmVmaXgsIG5ld191cmkgPSBwYXJzZV9zdG9yZV91cmkoZmVhdHVyZV92ZWN0b3IpCiAgICAgICAgICAgIGlmIHByZWZpeCAhPSBTdG9yZVByZWZpeC5GZWF0dXJlVmVjdG9yOgogICAgICAgICAgICAgICAgcmFpc2UgTUxSdW5JbnZhbGlkQXJndW1lbnRFcnJvcigKICAgICAgICAgICAgICAgICAgICBmInByb3ZpZGVkIHN0b3JlIHVyaSAoe2ZlYXR1cmVfdmVjdG9yfSkgZG9lcyBub3QgcmVwcmVzZW50IGEgZmVhdHVyZSB2ZWN0b3IgKHByZWZpeD17cHJlZml4fSkiCiAgICAgICAgICAgICAgICApCiAgICAgICAgICAgIGZlYXR1cmVfdmVjdG9yID0gbmV3X3VyaQoKICAgICAgICBjb250ZXh0LmxvZ2dlci5pbmZvKGYiQ3JlYXRpbmcgRmVhdHVyZVZlY3RvciB7ZmVhdHVyZV92ZWN0b3J9IikKICAgICAgICBwcm9qZWN0LCBuYW1lLCB0YWcsIF8gPSBwYXJzZV92ZXJzaW9uZWRfb2JqZWN0X3VyaShmZWF0dXJlX3ZlY3RvciwgbWxydW4ubWxjb25mLmRlZmF1bHRfcHJvamVjdCkKICAgICAgICB2ZWN0b3IgPSBmcy5GZWF0dXJlVmVjdG9yKG5hbWUsIGZlYXR1cmVzLCBsYWJlbF9mZWF0dXJlPWxhYmVsX2ZlYXR1cmUsIGRlc2NyaXB0aW9uPWRlc2NyaXB0aW9uKQogICAgICAgIHZlY3Rvci5tZXRhZGF0YS5wcm9qZWN0ID0gcHJvamVjdAogICAgICAgIHZlY3Rvci5tZXRhZGF0YS50YWcgPSB0YWcKICAgICAgICB2ZWN0b3Iuc2F2ZSgpCiAgICAgICAgZmVhdHVyZV92ZWN0b3IgPSB2ZWN0b3IudXJpCgogICAgIyBQcmVwYXJpbmcgZW50aXR5X3Jvd3M6CiAgICBpZiBlbnRpdHlfcm93cyBpcyBub3QgTm9uZToKICAgICAgICBjb250ZXh0LmxvZ2dlci5pbmZvKGYiQ3JlYXRpbmcgRGF0YUZyYW1lIGZyb20gZW50aXR5X3Jvd3MgPSB7ZW50aXR5X3Jvd3N9IikKICAgICAgICBlbnRpdHlfcm93cyA9IGVudGl0eV9yb3dzLmFzX2RmKCkKCiAgICAjIFByZXBhcmluZyB0YXJnZXQ6CiAgICBpZiB0YXJnZXQ6CiAgICAgICAgaWYgaXNpbnN0YW5jZSh0YXJnZXQsIHN0cik6CiAgICAgICAgICAgIHRhcmdldCA9IGtpbmRfdG9fZHJpdmVyW3RhcmdldF0oKQoKICAgICAgICBuYW1lID0gdGFyZ2V0Lm5hbWUgaWYgaGFzYXR0cih0YXJnZXQsICJuYW1lIikgZWxzZSB0YXJnZXRbIm5hbWUiXQogICAgICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZiJQcmVwYXJpbmcgJ3tuYW1lfScgdGFyZ2V0IikKICAgICAgICB0YXJnZXQgPSBnZXRfdGFyZ2V0X2RyaXZlcih0YXJnZXQpCiAgICBpZiBoYXNhdHRyKHRhcmdldCwgJ3BhdGgnKSBhbmQgdGFyZ2V0LnBhdGg6CiAgICAgICAgY29udGV4dC5sb2dfcmVzdWx0KCJ0YXJnZXQiLCB0YXJnZXQucGF0aCkKCiAgICAjIFByZXBhcmluZyBydW5fY29uZmlnOgogICAgaWYgcnVuX2NvbmZpZyBhbmQgaXNpbnN0YW5jZShydW5fY29uZmlnLCBkaWN0KToKICAgICAgICBjb250ZXh0LmxvZ2dlci5pbmZvKCJQcmVwYXJpbmcgcnVuIGNvbmZpZ3VyYXRpb24iKQogICAgICAgIHJ1bl9jb25maWcgPSBmcy5SdW5Db25maWcoKipydW5fY29uZmlnKQoKICAgICMgQ2FsbGluZyBnZXRfb2ZmbGluZV9mZWF0dXJlczoKICAgIGNvbnRleHQubG9nZ2VyLmluZm8oCiAgICAgICAgZiJnZXR0aW5nIG9mZmxpbmUgZmVhdHVyZXMgZnJvbSB0aGUgRmVhdHVyZVZlY3RvciB7ZmVhdHVyZV92ZWN0b3J9IgogICAgKQogICAgZnMuZ2V0X29mZmxpbmVfZmVhdHVyZXMoCiAgICAgICAgZmVhdHVyZV92ZWN0b3I9ZmVhdHVyZV92ZWN0b3IsCiAgICAgICAgZW50aXR5X3Jvd3M9ZW50aXR5X3Jvd3MsCiAgICAgICAgZW50aXR5X3RpbWVzdGFtcF9jb2x1bW49ZW50aXR5X3RpbWVzdGFtcF9jb2x1bW4sCiAgICAgICAgdGFyZ2V0PXRhcmdldCwKICAgICAgICBydW5fY29uZmlnPXJ1bl9jb25maWcsCiAgICAgICAgZHJvcF9jb2x1bW5zPWRyb3BfY29sdW1ucywKICAgICAgICBzdGFydF90aW1lPXN0YXJ0X3RpbWUsCiAgICAgICAgZW5kX3RpbWU9ZW5kX3RpbWUsCiAgICAgICAgd2l0aF9pbmRleGVzPXdpdGhfaW5kZXhlcywKICAgICAgICB1cGRhdGVfc3RhdHM9dXBkYXRlX3N0YXRzLAogICAgKQoKICAgIGNvbnRleHQubG9nX3Jlc3VsdCgiZmVhdHVyZV92ZWN0b3IiLCBmZWF0dXJlX3ZlY3RvcikK
-    commands: []
-    code_origin: https://github.com/mlrun/functions.git#ecc7bc18501d377d10d2b40edccf5c4b256a79d3:/Users/yonatanshelach/yoni/projects/functions/get_offline_features/get_offline_features.py
-    origin_filename: /Users/yonatanshelach/yoni/projects/functions/get_offline_features/get_offline_features.py
-  entry_points:
-    get_offline_features:
-      name: get_offline_features
-      doc: 'retrieve offline feature vector results
-
-
-        specify a feature vector object/uri and retrieve the desired features, their
-        metadata
-
-        and statistics. returns :py:class:`~mlrun.feature_store.OfflineVectorResponse`,
-
-        results can be returned as a dataframe or written to a target.
-
-        If feature vector does not exist, a new one will be created and saved with
-        the given features.
-
-
-        The start_time and end_time attributes allow filtering the data to a given
-        time range, they accept
-
-        string values or pandas `Timestamp` objects, string values can also be relative,
-        for example:
-
-        "now", "now - 1d2h", "now+5m", where a valid pandas Timedelta string follows
-        the verb "now",
-
-        for time alignment you can use the verb "floor" e.g. "now -1d floor 1H" will
-        align the time to the last hour
-
-        (the floor string is passed to pandas.Timestamp.floor(), can use D, H, T,
-        S for day, hour, min, sec alignment)'
-      parameters:
-      - name: context
-        type: MLClientCtx
-        doc: MLRun context
-        default: ''
-      - name: feature_vector
-        type: str
-        doc: feature vector uri
-        default: ''
-      - name: features
-        type: Union[List[str], ]
-        doc: Relevant only if feature_vector not exist. list of feature to collect
-          to this vector format [/]. [as
-          ]
-        default: null
-      - name: label_feature
-        type: str
-        doc: feature name to be used as label data
-        default: null
-      - name: description
-        type: str
-        doc: text description of the vector
-        default: null
-      - name: entity_rows
-        type: DataItem
-        doc: URI of the data entity rows to join with
-        default: null
-      - name: entity_timestamp_column
-        type: str
-        doc: timestamp column name in the entity rows dataframe
-        default: null
-      - name: target
-        type: Union[str, Dict]
-        doc: where to write the results to
-        default: null
-      - name: run_config
-        type: Union[str, Dict]
-        doc: function and/or run configuration see :py:class:`~mlrun.feature_store.RunConfig`
-        default: null
-      - name: drop_columns
-        type: List[str]
-        doc: list of columns to drop from the final result
-        default: null
-      - name: start_time
-        type: str
-        doc: datetime, low limit of time needed to be filtered. Optional entity_timestamp_column
-          must be passed when using time filtering
-        default: null
-      - name: end_time
-        type: str
-        doc: datetime, high limit of time needed to be filtered. Optional entity_timestamp_column
-          must be passed when using time filtering
-        default: null
-      - name: with_indexes
-        type: bool
-        doc: return vector with index columns (default False)
-        default: false
-      - name: update_stats
-        type: bool
-        doc: update features statistics from the requested feature sets on the vector.
-          Default is False.
-        default: false
-      outputs:
-      - default: ''
-      lineno: 13
-  description: retrieve offline feature vector results
-  default_handler: get_offline_features
-  disable_auto_mount: false
-  env: []
-  priority_class_name: ''
-  preemption_mode: prevent
-  affinity: null
-  tolerations: null
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/get_offline_features/1.1.0/static/get_offline_features.html b/functions/development/get_offline_features/1.1.0/static/get_offline_features.html deleted file mode 100644 index 68c11cf2..00000000 --- a/functions/development/get_offline_features/1.1.0/static/get_offline_features.html +++ /dev/null @@ -1,275 +0,0 @@ - - - - - - - -get_offline_features.get_offline_features - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - -
-
- -
-
-
-
-
- -
-

- -
-
-
-
-
-
-
-

Source code for get_offline_features.get_offline_features

-# Copyright 2019 Iguazio
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-from typing import Union, List, Dict
-
-import mlrun
-import mlrun.feature_store as fs
-from mlrun.datastore.store_resources import is_store_uri, parse_store_uri
-from mlrun.datastore.targets import get_target_driver, kind_to_driver
-from mlrun.datastore.base import DataItem
-from mlrun.execution import MLClientCtx
-from mlrun.utils import StorePrefix, parse_versioned_object_uri
-from mlrun.errors import MLRunInvalidArgumentError
-
-
-
[docs]def get_offline_features( - context: MLClientCtx, - feature_vector: str, - features: Union[List[str], None] = None, - label_feature: str = None, - description: str = None, - entity_rows: DataItem = None, - entity_timestamp_column: str = None, - target: Union[str, Dict] = None, - run_config: Union[str, Dict] = None, - drop_columns: List[str] = None, - start_time: str = None, - end_time: str = None, - with_indexes: bool = False, - update_stats: bool = False, -): - """retrieve offline feature vector results - - specify a feature vector object/uri and retrieve the desired features, their metadata - and statistics. returns :py:class:`~mlrun.feature_store.OfflineVectorResponse`, - results can be returned as a dataframe or written to a target. - If feature vector does not exist, a new one will be created and saved with the given features. - - The start_time and end_time attributes allow filtering the data to a given time range, they accept - string values or pandas `Timestamp` objects, string values can also be relative, for example: - "now", "now - 1d2h", "now+5m", where a valid pandas Timedelta string follows the verb "now", - for time alignment you can use the verb "floor" e.g. "now -1d floor 1H" will align the time to the last hour - (the floor string is passed to pandas.Timestamp.floor(), can use D, H, T, S for day, hour, min, sec alignment) - - - :param context: MLRun context - :param feature_vector: feature vector uri - :param features: Relevant only if feature_vector not exist. list of feature to collect to this vector - format [<project>/]<feature_set>.<feature_name or *> [as <alias>] - :param label_feature: feature name to be used as label data - :param description: text description of the vector - :param entity_rows: URI of the data entity rows to join with - :param target: where to write the results to - :param drop_columns: list of columns to drop from the final result - :param entity_timestamp_column: timestamp column name in the entity rows dataframe - :param run_config: function and/or run configuration - see :py:class:`~mlrun.feature_store.RunConfig` - :param start_time: datetime, low limit of time needed to be filtered. Optional - entity_timestamp_column must be passed when using time filtering - :param end_time: datetime, high limit of time needed to be filtered. Optional - entity_timestamp_column must be passed when using time filtering - :param with_indexes: return vector with index columns (default False) - :param update_stats: update features statistics from the requested feature sets on the vector. Default is False. - - :returns feature_vector input - """ - - if features: - # Creating a new FeatureVector and saving: - if is_store_uri(feature_vector): - prefix, new_uri = parse_store_uri(feature_vector) - if prefix != StorePrefix.FeatureVector: - raise MLRunInvalidArgumentError( - f"provided store uri ({feature_vector}) does not represent a feature vector (prefix={prefix})" - ) - feature_vector = new_uri - - context.logger.info(f"Creating FeatureVector {feature_vector}") - project, name, tag, _ = parse_versioned_object_uri(feature_vector, mlrun.mlconf.default_project) - vector = fs.FeatureVector(name, features, label_feature=label_feature, description=description) - vector.metadata.project = project - vector.metadata.tag = tag - vector.save() - feature_vector = vector.uri - - # Preparing entity_rows: - if entity_rows is not None: - context.logger.info(f"Creating DataFrame from entity_rows = {entity_rows}") - entity_rows = entity_rows.as_df() - - # Preparing target: - if target: - if isinstance(target, str): - target = kind_to_driver[target]() - - name = target.name if hasattr(target, "name") else target["name"] - context.logger.info(f"Preparing '{name}' target") - target = get_target_driver(target) - if hasattr(target, 'path') and target.path: - context.log_result("target", target.path) - - # Preparing run_config: - if run_config and isinstance(run_config, dict): - context.logger.info("Preparing run configuration") - run_config = fs.RunConfig(**run_config) - - # Calling get_offline_features: - context.logger.info( - f"getting offline features from the FeatureVector {feature_vector}" - ) - fs.get_offline_features( - feature_vector=feature_vector, - entity_rows=entity_rows, - entity_timestamp_column=entity_timestamp_column, - target=target, - run_config=run_config, - drop_columns=drop_columns, - start_time=start_time, - end_time=end_time, - with_indexes=with_indexes, - update_stats=update_stats, - ) - - context.log_result("feature_vector", feature_vector)
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/get_offline_features/1.1.0/static/item.html b/functions/development/get_offline_features/1.1.0/static/item.html deleted file mode 100644 index 19eae3eb..00000000 --- a/functions/development/get_offline_features/1.1.0/static/item.html +++ /dev/null @@ -1,48 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- data-preparation
-- data-analysis
-- feature-store
-description: retrieve offline feature vector results
-doc: ''
-example: get_offline_features.ipynb
-generationDate: 2022-08-28:17-25
-hidden: false
-icon: ''
-labels:
-  author: yonish
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 1.1.0
-name: get_offline_features
-platformVersion: 3.5.0
-spec:
-  filename: get_offline_features.py
-  handler: get_offline_features
-  image: mlrun/mlrun
-  kind: job
-  requirements: []
-url: ''
-version: 1.1.0
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/get_offline_features/1.1.0/static/source.html b/functions/development/get_offline_features/1.1.0/static/source.html deleted file mode 100644 index e7c6675a..00000000 --- a/functions/development/get_offline_features/1.1.0/static/source.html +++ /dev/null @@ -1,157 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-# Copyright 2019 Iguazio
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-from typing import Union, List, Dict
-
-import mlrun
-import mlrun.feature_store as fs
-from mlrun.datastore.store_resources import is_store_uri, parse_store_uri
-from mlrun.datastore.targets import get_target_driver, kind_to_driver
-from mlrun.datastore.base import DataItem
-from mlrun.execution import MLClientCtx
-from mlrun.utils import StorePrefix, parse_versioned_object_uri
-from mlrun.errors import MLRunInvalidArgumentError
-
-
-def get_offline_features(
-    context: MLClientCtx,
-    feature_vector: str,
-    features: Union[List[str], None] = None,
-    label_feature: str = None,
-    description: str = None,
-    entity_rows: DataItem = None,
-    entity_timestamp_column: str = None,
-    target: Union[str, Dict] = None,
-    run_config: Union[str, Dict] = None,
-    drop_columns: List[str] = None,
-    start_time: str = None,
-    end_time: str = None,
-    with_indexes: bool = False,
-    update_stats: bool = False,
-):
-    """retrieve offline feature vector results
-
-    specify a feature vector object/uri and retrieve the desired features, their metadata
-    and statistics. returns :py:class:`~mlrun.feature_store.OfflineVectorResponse`,
-    results can be returned as a dataframe or written to a target.
-    If feature vector does not exist, a new one will be created and saved with the given features.
-
-    The start_time and end_time attributes allow filtering the data to a given time range, they accept
-    string values or pandas `Timestamp` objects, string values can also be relative, for example:
-    "now", "now - 1d2h", "now+5m", where a valid pandas Timedelta string follows the verb "now",
-    for time alignment you can use the verb "floor" e.g. "now -1d floor 1H" will align the time to the last hour
-    (the floor string is passed to pandas.Timestamp.floor(), can use D, H, T, S for day, hour, min, sec alignment)
-
-
-    :param context:        MLRun context
-    :param feature_vector: feature vector uri
-    :param features:       Relevant only if feature_vector not exist. list of feature to collect to this vector
-                           format [/]. [as ]
-    :param label_feature:  feature name to be used as label data
-    :param description:    text description of the vector
-    :param entity_rows:    URI of the data entity rows to join with
-    :param target:         where to write the results to
-    :param drop_columns:   list of columns to drop from the final result
-    :param entity_timestamp_column: timestamp column name in the entity rows dataframe
-    :param run_config:     function and/or run configuration
-                           see :py:class:`~mlrun.feature_store.RunConfig`
-    :param start_time:      datetime, low limit of time needed to be filtered. Optional
-        entity_timestamp_column must be passed when using time filtering
-    :param end_time:        datetime, high limit of time needed to be filtered. Optional
-        entity_timestamp_column must be passed when using time filtering
-    :param with_indexes:    return vector with index columns (default False)
-    :param update_stats:    update features statistics from the requested feature sets on the vector. Default is False.
-
-    :returns feature_vector input
-    """
-
-    if features:
-        # Creating a new FeatureVector and saving:
-        if is_store_uri(feature_vector):
-            prefix, new_uri = parse_store_uri(feature_vector)
-            if prefix != StorePrefix.FeatureVector:
-                raise MLRunInvalidArgumentError(
-                    f"provided store uri ({feature_vector}) does not represent a feature vector (prefix={prefix})"
-                )
-            feature_vector = new_uri
-
-        context.logger.info(f"Creating FeatureVector {feature_vector}")
-        project, name, tag, _ = parse_versioned_object_uri(feature_vector, mlrun.mlconf.default_project)
-        vector = fs.FeatureVector(name, features, label_feature=label_feature, description=description)
-        vector.metadata.project = project
-        vector.metadata.tag = tag
-        vector.save()
-        feature_vector = vector.uri
-
-    # Preparing entity_rows:
-    if entity_rows is not None:
-        context.logger.info(f"Creating DataFrame from entity_rows = {entity_rows}")
-        entity_rows = entity_rows.as_df()
-
-    # Preparing target:
-    if target:
-        if isinstance(target, str):
-            target = kind_to_driver[target]()
-
-        name = target.name if hasattr(target, "name") else target["name"]
-        context.logger.info(f"Preparing '{name}' target")
-        target = get_target_driver(target)
-    if hasattr(target, 'path') and target.path:
-        context.log_result("target", target.path)
-
-    # Preparing run_config:
-    if run_config and isinstance(run_config, dict):
-        context.logger.info("Preparing run configuration")
-        run_config = fs.RunConfig(**run_config)
-
-    # Calling get_offline_features:
-    context.logger.info(
-        f"getting offline features from the FeatureVector {feature_vector}"
-    )
-    fs.get_offline_features(
-        feature_vector=feature_vector,
-        entity_rows=entity_rows,
-        entity_timestamp_column=entity_timestamp_column,
-        target=target,
-        run_config=run_config,
-        drop_columns=drop_columns,
-        start_time=start_time,
-        end_time=end_time,
-        with_indexes=with_indexes,
-        update_stats=update_stats,
-    )
-
-    context.log_result("feature_vector", feature_vector)
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/get_offline_features/1.2.0/src/function.yaml b/functions/development/get_offline_features/1.2.0/src/function.yaml deleted file mode 100644 index 3c6a8a87..00000000 --- a/functions/development/get_offline_features/1.2.0/src/function.yaml +++ /dev/null @@ -1,127 +0,0 @@ -kind: job -metadata: - name: get-offline-features - tag: '' - hash: 5ac6c4e2b67440b464710c072708a3581125c2f8 - project: '' - labels: - author: yonish - categories: - - data-preparation - - data-analysis - - feature-store -spec: - command: '' - args: [] - image: mlrun/mlrun - build: - functionSourceCode:  - commands: [] - code_origin: https://github.com/yonishelach/functions.git#82aab1724569d20c73cca114beb2fac0821d3383:/Users/Yonatan_Shelach/projects/functions/get_offline_features/get_offline_features.py - origin_filename: /Users/Yonatan_Shelach/projects/functions/get_offline_features/get_offline_features.py - entry_points: - get_offline_features: - name: get_offline_features - doc: 'retrieve offline feature vector results - - - specify a feature vector object/uri and retrieve the desired features, their - metadata - - and statistics. returns :py:class:`~mlrun.feature_store.OfflineVectorResponse`, - - results can be returned as a dataframe or written to a target. - - If feature vector does not exist, a new one will be created and saved with - the given features. - - - The start_time and end_time attributes allow filtering the data to a given - time range, they accept - - string values or pandas `Timestamp` objects, string values can also be relative, - for example: - - "now", "now - 1d2h", "now+5m", where a valid pandas Timedelta string follows - the verb "now", - - for time alignment you can use the verb "floor" e.g. "now -1d floor 1H" will - align the time to the last hour - - (the floor string is passed to pandas.Timestamp.floor(), can use D, H, T, - S for day, hour, min, sec alignment)' - parameters: - - name: context - type: MLClientCtx - doc: MLRun context - default: '' - - name: feature_vector - type: str - doc: feature vector uri - default: '' - - name: features - type: Union[List[str], ] - doc: Relevant only if feature_vector not exist. list of feature to collect - to this vector format [/]. [as - ] - default: null - - name: label_feature - type: str - doc: feature name to be used as label data - default: null - - name: description - type: str - doc: text description of the vector - default: null - - name: entity_rows - type: DataItem - doc: URI of the data entity rows to join with - default: null - - name: entity_timestamp_column - type: str - doc: timestamp column name in the entity rows dataframe - default: null - - name: target - type: Union[str, Dict] - doc: where to write the results to - default: null - - name: run_config - type: Union[str, Dict] - doc: function and/or run configuration see :py:class:`~mlrun.feature_store.RunConfig` - default: null - - name: drop_columns - type: List[str] - doc: list of columns to drop from the final result - default: null - - name: start_time - type: str - doc: datetime, low limit of time needed to be filtered. Optional entity_timestamp_column - must be passed when using time filtering - default: null - - name: end_time - type: str - doc: datetime, high limit of time needed to be filtered. Optional entity_timestamp_column - must be passed when using time filtering - default: null - - name: with_indexes - type: bool - doc: return vector with index columns (default False) - default: false - - name: update_stats - type: bool - doc: update features statistics from the requested feature sets on the vector. - Default is False. - default: false - outputs: - - default: '' - lineno: 27 - description: retrieve offline feature vector results - default_handler: get_offline_features - disable_auto_mount: false - env: [] - priority_class_name: '' - preemption_mode: prevent - affinity: null - tolerations: null - security_context: {} -verbose: false diff --git a/functions/development/get_offline_features/1.2.0/src/get_offline_features.ipynb b/functions/development/get_offline_features/1.2.0/src/get_offline_features.ipynb deleted file mode 100644 index d97402a2..00000000 --- a/functions/development/get_offline_features/1.2.0/src/get_offline_features.ipynb +++ /dev/null @@ -1,1536 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# `get_offline_features()` from MLRun FeatureStore\n", - "\n", - "This MLRun Function has the following `params`:\n", - "\n", - "- `feature_vector: str`, feature vector uri.\n", - "\n", - "- `entity_rows: DataItem` = None, URI of the data entity rows to join with.\n", - "\n", - "- `entity_timestamp_column: str = None`, timestamp column name in the entity rows dataframe.\n", - "\n", - "- `target: Union[str, Dict] = None`, where to write the results to.\n", - "\n", - "- `run_config: Union[str, Dict] = None`, function and/or run configuration see :py:class:`~mlrun.feature_store.RunConfig`.\n", - "\n", - "- `drop_columns: List[str] = None`, list of columns to drop from the final result. \n", - "\n", - "- `start_time: str = None`, datetime, low limit of time needed to be filtered. Optional. `entity_timestamp_column` must be passed when using time filtering.\n", - "\n", - "- `end_time: str = None`, datetime, high limit of time needed to be filtered. Optional. `entity_timestamp_column` must be passed when using time filtering.\n", - "\n", - "- `with_indexes: bool = False`, return vector with index columns (default False).\n", - "\n", - "- `update_stats: bool = False`, update features statistics from the requested feature sets on the vector. Default is False." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import mlrun\n", - "import mlrun.feature_store as fstore\n", - "from mlrun.datastore.targets import CSVTarget\n", - "from mlrun.datastore.sources import CSVSource\n", - "from mlrun.run import get_dataitem\n", - "from mlrun.feature_store.steps import *\n", - "from mlrun.features import MinMaxValidator\n", - "import pandas as pd\n", - "import datetime\n", - "import os" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2022-01-31 14:41:48,288 [info] loaded project get-offline-features from MLRun DB\n" - ] - } - ], - "source": [ - "ABS_PATH = 'v3io://users/{}/get_offline_features/'.format(os.environ['V3IO_USERNAME'])\n", - "# Initialize the MLRun project object\n", - "project = mlrun.get_or_create_project('get-offline-features', context=\"./\", user_project=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Generating the Same FeatureSets and FeatureVecotrs Based on the Stocks Example" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Create Sample Data For Demo" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "quotes = pd.DataFrame(\n", - " {\n", - " \"time\": [\n", - " pd.Timestamp(\"2016-05-25 13:30:00.023\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.023\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.030\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.041\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.048\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.049\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.072\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.075\")\n", - " ],\n", - " \"ticker\": [\n", - " \"GOOG\",\n", - " \"MSFT\",\n", - " \"MSFT\",\n", - " \"MSFT\",\n", - " \"GOOG\",\n", - " \"AAPL\",\n", - " \"GOOG\",\n", - " \"MSFT\"\n", - " ],\n", - " \"bid\": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01],\n", - " \"ask\": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03]\n", - " }\n", - ")\n", - "\n", - "trades = pd.DataFrame(\n", - " {\n", - " \"time\": [\n", - " pd.Timestamp(\"2016-05-25 13:30:00.023\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.038\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.048\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.048\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.048\")\n", - " ],\n", - " \"ticker\": [\"MSFT\", \"MSFT\", \"GOOG\", \"GOOG\", \"AAPL\"],\n", - " \"price\": [51.95, 51.95, 720.77, 720.92, 98.0],\n", - " \"quantity\": [75, 155, 100, 100, 100]\n", - " }\n", - ")\n", - "\n", - "stocks = pd.DataFrame(\n", - " {\n", - " \"ticker\": [\"MSFT\", \"GOOG\", \"AAPL\"],\n", - " \"name\": [\"Microsoft Corporation\", \"Alphabet Inc\", \"Apple Inc\"],\n", - " \"exchange\": [\"NASDAQ\", \"NASDAQ\", \"NASDAQ\"]\n", - " }\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "def move_date(df, col):\n", - " max_date = df[col].max()\n", - " now_date = datetime.datetime.now()\n", - " delta = now_date - max_date \n", - " df[col] = df[col] + delta \n", - " return df\n", - "\n", - "quotes = move_date(quotes, \"time\")\n", - "trades = move_date(trades, \"time\")\n", - "trades.to_csv('trades.csv', index=False)\n", - "data_uri = os.path.join(ABS_PATH, 'trades.csv')" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
timetickerbidask
02022-01-31 14:41:48.260566GOOG720.50720.93
12022-01-31 14:41:48.260566MSFT51.9551.96
22022-01-31 14:41:48.267566MSFT51.9751.98
32022-01-31 14:41:48.278566MSFT51.9952.00
42022-01-31 14:41:48.285566GOOG720.50720.93
52022-01-31 14:41:48.286566AAPL97.9998.01
62022-01-31 14:41:48.309566GOOG720.50720.88
72022-01-31 14:41:48.312566MSFT52.0152.03
\n", - "
" - ], - "text/plain": [ - " time ticker bid ask\n", - "0 2022-01-31 14:41:48.260566 GOOG 720.50 720.93\n", - "1 2022-01-31 14:41:48.260566 MSFT 51.95 51.96\n", - "2 2022-01-31 14:41:48.267566 MSFT 51.97 51.98\n", - "3 2022-01-31 14:41:48.278566 MSFT 51.99 52.00\n", - "4 2022-01-31 14:41:48.285566 GOOG 720.50 720.93\n", - "5 2022-01-31 14:41:48.286566 AAPL 97.99 98.01\n", - "6 2022-01-31 14:41:48.309566 GOOG 720.50 720.88\n", - "7 2022-01-31 14:41:48.312566 MSFT 52.01 52.03" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "quotes" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
timetickerpricequantity
02022-01-31 14:41:48.288476MSFT51.9575
12022-01-31 14:41:48.303476MSFT51.95155
22022-01-31 14:41:48.313476GOOG720.77100
32022-01-31 14:41:48.313476GOOG720.92100
42022-01-31 14:41:48.313476AAPL98.00100
\n", - "
" - ], - "text/plain": [ - " time ticker price quantity\n", - "0 2022-01-31 14:41:48.288476 MSFT 51.95 75\n", - "1 2022-01-31 14:41:48.303476 MSFT 51.95 155\n", - "2 2022-01-31 14:41:48.313476 GOOG 720.77 100\n", - "3 2022-01-31 14:41:48.313476 GOOG 720.92 100\n", - "4 2022-01-31 14:41:48.313476 AAPL 98.00 100" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "trades" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
tickernameexchange
0MSFTMicrosoft CorporationNASDAQ
1GOOGAlphabet IncNASDAQ
2AAPLApple IncNASDAQ
\n", - "
" - ], - "text/plain": [ - " ticker name exchange\n", - "0 MSFT Microsoft Corporation NASDAQ\n", - "1 GOOG Alphabet Inc NASDAQ\n", - "2 AAPL Apple Inc NASDAQ" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "stocks" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Build & Ingest Simple Feature Set (stocks)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
nameexchange
ticker
MSFTMicrosoft CorporationNASDAQ
GOOGAlphabet IncNASDAQ
AAPLApple IncNASDAQ
\n", - "
" - ], - "text/plain": [ - " name exchange\n", - "ticker \n", - "MSFT Microsoft Corporation NASDAQ\n", - "GOOG Alphabet Inc NASDAQ\n", - "AAPL Apple Inc NASDAQ" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# add feature set without time column (stock ticker metadata) \n", - "stocks_set = fstore.FeatureSet(\"stocks\", entities=[fstore.Entity(\"ticker\")])\n", - "fstore.ingest(stocks_set, stocks, infer_options=fstore.InferOptions.default())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Build Advanced feature set - with feature engineering pipeline" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "quotes_set = fstore.FeatureSet(\"stock-quotes\", entities=[fstore.Entity(\"ticker\")])" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "class MyMap(MapClass):\n", - " def __init__(self, multiplier=1, **kwargs):\n", - " super().__init__(**kwargs)\n", - " self._multiplier = multiplier\n", - "\n", - " def do(self, event):\n", - " event[\"multi\"] = event[\"bid\"] * self._multiplier\n", - " return event" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "image/svg+xml": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "mlrun-flow\n", - "\n", - "\n", - "\n", - "_start\n", - "\n", - "start\n", - "\n", - "\n", - "\n", - "MyMap\n", - "\n", - "MyMap\n", - "\n", - "\n", - "\n", - "_start->MyMap\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "storey.Extend\n", - "\n", - "storey.Extend\n", - "\n", - "\n", - "\n", - "MyMap->storey.Extend\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "filter\n", - "\n", - "filter\n", - "\n", - "\n", - "\n", - "storey.Extend->filter\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "FeaturesetValidator\n", - "\n", - "FeaturesetValidator\n", - "\n", - "\n", - "\n", - "filter->FeaturesetValidator\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "Aggregates\n", - "\n", - "Aggregates\n", - "\n", - "\n", - "\n", - "FeaturesetValidator->Aggregates\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "parquet\n", - "\n", - "\n", - "parquet\n", - "\n", - "\n", - "\n", - "Aggregates->parquet\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "nosql\n", - "\n", - "\n", - "nosql\n", - "\n", - "\n", - "\n", - "Aggregates->nosql\n", - "\n", - "\n", - "\n", - "\n", - "\n" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "quotes_set.graph.to(\"MyMap\", multiplier=3)\\\n", - " .to(\"storey.Extend\", _fn=\"({'extra': event['bid'] * 77})\")\\\n", - " .to(\"storey.Filter\", \"filter\", _fn=\"(event['bid'] > 51.92)\")\\\n", - " .to(FeaturesetValidator())\n", - "\n", - "quotes_set.add_aggregation(\"ask\", [\"sum\", \"max\"], \"1h\", \"10m\", name=\"asks1\")\n", - "quotes_set.add_aggregation(\"ask\", [\"sum\", \"max\"], \"5h\", \"10m\", name=\"asks5\")\n", - "quotes_set.add_aggregation(\"bid\", [\"min\", \"max\"], \"1h\", \"10m\", name=\"bids\")\n", - "\n", - "# add feature validation policy\n", - "quotes_set[\"bid\"] = fstore.Feature(validator=MinMaxValidator(min=52, severity=\"info\"))\n", - "\n", - "# add default target definitions and plot\n", - "quotes_set.set_targets()\n", - "quotes_set.plot(rankdir=\"LR\", with_targets=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Ingest Data Into Offline And Online Stores" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.377248+00:00 args={'min': 52, 'value': 51.95}\n", - "info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.377927+00:00 args={'min': 52, 'value': 51.97}\n", - "info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.378103+00:00 args={'min': 52, 'value': 51.99}\n", - "info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.578640+00:00 args={'min': 52, 'value': 51.95}\n", - "info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.581692+00:00 args={'min': 52, 'value': 51.97}\n", - "info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.584351+00:00 args={'min': 52, 'value': 51.99}\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
asks1_max_1hasks1_sum_1hasks5_max_5hasks5_sum_5hbids_max_1hbids_min_1htimebidaskmultiextra
ticker
GOOG720.93720.93720.93720.93720.50720.502022-01-31 14:41:48.260566720.50720.932161.5055478.50
MSFT51.9651.9651.9651.9651.9551.952022-01-31 14:41:48.26056651.9551.96155.854000.15
MSFT51.98103.9451.98103.9451.9751.952022-01-31 14:41:48.26756651.9751.98155.914001.69
MSFT52.00155.9452.00155.9451.9951.952022-01-31 14:41:48.27856651.9952.00155.974003.23
GOOG720.931441.86720.931441.86720.50720.502022-01-31 14:41:48.285566720.50720.932161.5055478.50
AAPL98.0198.0198.0198.0197.9997.992022-01-31 14:41:48.28656697.9998.01293.977545.23
GOOG720.932162.74720.932162.74720.50720.502022-01-31 14:41:48.309566720.50720.882161.5055478.50
MSFT52.03207.9752.03207.9752.0151.952022-01-31 14:41:48.31256652.0152.03156.034004.77
\n", - "
" - ], - "text/plain": [ - " asks1_max_1h asks1_sum_1h asks5_max_5h asks5_sum_5h bids_max_1h \\\n", - "ticker \n", - "GOOG 720.93 720.93 720.93 720.93 720.50 \n", - "MSFT 51.96 51.96 51.96 51.96 51.95 \n", - "MSFT 51.98 103.94 51.98 103.94 51.97 \n", - "MSFT 52.00 155.94 52.00 155.94 51.99 \n", - "GOOG 720.93 1441.86 720.93 1441.86 720.50 \n", - "AAPL 98.01 98.01 98.01 98.01 97.99 \n", - "GOOG 720.93 2162.74 720.93 2162.74 720.50 \n", - "MSFT 52.03 207.97 52.03 207.97 52.01 \n", - "\n", - " bids_min_1h time bid ask multi \\\n", - "ticker \n", - "GOOG 720.50 2022-01-31 14:41:48.260566 720.50 720.93 2161.50 \n", - "MSFT 51.95 2022-01-31 14:41:48.260566 51.95 51.96 155.85 \n", - "MSFT 51.95 2022-01-31 14:41:48.267566 51.97 51.98 155.91 \n", - "MSFT 51.95 2022-01-31 14:41:48.278566 51.99 52.00 155.97 \n", - "GOOG 720.50 2022-01-31 14:41:48.285566 720.50 720.93 2161.50 \n", - "AAPL 97.99 2022-01-31 14:41:48.286566 97.99 98.01 293.97 \n", - "GOOG 720.50 2022-01-31 14:41:48.309566 720.50 720.88 2161.50 \n", - "MSFT 51.95 2022-01-31 14:41:48.312566 52.01 52.03 156.03 \n", - "\n", - " extra \n", - "ticker \n", - "GOOG 55478.50 \n", - "MSFT 4000.15 \n", - "MSFT 4001.69 \n", - "MSFT 4003.23 \n", - "GOOG 55478.50 \n", - "AAPL 7545.23 \n", - "GOOG 55478.50 \n", - "MSFT 4004.77 " - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# save ingest data and print the FeatureSet spec\n", - "fstore.ingest(quotes_set, quotes)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Get an Offline Feature Vector" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "features = [\n", - " \"stock-quotes.multi\",\n", - " \"stock-quotes.asks5_sum_5h as total_ask\",\n", - " \"stock-quotes.bids_min_1h\",\n", - " \"stock-quotes.bids_max_1h\",\n", - " \"stocks.*\",\n", - "]\n", - "\n", - "vector = fstore.FeatureVector(\"stocks-vec\", features)\n", - "vector.save()" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "target_dict = CSVTarget('mycsv',path=os.path.join(ABS_PATH, 'my_csv.csv')).to_dict()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Using `get_offline_features()` " - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [], - "source": [ - "get_offline_features_fn = mlrun.import_function('hub://get_offline_features:development')" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2022-01-31 14:41:52,066 [info] starting run get-offline-features-get_offline_features uid=956663b9a9ba448c9ea65e8e9245718e DB=http://mlrun-api:8080\n", - "> 2022-01-31 14:41:52,214 [info] Creating DataFrame from entity_rows = v3io://users/yonatan/get_offline_features/trades.csv\n", - "> 2022-01-31 14:41:52,292 [info] Preparing 'mycsv' target\n", - "> 2022-01-31 14:41:52,294 [info] getting offline features from the FeatureVector store://feature-vectors/get-offline-features-yonatan/stocks-vec\n", - "> 2022-01-31 14:41:52,708 [info] wrote target: {'name': 'mycsv', 'kind': 'csv', 'path': 'v3io://users/yonatan/get_offline_features/my_csv.csv', 'status': 'ready', 'updated': '2022-01-31T14:41:52.708534+00:00'}\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
get-offline-features-yonatan0Jan 31 14:41:52completedget-offline-features-get_offline_features
v3io_user=yonatan
kind=
owner=yonatan
host=jupyter-yoni-647b99c95d-w4jlc
entity_rows
feature_vector=store://feature-vectors/get-offline-features-yonatan/stocks-vec
target={'name': 'mycsv', 'kind': 'csv', 'path': 'v3io://users/yonatan/get_offline_features/my_csv.csv', 'partitioned': False}
entity_timestamp_column=time
target=v3io://users/yonatan/get_offline_features/my_csv.csv
feature_vector=store://feature-vectors/get-offline-features-yonatan/stocks-vec
\n", - "
\n", - "
\n", - "
\n", - " Title\n", - " ×\n", - "
\n", - " \n", - "
\n", - "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - }, - { - "data": { - "text/html": [ - " > to track results use the .show() or .logs() methods or click here to open in UI" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2022-01-31 14:41:52,896 [info] run executed, status=completed\n" - ] - } - ], - "source": [ - "gof_run = get_offline_features_fn.run(\n", - " handler='get_offline_features',\n", - " inputs= {'entity_rows': data_uri},\n", - " params={'feature_vector': vector.uri,\n", - " 'target': target_dict,\n", - " 'entity_timestamp_column': \"time\",\n", - " },\n", - " local=True\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'store://feature-vectors/get-offline-features-yonatan/stocks-vec'" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "gof_run.outputs['feature_vector']" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Unnamed: 0pricequantitymultitotal_askbids_min_1hbids_max_1hnameexchange
0051.9575155.8551.9651.9551.95Microsoft CorporationNASDAQ
1151.9575155.91103.9451.9551.97Microsoft CorporationNASDAQ
2251.9575155.97155.9451.9551.99Microsoft CorporationNASDAQ
3351.9575156.03207.9751.9552.01Microsoft CorporationNASDAQ
4451.95155155.8551.9651.9551.95Microsoft CorporationNASDAQ
5551.95155155.91103.9451.9551.97Microsoft CorporationNASDAQ
6651.95155155.97155.9451.9551.99Microsoft CorporationNASDAQ
7751.95155156.03207.9751.9552.01Microsoft CorporationNASDAQ
88720.771002161.50720.93720.50720.50Alphabet IncNASDAQ
99720.771002161.501441.86720.50720.50Alphabet IncNASDAQ
1010720.771002161.502162.74720.50720.50Alphabet IncNASDAQ
1111720.921002161.50720.93720.50720.50Alphabet IncNASDAQ
1212720.921002161.501441.86720.50720.50Alphabet IncNASDAQ
1313720.921002161.502162.74720.50720.50Alphabet IncNASDAQ
141498.00100293.9798.0197.9997.99Apple IncNASDAQ
\n", - "
" - ], - "text/plain": [ - " Unnamed: 0 price quantity multi total_ask bids_min_1h \\\n", - "0 0 51.95 75 155.85 51.96 51.95 \n", - "1 1 51.95 75 155.91 103.94 51.95 \n", - "2 2 51.95 75 155.97 155.94 51.95 \n", - "3 3 51.95 75 156.03 207.97 51.95 \n", - "4 4 51.95 155 155.85 51.96 51.95 \n", - "5 5 51.95 155 155.91 103.94 51.95 \n", - "6 6 51.95 155 155.97 155.94 51.95 \n", - "7 7 51.95 155 156.03 207.97 51.95 \n", - "8 8 720.77 100 2161.50 720.93 720.50 \n", - "9 9 720.77 100 2161.50 1441.86 720.50 \n", - "10 10 720.77 100 2161.50 2162.74 720.50 \n", - "11 11 720.92 100 2161.50 720.93 720.50 \n", - "12 12 720.92 100 2161.50 1441.86 720.50 \n", - "13 13 720.92 100 2161.50 2162.74 720.50 \n", - "14 14 98.00 100 293.97 98.01 97.99 \n", - "\n", - " bids_max_1h name exchange \n", - "0 51.95 Microsoft Corporation NASDAQ \n", - "1 51.97 Microsoft Corporation NASDAQ \n", - "2 51.99 Microsoft Corporation NASDAQ \n", - "3 52.01 Microsoft Corporation NASDAQ \n", - "4 51.95 Microsoft Corporation NASDAQ \n", - "5 51.97 Microsoft Corporation NASDAQ \n", - "6 51.99 Microsoft Corporation NASDAQ \n", - "7 52.01 Microsoft Corporation NASDAQ \n", - "8 720.50 Alphabet Inc NASDAQ \n", - "9 720.50 Alphabet Inc NASDAQ \n", - "10 720.50 Alphabet Inc NASDAQ \n", - "11 720.50 Alphabet Inc NASDAQ \n", - "12 720.50 Alphabet Inc NASDAQ \n", - "13 720.50 Alphabet Inc NASDAQ \n", - "14 97.99 Apple Inc NASDAQ " - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "mlrun.get_dataitem(gof_run.outputs['feature_vector']).as_df()" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/functions/development/get_offline_features/1.2.0/src/get_offline_features.py b/functions/development/get_offline_features/1.2.0/src/get_offline_features.py deleted file mode 100644 index a48faa9c..00000000 --- a/functions/development/get_offline_features/1.2.0/src/get_offline_features.py +++ /dev/null @@ -1,142 +0,0 @@ -# Copyright 2019 Iguazio -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -from typing import Union, List, Dict - -import mlrun -import mlrun.feature_store as fs -from mlrun.datastore.store_resources import is_store_uri, parse_store_uri -from mlrun.datastore.targets import get_target_driver, kind_to_driver -from mlrun.datastore.base import DataItem -from mlrun.execution import MLClientCtx -from mlrun.utils import StorePrefix, parse_versioned_object_uri -from mlrun.errors import MLRunInvalidArgumentError - - -def get_offline_features( - context: MLClientCtx, - feature_vector: str, - features: Union[List[str], None] = None, - label_feature: str = None, - description: str = None, - entity_rows: DataItem = None, - entity_timestamp_column: str = None, - target: Union[str, Dict] = None, - run_config: Union[str, Dict] = None, - drop_columns: List[str] = None, - start_time: str = None, - end_time: str = None, - with_indexes: bool = False, - update_stats: bool = False, -): - """retrieve offline feature vector results - - specify a feature vector object/uri and retrieve the desired features, their metadata - and statistics. returns :py:class:`~mlrun.feature_store.OfflineVectorResponse`, - results can be returned as a dataframe or written to a target. - If feature vector does not exist, a new one will be created and saved with the given features. - - The start_time and end_time attributes allow filtering the data to a given time range, they accept - string values or pandas `Timestamp` objects, string values can also be relative, for example: - "now", "now - 1d2h", "now+5m", where a valid pandas Timedelta string follows the verb "now", - for time alignment you can use the verb "floor" e.g. "now -1d floor 1H" will align the time to the last hour - (the floor string is passed to pandas.Timestamp.floor(), can use D, H, T, S for day, hour, min, sec alignment) - - - :param context: MLRun context - :param feature_vector: feature vector uri - :param features: Relevant only if feature_vector not exist. list of feature to collect to this vector - format [/]. [as ] - :param label_feature: feature name to be used as label data - :param description: text description of the vector - :param entity_rows: URI of the data entity rows to join with - :param target: where to write the results to - :param drop_columns: list of columns to drop from the final result - :param entity_timestamp_column: timestamp column name in the entity rows dataframe - :param run_config: function and/or run configuration - see :py:class:`~mlrun.feature_store.RunConfig` - :param start_time: datetime, low limit of time needed to be filtered. Optional - entity_timestamp_column must be passed when using time filtering - :param end_time: datetime, high limit of time needed to be filtered. Optional - entity_timestamp_column must be passed when using time filtering - :param with_indexes: return vector with index columns (default False) - :param update_stats: update features statistics from the requested feature sets on the vector. Default is False. - - :returns feature_vector input - """ - - if features: - # Creating a new FeatureVector and saving: - if is_store_uri(feature_vector): - prefix, new_uri = parse_store_uri(feature_vector) - if prefix != StorePrefix.FeatureVector: - raise MLRunInvalidArgumentError( - f"provided store uri ({feature_vector}) does not represent a feature vector (prefix={prefix})" - ) - feature_vector = new_uri - - context.logger.info(f"Creating FeatureVector {feature_vector}") - project, name, tag, _ = parse_versioned_object_uri(feature_vector, mlrun.mlconf.default_project) - vector = fs.FeatureVector(name, features, label_feature=label_feature, description=description) - vector.metadata.project = project - vector.metadata.tag = tag - vector.save() - feature_vector_uri = vector.uri - else: - if is_store_uri(feature_vector): - feature_vector_uri = feature_vector - else: - vector = fs.get_feature_vector(feature_vector) - feature_vector_uri = vector.uri - - # Preparing entity_rows: - if entity_rows is not None: - context.logger.info(f"Creating DataFrame from entity_rows = {entity_rows}") - entity_rows = entity_rows.as_df() - - # Preparing target: - if target: - if isinstance(target, str): - target = kind_to_driver[target]() - - name = target.name if hasattr(target, "name") else target["name"] - context.logger.info(f"Preparing '{name}' target") - target = get_target_driver(target) - if hasattr(target, 'path') and target.path: - context.log_result("target", target.path) - - # Preparing run_config: - if run_config and isinstance(run_config, dict): - context.logger.info("Preparing run configuration") - run_config = fs.RunConfig(**run_config) - - # Calling get_offline_features: - context.logger.info( - f"getting offline features from the FeatureVector {feature_vector}" - ) - fs.get_offline_features( - feature_vector=feature_vector_uri, - entity_rows=entity_rows, - entity_timestamp_column=entity_timestamp_column, - target=target, - run_config=run_config, - drop_columns=drop_columns, - start_time=start_time, - end_time=end_time, - with_indexes=with_indexes, - update_stats=update_stats, - ) - - context.log_result("feature_vector", feature_vector) - context.log_result("feature_vector_uri", feature_vector_uri) diff --git a/functions/development/get_offline_features/1.2.0/src/item.yaml b/functions/development/get_offline_features/1.2.0/src/item.yaml deleted file mode 100644 index 17241f6e..00000000 --- a/functions/development/get_offline_features/1.2.0/src/item.yaml +++ /dev/null @@ -1,26 +0,0 @@ -apiVersion: v1 -categories: -- data-preparation -- data-analysis -- feature-store -description: retrieve offline feature vector results -doc: '' -example: get_offline_features.ipynb -generationDate: 2022-08-28:17-25 -hidden: false -icon: '' -labels: - author: yonish -maintainers: [] -marketplaceType: '' -mlrunVersion: 1.1.0 -name: get_offline_features -platformVersion: 3.5.0 -spec: - filename: get_offline_features.py - handler: get_offline_features - image: mlrun/mlrun - kind: job - requirements: [] -url: '' -version: 1.2.0 diff --git a/functions/development/get_offline_features/1.2.0/src/test_get_offline_features.py b/functions/development/get_offline_features/1.2.0/src/test_get_offline_features.py deleted file mode 100644 index 21913e01..00000000 --- a/functions/development/get_offline_features/1.2.0/src/test_get_offline_features.py +++ /dev/null @@ -1,239 +0,0 @@ -# Copyright 2019 Iguazio -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -import os -import tempfile -import shutil -import datetime - -import pytest -import mlrun -import mlrun.feature_store as fstore -from mlrun.datastore.targets import CSVTarget -from mlrun.feature_store.steps import * -from mlrun.features import MinMaxValidator -from mlrun.run import get_dataitem - - -REQUIRED_ENV_VARS = [ - "MLRUN_DBPATH", - "MLRUN_ARTIFACT_PATH", - "V3IO_USERNAME", - "V3IO_API", - "V3IO_ACCESS_KEY", -] - - -def _validate_environment_variables() -> bool: - """ - Checks that all required Environment variables are set. - """ - environment_keys = os.environ.keys() - return all(key in environment_keys for key in REQUIRED_ENV_VARS) - - -def _set_environment(): - """ - Creating project and temp dir for the project. - """ - artifact_path = tempfile.TemporaryDirectory().name - os.makedirs(artifact_path) - project = mlrun.get_or_create_project( - "get-offline-features-test", context="./", user_project=True - ) - return artifact_path, project - - -def _cleanup_environment(artifact_path: str): - """ - Cleanup the test environment, deleting files and artifacts created during the test. - - :param artifact_path: The artifact path to delete. - """ - # Clean the local directory: - for test_output in [ - *os.listdir(artifact_path), - "schedules", - "runs", - "artifacts", - "functions", - ]: - test_output_path = os.path.abspath(f"./{test_output}") - if os.path.exists(test_output_path): - if os.path.isdir(test_output_path): - shutil.rmtree(test_output_path) - else: - os.remove(test_output_path) - - # Clean the artifacts directory: - shutil.rmtree(artifact_path) - - -def create_dataframes() -> (pd.DataFrame, pd.DataFrame, pd.DataFrame): - """ - Creates all the necessary DataFrames to the test. - """ - - def move_date(df, col): - max_date = df[col].max() - now_date = datetime.datetime.now() - delta = now_date - max_date - df[col] = df[col] + delta - return df - - stocks = pd.DataFrame( - { - "ticker": ["MSFT", "GOOG", "AAPL"], - "name": ["Microsoft Corporation", "Alphabet Inc", "Apple Inc"], - "exchange": ["NASDAQ", "NASDAQ", "NASDAQ"], - } - ) - - quotes = pd.DataFrame( - { - "time": [ - pd.Timestamp("2016-05-25 13:30:00.023"), - pd.Timestamp("2016-05-25 13:30:00.023"), - pd.Timestamp("2016-05-25 13:30:00.030"), - pd.Timestamp("2016-05-25 13:30:00.041"), - pd.Timestamp("2016-05-25 13:30:00.048"), - pd.Timestamp("2016-05-25 13:30:00.049"), - pd.Timestamp("2016-05-25 13:30:00.072"), - pd.Timestamp("2016-05-25 13:30:00.075"), - ], - "ticker": ["GOOG", "MSFT", "MSFT", "MSFT", "GOOG", "AAPL", "GOOG", "MSFT"], - "bid": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01], - "ask": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03], - } - ) - - trades = pd.DataFrame( - { - "time": [ - pd.Timestamp("2016-05-25 13:30:00.023"), - pd.Timestamp("2016-05-25 13:30:00.038"), - pd.Timestamp("2016-05-25 13:30:00.048"), - pd.Timestamp("2016-05-25 13:30:00.048"), - pd.Timestamp("2016-05-25 13:30:00.048"), - ], - "ticker": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"], - "price": [51.95, 51.95, 720.77, 720.92, 98.0], - "quantity": [75, 155, 100, 100, 100], - } - ) - quotes = move_date(quotes, "time") - trades = move_date(trades, "time") - return quotes, trades, stocks - - -class MyMap(MapClass): - def __init__(self, multiplier=1, **kwargs): - super().__init__(**kwargs) - self._multiplier = multiplier - - def do(self, event): - event["multi"] = event["bid"] * self._multiplier - return event - - -def _create_feature_set(): - """ - Creating all the necessary FeatureSets for the test. - """ - stocks_set = fstore.FeatureSet("stocks", entities=[fstore.Entity("ticker")]) - - quotes_set = fstore.FeatureSet("stock-quotes", entities=[fstore.Entity("ticker")]) - - quotes_set.graph.to("MyMap", multiplier=3).to( - "storey.Extend", _fn="({'extra': event['bid'] * 77})" - ).to("storey.Filter", "filter", _fn="(event['bid'] > 51.92)").to( - FeaturesetValidator() - ) - - quotes_set.add_aggregation("ask", ["sum", "max"], "1h", "10m", name="asks1") - quotes_set.add_aggregation("ask", ["sum", "max"], "5h", "10m", name="asks5") - quotes_set.add_aggregation("bid", ["min", "max"], "1h", "10m", name="bids") - - # add feature validation policy - quotes_set["bid"] = fstore.Feature( - validator=MinMaxValidator(min=52, severity="info") - ) - - # add default target definitions and plot - quotes_set.set_targets() - return quotes_set, stocks_set - - -@pytest.mark.skipif( - condition=not _validate_environment_variables(), - reason="Project's environment variables are not set", -) -def test_get_offline_vector(): - # Creating project: - artifact_path, project = _set_environment() - - # Importing the marketplace function: - gof_fn = mlrun.import_function("function.yaml") - - # Creating the dataframes: - quotes, trades, stocks = create_dataframes() - - # Defining features for the FeatureVector: - features = [ - "stock-quotes.multi", - "stock-quotes.asks5_sum_5h as total_ask", - "stock-quotes.bids_min_1h", - "stock-quotes.bids_max_1h", - "stocks.*", - ] - - # Creating the FeatureSets and ingesting them: - quotes_set, stocks_set = _create_feature_set() - fstore.ingest(stocks_set, stocks) - fstore.ingest(quotes_set, quotes) - - # Saving the trades dataframe as a csv to use as entity_rows: - trades_uri = os.path.join(artifact_path, "trades.csv") - trades.to_csv(trades_uri, index=False) - - # Creating target for the FeatureVector: - target_dict = CSVTarget( - "mycsv", path=os.path.join(artifact_path, "my_csv.csv") - ).to_dict() - - # Running the getting_offline_features function: - gof_run = None - try: - gof_run = gof_fn.run( - handler="get_offline_features", - inputs={"entity_rows": trades_uri}, - params={ - "feature_vector": "stocks-vec", - "features": features, - "target": target_dict, - "entity_timestamp_column": "time", - }, - local=True, - ) - - except Exception as e: - print(f"- The test failed - raised the following error:\n- {e}") - - target_df = get_dataitem(gof_run.outputs["target"]).as_df() - vector_df = get_dataitem(gof_run.outputs["feature_vector"]).as_df() - - # Asserting that the target and FeatureVector dataframes are the same: - assert mlrun.datastore.is_store_uri(gof_run.outputs["feature_vector_uri"]) - assert vector_df.equals(target_df), "Target and feature vector are not the same" - _cleanup_environment(artifact_path) diff --git a/functions/development/get_offline_features/1.2.0/static/documentation.html b/functions/development/get_offline_features/1.2.0/static/documentation.html deleted file mode 100644 index 3c02dab8..00000000 --- a/functions/development/get_offline_features/1.2.0/static/documentation.html +++ /dev/null @@ -1,261 +0,0 @@ - - - - - - - -get_offline_features package - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - - - -
-
- - -
-
-
- -
-

get_offline_features package

- -
- -
-
-
-
-
-

get_offline_features package#

-
-

Submodules#

-
-
-

get_offline_features.get_offline_features module#

-
-
-get_offline_features.get_offline_features.get_offline_features(context: mlrun.execution.MLClientCtx, feature_vector: str, features: Optional[List[str]] = None, label_feature: Optional[str] = None, description: Optional[str] = None, entity_rows: Optional[mlrun.datastore.base.DataItem] = None, entity_timestamp_column: Optional[str] = None, target: Optional[Union[str, Dict]] = None, run_config: Optional[Union[str, Dict]] = None, drop_columns: Optional[List[str]] = None, start_time: Optional[str] = None, end_time: Optional[str] = None, with_indexes: bool = False, update_stats: bool = False)[source]#
-

retrieve offline feature vector results

-

specify a feature vector object/uri and retrieve the desired features, their metadata -and statistics. returns OfflineVectorResponse, -results can be returned as a dataframe or written to a target. -If feature vector does not exist, a new one will be created and saved with the given features.

-

The start_time and end_time attributes allow filtering the data to a given time range, they accept -string values or pandas Timestamp objects, string values can also be relative, for example: -“now”, “now - 1d2h”, “now+5m”, where a valid pandas Timedelta string follows the verb “now”, -for time alignment you can use the verb “floor” e.g. “now -1d floor 1H” will align the time to the last hour -(the floor string is passed to pandas.Timestamp.floor(), can use D, H, T, S for day, hour, min, sec alignment)

-
-
Parameters
-
    -
  • context – MLRun context

  • -
  • feature_vector – feature vector uri

  • -
  • features – Relevant only if feature_vector not exist. list of feature to collect to this vector -format [<project>/]<feature_set>.<feature_name or *> [as <alias>]

  • -
  • label_feature – feature name to be used as label data

  • -
  • description – text description of the vector

  • -
  • entity_rows – URI of the data entity rows to join with

  • -
  • target – where to write the results to

  • -
  • drop_columns – list of columns to drop from the final result

  • -
  • entity_timestamp_column – timestamp column name in the entity rows dataframe

  • -
  • run_config – function and/or run configuration -see RunConfig

  • -
  • start_time – datetime, low limit of time needed to be filtered. Optional -entity_timestamp_column must be passed when using time filtering

  • -
  • end_time – datetime, high limit of time needed to be filtered. Optional -entity_timestamp_column must be passed when using time filtering

  • -
  • with_indexes – return vector with index columns (default False)

  • -
  • update_stats – update features statistics from the requested feature sets on the vector. Default is False.

  • -
-
-
-

:returns feature_vector input

-
-
-
-

Module contents#

-
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/get_offline_features/1.2.0/static/example.html b/functions/development/get_offline_features/1.2.0/static/example.html deleted file mode 100644 index bbd67e37..00000000 --- a/functions/development/get_offline_features/1.2.0/static/example.html +++ /dev/null @@ -1,1390 +0,0 @@ - - - - - - - -get_offline_features() from MLRun FeatureStore - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - - - -
-
- - -
-
-
- - -
-
-
-

get_offline_features() from MLRun FeatureStore#

-

This MLRun Function has the following params:

-
    -
  • feature_vector: str, feature vector uri.

  • -
  • entity_rows: DataItem = None, URI of the data entity rows to join with.

  • -
  • entity_timestamp_column: str = None, timestamp column name in the entity rows dataframe.

  • -
  • target: Union[str, Dict] = None, where to write the results to.

  • -
  • run_config: Union[str, Dict] = None, function and/or run configuration see :py:class:~mlrun.feature_store.RunConfig.

  • -
  • drop_columns: List[str] = None, list of columns to drop from the final result.

  • -
  • start_time: str = None, datetime, low limit of time needed to be filtered. Optional. entity_timestamp_column must be passed when using time filtering.

  • -
  • end_time: str = None, datetime, high limit of time needed to be filtered. Optional. entity_timestamp_column must be passed when using time filtering.

  • -
  • with_indexes: bool = False, return vector with index columns (default False).

  • -
  • update_stats: bool = False, update features statistics from the requested feature sets on the vector. Default is False.

  • -
-
-
-
import mlrun
-import mlrun.feature_store as fstore
-from mlrun.datastore.targets import CSVTarget
-from mlrun.datastore.sources import CSVSource
-from mlrun.run import get_dataitem
-from mlrun.feature_store.steps import *
-from mlrun.features import MinMaxValidator
-import pandas as pd
-import datetime
-import os
-
-
-
-
-
-
-
ABS_PATH = 'v3io://users/{}/get_offline_features/'.format(os.environ['V3IO_USERNAME'])
-# Initialize the MLRun project object
-project = mlrun.get_or_create_project('get-offline-features', context="./", user_project=True)
-
-
-
-
-
> 2022-01-31 14:41:48,288 [info] loaded project get-offline-features from MLRun DB
-
-
-
-
-
-

Generating the Same FeatureSets and FeatureVecotrs Based on the Stocks Example#

-
-

Create Sample Data For Demo#

-
-
-
quotes = pd.DataFrame(
-    {
-        "time": [
-            pd.Timestamp("2016-05-25 13:30:00.023"),
-            pd.Timestamp("2016-05-25 13:30:00.023"),
-            pd.Timestamp("2016-05-25 13:30:00.030"),
-            pd.Timestamp("2016-05-25 13:30:00.041"),
-            pd.Timestamp("2016-05-25 13:30:00.048"),
-            pd.Timestamp("2016-05-25 13:30:00.049"),
-            pd.Timestamp("2016-05-25 13:30:00.072"),
-            pd.Timestamp("2016-05-25 13:30:00.075")
-        ],
-        "ticker": [
-               "GOOG",
-               "MSFT",
-               "MSFT",
-               "MSFT",
-               "GOOG",
-               "AAPL",
-               "GOOG",
-               "MSFT"
-           ],
-           "bid": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01],
-           "ask": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03]
-    }
-)
-
-trades = pd.DataFrame(
-       {
-           "time": [
-               pd.Timestamp("2016-05-25 13:30:00.023"),
-               pd.Timestamp("2016-05-25 13:30:00.038"),
-               pd.Timestamp("2016-05-25 13:30:00.048"),
-               pd.Timestamp("2016-05-25 13:30:00.048"),
-               pd.Timestamp("2016-05-25 13:30:00.048")
-           ],
-           "ticker": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"],
-           "price": [51.95, 51.95, 720.77, 720.92, 98.0],
-           "quantity": [75, 155, 100, 100, 100]
-       }
-)
-
-stocks = pd.DataFrame(
-       {
-           "ticker": ["MSFT", "GOOG", "AAPL"],
-           "name": ["Microsoft Corporation", "Alphabet Inc", "Apple Inc"],
-           "exchange": ["NASDAQ", "NASDAQ", "NASDAQ"]
-       }
-)
-
-
-
-
-
-
-
def move_date(df, col):
-    max_date = df[col].max()
-    now_date = datetime.datetime.now()
-    delta = now_date - max_date 
-    df[col] = df[col] + delta 
-    return df
-
-quotes = move_date(quotes, "time")
-trades = move_date(trades, "time")
-trades.to_csv('trades.csv', index=False)
-data_uri = os.path.join(ABS_PATH, 'trades.csv')
-
-
-
-
-
-
-
quotes
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
timetickerbidask
02022-01-31 14:41:48.260566GOOG720.50720.93
12022-01-31 14:41:48.260566MSFT51.9551.96
22022-01-31 14:41:48.267566MSFT51.9751.98
32022-01-31 14:41:48.278566MSFT51.9952.00
42022-01-31 14:41:48.285566GOOG720.50720.93
52022-01-31 14:41:48.286566AAPL97.9998.01
62022-01-31 14:41:48.309566GOOG720.50720.88
72022-01-31 14:41:48.312566MSFT52.0152.03
-
-
-
-
-
trades
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
timetickerpricequantity
02022-01-31 14:41:48.288476MSFT51.9575
12022-01-31 14:41:48.303476MSFT51.95155
22022-01-31 14:41:48.313476GOOG720.77100
32022-01-31 14:41:48.313476GOOG720.92100
42022-01-31 14:41:48.313476AAPL98.00100
-
-
-
-
-
stocks
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
tickernameexchange
0MSFTMicrosoft CorporationNASDAQ
1GOOGAlphabet IncNASDAQ
2AAPLApple IncNASDAQ
-
-
-
-
-

Build & Ingest Simple Feature Set (stocks)#

-
-
-
# add feature set without time column (stock ticker metadata) 
-stocks_set = fstore.FeatureSet("stocks", entities=[fstore.Entity("ticker")])
-fstore.ingest(stocks_set, stocks, infer_options=fstore.InferOptions.default())
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
nameexchange
ticker
MSFTMicrosoft CorporationNASDAQ
GOOGAlphabet IncNASDAQ
AAPLApple IncNASDAQ
-
-
-
-
-

Build Advanced feature set - with feature engineering pipeline#

-
-
-
quotes_set = fstore.FeatureSet("stock-quotes", entities=[fstore.Entity("ticker")])
-
-
-
-
-
-
-
class MyMap(MapClass):
-    def __init__(self, multiplier=1, **kwargs):
-        super().__init__(**kwargs)
-        self._multiplier = multiplier
-
-    def do(self, event):
-        event["multi"] = event["bid"] * self._multiplier
-        return event
-
-
-
-
-
-
-
quotes_set.graph.to("MyMap", multiplier=3)\
-                .to("storey.Extend", _fn="({'extra': event['bid'] * 77})")\
-                .to("storey.Filter", "filter", _fn="(event['bid'] > 51.92)")\
-                .to(FeaturesetValidator())
-
-quotes_set.add_aggregation("ask", ["sum", "max"], "1h", "10m", name="asks1")
-quotes_set.add_aggregation("ask", ["sum", "max"], "5h", "10m", name="asks5")
-quotes_set.add_aggregation("bid", ["min", "max"], "1h", "10m", name="bids")
-
-# add feature validation policy
-quotes_set["bid"] = fstore.Feature(validator=MinMaxValidator(min=52, severity="info"))
-
-# add default target definitions and plot
-quotes_set.set_targets()
-quotes_set.plot(rankdir="LR", with_targets=True)
-
-
-
-
-_images/01e548158b906df8cc3f4f282097c5f50e603245dd43f2314bc9ff5ef6aa1447.svg
-
-
-
-

Ingest Data Into Offline And Online Stores#

-
-
-
# save ingest data and print the FeatureSet spec
-fstore.ingest(quotes_set, quotes)
-
-
-
-
-
info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.377248+00:00 args={'min': 52, 'value': 51.95}
-info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.377927+00:00 args={'min': 52, 'value': 51.97}
-info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.378103+00:00 args={'min': 52, 'value': 51.99}
-info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.578640+00:00 args={'min': 52, 'value': 51.95}
-info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.581692+00:00 args={'min': 52, 'value': 51.97}
-info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.584351+00:00 args={'min': 52, 'value': 51.99}
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
asks1_max_1hasks1_sum_1hasks5_max_5hasks5_sum_5hbids_max_1hbids_min_1htimebidaskmultiextra
ticker
GOOG720.93720.93720.93720.93720.50720.502022-01-31 14:41:48.260566720.50720.932161.5055478.50
MSFT51.9651.9651.9651.9651.9551.952022-01-31 14:41:48.26056651.9551.96155.854000.15
MSFT51.98103.9451.98103.9451.9751.952022-01-31 14:41:48.26756651.9751.98155.914001.69
MSFT52.00155.9452.00155.9451.9951.952022-01-31 14:41:48.27856651.9952.00155.974003.23
GOOG720.931441.86720.931441.86720.50720.502022-01-31 14:41:48.285566720.50720.932161.5055478.50
AAPL98.0198.0198.0198.0197.9997.992022-01-31 14:41:48.28656697.9998.01293.977545.23
GOOG720.932162.74720.932162.74720.50720.502022-01-31 14:41:48.309566720.50720.882161.5055478.50
MSFT52.03207.9752.03207.9752.0151.952022-01-31 14:41:48.31256652.0152.03156.034004.77
-
-
-
-
-

Get an Offline Feature Vector#

-
-
-
features = [
-    "stock-quotes.multi",
-    "stock-quotes.asks5_sum_5h as total_ask",
-    "stock-quotes.bids_min_1h",
-    "stock-quotes.bids_max_1h",
-    "stocks.*",
-]
-
-vector = fstore.FeatureVector("stocks-vec", features)
-vector.save()
-
-
-
-
-
-
-
target_dict = CSVTarget('mycsv',path=os.path.join(ABS_PATH, 'my_csv.csv')).to_dict()
-
-
-
-
-
-
-

Using get_offline_features()#

-
-
-
get_offline_features_fn = mlrun.import_function('hub://get_offline_features:development')
-
-
-
-
-
-
-
gof_run = get_offline_features_fn.run(
-    handler='get_offline_features',
-    inputs= {'entity_rows': data_uri},
-    params={'feature_vector': vector.uri,
-           'target': target_dict,
-            'entity_timestamp_column': "time",
-           },
-    local=True
-)
-
-
-
-
-
> 2022-01-31 14:41:52,066 [info] starting run get-offline-features-get_offline_features uid=956663b9a9ba448c9ea65e8e9245718e DB=http://mlrun-api:8080
-> 2022-01-31 14:41:52,214 [info] Creating DataFrame from entity_rows = v3io://users/yonatan/get_offline_features/trades.csv
-> 2022-01-31 14:41:52,292 [info] Preparing 'mycsv' target
-> 2022-01-31 14:41:52,294 [info] getting offline features from the FeatureVector store://feature-vectors/get-offline-features-yonatan/stocks-vec
-> 2022-01-31 14:41:52,708 [info] wrote target: {'name': 'mycsv', 'kind': 'csv', 'path': 'v3io://users/yonatan/get_offline_features/my_csv.csv', 'status': 'ready', 'updated': '2022-01-31T14:41:52.708534+00:00'}
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
get-offline-features-yonatan0Jan 31 14:41:52completedget-offline-features-get_offline_features
v3io_user=yonatan
kind=
owner=yonatan
host=jupyter-yoni-647b99c95d-w4jlc
entity_rows
feature_vector=store://feature-vectors/get-offline-features-yonatan/stocks-vec
target={'name': 'mycsv', 'kind': 'csv', 'path': 'v3io://users/yonatan/get_offline_features/my_csv.csv', 'partitioned': False}
entity_timestamp_column=time
target=v3io://users/yonatan/get_offline_features/my_csv.csv
feature_vector=store://feature-vectors/get-offline-features-yonatan/stocks-vec
-
- -
-

-
-
-
> to track results use the .show() or .logs() methods or click here to open in UI
> 2022-01-31 14:41:52,896 [info] run executed, status=completed
-
-
-
-
-
-
-
gof_run.outputs['feature_vector']
-
-
-
-
-
'store://feature-vectors/get-offline-features-yonatan/stocks-vec'
-
-
-
-
-
-
-
mlrun.get_dataitem(gof_run.outputs['feature_vector']).as_df()
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Unnamed: 0pricequantitymultitotal_askbids_min_1hbids_max_1hnameexchange
0051.9575155.8551.9651.9551.95Microsoft CorporationNASDAQ
1151.9575155.91103.9451.9551.97Microsoft CorporationNASDAQ
2251.9575155.97155.9451.9551.99Microsoft CorporationNASDAQ
3351.9575156.03207.9751.9552.01Microsoft CorporationNASDAQ
4451.95155155.8551.9651.9551.95Microsoft CorporationNASDAQ
5551.95155155.91103.9451.9551.97Microsoft CorporationNASDAQ
6651.95155155.97155.9451.9551.99Microsoft CorporationNASDAQ
7751.95155156.03207.9751.9552.01Microsoft CorporationNASDAQ
88720.771002161.50720.93720.50720.50Alphabet IncNASDAQ
99720.771002161.501441.86720.50720.50Alphabet IncNASDAQ
1010720.771002161.502162.74720.50720.50Alphabet IncNASDAQ
1111720.921002161.50720.93720.50720.50Alphabet IncNASDAQ
1212720.921002161.501441.86720.50720.50Alphabet IncNASDAQ
1313720.921002161.502162.74720.50720.50Alphabet IncNASDAQ
141498.00100293.9798.0197.9997.99Apple IncNASDAQ
-
-
-
-
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/get_offline_features/1.2.0/static/function.html b/functions/development/get_offline_features/1.2.0/static/function.html deleted file mode 100644 index efdf38c0..00000000 --- a/functions/development/get_offline_features/1.2.0/static/function.html +++ /dev/null @@ -1,149 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: job
-metadata:
-  name: get-offline-features
-  tag: ''
-  hash: 5ac6c4e2b67440b464710c072708a3581125c2f8
-  project: ''
-  labels:
-    author: yonish
-  categories:
-  - data-preparation
-  - data-analysis
-  - feature-store
-spec:
-  command: ''
-  args: []
-  image: mlrun/mlrun
-  build:
-    functionSourceCode: 
-    commands: []
-    code_origin: https://github.com/yonishelach/functions.git#82aab1724569d20c73cca114beb2fac0821d3383:/Users/Yonatan_Shelach/projects/functions/get_offline_features/get_offline_features.py
-    origin_filename: /Users/Yonatan_Shelach/projects/functions/get_offline_features/get_offline_features.py
-  entry_points:
-    get_offline_features:
-      name: get_offline_features
-      doc: 'retrieve offline feature vector results
-
-
-        specify a feature vector object/uri and retrieve the desired features, their
-        metadata
-
-        and statistics. returns :py:class:`~mlrun.feature_store.OfflineVectorResponse`,
-
-        results can be returned as a dataframe or written to a target.
-
-        If feature vector does not exist, a new one will be created and saved with
-        the given features.
-
-
-        The start_time and end_time attributes allow filtering the data to a given
-        time range, they accept
-
-        string values or pandas `Timestamp` objects, string values can also be relative,
-        for example:
-
-        "now", "now - 1d2h", "now+5m", where a valid pandas Timedelta string follows
-        the verb "now",
-
-        for time alignment you can use the verb "floor" e.g. "now -1d floor 1H" will
-        align the time to the last hour
-
-        (the floor string is passed to pandas.Timestamp.floor(), can use D, H, T,
-        S for day, hour, min, sec alignment)'
-      parameters:
-      - name: context
-        type: MLClientCtx
-        doc: MLRun context
-        default: ''
-      - name: feature_vector
-        type: str
-        doc: feature vector uri
-        default: ''
-      - name: features
-        type: Union[List[str], ]
-        doc: Relevant only if feature_vector not exist. list of feature to collect
-          to this vector format [/]. [as
-          ]
-        default: null
-      - name: label_feature
-        type: str
-        doc: feature name to be used as label data
-        default: null
-      - name: description
-        type: str
-        doc: text description of the vector
-        default: null
-      - name: entity_rows
-        type: DataItem
-        doc: URI of the data entity rows to join with
-        default: null
-      - name: entity_timestamp_column
-        type: str
-        doc: timestamp column name in the entity rows dataframe
-        default: null
-      - name: target
-        type: Union[str, Dict]
-        doc: where to write the results to
-        default: null
-      - name: run_config
-        type: Union[str, Dict]
-        doc: function and/or run configuration see :py:class:`~mlrun.feature_store.RunConfig`
-        default: null
-      - name: drop_columns
-        type: List[str]
-        doc: list of columns to drop from the final result
-        default: null
-      - name: start_time
-        type: str
-        doc: datetime, low limit of time needed to be filtered. Optional entity_timestamp_column
-          must be passed when using time filtering
-        default: null
-      - name: end_time
-        type: str
-        doc: datetime, high limit of time needed to be filtered. Optional entity_timestamp_column
-          must be passed when using time filtering
-        default: null
-      - name: with_indexes
-        type: bool
-        doc: return vector with index columns (default False)
-        default: false
-      - name: update_stats
-        type: bool
-        doc: update features statistics from the requested feature sets on the vector.
-          Default is False.
-        default: false
-      outputs:
-      - default: ''
-      lineno: 27
-  description: retrieve offline feature vector results
-  default_handler: get_offline_features
-  disable_auto_mount: false
-  env: []
-  priority_class_name: ''
-  preemption_mode: prevent
-  affinity: null
-  tolerations: null
-  security_context: {}
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/get_offline_features/1.2.0/static/get_offline_features.html b/functions/development/get_offline_features/1.2.0/static/get_offline_features.html deleted file mode 100644 index f6c610d6..00000000 --- a/functions/development/get_offline_features/1.2.0/static/get_offline_features.html +++ /dev/null @@ -1,282 +0,0 @@ - - - - - - - -get_offline_features.get_offline_features - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - -
-
- -
-
-
-
-
- -
-

- -
-
-
-
-
-
-
-

Source code for get_offline_features.get_offline_features

-# Copyright 2019 Iguazio
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-from typing import Union, List, Dict
-
-import mlrun
-import mlrun.feature_store as fs
-from mlrun.datastore.store_resources import is_store_uri, parse_store_uri
-from mlrun.datastore.targets import get_target_driver, kind_to_driver
-from mlrun.datastore.base import DataItem
-from mlrun.execution import MLClientCtx
-from mlrun.utils import StorePrefix, parse_versioned_object_uri
-from mlrun.errors import MLRunInvalidArgumentError
-
-
-
[docs]def get_offline_features( - context: MLClientCtx, - feature_vector: str, - features: Union[List[str], None] = None, - label_feature: str = None, - description: str = None, - entity_rows: DataItem = None, - entity_timestamp_column: str = None, - target: Union[str, Dict] = None, - run_config: Union[str, Dict] = None, - drop_columns: List[str] = None, - start_time: str = None, - end_time: str = None, - with_indexes: bool = False, - update_stats: bool = False, -): - """retrieve offline feature vector results - - specify a feature vector object/uri and retrieve the desired features, their metadata - and statistics. returns :py:class:`~mlrun.feature_store.OfflineVectorResponse`, - results can be returned as a dataframe or written to a target. - If feature vector does not exist, a new one will be created and saved with the given features. - - The start_time and end_time attributes allow filtering the data to a given time range, they accept - string values or pandas `Timestamp` objects, string values can also be relative, for example: - "now", "now - 1d2h", "now+5m", where a valid pandas Timedelta string follows the verb "now", - for time alignment you can use the verb "floor" e.g. "now -1d floor 1H" will align the time to the last hour - (the floor string is passed to pandas.Timestamp.floor(), can use D, H, T, S for day, hour, min, sec alignment) - - - :param context: MLRun context - :param feature_vector: feature vector uri - :param features: Relevant only if feature_vector not exist. list of feature to collect to this vector - format [<project>/]<feature_set>.<feature_name or *> [as <alias>] - :param label_feature: feature name to be used as label data - :param description: text description of the vector - :param entity_rows: URI of the data entity rows to join with - :param target: where to write the results to - :param drop_columns: list of columns to drop from the final result - :param entity_timestamp_column: timestamp column name in the entity rows dataframe - :param run_config: function and/or run configuration - see :py:class:`~mlrun.feature_store.RunConfig` - :param start_time: datetime, low limit of time needed to be filtered. Optional - entity_timestamp_column must be passed when using time filtering - :param end_time: datetime, high limit of time needed to be filtered. Optional - entity_timestamp_column must be passed when using time filtering - :param with_indexes: return vector with index columns (default False) - :param update_stats: update features statistics from the requested feature sets on the vector. Default is False. - - :returns feature_vector input - """ - - if features: - # Creating a new FeatureVector and saving: - if is_store_uri(feature_vector): - prefix, new_uri = parse_store_uri(feature_vector) - if prefix != StorePrefix.FeatureVector: - raise MLRunInvalidArgumentError( - f"provided store uri ({feature_vector}) does not represent a feature vector (prefix={prefix})" - ) - feature_vector = new_uri - - context.logger.info(f"Creating FeatureVector {feature_vector}") - project, name, tag, _ = parse_versioned_object_uri(feature_vector, mlrun.mlconf.default_project) - vector = fs.FeatureVector(name, features, label_feature=label_feature, description=description) - vector.metadata.project = project - vector.metadata.tag = tag - vector.save() - feature_vector_uri = vector.uri - else: - if is_store_uri(feature_vector): - feature_vector_uri = feature_vector - else: - vector = fs.get_feature_vector(feature_vector) - feature_vector_uri = vector.uri - - # Preparing entity_rows: - if entity_rows is not None: - context.logger.info(f"Creating DataFrame from entity_rows = {entity_rows}") - entity_rows = entity_rows.as_df() - - # Preparing target: - if target: - if isinstance(target, str): - target = kind_to_driver[target]() - - name = target.name if hasattr(target, "name") else target["name"] - context.logger.info(f"Preparing '{name}' target") - target = get_target_driver(target) - if hasattr(target, 'path') and target.path: - context.log_result("target", target.path) - - # Preparing run_config: - if run_config and isinstance(run_config, dict): - context.logger.info("Preparing run configuration") - run_config = fs.RunConfig(**run_config) - - # Calling get_offline_features: - context.logger.info( - f"getting offline features from the FeatureVector {feature_vector}" - ) - fs.get_offline_features( - feature_vector=feature_vector_uri, - entity_rows=entity_rows, - entity_timestamp_column=entity_timestamp_column, - target=target, - run_config=run_config, - drop_columns=drop_columns, - start_time=start_time, - end_time=end_time, - with_indexes=with_indexes, - update_stats=update_stats, - ) - - context.log_result("feature_vector", feature_vector) - context.log_result("feature_vector_uri", feature_vector_uri)
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/get_offline_features/1.2.0/static/item.html b/functions/development/get_offline_features/1.2.0/static/item.html deleted file mode 100644 index 97bb495e..00000000 --- a/functions/development/get_offline_features/1.2.0/static/item.html +++ /dev/null @@ -1,48 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- data-preparation
-- data-analysis
-- feature-store
-description: retrieve offline feature vector results
-doc: ''
-example: get_offline_features.ipynb
-generationDate: 2022-08-28:17-25
-hidden: false
-icon: ''
-labels:
-  author: yonish
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 1.1.0
-name: get_offline_features
-platformVersion: 3.5.0
-spec:
-  filename: get_offline_features.py
-  handler: get_offline_features
-  image: mlrun/mlrun
-  kind: job
-  requirements: []
-url: ''
-version: 1.2.0
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/get_offline_features/1.2.0/static/source.html b/functions/development/get_offline_features/1.2.0/static/source.html deleted file mode 100644 index ab2901f3..00000000 --- a/functions/development/get_offline_features/1.2.0/static/source.html +++ /dev/null @@ -1,164 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-# Copyright 2019 Iguazio
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-from typing import Union, List, Dict
-
-import mlrun
-import mlrun.feature_store as fs
-from mlrun.datastore.store_resources import is_store_uri, parse_store_uri
-from mlrun.datastore.targets import get_target_driver, kind_to_driver
-from mlrun.datastore.base import DataItem
-from mlrun.execution import MLClientCtx
-from mlrun.utils import StorePrefix, parse_versioned_object_uri
-from mlrun.errors import MLRunInvalidArgumentError
-
-
-def get_offline_features(
-    context: MLClientCtx,
-    feature_vector: str,
-    features: Union[List[str], None] = None,
-    label_feature: str = None,
-    description: str = None,
-    entity_rows: DataItem = None,
-    entity_timestamp_column: str = None,
-    target: Union[str, Dict] = None,
-    run_config: Union[str, Dict] = None,
-    drop_columns: List[str] = None,
-    start_time: str = None,
-    end_time: str = None,
-    with_indexes: bool = False,
-    update_stats: bool = False,
-):
-    """retrieve offline feature vector results
-
-    specify a feature vector object/uri and retrieve the desired features, their metadata
-    and statistics. returns :py:class:`~mlrun.feature_store.OfflineVectorResponse`,
-    results can be returned as a dataframe or written to a target.
-    If feature vector does not exist, a new one will be created and saved with the given features.
-
-    The start_time and end_time attributes allow filtering the data to a given time range, they accept
-    string values or pandas `Timestamp` objects, string values can also be relative, for example:
-    "now", "now - 1d2h", "now+5m", where a valid pandas Timedelta string follows the verb "now",
-    for time alignment you can use the verb "floor" e.g. "now -1d floor 1H" will align the time to the last hour
-    (the floor string is passed to pandas.Timestamp.floor(), can use D, H, T, S for day, hour, min, sec alignment)
-
-
-    :param context:        MLRun context
-    :param feature_vector: feature vector uri
-    :param features:       Relevant only if feature_vector not exist. list of feature to collect to this vector
-                           format [/]. [as ]
-    :param label_feature:  feature name to be used as label data
-    :param description:    text description of the vector
-    :param entity_rows:    URI of the data entity rows to join with
-    :param target:         where to write the results to
-    :param drop_columns:   list of columns to drop from the final result
-    :param entity_timestamp_column: timestamp column name in the entity rows dataframe
-    :param run_config:     function and/or run configuration
-                           see :py:class:`~mlrun.feature_store.RunConfig`
-    :param start_time:      datetime, low limit of time needed to be filtered. Optional
-        entity_timestamp_column must be passed when using time filtering
-    :param end_time:        datetime, high limit of time needed to be filtered. Optional
-        entity_timestamp_column must be passed when using time filtering
-    :param with_indexes:    return vector with index columns (default False)
-    :param update_stats:    update features statistics from the requested feature sets on the vector. Default is False.
-
-    :returns feature_vector input
-    """
-
-    if features:
-        # Creating a new FeatureVector and saving:
-        if is_store_uri(feature_vector):
-            prefix, new_uri = parse_store_uri(feature_vector)
-            if prefix != StorePrefix.FeatureVector:
-                raise MLRunInvalidArgumentError(
-                    f"provided store uri ({feature_vector}) does not represent a feature vector (prefix={prefix})"
-                )
-            feature_vector = new_uri
-
-        context.logger.info(f"Creating FeatureVector {feature_vector}")
-        project, name, tag, _ = parse_versioned_object_uri(feature_vector, mlrun.mlconf.default_project)
-        vector = fs.FeatureVector(name, features, label_feature=label_feature, description=description)
-        vector.metadata.project = project
-        vector.metadata.tag = tag
-        vector.save()
-        feature_vector_uri = vector.uri
-    else:
-        if is_store_uri(feature_vector):
-            feature_vector_uri = feature_vector
-        else:
-            vector = fs.get_feature_vector(feature_vector)
-            feature_vector_uri = vector.uri
-
-    # Preparing entity_rows:
-    if entity_rows is not None:
-        context.logger.info(f"Creating DataFrame from entity_rows = {entity_rows}")
-        entity_rows = entity_rows.as_df()
-
-    # Preparing target:
-    if target:
-        if isinstance(target, str):
-            target = kind_to_driver[target]()
-
-        name = target.name if hasattr(target, "name") else target["name"]
-        context.logger.info(f"Preparing '{name}' target")
-        target = get_target_driver(target)
-    if hasattr(target, 'path') and target.path:
-        context.log_result("target", target.path)
-
-    # Preparing run_config:
-    if run_config and isinstance(run_config, dict):
-        context.logger.info("Preparing run configuration")
-        run_config = fs.RunConfig(**run_config)
-
-    # Calling get_offline_features:
-    context.logger.info(
-        f"getting offline features from the FeatureVector {feature_vector}"
-    )
-    fs.get_offline_features(
-        feature_vector=feature_vector_uri,
-        entity_rows=entity_rows,
-        entity_timestamp_column=entity_timestamp_column,
-        target=target,
-        run_config=run_config,
-        drop_columns=drop_columns,
-        start_time=start_time,
-        end_time=end_time,
-        with_indexes=with_indexes,
-        update_stats=update_stats,
-    )
-
-    context.log_result("feature_vector", feature_vector)
-    context.log_result("feature_vector_uri", feature_vector_uri)
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/get_offline_features/latest/src/function.yaml b/functions/development/get_offline_features/latest/src/function.yaml deleted file mode 100644 index 3c6a8a87..00000000 --- a/functions/development/get_offline_features/latest/src/function.yaml +++ /dev/null @@ -1,127 +0,0 @@ -kind: job -metadata: - name: get-offline-features - tag: '' - hash: 5ac6c4e2b67440b464710c072708a3581125c2f8 - project: '' - labels: - author: yonish - categories: - - data-preparation - - data-analysis - - feature-store -spec: - command: '' - args: [] - image: mlrun/mlrun - build: - functionSourceCode:  - commands: [] - code_origin: https://github.com/yonishelach/functions.git#82aab1724569d20c73cca114beb2fac0821d3383:/Users/Yonatan_Shelach/projects/functions/get_offline_features/get_offline_features.py - origin_filename: /Users/Yonatan_Shelach/projects/functions/get_offline_features/get_offline_features.py - entry_points: - get_offline_features: - name: get_offline_features - doc: 'retrieve offline feature vector results - - - specify a feature vector object/uri and retrieve the desired features, their - metadata - - and statistics. returns :py:class:`~mlrun.feature_store.OfflineVectorResponse`, - - results can be returned as a dataframe or written to a target. - - If feature vector does not exist, a new one will be created and saved with - the given features. - - - The start_time and end_time attributes allow filtering the data to a given - time range, they accept - - string values or pandas `Timestamp` objects, string values can also be relative, - for example: - - "now", "now - 1d2h", "now+5m", where a valid pandas Timedelta string follows - the verb "now", - - for time alignment you can use the verb "floor" e.g. "now -1d floor 1H" will - align the time to the last hour - - (the floor string is passed to pandas.Timestamp.floor(), can use D, H, T, - S for day, hour, min, sec alignment)' - parameters: - - name: context - type: MLClientCtx - doc: MLRun context - default: '' - - name: feature_vector - type: str - doc: feature vector uri - default: '' - - name: features - type: Union[List[str], ] - doc: Relevant only if feature_vector not exist. list of feature to collect - to this vector format [/]. [as - ] - default: null - - name: label_feature - type: str - doc: feature name to be used as label data - default: null - - name: description - type: str - doc: text description of the vector - default: null - - name: entity_rows - type: DataItem - doc: URI of the data entity rows to join with - default: null - - name: entity_timestamp_column - type: str - doc: timestamp column name in the entity rows dataframe - default: null - - name: target - type: Union[str, Dict] - doc: where to write the results to - default: null - - name: run_config - type: Union[str, Dict] - doc: function and/or run configuration see :py:class:`~mlrun.feature_store.RunConfig` - default: null - - name: drop_columns - type: List[str] - doc: list of columns to drop from the final result - default: null - - name: start_time - type: str - doc: datetime, low limit of time needed to be filtered. Optional entity_timestamp_column - must be passed when using time filtering - default: null - - name: end_time - type: str - doc: datetime, high limit of time needed to be filtered. Optional entity_timestamp_column - must be passed when using time filtering - default: null - - name: with_indexes - type: bool - doc: return vector with index columns (default False) - default: false - - name: update_stats - type: bool - doc: update features statistics from the requested feature sets on the vector. - Default is False. - default: false - outputs: - - default: '' - lineno: 27 - description: retrieve offline feature vector results - default_handler: get_offline_features - disable_auto_mount: false - env: [] - priority_class_name: '' - preemption_mode: prevent - affinity: null - tolerations: null - security_context: {} -verbose: false diff --git a/functions/development/get_offline_features/latest/src/get_offline_features.ipynb b/functions/development/get_offline_features/latest/src/get_offline_features.ipynb deleted file mode 100644 index d97402a2..00000000 --- a/functions/development/get_offline_features/latest/src/get_offline_features.ipynb +++ /dev/null @@ -1,1536 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# `get_offline_features()` from MLRun FeatureStore\n", - "\n", - "This MLRun Function has the following `params`:\n", - "\n", - "- `feature_vector: str`, feature vector uri.\n", - "\n", - "- `entity_rows: DataItem` = None, URI of the data entity rows to join with.\n", - "\n", - "- `entity_timestamp_column: str = None`, timestamp column name in the entity rows dataframe.\n", - "\n", - "- `target: Union[str, Dict] = None`, where to write the results to.\n", - "\n", - "- `run_config: Union[str, Dict] = None`, function and/or run configuration see :py:class:`~mlrun.feature_store.RunConfig`.\n", - "\n", - "- `drop_columns: List[str] = None`, list of columns to drop from the final result. \n", - "\n", - "- `start_time: str = None`, datetime, low limit of time needed to be filtered. Optional. `entity_timestamp_column` must be passed when using time filtering.\n", - "\n", - "- `end_time: str = None`, datetime, high limit of time needed to be filtered. Optional. `entity_timestamp_column` must be passed when using time filtering.\n", - "\n", - "- `with_indexes: bool = False`, return vector with index columns (default False).\n", - "\n", - "- `update_stats: bool = False`, update features statistics from the requested feature sets on the vector. Default is False." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import mlrun\n", - "import mlrun.feature_store as fstore\n", - "from mlrun.datastore.targets import CSVTarget\n", - "from mlrun.datastore.sources import CSVSource\n", - "from mlrun.run import get_dataitem\n", - "from mlrun.feature_store.steps import *\n", - "from mlrun.features import MinMaxValidator\n", - "import pandas as pd\n", - "import datetime\n", - "import os" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2022-01-31 14:41:48,288 [info] loaded project get-offline-features from MLRun DB\n" - ] - } - ], - "source": [ - "ABS_PATH = 'v3io://users/{}/get_offline_features/'.format(os.environ['V3IO_USERNAME'])\n", - "# Initialize the MLRun project object\n", - "project = mlrun.get_or_create_project('get-offline-features', context=\"./\", user_project=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Generating the Same FeatureSets and FeatureVecotrs Based on the Stocks Example" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Create Sample Data For Demo" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "quotes = pd.DataFrame(\n", - " {\n", - " \"time\": [\n", - " pd.Timestamp(\"2016-05-25 13:30:00.023\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.023\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.030\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.041\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.048\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.049\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.072\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.075\")\n", - " ],\n", - " \"ticker\": [\n", - " \"GOOG\",\n", - " \"MSFT\",\n", - " \"MSFT\",\n", - " \"MSFT\",\n", - " \"GOOG\",\n", - " \"AAPL\",\n", - " \"GOOG\",\n", - " \"MSFT\"\n", - " ],\n", - " \"bid\": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01],\n", - " \"ask\": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03]\n", - " }\n", - ")\n", - "\n", - "trades = pd.DataFrame(\n", - " {\n", - " \"time\": [\n", - " pd.Timestamp(\"2016-05-25 13:30:00.023\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.038\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.048\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.048\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.048\")\n", - " ],\n", - " \"ticker\": [\"MSFT\", \"MSFT\", \"GOOG\", \"GOOG\", \"AAPL\"],\n", - " \"price\": [51.95, 51.95, 720.77, 720.92, 98.0],\n", - " \"quantity\": [75, 155, 100, 100, 100]\n", - " }\n", - ")\n", - "\n", - "stocks = pd.DataFrame(\n", - " {\n", - " \"ticker\": [\"MSFT\", \"GOOG\", \"AAPL\"],\n", - " \"name\": [\"Microsoft Corporation\", \"Alphabet Inc\", \"Apple Inc\"],\n", - " \"exchange\": [\"NASDAQ\", \"NASDAQ\", \"NASDAQ\"]\n", - " }\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "def move_date(df, col):\n", - " max_date = df[col].max()\n", - " now_date = datetime.datetime.now()\n", - " delta = now_date - max_date \n", - " df[col] = df[col] + delta \n", - " return df\n", - "\n", - "quotes = move_date(quotes, \"time\")\n", - "trades = move_date(trades, \"time\")\n", - "trades.to_csv('trades.csv', index=False)\n", - "data_uri = os.path.join(ABS_PATH, 'trades.csv')" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
timetickerbidask
02022-01-31 14:41:48.260566GOOG720.50720.93
12022-01-31 14:41:48.260566MSFT51.9551.96
22022-01-31 14:41:48.267566MSFT51.9751.98
32022-01-31 14:41:48.278566MSFT51.9952.00
42022-01-31 14:41:48.285566GOOG720.50720.93
52022-01-31 14:41:48.286566AAPL97.9998.01
62022-01-31 14:41:48.309566GOOG720.50720.88
72022-01-31 14:41:48.312566MSFT52.0152.03
\n", - "
" - ], - "text/plain": [ - " time ticker bid ask\n", - "0 2022-01-31 14:41:48.260566 GOOG 720.50 720.93\n", - "1 2022-01-31 14:41:48.260566 MSFT 51.95 51.96\n", - "2 2022-01-31 14:41:48.267566 MSFT 51.97 51.98\n", - "3 2022-01-31 14:41:48.278566 MSFT 51.99 52.00\n", - "4 2022-01-31 14:41:48.285566 GOOG 720.50 720.93\n", - "5 2022-01-31 14:41:48.286566 AAPL 97.99 98.01\n", - "6 2022-01-31 14:41:48.309566 GOOG 720.50 720.88\n", - "7 2022-01-31 14:41:48.312566 MSFT 52.01 52.03" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "quotes" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
timetickerpricequantity
02022-01-31 14:41:48.288476MSFT51.9575
12022-01-31 14:41:48.303476MSFT51.95155
22022-01-31 14:41:48.313476GOOG720.77100
32022-01-31 14:41:48.313476GOOG720.92100
42022-01-31 14:41:48.313476AAPL98.00100
\n", - "
" - ], - "text/plain": [ - " time ticker price quantity\n", - "0 2022-01-31 14:41:48.288476 MSFT 51.95 75\n", - "1 2022-01-31 14:41:48.303476 MSFT 51.95 155\n", - "2 2022-01-31 14:41:48.313476 GOOG 720.77 100\n", - "3 2022-01-31 14:41:48.313476 GOOG 720.92 100\n", - "4 2022-01-31 14:41:48.313476 AAPL 98.00 100" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "trades" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
tickernameexchange
0MSFTMicrosoft CorporationNASDAQ
1GOOGAlphabet IncNASDAQ
2AAPLApple IncNASDAQ
\n", - "
" - ], - "text/plain": [ - " ticker name exchange\n", - "0 MSFT Microsoft Corporation NASDAQ\n", - "1 GOOG Alphabet Inc NASDAQ\n", - "2 AAPL Apple Inc NASDAQ" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "stocks" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Build & Ingest Simple Feature Set (stocks)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
nameexchange
ticker
MSFTMicrosoft CorporationNASDAQ
GOOGAlphabet IncNASDAQ
AAPLApple IncNASDAQ
\n", - "
" - ], - "text/plain": [ - " name exchange\n", - "ticker \n", - "MSFT Microsoft Corporation NASDAQ\n", - "GOOG Alphabet Inc NASDAQ\n", - "AAPL Apple Inc NASDAQ" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# add feature set without time column (stock ticker metadata) \n", - "stocks_set = fstore.FeatureSet(\"stocks\", entities=[fstore.Entity(\"ticker\")])\n", - "fstore.ingest(stocks_set, stocks, infer_options=fstore.InferOptions.default())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Build Advanced feature set - with feature engineering pipeline" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "quotes_set = fstore.FeatureSet(\"stock-quotes\", entities=[fstore.Entity(\"ticker\")])" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "class MyMap(MapClass):\n", - " def __init__(self, multiplier=1, **kwargs):\n", - " super().__init__(**kwargs)\n", - " self._multiplier = multiplier\n", - "\n", - " def do(self, event):\n", - " event[\"multi\"] = event[\"bid\"] * self._multiplier\n", - " return event" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "image/svg+xml": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "mlrun-flow\n", - "\n", - "\n", - "\n", - "_start\n", - "\n", - "start\n", - "\n", - "\n", - "\n", - "MyMap\n", - "\n", - "MyMap\n", - "\n", - "\n", - "\n", - "_start->MyMap\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "storey.Extend\n", - "\n", - "storey.Extend\n", - "\n", - "\n", - "\n", - "MyMap->storey.Extend\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "filter\n", - "\n", - "filter\n", - "\n", - "\n", - "\n", - "storey.Extend->filter\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "FeaturesetValidator\n", - "\n", - "FeaturesetValidator\n", - "\n", - "\n", - "\n", - "filter->FeaturesetValidator\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "Aggregates\n", - "\n", - "Aggregates\n", - "\n", - "\n", - "\n", - "FeaturesetValidator->Aggregates\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "parquet\n", - "\n", - "\n", - "parquet\n", - "\n", - "\n", - "\n", - "Aggregates->parquet\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "nosql\n", - "\n", - "\n", - "nosql\n", - "\n", - "\n", - "\n", - "Aggregates->nosql\n", - "\n", - "\n", - "\n", - "\n", - "\n" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "quotes_set.graph.to(\"MyMap\", multiplier=3)\\\n", - " .to(\"storey.Extend\", _fn=\"({'extra': event['bid'] * 77})\")\\\n", - " .to(\"storey.Filter\", \"filter\", _fn=\"(event['bid'] > 51.92)\")\\\n", - " .to(FeaturesetValidator())\n", - "\n", - "quotes_set.add_aggregation(\"ask\", [\"sum\", \"max\"], \"1h\", \"10m\", name=\"asks1\")\n", - "quotes_set.add_aggregation(\"ask\", [\"sum\", \"max\"], \"5h\", \"10m\", name=\"asks5\")\n", - "quotes_set.add_aggregation(\"bid\", [\"min\", \"max\"], \"1h\", \"10m\", name=\"bids\")\n", - "\n", - "# add feature validation policy\n", - "quotes_set[\"bid\"] = fstore.Feature(validator=MinMaxValidator(min=52, severity=\"info\"))\n", - "\n", - "# add default target definitions and plot\n", - "quotes_set.set_targets()\n", - "quotes_set.plot(rankdir=\"LR\", with_targets=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Ingest Data Into Offline And Online Stores" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.377248+00:00 args={'min': 52, 'value': 51.95}\n", - "info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.377927+00:00 args={'min': 52, 'value': 51.97}\n", - "info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.378103+00:00 args={'min': 52, 'value': 51.99}\n", - "info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.578640+00:00 args={'min': 52, 'value': 51.95}\n", - "info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.581692+00:00 args={'min': 52, 'value': 51.97}\n", - "info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.584351+00:00 args={'min': 52, 'value': 51.99}\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
asks1_max_1hasks1_sum_1hasks5_max_5hasks5_sum_5hbids_max_1hbids_min_1htimebidaskmultiextra
ticker
GOOG720.93720.93720.93720.93720.50720.502022-01-31 14:41:48.260566720.50720.932161.5055478.50
MSFT51.9651.9651.9651.9651.9551.952022-01-31 14:41:48.26056651.9551.96155.854000.15
MSFT51.98103.9451.98103.9451.9751.952022-01-31 14:41:48.26756651.9751.98155.914001.69
MSFT52.00155.9452.00155.9451.9951.952022-01-31 14:41:48.27856651.9952.00155.974003.23
GOOG720.931441.86720.931441.86720.50720.502022-01-31 14:41:48.285566720.50720.932161.5055478.50
AAPL98.0198.0198.0198.0197.9997.992022-01-31 14:41:48.28656697.9998.01293.977545.23
GOOG720.932162.74720.932162.74720.50720.502022-01-31 14:41:48.309566720.50720.882161.5055478.50
MSFT52.03207.9752.03207.9752.0151.952022-01-31 14:41:48.31256652.0152.03156.034004.77
\n", - "
" - ], - "text/plain": [ - " asks1_max_1h asks1_sum_1h asks5_max_5h asks5_sum_5h bids_max_1h \\\n", - "ticker \n", - "GOOG 720.93 720.93 720.93 720.93 720.50 \n", - "MSFT 51.96 51.96 51.96 51.96 51.95 \n", - "MSFT 51.98 103.94 51.98 103.94 51.97 \n", - "MSFT 52.00 155.94 52.00 155.94 51.99 \n", - "GOOG 720.93 1441.86 720.93 1441.86 720.50 \n", - "AAPL 98.01 98.01 98.01 98.01 97.99 \n", - "GOOG 720.93 2162.74 720.93 2162.74 720.50 \n", - "MSFT 52.03 207.97 52.03 207.97 52.01 \n", - "\n", - " bids_min_1h time bid ask multi \\\n", - "ticker \n", - "GOOG 720.50 2022-01-31 14:41:48.260566 720.50 720.93 2161.50 \n", - "MSFT 51.95 2022-01-31 14:41:48.260566 51.95 51.96 155.85 \n", - "MSFT 51.95 2022-01-31 14:41:48.267566 51.97 51.98 155.91 \n", - "MSFT 51.95 2022-01-31 14:41:48.278566 51.99 52.00 155.97 \n", - "GOOG 720.50 2022-01-31 14:41:48.285566 720.50 720.93 2161.50 \n", - "AAPL 97.99 2022-01-31 14:41:48.286566 97.99 98.01 293.97 \n", - "GOOG 720.50 2022-01-31 14:41:48.309566 720.50 720.88 2161.50 \n", - "MSFT 51.95 2022-01-31 14:41:48.312566 52.01 52.03 156.03 \n", - "\n", - " extra \n", - "ticker \n", - "GOOG 55478.50 \n", - "MSFT 4000.15 \n", - "MSFT 4001.69 \n", - "MSFT 4003.23 \n", - "GOOG 55478.50 \n", - "AAPL 7545.23 \n", - "GOOG 55478.50 \n", - "MSFT 4004.77 " - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# save ingest data and print the FeatureSet spec\n", - "fstore.ingest(quotes_set, quotes)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Get an Offline Feature Vector" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "features = [\n", - " \"stock-quotes.multi\",\n", - " \"stock-quotes.asks5_sum_5h as total_ask\",\n", - " \"stock-quotes.bids_min_1h\",\n", - " \"stock-quotes.bids_max_1h\",\n", - " \"stocks.*\",\n", - "]\n", - "\n", - "vector = fstore.FeatureVector(\"stocks-vec\", features)\n", - "vector.save()" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "target_dict = CSVTarget('mycsv',path=os.path.join(ABS_PATH, 'my_csv.csv')).to_dict()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Using `get_offline_features()` " - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [], - "source": [ - "get_offline_features_fn = mlrun.import_function('hub://get_offline_features:development')" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2022-01-31 14:41:52,066 [info] starting run get-offline-features-get_offline_features uid=956663b9a9ba448c9ea65e8e9245718e DB=http://mlrun-api:8080\n", - "> 2022-01-31 14:41:52,214 [info] Creating DataFrame from entity_rows = v3io://users/yonatan/get_offline_features/trades.csv\n", - "> 2022-01-31 14:41:52,292 [info] Preparing 'mycsv' target\n", - "> 2022-01-31 14:41:52,294 [info] getting offline features from the FeatureVector store://feature-vectors/get-offline-features-yonatan/stocks-vec\n", - "> 2022-01-31 14:41:52,708 [info] wrote target: {'name': 'mycsv', 'kind': 'csv', 'path': 'v3io://users/yonatan/get_offline_features/my_csv.csv', 'status': 'ready', 'updated': '2022-01-31T14:41:52.708534+00:00'}\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
get-offline-features-yonatan0Jan 31 14:41:52completedget-offline-features-get_offline_features
v3io_user=yonatan
kind=
owner=yonatan
host=jupyter-yoni-647b99c95d-w4jlc
entity_rows
feature_vector=store://feature-vectors/get-offline-features-yonatan/stocks-vec
target={'name': 'mycsv', 'kind': 'csv', 'path': 'v3io://users/yonatan/get_offline_features/my_csv.csv', 'partitioned': False}
entity_timestamp_column=time
target=v3io://users/yonatan/get_offline_features/my_csv.csv
feature_vector=store://feature-vectors/get-offline-features-yonatan/stocks-vec
\n", - "
\n", - "
\n", - "
\n", - " Title\n", - " ×\n", - "
\n", - " \n", - "
\n", - "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - }, - { - "data": { - "text/html": [ - " > to track results use the .show() or .logs() methods or click here to open in UI" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2022-01-31 14:41:52,896 [info] run executed, status=completed\n" - ] - } - ], - "source": [ - "gof_run = get_offline_features_fn.run(\n", - " handler='get_offline_features',\n", - " inputs= {'entity_rows': data_uri},\n", - " params={'feature_vector': vector.uri,\n", - " 'target': target_dict,\n", - " 'entity_timestamp_column': \"time\",\n", - " },\n", - " local=True\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'store://feature-vectors/get-offline-features-yonatan/stocks-vec'" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "gof_run.outputs['feature_vector']" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Unnamed: 0pricequantitymultitotal_askbids_min_1hbids_max_1hnameexchange
0051.9575155.8551.9651.9551.95Microsoft CorporationNASDAQ
1151.9575155.91103.9451.9551.97Microsoft CorporationNASDAQ
2251.9575155.97155.9451.9551.99Microsoft CorporationNASDAQ
3351.9575156.03207.9751.9552.01Microsoft CorporationNASDAQ
4451.95155155.8551.9651.9551.95Microsoft CorporationNASDAQ
5551.95155155.91103.9451.9551.97Microsoft CorporationNASDAQ
6651.95155155.97155.9451.9551.99Microsoft CorporationNASDAQ
7751.95155156.03207.9751.9552.01Microsoft CorporationNASDAQ
88720.771002161.50720.93720.50720.50Alphabet IncNASDAQ
99720.771002161.501441.86720.50720.50Alphabet IncNASDAQ
1010720.771002161.502162.74720.50720.50Alphabet IncNASDAQ
1111720.921002161.50720.93720.50720.50Alphabet IncNASDAQ
1212720.921002161.501441.86720.50720.50Alphabet IncNASDAQ
1313720.921002161.502162.74720.50720.50Alphabet IncNASDAQ
141498.00100293.9798.0197.9997.99Apple IncNASDAQ
\n", - "
" - ], - "text/plain": [ - " Unnamed: 0 price quantity multi total_ask bids_min_1h \\\n", - "0 0 51.95 75 155.85 51.96 51.95 \n", - "1 1 51.95 75 155.91 103.94 51.95 \n", - "2 2 51.95 75 155.97 155.94 51.95 \n", - "3 3 51.95 75 156.03 207.97 51.95 \n", - "4 4 51.95 155 155.85 51.96 51.95 \n", - "5 5 51.95 155 155.91 103.94 51.95 \n", - "6 6 51.95 155 155.97 155.94 51.95 \n", - "7 7 51.95 155 156.03 207.97 51.95 \n", - "8 8 720.77 100 2161.50 720.93 720.50 \n", - "9 9 720.77 100 2161.50 1441.86 720.50 \n", - "10 10 720.77 100 2161.50 2162.74 720.50 \n", - "11 11 720.92 100 2161.50 720.93 720.50 \n", - "12 12 720.92 100 2161.50 1441.86 720.50 \n", - "13 13 720.92 100 2161.50 2162.74 720.50 \n", - "14 14 98.00 100 293.97 98.01 97.99 \n", - "\n", - " bids_max_1h name exchange \n", - "0 51.95 Microsoft Corporation NASDAQ \n", - "1 51.97 Microsoft Corporation NASDAQ \n", - "2 51.99 Microsoft Corporation NASDAQ \n", - "3 52.01 Microsoft Corporation NASDAQ \n", - "4 51.95 Microsoft Corporation NASDAQ \n", - "5 51.97 Microsoft Corporation NASDAQ \n", - "6 51.99 Microsoft Corporation NASDAQ \n", - "7 52.01 Microsoft Corporation NASDAQ \n", - "8 720.50 Alphabet Inc NASDAQ \n", - "9 720.50 Alphabet Inc NASDAQ \n", - "10 720.50 Alphabet Inc NASDAQ \n", - "11 720.50 Alphabet Inc NASDAQ \n", - "12 720.50 Alphabet Inc NASDAQ \n", - "13 720.50 Alphabet Inc NASDAQ \n", - "14 97.99 Apple Inc NASDAQ " - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "mlrun.get_dataitem(gof_run.outputs['feature_vector']).as_df()" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/functions/development/get_offline_features/latest/src/get_offline_features.py b/functions/development/get_offline_features/latest/src/get_offline_features.py deleted file mode 100644 index a48faa9c..00000000 --- a/functions/development/get_offline_features/latest/src/get_offline_features.py +++ /dev/null @@ -1,142 +0,0 @@ -# Copyright 2019 Iguazio -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -from typing import Union, List, Dict - -import mlrun -import mlrun.feature_store as fs -from mlrun.datastore.store_resources import is_store_uri, parse_store_uri -from mlrun.datastore.targets import get_target_driver, kind_to_driver -from mlrun.datastore.base import DataItem -from mlrun.execution import MLClientCtx -from mlrun.utils import StorePrefix, parse_versioned_object_uri -from mlrun.errors import MLRunInvalidArgumentError - - -def get_offline_features( - context: MLClientCtx, - feature_vector: str, - features: Union[List[str], None] = None, - label_feature: str = None, - description: str = None, - entity_rows: DataItem = None, - entity_timestamp_column: str = None, - target: Union[str, Dict] = None, - run_config: Union[str, Dict] = None, - drop_columns: List[str] = None, - start_time: str = None, - end_time: str = None, - with_indexes: bool = False, - update_stats: bool = False, -): - """retrieve offline feature vector results - - specify a feature vector object/uri and retrieve the desired features, their metadata - and statistics. returns :py:class:`~mlrun.feature_store.OfflineVectorResponse`, - results can be returned as a dataframe or written to a target. - If feature vector does not exist, a new one will be created and saved with the given features. - - The start_time and end_time attributes allow filtering the data to a given time range, they accept - string values or pandas `Timestamp` objects, string values can also be relative, for example: - "now", "now - 1d2h", "now+5m", where a valid pandas Timedelta string follows the verb "now", - for time alignment you can use the verb "floor" e.g. "now -1d floor 1H" will align the time to the last hour - (the floor string is passed to pandas.Timestamp.floor(), can use D, H, T, S for day, hour, min, sec alignment) - - - :param context: MLRun context - :param feature_vector: feature vector uri - :param features: Relevant only if feature_vector not exist. list of feature to collect to this vector - format [/]. [as ] - :param label_feature: feature name to be used as label data - :param description: text description of the vector - :param entity_rows: URI of the data entity rows to join with - :param target: where to write the results to - :param drop_columns: list of columns to drop from the final result - :param entity_timestamp_column: timestamp column name in the entity rows dataframe - :param run_config: function and/or run configuration - see :py:class:`~mlrun.feature_store.RunConfig` - :param start_time: datetime, low limit of time needed to be filtered. Optional - entity_timestamp_column must be passed when using time filtering - :param end_time: datetime, high limit of time needed to be filtered. Optional - entity_timestamp_column must be passed when using time filtering - :param with_indexes: return vector with index columns (default False) - :param update_stats: update features statistics from the requested feature sets on the vector. Default is False. - - :returns feature_vector input - """ - - if features: - # Creating a new FeatureVector and saving: - if is_store_uri(feature_vector): - prefix, new_uri = parse_store_uri(feature_vector) - if prefix != StorePrefix.FeatureVector: - raise MLRunInvalidArgumentError( - f"provided store uri ({feature_vector}) does not represent a feature vector (prefix={prefix})" - ) - feature_vector = new_uri - - context.logger.info(f"Creating FeatureVector {feature_vector}") - project, name, tag, _ = parse_versioned_object_uri(feature_vector, mlrun.mlconf.default_project) - vector = fs.FeatureVector(name, features, label_feature=label_feature, description=description) - vector.metadata.project = project - vector.metadata.tag = tag - vector.save() - feature_vector_uri = vector.uri - else: - if is_store_uri(feature_vector): - feature_vector_uri = feature_vector - else: - vector = fs.get_feature_vector(feature_vector) - feature_vector_uri = vector.uri - - # Preparing entity_rows: - if entity_rows is not None: - context.logger.info(f"Creating DataFrame from entity_rows = {entity_rows}") - entity_rows = entity_rows.as_df() - - # Preparing target: - if target: - if isinstance(target, str): - target = kind_to_driver[target]() - - name = target.name if hasattr(target, "name") else target["name"] - context.logger.info(f"Preparing '{name}' target") - target = get_target_driver(target) - if hasattr(target, 'path') and target.path: - context.log_result("target", target.path) - - # Preparing run_config: - if run_config and isinstance(run_config, dict): - context.logger.info("Preparing run configuration") - run_config = fs.RunConfig(**run_config) - - # Calling get_offline_features: - context.logger.info( - f"getting offline features from the FeatureVector {feature_vector}" - ) - fs.get_offline_features( - feature_vector=feature_vector_uri, - entity_rows=entity_rows, - entity_timestamp_column=entity_timestamp_column, - target=target, - run_config=run_config, - drop_columns=drop_columns, - start_time=start_time, - end_time=end_time, - with_indexes=with_indexes, - update_stats=update_stats, - ) - - context.log_result("feature_vector", feature_vector) - context.log_result("feature_vector_uri", feature_vector_uri) diff --git a/functions/development/get_offline_features/latest/src/item.yaml b/functions/development/get_offline_features/latest/src/item.yaml deleted file mode 100644 index 17241f6e..00000000 --- a/functions/development/get_offline_features/latest/src/item.yaml +++ /dev/null @@ -1,26 +0,0 @@ -apiVersion: v1 -categories: -- data-preparation -- data-analysis -- feature-store -description: retrieve offline feature vector results -doc: '' -example: get_offline_features.ipynb -generationDate: 2022-08-28:17-25 -hidden: false -icon: '' -labels: - author: yonish -maintainers: [] -marketplaceType: '' -mlrunVersion: 1.1.0 -name: get_offline_features -platformVersion: 3.5.0 -spec: - filename: get_offline_features.py - handler: get_offline_features - image: mlrun/mlrun - kind: job - requirements: [] -url: '' -version: 1.2.0 diff --git a/functions/development/get_offline_features/latest/src/test_get_offline_features.py b/functions/development/get_offline_features/latest/src/test_get_offline_features.py deleted file mode 100644 index 21913e01..00000000 --- a/functions/development/get_offline_features/latest/src/test_get_offline_features.py +++ /dev/null @@ -1,239 +0,0 @@ -# Copyright 2019 Iguazio -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -import os -import tempfile -import shutil -import datetime - -import pytest -import mlrun -import mlrun.feature_store as fstore -from mlrun.datastore.targets import CSVTarget -from mlrun.feature_store.steps import * -from mlrun.features import MinMaxValidator -from mlrun.run import get_dataitem - - -REQUIRED_ENV_VARS = [ - "MLRUN_DBPATH", - "MLRUN_ARTIFACT_PATH", - "V3IO_USERNAME", - "V3IO_API", - "V3IO_ACCESS_KEY", -] - - -def _validate_environment_variables() -> bool: - """ - Checks that all required Environment variables are set. - """ - environment_keys = os.environ.keys() - return all(key in environment_keys for key in REQUIRED_ENV_VARS) - - -def _set_environment(): - """ - Creating project and temp dir for the project. - """ - artifact_path = tempfile.TemporaryDirectory().name - os.makedirs(artifact_path) - project = mlrun.get_or_create_project( - "get-offline-features-test", context="./", user_project=True - ) - return artifact_path, project - - -def _cleanup_environment(artifact_path: str): - """ - Cleanup the test environment, deleting files and artifacts created during the test. - - :param artifact_path: The artifact path to delete. - """ - # Clean the local directory: - for test_output in [ - *os.listdir(artifact_path), - "schedules", - "runs", - "artifacts", - "functions", - ]: - test_output_path = os.path.abspath(f"./{test_output}") - if os.path.exists(test_output_path): - if os.path.isdir(test_output_path): - shutil.rmtree(test_output_path) - else: - os.remove(test_output_path) - - # Clean the artifacts directory: - shutil.rmtree(artifact_path) - - -def create_dataframes() -> (pd.DataFrame, pd.DataFrame, pd.DataFrame): - """ - Creates all the necessary DataFrames to the test. - """ - - def move_date(df, col): - max_date = df[col].max() - now_date = datetime.datetime.now() - delta = now_date - max_date - df[col] = df[col] + delta - return df - - stocks = pd.DataFrame( - { - "ticker": ["MSFT", "GOOG", "AAPL"], - "name": ["Microsoft Corporation", "Alphabet Inc", "Apple Inc"], - "exchange": ["NASDAQ", "NASDAQ", "NASDAQ"], - } - ) - - quotes = pd.DataFrame( - { - "time": [ - pd.Timestamp("2016-05-25 13:30:00.023"), - pd.Timestamp("2016-05-25 13:30:00.023"), - pd.Timestamp("2016-05-25 13:30:00.030"), - pd.Timestamp("2016-05-25 13:30:00.041"), - pd.Timestamp("2016-05-25 13:30:00.048"), - pd.Timestamp("2016-05-25 13:30:00.049"), - pd.Timestamp("2016-05-25 13:30:00.072"), - pd.Timestamp("2016-05-25 13:30:00.075"), - ], - "ticker": ["GOOG", "MSFT", "MSFT", "MSFT", "GOOG", "AAPL", "GOOG", "MSFT"], - "bid": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01], - "ask": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03], - } - ) - - trades = pd.DataFrame( - { - "time": [ - pd.Timestamp("2016-05-25 13:30:00.023"), - pd.Timestamp("2016-05-25 13:30:00.038"), - pd.Timestamp("2016-05-25 13:30:00.048"), - pd.Timestamp("2016-05-25 13:30:00.048"), - pd.Timestamp("2016-05-25 13:30:00.048"), - ], - "ticker": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"], - "price": [51.95, 51.95, 720.77, 720.92, 98.0], - "quantity": [75, 155, 100, 100, 100], - } - ) - quotes = move_date(quotes, "time") - trades = move_date(trades, "time") - return quotes, trades, stocks - - -class MyMap(MapClass): - def __init__(self, multiplier=1, **kwargs): - super().__init__(**kwargs) - self._multiplier = multiplier - - def do(self, event): - event["multi"] = event["bid"] * self._multiplier - return event - - -def _create_feature_set(): - """ - Creating all the necessary FeatureSets for the test. - """ - stocks_set = fstore.FeatureSet("stocks", entities=[fstore.Entity("ticker")]) - - quotes_set = fstore.FeatureSet("stock-quotes", entities=[fstore.Entity("ticker")]) - - quotes_set.graph.to("MyMap", multiplier=3).to( - "storey.Extend", _fn="({'extra': event['bid'] * 77})" - ).to("storey.Filter", "filter", _fn="(event['bid'] > 51.92)").to( - FeaturesetValidator() - ) - - quotes_set.add_aggregation("ask", ["sum", "max"], "1h", "10m", name="asks1") - quotes_set.add_aggregation("ask", ["sum", "max"], "5h", "10m", name="asks5") - quotes_set.add_aggregation("bid", ["min", "max"], "1h", "10m", name="bids") - - # add feature validation policy - quotes_set["bid"] = fstore.Feature( - validator=MinMaxValidator(min=52, severity="info") - ) - - # add default target definitions and plot - quotes_set.set_targets() - return quotes_set, stocks_set - - -@pytest.mark.skipif( - condition=not _validate_environment_variables(), - reason="Project's environment variables are not set", -) -def test_get_offline_vector(): - # Creating project: - artifact_path, project = _set_environment() - - # Importing the marketplace function: - gof_fn = mlrun.import_function("function.yaml") - - # Creating the dataframes: - quotes, trades, stocks = create_dataframes() - - # Defining features for the FeatureVector: - features = [ - "stock-quotes.multi", - "stock-quotes.asks5_sum_5h as total_ask", - "stock-quotes.bids_min_1h", - "stock-quotes.bids_max_1h", - "stocks.*", - ] - - # Creating the FeatureSets and ingesting them: - quotes_set, stocks_set = _create_feature_set() - fstore.ingest(stocks_set, stocks) - fstore.ingest(quotes_set, quotes) - - # Saving the trades dataframe as a csv to use as entity_rows: - trades_uri = os.path.join(artifact_path, "trades.csv") - trades.to_csv(trades_uri, index=False) - - # Creating target for the FeatureVector: - target_dict = CSVTarget( - "mycsv", path=os.path.join(artifact_path, "my_csv.csv") - ).to_dict() - - # Running the getting_offline_features function: - gof_run = None - try: - gof_run = gof_fn.run( - handler="get_offline_features", - inputs={"entity_rows": trades_uri}, - params={ - "feature_vector": "stocks-vec", - "features": features, - "target": target_dict, - "entity_timestamp_column": "time", - }, - local=True, - ) - - except Exception as e: - print(f"- The test failed - raised the following error:\n- {e}") - - target_df = get_dataitem(gof_run.outputs["target"]).as_df() - vector_df = get_dataitem(gof_run.outputs["feature_vector"]).as_df() - - # Asserting that the target and FeatureVector dataframes are the same: - assert mlrun.datastore.is_store_uri(gof_run.outputs["feature_vector_uri"]) - assert vector_df.equals(target_df), "Target and feature vector are not the same" - _cleanup_environment(artifact_path) diff --git a/functions/development/get_offline_features/latest/static/documentation.html b/functions/development/get_offline_features/latest/static/documentation.html deleted file mode 100644 index 3c02dab8..00000000 --- a/functions/development/get_offline_features/latest/static/documentation.html +++ /dev/null @@ -1,261 +0,0 @@ - - - - - - - -get_offline_features package - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - - - -
-
- - -
-
-
- -
-

get_offline_features package

- -
- -
-
-
-
-
-

get_offline_features package#

-
-

Submodules#

-
-
-

get_offline_features.get_offline_features module#

-
-
-get_offline_features.get_offline_features.get_offline_features(context: mlrun.execution.MLClientCtx, feature_vector: str, features: Optional[List[str]] = None, label_feature: Optional[str] = None, description: Optional[str] = None, entity_rows: Optional[mlrun.datastore.base.DataItem] = None, entity_timestamp_column: Optional[str] = None, target: Optional[Union[str, Dict]] = None, run_config: Optional[Union[str, Dict]] = None, drop_columns: Optional[List[str]] = None, start_time: Optional[str] = None, end_time: Optional[str] = None, with_indexes: bool = False, update_stats: bool = False)[source]#
-

retrieve offline feature vector results

-

specify a feature vector object/uri and retrieve the desired features, their metadata -and statistics. returns OfflineVectorResponse, -results can be returned as a dataframe or written to a target. -If feature vector does not exist, a new one will be created and saved with the given features.

-

The start_time and end_time attributes allow filtering the data to a given time range, they accept -string values or pandas Timestamp objects, string values can also be relative, for example: -“now”, “now - 1d2h”, “now+5m”, where a valid pandas Timedelta string follows the verb “now”, -for time alignment you can use the verb “floor” e.g. “now -1d floor 1H” will align the time to the last hour -(the floor string is passed to pandas.Timestamp.floor(), can use D, H, T, S for day, hour, min, sec alignment)

-
-
Parameters
-
    -
  • context – MLRun context

  • -
  • feature_vector – feature vector uri

  • -
  • features – Relevant only if feature_vector not exist. list of feature to collect to this vector -format [<project>/]<feature_set>.<feature_name or *> [as <alias>]

  • -
  • label_feature – feature name to be used as label data

  • -
  • description – text description of the vector

  • -
  • entity_rows – URI of the data entity rows to join with

  • -
  • target – where to write the results to

  • -
  • drop_columns – list of columns to drop from the final result

  • -
  • entity_timestamp_column – timestamp column name in the entity rows dataframe

  • -
  • run_config – function and/or run configuration -see RunConfig

  • -
  • start_time – datetime, low limit of time needed to be filtered. Optional -entity_timestamp_column must be passed when using time filtering

  • -
  • end_time – datetime, high limit of time needed to be filtered. Optional -entity_timestamp_column must be passed when using time filtering

  • -
  • with_indexes – return vector with index columns (default False)

  • -
  • update_stats – update features statistics from the requested feature sets on the vector. Default is False.

  • -
-
-
-

:returns feature_vector input

-
-
-
-

Module contents#

-
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/get_offline_features/latest/static/example.html b/functions/development/get_offline_features/latest/static/example.html deleted file mode 100644 index bbd67e37..00000000 --- a/functions/development/get_offline_features/latest/static/example.html +++ /dev/null @@ -1,1390 +0,0 @@ - - - - - - - -get_offline_features() from MLRun FeatureStore - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - - - -
-
- - -
-
-
- - -
-
-
-

get_offline_features() from MLRun FeatureStore#

-

This MLRun Function has the following params:

-
    -
  • feature_vector: str, feature vector uri.

  • -
  • entity_rows: DataItem = None, URI of the data entity rows to join with.

  • -
  • entity_timestamp_column: str = None, timestamp column name in the entity rows dataframe.

  • -
  • target: Union[str, Dict] = None, where to write the results to.

  • -
  • run_config: Union[str, Dict] = None, function and/or run configuration see :py:class:~mlrun.feature_store.RunConfig.

  • -
  • drop_columns: List[str] = None, list of columns to drop from the final result.

  • -
  • start_time: str = None, datetime, low limit of time needed to be filtered. Optional. entity_timestamp_column must be passed when using time filtering.

  • -
  • end_time: str = None, datetime, high limit of time needed to be filtered. Optional. entity_timestamp_column must be passed when using time filtering.

  • -
  • with_indexes: bool = False, return vector with index columns (default False).

  • -
  • update_stats: bool = False, update features statistics from the requested feature sets on the vector. Default is False.

  • -
-
-
-
import mlrun
-import mlrun.feature_store as fstore
-from mlrun.datastore.targets import CSVTarget
-from mlrun.datastore.sources import CSVSource
-from mlrun.run import get_dataitem
-from mlrun.feature_store.steps import *
-from mlrun.features import MinMaxValidator
-import pandas as pd
-import datetime
-import os
-
-
-
-
-
-
-
ABS_PATH = 'v3io://users/{}/get_offline_features/'.format(os.environ['V3IO_USERNAME'])
-# Initialize the MLRun project object
-project = mlrun.get_or_create_project('get-offline-features', context="./", user_project=True)
-
-
-
-
-
> 2022-01-31 14:41:48,288 [info] loaded project get-offline-features from MLRun DB
-
-
-
-
-
-

Generating the Same FeatureSets and FeatureVecotrs Based on the Stocks Example#

-
-

Create Sample Data For Demo#

-
-
-
quotes = pd.DataFrame(
-    {
-        "time": [
-            pd.Timestamp("2016-05-25 13:30:00.023"),
-            pd.Timestamp("2016-05-25 13:30:00.023"),
-            pd.Timestamp("2016-05-25 13:30:00.030"),
-            pd.Timestamp("2016-05-25 13:30:00.041"),
-            pd.Timestamp("2016-05-25 13:30:00.048"),
-            pd.Timestamp("2016-05-25 13:30:00.049"),
-            pd.Timestamp("2016-05-25 13:30:00.072"),
-            pd.Timestamp("2016-05-25 13:30:00.075")
-        ],
-        "ticker": [
-               "GOOG",
-               "MSFT",
-               "MSFT",
-               "MSFT",
-               "GOOG",
-               "AAPL",
-               "GOOG",
-               "MSFT"
-           ],
-           "bid": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01],
-           "ask": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03]
-    }
-)
-
-trades = pd.DataFrame(
-       {
-           "time": [
-               pd.Timestamp("2016-05-25 13:30:00.023"),
-               pd.Timestamp("2016-05-25 13:30:00.038"),
-               pd.Timestamp("2016-05-25 13:30:00.048"),
-               pd.Timestamp("2016-05-25 13:30:00.048"),
-               pd.Timestamp("2016-05-25 13:30:00.048")
-           ],
-           "ticker": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"],
-           "price": [51.95, 51.95, 720.77, 720.92, 98.0],
-           "quantity": [75, 155, 100, 100, 100]
-       }
-)
-
-stocks = pd.DataFrame(
-       {
-           "ticker": ["MSFT", "GOOG", "AAPL"],
-           "name": ["Microsoft Corporation", "Alphabet Inc", "Apple Inc"],
-           "exchange": ["NASDAQ", "NASDAQ", "NASDAQ"]
-       }
-)
-
-
-
-
-
-
-
def move_date(df, col):
-    max_date = df[col].max()
-    now_date = datetime.datetime.now()
-    delta = now_date - max_date 
-    df[col] = df[col] + delta 
-    return df
-
-quotes = move_date(quotes, "time")
-trades = move_date(trades, "time")
-trades.to_csv('trades.csv', index=False)
-data_uri = os.path.join(ABS_PATH, 'trades.csv')
-
-
-
-
-
-
-
quotes
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
timetickerbidask
02022-01-31 14:41:48.260566GOOG720.50720.93
12022-01-31 14:41:48.260566MSFT51.9551.96
22022-01-31 14:41:48.267566MSFT51.9751.98
32022-01-31 14:41:48.278566MSFT51.9952.00
42022-01-31 14:41:48.285566GOOG720.50720.93
52022-01-31 14:41:48.286566AAPL97.9998.01
62022-01-31 14:41:48.309566GOOG720.50720.88
72022-01-31 14:41:48.312566MSFT52.0152.03
-
-
-
-
-
trades
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
timetickerpricequantity
02022-01-31 14:41:48.288476MSFT51.9575
12022-01-31 14:41:48.303476MSFT51.95155
22022-01-31 14:41:48.313476GOOG720.77100
32022-01-31 14:41:48.313476GOOG720.92100
42022-01-31 14:41:48.313476AAPL98.00100
-
-
-
-
-
stocks
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
tickernameexchange
0MSFTMicrosoft CorporationNASDAQ
1GOOGAlphabet IncNASDAQ
2AAPLApple IncNASDAQ
-
-
-
-
-

Build & Ingest Simple Feature Set (stocks)#

-
-
-
# add feature set without time column (stock ticker metadata) 
-stocks_set = fstore.FeatureSet("stocks", entities=[fstore.Entity("ticker")])
-fstore.ingest(stocks_set, stocks, infer_options=fstore.InferOptions.default())
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
nameexchange
ticker
MSFTMicrosoft CorporationNASDAQ
GOOGAlphabet IncNASDAQ
AAPLApple IncNASDAQ
-
-
-
-
-

Build Advanced feature set - with feature engineering pipeline#

-
-
-
quotes_set = fstore.FeatureSet("stock-quotes", entities=[fstore.Entity("ticker")])
-
-
-
-
-
-
-
class MyMap(MapClass):
-    def __init__(self, multiplier=1, **kwargs):
-        super().__init__(**kwargs)
-        self._multiplier = multiplier
-
-    def do(self, event):
-        event["multi"] = event["bid"] * self._multiplier
-        return event
-
-
-
-
-
-
-
quotes_set.graph.to("MyMap", multiplier=3)\
-                .to("storey.Extend", _fn="({'extra': event['bid'] * 77})")\
-                .to("storey.Filter", "filter", _fn="(event['bid'] > 51.92)")\
-                .to(FeaturesetValidator())
-
-quotes_set.add_aggregation("ask", ["sum", "max"], "1h", "10m", name="asks1")
-quotes_set.add_aggregation("ask", ["sum", "max"], "5h", "10m", name="asks5")
-quotes_set.add_aggregation("bid", ["min", "max"], "1h", "10m", name="bids")
-
-# add feature validation policy
-quotes_set["bid"] = fstore.Feature(validator=MinMaxValidator(min=52, severity="info"))
-
-# add default target definitions and plot
-quotes_set.set_targets()
-quotes_set.plot(rankdir="LR", with_targets=True)
-
-
-
-
-_images/01e548158b906df8cc3f4f282097c5f50e603245dd43f2314bc9ff5ef6aa1447.svg
-
-
-
-

Ingest Data Into Offline And Online Stores#

-
-
-
# save ingest data and print the FeatureSet spec
-fstore.ingest(quotes_set, quotes)
-
-
-
-
-
info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.377248+00:00 args={'min': 52, 'value': 51.95}
-info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.377927+00:00 args={'min': 52, 'value': 51.97}
-info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.378103+00:00 args={'min': 52, 'value': 51.99}
-info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.578640+00:00 args={'min': 52, 'value': 51.95}
-info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.581692+00:00 args={'min': 52, 'value': 51.97}
-info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 14:41:51.584351+00:00 args={'min': 52, 'value': 51.99}
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
asks1_max_1hasks1_sum_1hasks5_max_5hasks5_sum_5hbids_max_1hbids_min_1htimebidaskmultiextra
ticker
GOOG720.93720.93720.93720.93720.50720.502022-01-31 14:41:48.260566720.50720.932161.5055478.50
MSFT51.9651.9651.9651.9651.9551.952022-01-31 14:41:48.26056651.9551.96155.854000.15
MSFT51.98103.9451.98103.9451.9751.952022-01-31 14:41:48.26756651.9751.98155.914001.69
MSFT52.00155.9452.00155.9451.9951.952022-01-31 14:41:48.27856651.9952.00155.974003.23
GOOG720.931441.86720.931441.86720.50720.502022-01-31 14:41:48.285566720.50720.932161.5055478.50
AAPL98.0198.0198.0198.0197.9997.992022-01-31 14:41:48.28656697.9998.01293.977545.23
GOOG720.932162.74720.932162.74720.50720.502022-01-31 14:41:48.309566720.50720.882161.5055478.50
MSFT52.03207.9752.03207.9752.0151.952022-01-31 14:41:48.31256652.0152.03156.034004.77
-
-
-
-
-

Get an Offline Feature Vector#

-
-
-
features = [
-    "stock-quotes.multi",
-    "stock-quotes.asks5_sum_5h as total_ask",
-    "stock-quotes.bids_min_1h",
-    "stock-quotes.bids_max_1h",
-    "stocks.*",
-]
-
-vector = fstore.FeatureVector("stocks-vec", features)
-vector.save()
-
-
-
-
-
-
-
target_dict = CSVTarget('mycsv',path=os.path.join(ABS_PATH, 'my_csv.csv')).to_dict()
-
-
-
-
-
-
-

Using get_offline_features()#

-
-
-
get_offline_features_fn = mlrun.import_function('hub://get_offline_features:development')
-
-
-
-
-
-
-
gof_run = get_offline_features_fn.run(
-    handler='get_offline_features',
-    inputs= {'entity_rows': data_uri},
-    params={'feature_vector': vector.uri,
-           'target': target_dict,
-            'entity_timestamp_column': "time",
-           },
-    local=True
-)
-
-
-
-
-
> 2022-01-31 14:41:52,066 [info] starting run get-offline-features-get_offline_features uid=956663b9a9ba448c9ea65e8e9245718e DB=http://mlrun-api:8080
-> 2022-01-31 14:41:52,214 [info] Creating DataFrame from entity_rows = v3io://users/yonatan/get_offline_features/trades.csv
-> 2022-01-31 14:41:52,292 [info] Preparing 'mycsv' target
-> 2022-01-31 14:41:52,294 [info] getting offline features from the FeatureVector store://feature-vectors/get-offline-features-yonatan/stocks-vec
-> 2022-01-31 14:41:52,708 [info] wrote target: {'name': 'mycsv', 'kind': 'csv', 'path': 'v3io://users/yonatan/get_offline_features/my_csv.csv', 'status': 'ready', 'updated': '2022-01-31T14:41:52.708534+00:00'}
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
get-offline-features-yonatan0Jan 31 14:41:52completedget-offline-features-get_offline_features
v3io_user=yonatan
kind=
owner=yonatan
host=jupyter-yoni-647b99c95d-w4jlc
entity_rows
feature_vector=store://feature-vectors/get-offline-features-yonatan/stocks-vec
target={'name': 'mycsv', 'kind': 'csv', 'path': 'v3io://users/yonatan/get_offline_features/my_csv.csv', 'partitioned': False}
entity_timestamp_column=time
target=v3io://users/yonatan/get_offline_features/my_csv.csv
feature_vector=store://feature-vectors/get-offline-features-yonatan/stocks-vec
-
- -
-

-
-
-
> to track results use the .show() or .logs() methods or click here to open in UI
> 2022-01-31 14:41:52,896 [info] run executed, status=completed
-
-
-
-
-
-
-
gof_run.outputs['feature_vector']
-
-
-
-
-
'store://feature-vectors/get-offline-features-yonatan/stocks-vec'
-
-
-
-
-
-
-
mlrun.get_dataitem(gof_run.outputs['feature_vector']).as_df()
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Unnamed: 0pricequantitymultitotal_askbids_min_1hbids_max_1hnameexchange
0051.9575155.8551.9651.9551.95Microsoft CorporationNASDAQ
1151.9575155.91103.9451.9551.97Microsoft CorporationNASDAQ
2251.9575155.97155.9451.9551.99Microsoft CorporationNASDAQ
3351.9575156.03207.9751.9552.01Microsoft CorporationNASDAQ
4451.95155155.8551.9651.9551.95Microsoft CorporationNASDAQ
5551.95155155.91103.9451.9551.97Microsoft CorporationNASDAQ
6651.95155155.97155.9451.9551.99Microsoft CorporationNASDAQ
7751.95155156.03207.9751.9552.01Microsoft CorporationNASDAQ
88720.771002161.50720.93720.50720.50Alphabet IncNASDAQ
99720.771002161.501441.86720.50720.50Alphabet IncNASDAQ
1010720.771002161.502162.74720.50720.50Alphabet IncNASDAQ
1111720.921002161.50720.93720.50720.50Alphabet IncNASDAQ
1212720.921002161.501441.86720.50720.50Alphabet IncNASDAQ
1313720.921002161.502162.74720.50720.50Alphabet IncNASDAQ
141498.00100293.9798.0197.9997.99Apple IncNASDAQ
-
-
-
-
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/get_offline_features/latest/static/function.html b/functions/development/get_offline_features/latest/static/function.html deleted file mode 100644 index efdf38c0..00000000 --- a/functions/development/get_offline_features/latest/static/function.html +++ /dev/null @@ -1,149 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: job
-metadata:
-  name: get-offline-features
-  tag: ''
-  hash: 5ac6c4e2b67440b464710c072708a3581125c2f8
-  project: ''
-  labels:
-    author: yonish
-  categories:
-  - data-preparation
-  - data-analysis
-  - feature-store
-spec:
-  command: ''
-  args: []
-  image: mlrun/mlrun
-  build:
-    functionSourceCode: 
-    commands: []
-    code_origin: https://github.com/yonishelach/functions.git#82aab1724569d20c73cca114beb2fac0821d3383:/Users/Yonatan_Shelach/projects/functions/get_offline_features/get_offline_features.py
-    origin_filename: /Users/Yonatan_Shelach/projects/functions/get_offline_features/get_offline_features.py
-  entry_points:
-    get_offline_features:
-      name: get_offline_features
-      doc: 'retrieve offline feature vector results
-
-
-        specify a feature vector object/uri and retrieve the desired features, their
-        metadata
-
-        and statistics. returns :py:class:`~mlrun.feature_store.OfflineVectorResponse`,
-
-        results can be returned as a dataframe or written to a target.
-
-        If feature vector does not exist, a new one will be created and saved with
-        the given features.
-
-
-        The start_time and end_time attributes allow filtering the data to a given
-        time range, they accept
-
-        string values or pandas `Timestamp` objects, string values can also be relative,
-        for example:
-
-        "now", "now - 1d2h", "now+5m", where a valid pandas Timedelta string follows
-        the verb "now",
-
-        for time alignment you can use the verb "floor" e.g. "now -1d floor 1H" will
-        align the time to the last hour
-
-        (the floor string is passed to pandas.Timestamp.floor(), can use D, H, T,
-        S for day, hour, min, sec alignment)'
-      parameters:
-      - name: context
-        type: MLClientCtx
-        doc: MLRun context
-        default: ''
-      - name: feature_vector
-        type: str
-        doc: feature vector uri
-        default: ''
-      - name: features
-        type: Union[List[str], ]
-        doc: Relevant only if feature_vector not exist. list of feature to collect
-          to this vector format [/]. [as
-          ]
-        default: null
-      - name: label_feature
-        type: str
-        doc: feature name to be used as label data
-        default: null
-      - name: description
-        type: str
-        doc: text description of the vector
-        default: null
-      - name: entity_rows
-        type: DataItem
-        doc: URI of the data entity rows to join with
-        default: null
-      - name: entity_timestamp_column
-        type: str
-        doc: timestamp column name in the entity rows dataframe
-        default: null
-      - name: target
-        type: Union[str, Dict]
-        doc: where to write the results to
-        default: null
-      - name: run_config
-        type: Union[str, Dict]
-        doc: function and/or run configuration see :py:class:`~mlrun.feature_store.RunConfig`
-        default: null
-      - name: drop_columns
-        type: List[str]
-        doc: list of columns to drop from the final result
-        default: null
-      - name: start_time
-        type: str
-        doc: datetime, low limit of time needed to be filtered. Optional entity_timestamp_column
-          must be passed when using time filtering
-        default: null
-      - name: end_time
-        type: str
-        doc: datetime, high limit of time needed to be filtered. Optional entity_timestamp_column
-          must be passed when using time filtering
-        default: null
-      - name: with_indexes
-        type: bool
-        doc: return vector with index columns (default False)
-        default: false
-      - name: update_stats
-        type: bool
-        doc: update features statistics from the requested feature sets on the vector.
-          Default is False.
-        default: false
-      outputs:
-      - default: ''
-      lineno: 27
-  description: retrieve offline feature vector results
-  default_handler: get_offline_features
-  disable_auto_mount: false
-  env: []
-  priority_class_name: ''
-  preemption_mode: prevent
-  affinity: null
-  tolerations: null
-  security_context: {}
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/get_offline_features/latest/static/get_offline_features.html b/functions/development/get_offline_features/latest/static/get_offline_features.html deleted file mode 100644 index f6c610d6..00000000 --- a/functions/development/get_offline_features/latest/static/get_offline_features.html +++ /dev/null @@ -1,282 +0,0 @@ - - - - - - - -get_offline_features.get_offline_features - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - -
-
- -
-
-
-
-
- -
-

- -
-
-
-
-
-
-
-

Source code for get_offline_features.get_offline_features

-# Copyright 2019 Iguazio
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-from typing import Union, List, Dict
-
-import mlrun
-import mlrun.feature_store as fs
-from mlrun.datastore.store_resources import is_store_uri, parse_store_uri
-from mlrun.datastore.targets import get_target_driver, kind_to_driver
-from mlrun.datastore.base import DataItem
-from mlrun.execution import MLClientCtx
-from mlrun.utils import StorePrefix, parse_versioned_object_uri
-from mlrun.errors import MLRunInvalidArgumentError
-
-
-
[docs]def get_offline_features( - context: MLClientCtx, - feature_vector: str, - features: Union[List[str], None] = None, - label_feature: str = None, - description: str = None, - entity_rows: DataItem = None, - entity_timestamp_column: str = None, - target: Union[str, Dict] = None, - run_config: Union[str, Dict] = None, - drop_columns: List[str] = None, - start_time: str = None, - end_time: str = None, - with_indexes: bool = False, - update_stats: bool = False, -): - """retrieve offline feature vector results - - specify a feature vector object/uri and retrieve the desired features, their metadata - and statistics. returns :py:class:`~mlrun.feature_store.OfflineVectorResponse`, - results can be returned as a dataframe or written to a target. - If feature vector does not exist, a new one will be created and saved with the given features. - - The start_time and end_time attributes allow filtering the data to a given time range, they accept - string values or pandas `Timestamp` objects, string values can also be relative, for example: - "now", "now - 1d2h", "now+5m", where a valid pandas Timedelta string follows the verb "now", - for time alignment you can use the verb "floor" e.g. "now -1d floor 1H" will align the time to the last hour - (the floor string is passed to pandas.Timestamp.floor(), can use D, H, T, S for day, hour, min, sec alignment) - - - :param context: MLRun context - :param feature_vector: feature vector uri - :param features: Relevant only if feature_vector not exist. list of feature to collect to this vector - format [<project>/]<feature_set>.<feature_name or *> [as <alias>] - :param label_feature: feature name to be used as label data - :param description: text description of the vector - :param entity_rows: URI of the data entity rows to join with - :param target: where to write the results to - :param drop_columns: list of columns to drop from the final result - :param entity_timestamp_column: timestamp column name in the entity rows dataframe - :param run_config: function and/or run configuration - see :py:class:`~mlrun.feature_store.RunConfig` - :param start_time: datetime, low limit of time needed to be filtered. Optional - entity_timestamp_column must be passed when using time filtering - :param end_time: datetime, high limit of time needed to be filtered. Optional - entity_timestamp_column must be passed when using time filtering - :param with_indexes: return vector with index columns (default False) - :param update_stats: update features statistics from the requested feature sets on the vector. Default is False. - - :returns feature_vector input - """ - - if features: - # Creating a new FeatureVector and saving: - if is_store_uri(feature_vector): - prefix, new_uri = parse_store_uri(feature_vector) - if prefix != StorePrefix.FeatureVector: - raise MLRunInvalidArgumentError( - f"provided store uri ({feature_vector}) does not represent a feature vector (prefix={prefix})" - ) - feature_vector = new_uri - - context.logger.info(f"Creating FeatureVector {feature_vector}") - project, name, tag, _ = parse_versioned_object_uri(feature_vector, mlrun.mlconf.default_project) - vector = fs.FeatureVector(name, features, label_feature=label_feature, description=description) - vector.metadata.project = project - vector.metadata.tag = tag - vector.save() - feature_vector_uri = vector.uri - else: - if is_store_uri(feature_vector): - feature_vector_uri = feature_vector - else: - vector = fs.get_feature_vector(feature_vector) - feature_vector_uri = vector.uri - - # Preparing entity_rows: - if entity_rows is not None: - context.logger.info(f"Creating DataFrame from entity_rows = {entity_rows}") - entity_rows = entity_rows.as_df() - - # Preparing target: - if target: - if isinstance(target, str): - target = kind_to_driver[target]() - - name = target.name if hasattr(target, "name") else target["name"] - context.logger.info(f"Preparing '{name}' target") - target = get_target_driver(target) - if hasattr(target, 'path') and target.path: - context.log_result("target", target.path) - - # Preparing run_config: - if run_config and isinstance(run_config, dict): - context.logger.info("Preparing run configuration") - run_config = fs.RunConfig(**run_config) - - # Calling get_offline_features: - context.logger.info( - f"getting offline features from the FeatureVector {feature_vector}" - ) - fs.get_offline_features( - feature_vector=feature_vector_uri, - entity_rows=entity_rows, - entity_timestamp_column=entity_timestamp_column, - target=target, - run_config=run_config, - drop_columns=drop_columns, - start_time=start_time, - end_time=end_time, - with_indexes=with_indexes, - update_stats=update_stats, - ) - - context.log_result("feature_vector", feature_vector) - context.log_result("feature_vector_uri", feature_vector_uri)
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/get_offline_features/latest/static/item.html b/functions/development/get_offline_features/latest/static/item.html deleted file mode 100644 index 97bb495e..00000000 --- a/functions/development/get_offline_features/latest/static/item.html +++ /dev/null @@ -1,48 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- data-preparation
-- data-analysis
-- feature-store
-description: retrieve offline feature vector results
-doc: ''
-example: get_offline_features.ipynb
-generationDate: 2022-08-28:17-25
-hidden: false
-icon: ''
-labels:
-  author: yonish
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 1.1.0
-name: get_offline_features
-platformVersion: 3.5.0
-spec:
-  filename: get_offline_features.py
-  handler: get_offline_features
-  image: mlrun/mlrun
-  kind: job
-  requirements: []
-url: ''
-version: 1.2.0
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/get_offline_features/latest/static/source.html b/functions/development/get_offline_features/latest/static/source.html deleted file mode 100644 index ab2901f3..00000000 --- a/functions/development/get_offline_features/latest/static/source.html +++ /dev/null @@ -1,164 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-# Copyright 2019 Iguazio
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-from typing import Union, List, Dict
-
-import mlrun
-import mlrun.feature_store as fs
-from mlrun.datastore.store_resources import is_store_uri, parse_store_uri
-from mlrun.datastore.targets import get_target_driver, kind_to_driver
-from mlrun.datastore.base import DataItem
-from mlrun.execution import MLClientCtx
-from mlrun.utils import StorePrefix, parse_versioned_object_uri
-from mlrun.errors import MLRunInvalidArgumentError
-
-
-def get_offline_features(
-    context: MLClientCtx,
-    feature_vector: str,
-    features: Union[List[str], None] = None,
-    label_feature: str = None,
-    description: str = None,
-    entity_rows: DataItem = None,
-    entity_timestamp_column: str = None,
-    target: Union[str, Dict] = None,
-    run_config: Union[str, Dict] = None,
-    drop_columns: List[str] = None,
-    start_time: str = None,
-    end_time: str = None,
-    with_indexes: bool = False,
-    update_stats: bool = False,
-):
-    """retrieve offline feature vector results
-
-    specify a feature vector object/uri and retrieve the desired features, their metadata
-    and statistics. returns :py:class:`~mlrun.feature_store.OfflineVectorResponse`,
-    results can be returned as a dataframe or written to a target.
-    If feature vector does not exist, a new one will be created and saved with the given features.
-
-    The start_time and end_time attributes allow filtering the data to a given time range, they accept
-    string values or pandas `Timestamp` objects, string values can also be relative, for example:
-    "now", "now - 1d2h", "now+5m", where a valid pandas Timedelta string follows the verb "now",
-    for time alignment you can use the verb "floor" e.g. "now -1d floor 1H" will align the time to the last hour
-    (the floor string is passed to pandas.Timestamp.floor(), can use D, H, T, S for day, hour, min, sec alignment)
-
-
-    :param context:        MLRun context
-    :param feature_vector: feature vector uri
-    :param features:       Relevant only if feature_vector not exist. list of feature to collect to this vector
-                           format [/]. [as ]
-    :param label_feature:  feature name to be used as label data
-    :param description:    text description of the vector
-    :param entity_rows:    URI of the data entity rows to join with
-    :param target:         where to write the results to
-    :param drop_columns:   list of columns to drop from the final result
-    :param entity_timestamp_column: timestamp column name in the entity rows dataframe
-    :param run_config:     function and/or run configuration
-                           see :py:class:`~mlrun.feature_store.RunConfig`
-    :param start_time:      datetime, low limit of time needed to be filtered. Optional
-        entity_timestamp_column must be passed when using time filtering
-    :param end_time:        datetime, high limit of time needed to be filtered. Optional
-        entity_timestamp_column must be passed when using time filtering
-    :param with_indexes:    return vector with index columns (default False)
-    :param update_stats:    update features statistics from the requested feature sets on the vector. Default is False.
-
-    :returns feature_vector input
-    """
-
-    if features:
-        # Creating a new FeatureVector and saving:
-        if is_store_uri(feature_vector):
-            prefix, new_uri = parse_store_uri(feature_vector)
-            if prefix != StorePrefix.FeatureVector:
-                raise MLRunInvalidArgumentError(
-                    f"provided store uri ({feature_vector}) does not represent a feature vector (prefix={prefix})"
-                )
-            feature_vector = new_uri
-
-        context.logger.info(f"Creating FeatureVector {feature_vector}")
-        project, name, tag, _ = parse_versioned_object_uri(feature_vector, mlrun.mlconf.default_project)
-        vector = fs.FeatureVector(name, features, label_feature=label_feature, description=description)
-        vector.metadata.project = project
-        vector.metadata.tag = tag
-        vector.save()
-        feature_vector_uri = vector.uri
-    else:
-        if is_store_uri(feature_vector):
-            feature_vector_uri = feature_vector
-        else:
-            vector = fs.get_feature_vector(feature_vector)
-            feature_vector_uri = vector.uri
-
-    # Preparing entity_rows:
-    if entity_rows is not None:
-        context.logger.info(f"Creating DataFrame from entity_rows = {entity_rows}")
-        entity_rows = entity_rows.as_df()
-
-    # Preparing target:
-    if target:
-        if isinstance(target, str):
-            target = kind_to_driver[target]()
-
-        name = target.name if hasattr(target, "name") else target["name"]
-        context.logger.info(f"Preparing '{name}' target")
-        target = get_target_driver(target)
-    if hasattr(target, 'path') and target.path:
-        context.log_result("target", target.path)
-
-    # Preparing run_config:
-    if run_config and isinstance(run_config, dict):
-        context.logger.info("Preparing run configuration")
-        run_config = fs.RunConfig(**run_config)
-
-    # Calling get_offline_features:
-    context.logger.info(
-        f"getting offline features from the FeatureVector {feature_vector}"
-    )
-    fs.get_offline_features(
-        feature_vector=feature_vector_uri,
-        entity_rows=entity_rows,
-        entity_timestamp_column=entity_timestamp_column,
-        target=target,
-        run_config=run_config,
-        drop_columns=drop_columns,
-        start_time=start_time,
-        end_time=end_time,
-        with_indexes=with_indexes,
-        update_stats=update_stats,
-    )
-
-    context.log_result("feature_vector", feature_vector)
-    context.log_result("feature_vector_uri", feature_vector_uri)
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/hugging_face_classifier_trainer/0.0.1/src/function.yaml b/functions/development/hugging_face_classifier_trainer/0.0.1/src/function.yaml deleted file mode 100755 index fa629abe..00000000 --- a/functions/development/hugging_face_classifier_trainer/0.0.1/src/function.yaml +++ /dev/null @@ -1,381 +0,0 @@ -kind: job -metadata: - name: hugging-face-classifier-trainer - tag: '' - hash: bc6b080ee99a515a6ccbdbb3d3a26b74657d38ea - project: '' - labels: - author: davids - categories: - - machine-learning - - model-training -spec: - command: '' - args: [] - image: mlrun/ml-models - build: - functionSourceCode:  - commands: - - python -m pip install onnx~=1.10.1 onnxruntime~=1.8.1 optimum~=1.6.4 transformers~=4.26.1 - datasets~=2.10.1 scikit-learn~=1.0.2 - code_origin: https://github.com/davesh0812/functions.git#837c1850e97aab539fc2820db5e2b0100699543e:/Users/davids/Projects/functions/hugging_face_classifier_trainer/hugging_face_classifier_trainer.py - origin_filename: /Users/davids/Projects/functions/hugging_face_classifier_trainer/hugging_face_classifier_trainer.py - entry_points: - add_interface: - name: add_interface - doc: 'Enrich the object with this interface properties, methods and functions, - so it will have this TensorFlow.Keras - - MLRuns features.' - parameters: - - name: cls - default: '' - - name: obj - type: Trainer - doc: The object to enrich his interface. - default: '' - - name: restoration - type: MLRunInterfaceRestorationType - doc: Restoration information tuple as returned from 'remove_interface' in - order to add the interface in a certain state. - default: null - outputs: - - default: '' - lineno: 144 - mlrun_optimize: - name: mlrun_optimize - doc: 'MLRun''s tf.keras.Model.fit wrapper. It will setup the optimizer when - using horovod. The optimizer must be - - passed in a keyword argument and when using horovod, it must be passed as - an Optimizer instance, not a string. - - - raise MLRunInvalidArgumentError: In case the optimizer provided did not follow - the instructions above.' - parameters: - - name: cls - default: '' - outputs: - - default: '' - lineno: 77 - wrapper: - name: wrapper - doc: '' - parameters: - - name: self - type: Trainer - default: '' - outputs: - - default: '' - lineno: 172 - enable_auto_logging: - name: enable_auto_logging - doc: '' - parameters: - - name: self - default: '' - - name: context - type: MLClientCtx - default: '' - - name: model_name - type: str - default: model - - name: tag - type: str - default: '' - - name: labels - type: Dict[str, str] - default: null - - name: extra_data - type: dict - default: null - outputs: - - default: '' - lineno: 113 - mlrun_train: - name: mlrun_train - doc: 'MLRuns tf.keras.Model.fit wrapper. It will setup the optimizer when using - horovod. The optimizer must be - - passed in a keyword argument and when using horovod, it must be passed as - an Optimizer instance, not a string. - - - raise MLRunInvalidArgumentError: In case the optimizer provided did not follow - the instructions above.' - parameters: - - name: cls - default: '' - outputs: - - default: '' - lineno: 162 - on_epoch_begin: - name: on_epoch_begin - doc: '' - parameters: - - name: self - default: '' - - name: args - type: TrainingArguments - default: '' - - name: state - type: TrainerState - default: '' - - name: control - type: TrainerControl - default: '' - outputs: - - default: '' - lineno: 219 - on_epoch_end: - name: on_epoch_end - doc: '' - parameters: - - name: self - default: '' - - name: args - type: TrainingArguments - default: '' - - name: state - type: TrainerState - default: '' - - name: control - type: TrainerControl - default: '' - outputs: - - default: '' - lineno: 228 - on_log: - name: on_log - doc: '' - parameters: - - name: self - default: '' - - name: args - type: TrainingArguments - default: '' - - name: state - type: TrainerState - default: '' - - name: control - type: TrainerControl - default: '' - - name: logs - type: Dict[str, float] - default: null - outputs: - - default: '' - lineno: 237 - on_train_begin: - name: on_train_begin - doc: '' - parameters: - - name: self - default: '' - - name: args - type: TrainingArguments - default: '' - - name: state - type: TrainerState - default: '' - - name: control - type: TrainerControl - default: '' - outputs: - - default: '' - lineno: 261 - on_train_end: - name: on_train_end - doc: '' - parameters: - - name: self - default: '' - - name: args - type: TrainingArguments - default: '' - - name: state - type: TrainerState - default: '' - - name: control - type: TrainerControl - default: '' - - name: model - type: PreTrainedModel - default: null - - name: tokenizer - type: PreTrainedTokenizer - default: null - outputs: - - default: '' - lineno: 270 - on_evaluate: - name: on_evaluate - doc: '' - parameters: - - name: self - default: '' - - name: args - type: TrainingArguments - default: '' - - name: state - type: TrainerState - default: '' - - name: control - type: TrainerControl - default: '' - outputs: - - default: '' - lineno: 321 - apply_mlrun: - name: apply_mlrun - doc: Wrap the given model with MLRun's interface providing it with mlrun's additional - features. - parameters: - - name: huggingface_object - doc: The model to wrap. Can be loaded from the model path given as well. - default: '' - - name: model_name - type: str - doc: 'The model name to use for storing the model artifact. Default: "model".' - default: null - - name: tag - type: str - doc: The model's tag to log with. - default: '' - - name: context - type: MLClientCtx - doc: MLRun context to work with. If no context is given it will be retrieved - via 'mlrun.get_or_create_ctx(None)' - default: null - - name: auto_log - type: bool - doc: 'Whether to enable MLRun''s auto logging. Default: True.' - default: true - - name: labels - type: Dict[str, str] - default: null - - name: extra_data - type: dict - default: null - outputs: - - default: '' - lineno: 420 - train: - name: train - doc: 'Training and evaluating a pretrained model with a pretrained tokenizer - over a dataset. - - The dataset can be either be the name of the dataset that contains in the - HuggingFace hub, - - or a URI or a FeatureVector' - parameters: - - name: context - type: MLClientCtx - doc: MLRun context - default: '' - - name: hf_dataset - type: str - doc: The name of the dataset to get from the HuggingFace hub - default: null - - name: dataset - type: DataItem - doc: The dataset to train the model on. Can be either a URI or a FeatureVector - default: null - - name: test_set - type: DataItem - doc: The test set to train the model with. - default: null - - name: drop_columns - type: Optional[List[str]] - doc: The columns to drop from the dataset. - default: null - - name: pretrained_tokenizer - type: str - doc: The name of the pretrained tokenizer from the HuggingFace hub. - default: null - - name: pretrained_model - type: str - doc: The name of the pretrained model from the HuggingFace hub. - default: null - - name: model_class - type: str - doc: The class of the model, e.g. `transformers.AutoModelForSequenceClassification` - default: null - - name: model_name - type: str - doc: The model's name to use for storing the model artifact, default to 'model' - default: huggingface-model - - name: label_name - type: str - doc: The target label of the column in the dataset. - default: labels - - name: text_col - type: str - doc: The input text column un the dataset. - default: text - - name: num_of_train_samples - type: int - doc: Max number of training samples, for debugging. - default: null - - name: train_test_split_size - type: float - doc: Should be between 0.0 and 1.0 and represent the proportion of the dataset - to include in the test split. - default: null - - name: metrics - type: List[str] - doc: List of different metrics for evaluate the model such as f1, accuracy - etc. - default: null - - name: random_state - type: int - doc: Random state for train_test_split - default: null - outputs: - - default: '' - lineno: 645 - preprocess_function: - name: preprocess_function - doc: '' - parameters: - - name: examples - default: '' - outputs: - - default: '' - lineno: 694 - optimize: - name: optimize - doc: Optimizing the transformer model using ONNX optimization. - parameters: - - name: model_path - type: str - doc: The path of the model to optimize. - default: '' - - name: model_name - type: str - doc: Name of the optimized model. - default: optimized_model - - name: target_dir - type: str - doc: The directory to save the ONNX model. - default: ./optimized - - name: optimization_level - type: int - doc: Optimization level performed by ONNX Runtime of the loaded graph. (default - is 1) - default: 1 - outputs: - - default: '' - lineno: 797 - description: Automatic train and optimize functions for HuggingFace framework - default_handler: train - disable_auto_mount: false - env: [] - priority_class_name: '' - preemption_mode: prevent - affinity: null - tolerations: null - security_context: {} -verbose: false diff --git a/functions/development/hugging_face_classifier_trainer/0.0.1/src/hugging_face_classifier_trainer.ipynb b/functions/development/hugging_face_classifier_trainer/0.0.1/src/hugging_face_classifier_trainer.ipynb deleted file mode 100644 index 16989335..00000000 --- a/functions/development/hugging_face_classifier_trainer/0.0.1/src/hugging_face_classifier_trainer.ipynb +++ /dev/null @@ -1,455 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "\n", - "# MLRun Hugging Face Classifier Trainer Tutorial" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "This notebook shows how to use the handlers of the Hugging Face classifier trainer.\n", - "the following handlers are:\n", - "- `train`\n", - "- `optimize`\n", - "\n", - "All you need is simply **HF model type** and a **HF dataset name**." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%pip install -r requirements.txt" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [], - "source": [ - "import mlrun" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2023-02-26 15:26:59,980 [info] loaded project hugging-face-trainer from MLRun DB\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "mlrun.get_or_create_project('hugging-face-trainer', context=\"./\", user_project=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "### **Importing the hugging_face_classifier_trainer function from the Marketplace**" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [], - "source": [ - "hugging_face_classifier_trainer = mlrun.import_function(\"hub://hugging_face_classifier_trainer\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "### **Training a model**\n", - "\n", - "Choosing the `train` handler" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "#### Define task parameters¶\n", - "* Class parameters should contain the prefix `CLASS_`\n", - "* Train parameters should contain the prefix `TRAIN_`" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [], - "source": [ - "model_class = \"transformers.AutoModelForSequenceClassification\"\n", - "additional_parameters = {\n", - " \"TRAIN_output_dir\": \"finetuning-sentiment-model-3000-samples\",\n", - " \"TRAIN_learning_rate\": 2e-5,\n", - " \"TRAIN_per_device_train_batch_size\": 16,\n", - " \"TRAIN_per_device_eval_batch_size\": 16,\n", - " \"TRAIN_num_train_epochs\": 3,\n", - " \"TRAIN_weight_decay\": 0.01,\n", - " \"TRAIN_push_to_hub\": False,\n", - " \"TRAIN_evaluation_strategy\": \"epoch\",\n", - " \"TRAIN_eval_steps\": 1,\n", - " \"TRAIN_logging_steps\": 1,\n", - " \"CLASS_num_labels\": 2\n", - "}" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "#### Running the Training job with the \"train\" handler" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [], - "source": [ - "train_run = hugging_face_classifier_trainer.run(params={\n", - " \"hf_dataset\": \"Shayanvsf/US_Airline_Sentiment\",\n", - " \"drop_columns\": [\n", - " \"airline_sentiment_confidence\",\n", - " \"negativereason_confidence\",\n", - " ],\n", - " \"pretrained_tokenizer\": \"distilbert-base-uncased\",\n", - " \"pretrained_model\": \"distilbert-base-uncased\",\n", - " \"model_class\": \"transformers.AutoModelForSequenceClassification\",\n", - " \"label_name\": \"airline_sentiment\",\n", - " \"num_of_train_samples\": 100,\n", - " \"metrics\": [\"accuracy\", \"f1\"],\n", - " \"random_state\": 42,\n", - " **additional_parameters\n", - " },\n", - " handler=\"train\",\n", - " local=True,\n", - " )" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "#### The result of the train run" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "{'loss': 0.5363,\n", - " 'learning_rate': 0.0,\n", - " 'eval_loss': 0.48737242817878723,\n", - " 'eval_accuracy': 0.7916666666666666,\n", - " 'eval_f1': 0.0,\n", - " 'eval_runtime': 0.5752,\n", - " 'eval_samples_per_second': 41.722,\n", - " 'eval_steps_per_second': 3.477,\n", - " 'train_runtime': 17.5022,\n", - " 'train_samples_per_second': 17.141,\n", - " 'train_steps_per_second': 1.2,\n", - " 'total_flos': 3327208489680.0,\n", - " 'loss_plot': 'v3io:///projects/hugging-face-trainer-davids/artifacts/hugging-face-classifier-trainer-train/0/loss_plot.html',\n", - " 'learning_rate_plot': 'v3io:///projects/hugging-face-trainer-davids/artifacts/hugging-face-classifier-trainer-train/0/learning_rate_plot.html',\n", - " 'eval_loss_plot': 'v3io:///projects/hugging-face-trainer-davids/artifacts/hugging-face-classifier-trainer-train/0/eval_loss_plot.html',\n", - " 'eval_accuracy_plot': 'v3io:///projects/hugging-face-trainer-davids/artifacts/hugging-face-classifier-trainer-train/0/eval_accuracy_plot.html',\n", - " 'eval_f1_plot': 'v3io:///projects/hugging-face-trainer-davids/artifacts/hugging-face-classifier-trainer-train/0/eval_f1_plot.html',\n", - " 'eval_runtime_plot': 'v3io:///projects/hugging-face-trainer-davids/artifacts/hugging-face-classifier-trainer-train/0/eval_runtime_plot.html',\n", - " 'eval_samples_per_second_plot': 'v3io:///projects/hugging-face-trainer-davids/artifacts/hugging-face-classifier-trainer-train/0/eval_samples_per_second_plot.html',\n", - " 'eval_steps_per_second_plot': 'v3io:///projects/hugging-face-trainer-davids/artifacts/hugging-face-classifier-trainer-train/0/eval_steps_per_second_plot.html',\n", - " 'tokenizer': 'v3io:///projects/hugging-face-trainer-davids/artifacts/hugging-face-classifier-trainer-train/0/tokenizer.zip',\n", - " 'model': 'store://artifacts/hugging-face-trainer-davids/huggingface-model:32a62cb55414402facbf47bb0470dc9f'}" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "train_run.outputs" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "
\n", - "
\n", - "\n", - "" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "train_run.artifact('loss_plot').show()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "#### Getting the model for evaluating and predicting" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [], - "source": [ - "model_path = train_run.outputs['model']" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Optimize the model**\n", - "\n", - "Choosing the `optimize` handler\n", - "\n", - "The result of using this handled is an onnx optimized model." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "optimize_run = hugging_face_classifier_trainer.run(params={\n", - " \"model_path\": str(model_path)\n", - " },\n", - " handler=\"optimize\",\n", - " local=True,\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'model': 'store://artifacts/hugging-face-trainer-davids/optimized_model:2c355967689240de8964a1d10d137215'}" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "optimize_run.outputs" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "[Back to the top](#top)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python [conda env:root] *", - "language": "python", - "name": "conda-root-py" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/functions/development/hugging_face_classifier_trainer/0.0.1/src/hugging_face_classifier_trainer.py b/functions/development/hugging_face_classifier_trainer/0.0.1/src/hugging_face_classifier_trainer.py deleted file mode 100755 index d9da0f87..00000000 --- a/functions/development/hugging_face_classifier_trainer/0.0.1/src/hugging_face_classifier_trainer.py +++ /dev/null @@ -1,830 +0,0 @@ -import os -import shutil -import tempfile -import zipfile -from abc import ABC -from typing import Any, Callable, Dict, List, Optional, Tuple, Union - -import mlrun -import numpy as np -import pandas as pd -import transformers -from datasets import Dataset, load_dataset, load_metric -from mlrun import MLClientCtx -from mlrun import feature_store as fs -from mlrun.api.schemas import ObjectKind -from mlrun.artifacts import Artifact, PlotlyArtifact -from mlrun.datastore import DataItem -from mlrun.frameworks._common import CommonTypes, MLRunInterface -from mlrun.utils import create_class -from plotly import graph_objects as go -from sklearn.model_selection import train_test_split -from transformers import ( - AutoTokenizer, - DataCollatorWithPadding, - EvalPrediction, - PreTrainedModel, - PreTrainedTokenizer, - Trainer, - TrainerCallback, - TrainerControl, - TrainerState, - TrainingArguments, -) - - -# ----------------------from MLRUN-------------------------------- -class HFORTOptimizerMLRunInterface(MLRunInterface, ABC): - """ - Interface for adding MLRun features for tensorflow keras API. - """ - - # MLRun's context default name: - DEFAULT_CONTEXT_NAME = "mlrun-huggingface" - - # Attributes to be inserted so the MLRun interface will be fully enabled. - _PROPERTIES = { - "_auto_log": False, - "_context": None, - "_model_name": "model", - "_tag": "", - "_labels": None, - "_extra_data": None, - } - _METHODS = ["enable_auto_logging"] - # Attributes to replace so the MLRun interface will be fully enabled. - _REPLACED_METHODS = [ - "optimize", - ] - - @classmethod - def add_interface( - cls, - obj, - restoration: CommonTypes.MLRunInterfaceRestorationType = None, - ): - """ - Enrich the object with this interface properties, methods and functions, so it will have this TensorFlow.Keras - MLRun's features. - :param obj: The object to enrich his interface. - :param restoration: Restoration information tuple as returned from 'remove_interface' in order to - add the interface in a certain state. - """ - super(HFORTOptimizerMLRunInterface, cls).add_interface( - obj=obj, restoration=restoration - ) - - @classmethod - def mlrun_optimize(cls): - """ - MLRun's tf.keras.Model.fit wrapper. It will setup the optimizer when using horovod. The optimizer must be - passed in a keyword argument and when using horovod, it must be passed as an Optimizer instance, not a string. - - raise MLRunInvalidArgumentError: In case the optimizer provided did not follow the instructions above. - """ - - def wrapper(self, *args, **kwargs): - save_dir = cls._get_function_argument( - self.optimize, - argument_name="save_dir", - passed_args=args, - passed_kwargs=kwargs, - )[0] - - # Call the original optimize method: - result = self.original_optimize(*args, **kwargs) - - if self._auto_log: - # Log the onnx model: - self._context.log_model( - key="model", - db_key=self._model_name, - model_file=f"{save_dir}/model_optimized.onnx", - tag=self._tag, - framework="ONNX", - labels=self._labels, - extra_data=self._extra_data, - ) - - return result - - return wrapper - - def enable_auto_logging( - self, - context: mlrun.MLClientCtx, - model_name: str = "model", - tag: str = "", - labels: Dict[str, str] = None, - extra_data: dict = None, - ): - self._auto_log = True - - self._context = context - self._model_name = model_name - self._tag = tag - self._labels = labels - self._extra_data = extra_data - - -class HFTrainerMLRunInterface(MLRunInterface, ABC): - """ - Interface for adding MLRun features for tensorflow keras API. - """ - - # MLRuns context default name: - DEFAULT_CONTEXT_NAME = "mlrun-huggingface" - - # Attributes to replace so the MLRun interface will be fully enabled. - _REPLACED_METHODS = [ - "train", - # "evaluate" - ] - - @classmethod - def add_interface( - cls, - obj: Trainer, - restoration: CommonTypes.MLRunInterfaceRestorationType = None, - ): - """ - Enrich the object with this interface properties, methods and functions, so it will have this TensorFlow.Keras - MLRuns features. - :param obj: The object to enrich his interface. - :param restoration: Restoration information tuple as returned from 'remove_interface' in order to - add the interface in a certain state. - """ - - super(HFTrainerMLRunInterface, cls).add_interface( - obj=obj, restoration=restoration - ) - - @classmethod - def mlrun_train(cls): - - """ - MLRuns tf.keras.Model.fit wrapper. It will setup the optimizer when using horovod. The optimizer must be - passed in a keyword argument and when using horovod, it must be passed as an Optimizer instance, not a string. - - raise MLRunInvalidArgumentError: In case the optimizer provided did not follow the instructions above. - """ - - def wrapper(self: Trainer, *args, **kwargs): - # Restore the evaluation method as `train` will use it: - # cls._restore_attribute(obj=self, attribute_name="evaluate") - - # Call the original fit method: - result = self.original_train(*args, **kwargs) - - # Replace the evaluation method again: - # cls._replace_function(obj=self, function_name="evaluate") - - return result - - return wrapper - - -class MLRunCallback(TrainerCallback): - """ - Callback for collecting logs during training / evaluation of the `Trainer` API. - """ - - def __init__( - self, - context: mlrun.MLClientCtx = None, - model_name: str = "model", - tag: str = "", - labels: Dict[str, str] = None, - extra_data: dict = None, - ): - super().__init__() - - # Store the configurations: - self._context = ( - context - if context is not None - else mlrun.get_or_create_ctx("./mlrun-huggingface") - ) - self._model_name = model_name - self._tag = tag - self._labels = labels - self._extra_data = extra_data if extra_data is not None else {} - - # Set up the logging mode: - self._is_training = False - self._steps: List[List[int]] = [] - self._metric_scores: Dict[str, List[float]] = {} - self._artifacts: Dict[str, Artifact] = {} - - def on_epoch_begin( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs, - ): - self._steps.append([]) - - def on_epoch_end( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs, - ): - self._log_metrics() - - def on_log( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - logs: Dict[str, float] = None, - **kwargs, - ): - recent_logs = state.log_history[-1].copy() - - recent_logs.pop("epoch") - current_step = int(recent_logs.pop("step")) - if current_step not in self._steps[-1]: - self._steps[-1].append(current_step) - - for metric_name, metric_score in recent_logs.items(): - if metric_name.startswith("train_"): - if metric_name.split("train_")[1] not in self._metric_scores: - self._metric_scores[metric_name] = [metric_score] - continue - if metric_name not in self._metric_scores: - self._metric_scores[metric_name] = [] - self._metric_scores[metric_name].append(metric_score) - - def on_train_begin( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs, - ): - self._is_training = True - - def on_train_end( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - model: PreTrainedModel = None, - tokenizer: PreTrainedTokenizer = None, - **kwargs, - ): - self._log_metrics() - - temp_directory = tempfile.gettempdir() - - # Save and log the tokenizer: - if tokenizer is not None: - # Save tokenizer: - tokenizer_dir = os.path.join(temp_directory, "tokenizer") - tokenizer.save_pretrained(save_directory=tokenizer_dir) - # Zip the tokenizer directory: - tokenizer_zip = shutil.make_archive( - base_name="tokenizer", - format="zip", - root_dir=tokenizer_dir, - ) - # Log the zip file: - self._artifacts["tokenizer"] = self._context.log_artifact( - item="tokenizer", local_path=tokenizer_zip - ) - - # Save the model: - model_dir = os.path.join(temp_directory, "model") - model.save_pretrained(save_directory=model_dir) - - # Zip the model directory: - shutil.make_archive( - base_name="model", - format="zip", - root_dir=model_dir, - ) - - # Log the model: - self._context.log_model( - key="model", - db_key=self._model_name, - model_file="model.zip", - tag=self._tag, - framework="Hugging Face", - labels=self._labels, - extra_data={**self._artifacts, **self._extra_data}, - ) - - def on_evaluate( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs, - ): - self._log_metrics() - - if self._is_training: - return - - # TODO: Update the model object - - def _log_metrics(self): - for metric_name, metric_scores in self._metric_scores.items(): - self._context.log_result(key=metric_name, value=metric_scores[-1]) - if len(metric_scores) > 1: - self._log_metric_plot(name=metric_name, scores=metric_scores) - self._context.commit(completed=False) - - def _log_metric_plot(self, name: str, scores: List[float]): - # Initialize a plotly figure: - metric_figure = go.Figure() - - # Add titles: - metric_figure.update_layout( - title=name.capitalize().replace("_", " "), - xaxis_title="Samples", - yaxis_title="Scores", - ) - - # Draw: - metric_figure.add_trace( - go.Scatter(x=np.arange(len(scores)), y=scores, mode="lines") - ) - - # Create the plotly artifact: - artifact_name = f"{name}_plot" - artifact = PlotlyArtifact(key=artifact_name, figure=metric_figure) - self._artifacts[artifact_name] = self._context.log_artifact(artifact) - - -def _apply_mlrun_on_trainer( - trainer: transformers.Trainer, - model_name: str = None, - tag: str = "", - context: mlrun.MLClientCtx = None, - auto_log: bool = True, - labels: Dict[str, str] = None, - extra_data: dict = None, - **kwargs, -): - # Get parameters defaults: - if context is None: - context = mlrun.get_or_create_ctx(HFTrainerMLRunInterface.DEFAULT_CONTEXT_NAME) - - HFTrainerMLRunInterface.add_interface(obj=trainer) - - if auto_log: - trainer.add_callback( - MLRunCallback( - context=context, - model_name=model_name, - tag=tag, - labels=labels, - extra_data=extra_data, - ) - ) - - -def _apply_mlrun_on_optimizer( - optimizer, - model_name: str = None, - tag: str = "", - context: mlrun.MLClientCtx = None, - auto_log: bool = True, - labels: Dict[str, str] = None, - extra_data: dict = None, - **kwargs, -): - # Get parameters defaults: - if context is None: - context = mlrun.get_or_create_ctx( - HFORTOptimizerMLRunInterface.DEFAULT_CONTEXT_NAME - ) - - HFORTOptimizerMLRunInterface.add_interface(obj=optimizer) - - if auto_log: - optimizer.enable_auto_logging( - context=context, - model_name=model_name, - tag=tag, - labels=labels, - extra_data=extra_data, - ) - - -def apply_mlrun( - huggingface_object, - model_name: str = None, - tag: str = "", - context: mlrun.MLClientCtx = None, - auto_log: bool = True, - labels: Dict[str, str] = None, - extra_data: dict = None, - **kwargs, -): - """ - Wrap the given model with MLRun's interface providing it with mlrun's additional features. - :param huggingface_object: The model to wrap. Can be loaded from the model path given as well. - :param model_name: The model name to use for storing the model artifact. Default: "model". - :param tag: The model's tag to log with. - :param context: MLRun context to work with. If no context is given it will be retrieved via - 'mlrun.get_or_create_ctx(None)' - :param auto_log: Whether to enable MLRun's auto logging. Default: True. - """ - - if isinstance(huggingface_object, transformers.Trainer): - return _apply_mlrun_on_trainer( - trainer=huggingface_object, - model_name=model_name, - tag=tag, - context=context, - auto_log=auto_log, - labels=labels, - extra_data=extra_data, - ) - import optimum.onnxruntime as optimum_ort - - if isinstance(huggingface_object, optimum_ort.ORTOptimizer): - return _apply_mlrun_on_optimizer( - optimizer=huggingface_object, - model_name=model_name, - tag=tag, - context=context, - auto_log=auto_log, - labels=labels, - extra_data=extra_data, - ) - raise mlrun.errors.MLRunInvalidArgumentError - - -# ---------------------- from auto_trainer-------------------------------- -class KWArgsPrefixes: - MODEL_CLASS = "CLASS_" - FIT = "FIT_" - TRAIN = "TRAIN_" - PREDICT = "PREDICT_" - - -def _get_sub_dict_by_prefix(src: Dict, prefix_key: str) -> Dict[str, Any]: - """ - Collect all the keys from the given dict that starts with the given prefix and creates a new dictionary with these - keys. - - :param src: The source dict to extract the values from. - :param prefix_key: Only keys with this prefix will be returned. The keys in the result dict will be without this - prefix. - """ - return { - key.replace(prefix_key, ""): val - for key, val in src.items() - if key.startswith(prefix_key) - } - - -def _get_dataframe( - context: MLClientCtx, - dataset: DataItem, - label_columns: Optional[Union[str, List[str]]] = None, - drop_columns: Union[str, List[str], int, List[int]] = None, -) -> Tuple[pd.DataFrame, Optional[Union[str, List[str]]]]: - """ - Getting the DataFrame of the dataset and drop the columns accordingly. - - :param context: MLRun context. - :param dataset: The dataset to train the model on. - Can be either a list of lists, dict, URI or a FeatureVector. - :param label_columns: The target label(s) of the column(s) in the dataset. for Regression or - Classification tasks. - :param drop_columns: str/int or a list of strings/ints that represent the column names/indices to drop. - """ - if isinstance(dataset, (list, dict)): - dataset = pd.DataFrame(dataset) - # Checking if drop_columns provided by integer type: - if drop_columns: - if isinstance(drop_columns, str) or ( - isinstance(drop_columns, list) - and any(isinstance(col, str) for col in drop_columns) - ): - context.logger.error( - "drop_columns must be an integer/list of integers if not provided with a URI/FeatureVector dataset" - ) - raise ValueError - dataset.drop(drop_columns, axis=1, inplace=True) - - return dataset, label_columns - - if dataset.meta and dataset.meta.kind == ObjectKind.feature_vector: - # feature-vector case: - label_columns = label_columns or dataset.meta.status.label_column - dataset = fs.get_offline_features( - dataset.meta.uri, drop_columns=drop_columns - ).to_dataframe() - - context.logger.info(f"label columns: {label_columns}") - else: - # simple URL case: - dataset = dataset.as_df() - if drop_columns: - if all(col in dataset for col in drop_columns): - dataset = dataset.drop(drop_columns, axis=1) - else: - context.logger.info( - "not all of the columns to drop in the dataset, drop columns process skipped" - ) - return dataset, label_columns - - -# ---------------------- Hugging Face Trainer -------------------------------- - - -def _create_compute_metrics(metrics: List[str]) -> Callable[[EvalPrediction], Dict]: - """ - This function create and returns a function that will be used to compute metrics at evaluation. - :param metrics: List of different metrics for evaluate the model such as f1, accuracy etc. - - :returns: Function that will be used to compute metrics at evaluation. - Must take a [`EvalPrediction`] and return a dictionary string to metric values. - """ - - def _compute_metrics(eval_pred): - logits, labels = eval_pred - predictions = np.argmax(logits, axis=-1) - metric_dict_results = {} - for metric in metrics: - load_met = load_metric(metric) - metric_res = load_met.compute(predictions=predictions, references=labels)[ - metric - ] - metric_dict_results[metric] = metric_res - - return metric_dict_results - - return _compute_metrics - - -def _edit_columns( - dataset: Dataset, - drop_columns: List[str] = None, - rename_columns: [str, str] = None, -) -> Dataset: - """ - Drop and renames that columns of the given dataset - :param dataset: Dataset to process - :param drop_columns: The columns to drop from the dataset. - :param rename_columns: Dict of columns ro rename : {: , ...} - - :returns: The dataset after the desired process - """ - if drop_columns: - dataset = dataset.remove_columns(drop_columns) - if rename_columns: - dataset = dataset.rename_columns(rename_columns) - return dataset - - -def _prepare_dataset( - context: MLClientCtx, - dataset_name: str, - label_name: str = None, - drop_columns: Optional[List[str]] = None, - num_of_train_samples: int = None, - train_test_split_size: float = None, - random_state: int = None, -) -> Tuple[Dataset, Dataset]: - """ - Loading the dataset and editing the columns - - :param context: MLRun contex - :param dataset_name: The name of the dataset to get from the HuggingFace hub - :param label_name: The target label of the column in the dataset. - :param drop_columns: The columns to drop from the dataset. - :param num_of_train_samples: Max number of training samples, for debugging. - :param train_test_split_size: Should be between 0.0 and 1.0 and represent the proportion of the dataset to include - in the test split. - :param random_state: Random state for train_test_split - - """ - - context.logger.info( - f"Loading and editing {dataset_name} dataset from Hugging Face hub" - ) - rename_cols = {label_name: "labels"} - - # Loading and editing dataset: - dataset = load_dataset(dataset_name) - - # train set - train_dataset = dataset["train"] - if num_of_train_samples: - train_dataset = train_dataset.shuffle(seed=random_state).select( - list(range(num_of_train_samples)) - ) - train_dataset = _edit_columns(train_dataset, drop_columns, rename_cols) - - # test set - test_dataset = dataset["test"] - if train_test_split_size or num_of_train_samples: - train_test_split_size = train_test_split_size or 0.2 - num_of_test_samples = int( - (train_dataset.num_rows * train_test_split_size) - // (1 - train_test_split_size) - ) - test_dataset = test_dataset.shuffle(seed=random_state).select( - list(range(num_of_test_samples)) - ) - test_dataset = _edit_columns(test_dataset, drop_columns, rename_cols) - - return train_dataset, test_dataset - - -def train( - context: MLClientCtx, - hf_dataset: str = None, - dataset: DataItem = None, - test_set: DataItem = None, - drop_columns: Optional[List[str]] = None, - pretrained_tokenizer: str = None, - pretrained_model: str = None, - model_class: str = None, - model_name: str = "huggingface-model", - label_name: str = "labels", - text_col: str = "text", - num_of_train_samples: int = None, - train_test_split_size: float = None, - metrics: List[str] = None, - random_state: int = None, -): - """ - Training and evaluating a pretrained model with a pretrained tokenizer over a dataset. - The dataset can be either be the name of the dataset that contains in the HuggingFace hub, - or a URI or a FeatureVector - - :param context: MLRun context - :param hf_dataset: The name of the dataset to get from the HuggingFace hub - :param dataset: The dataset to train the model on. Can be either a URI or a FeatureVector - :param test_set: The test set to train the model with. - :param drop_columns: The columns to drop from the dataset. - :param pretrained_tokenizer: The name of the pretrained tokenizer from the HuggingFace hub. - :param pretrained_model: The name of the pretrained model from the HuggingFace hub. - :param model_name: The model's name to use for storing the model artifact, default to 'model' - :param model_class: The class of the model, e.g. `transformers.AutoModelForSequenceClassification` - :param label_name: The target label of the column in the dataset. - :param text_col: The input text column un the dataset. - :param num_of_train_samples: Max number of training samples, for debugging. - :param train_test_split_size: Should be between 0.0 and 1.0 and represent the proportion of the dataset to include - in the test split. - :param metrics: List of different metrics for evaluate the model such as f1, accuracy etc. - :param random_state: Random state for train_test_split - """ - - if train_test_split_size is None and test_set is None: - context.logger.info( - "'train_test_split_size' is not provided, setting train_test_split_size to 0.2" - ) - train_test_split_size = 0.2 - - # Creating tokenizer: - tokenizer = AutoTokenizer.from_pretrained(pretrained_tokenizer) - - def preprocess_function(examples): - return tokenizer(examples[text_col], truncation=True) - - # prepare data for training - if hf_dataset: - train_dataset, test_dataset = _prepare_dataset( - context, - hf_dataset, - label_name, - drop_columns, - num_of_train_samples, - train_test_split_size, - random_state=random_state, - ) - elif dataset: - # Get DataFrame by URL or by FeatureVector: - train_dataset, label_name = _get_dataframe( - context=context, - dataset=dataset, - label_columns=label_name, - drop_columns=drop_columns, - ) - if test_set: - test_dataset, _ = _get_dataframe( - context=context, - dataset=test_set, - label_columns=label_name, - drop_columns=drop_columns, - ) - else: - train_dataset, test_dataset = train_test_split( - train_dataset, - test_size=train_test_split_size, - random_state=random_state, - ) - train_dataset = Dataset.from_pandas(train_dataset) - test_dataset = Dataset.from_pandas(test_dataset) - else: - raise mlrun.errors.MLRunInvalidArgumentError( - "Training data was not provided. A training dataset is mandatory for training." - " Please provide a training set using one of the arguments 'hf_dataset' or 'dataset'." - ) - - # Mapping datasets with the tokenizer: - tokenized_train = train_dataset.map(preprocess_function, batched=True) - tokenized_test = test_dataset.map(preprocess_function, batched=True) - - # Creating data collator for batching: - data_collator = DataCollatorWithPadding(tokenizer=tokenizer) - - # Parsing kwargs: - train_kwargs = _get_sub_dict_by_prefix( - src=context.parameters, prefix_key=KWArgsPrefixes.TRAIN - ) - model_class_kwargs = _get_sub_dict_by_prefix( - src=context.parameters, prefix_key=KWArgsPrefixes.MODEL_CLASS - ) - - # Loading our pretrained model: - model_class_kwargs["pretrained_model_name_or_path"] = ( - model_class_kwargs.get("pretrained_model_name_or_path") or pretrained_model - ) - train_kwargs["hub_token"] = train_kwargs.get("hub_token") or pretrained_tokenizer - if not model_class_kwargs["pretrained_model_name_or_path"]: - raise mlrun.errors.MLRunRuntimeError( - "Must provide pretrained_model name as " - "function argument or in extra params" - ) - model = create_class(model_class).from_pretrained(**model_class_kwargs) - - # Preparing training arguments: - training_args = TrainingArguments( - **train_kwargs, - ) - - compute_metrics = _create_compute_metrics(metrics) if metrics else None - trainer = Trainer( - model=model, - args=training_args, - train_dataset=tokenized_train, - eval_dataset=tokenized_test, - tokenizer=tokenizer, - data_collator=data_collator, - compute_metrics=compute_metrics, - ) - - apply_mlrun(trainer, model_name=model_name) - - # Apply training with evaluation: - context.logger.info(f"training '{model_name}'") - trainer.train() - - -def _get_model_dir(model_uri: str): - model_file, _, _ = mlrun.artifacts.get_model(model_uri) - model_dir = tempfile.gettempdir() - # Unzip the Model: - with zipfile.ZipFile(model_file, "r") as zip_file: - zip_file.extractall(model_dir) - - return model_dir - - -def optimize( - model_path: str, - model_name: str = "optimized_model", - target_dir: str = "./optimized", - optimization_level: int = 1, -): - """ - Optimizing the transformer model using ONNX optimization. - - - :param model_path: The path of the model to optimize. - :param model_name: Name of the optimized model. - :param target_dir: The directory to save the ONNX model. - :param optimization_level: Optimization level performed by ONNX Runtime of the loaded graph. (default is 1) - """ - # We import these in the function scope so ONNX won't be mandatory for the other handlers: - from optimum.onnxruntime import ORTModelForSequenceClassification, ORTOptimizer - from optimum.onnxruntime.configuration import OptimizationConfig - - model_dir = _get_model_dir(model_uri=model_path) - # Creating configuration for optimization step: - optimization_config = OptimizationConfig(optimization_level=optimization_level) - - # Converting our pretrained model to an ONNX-Runtime model: - ort_model = ORTModelForSequenceClassification.from_pretrained( - model_dir, from_transformers=True - ) - - # Creating an ONNX-Runtime optimizer from ONNX model: - optimizer = ORTOptimizer.from_pretrained(ort_model) - - apply_mlrun(optimizer, model_name=model_name) - # Optimizing and saving the ONNX model: - optimizer.optimize(save_dir=target_dir, optimization_config=optimization_config) diff --git a/functions/development/hugging_face_classifier_trainer/0.0.1/src/item.yaml b/functions/development/hugging_face_classifier_trainer/0.0.1/src/item.yaml deleted file mode 100755 index 611e0e1a..00000000 --- a/functions/development/hugging_face_classifier_trainer/0.0.1/src/item.yaml +++ /dev/null @@ -1,31 +0,0 @@ -apiVersion: v1 -categories: -- machine-learning -- model-training -description: Automatic train and optimize functions for HuggingFace framework -doc: '' -example: hugging_face_classifier_trainer.ipynb -generationDate: 2022-08-28:17-25 -hidden: false -icon: '' -labels: - author: davids -maintainers: [] -marketplaceType: '' -mlrunVersion: 1.2.0 -name: hugging_face_classifier_trainer -platformVersion: 3.5.0 -spec: - filename: hugging_face_classifier_trainer.py - handler: train - image: mlrun/ml-models - kind: job - requirements: - - onnx~=1.10.1 - - onnxruntime~=1.8.1 - - optimum~=1.6.4 - - transformers~=4.26.1 - - datasets~=2.10.1 - - scikit-learn~=1.0.2 -url: '' -version: 0.0.1 diff --git a/functions/development/hugging_face_classifier_trainer/0.0.1/src/requirements.txt b/functions/development/hugging_face_classifier_trainer/0.0.1/src/requirements.txt deleted file mode 100644 index 10b0872a..00000000 --- a/functions/development/hugging_face_classifier_trainer/0.0.1/src/requirements.txt +++ /dev/null @@ -1,6 +0,0 @@ -onnx~=1.10.1 -onnxruntime~=1.8.1 -optimum~=1.6.4 -transformers~=4.26.1 -datasets~=2.10.1 -scikit-learn~=1.0.2 \ No newline at end of file diff --git a/functions/development/hugging_face_classifier_trainer/0.0.1/src/test_hugging_face_classifier_trainer.py b/functions/development/hugging_face_classifier_trainer/0.0.1/src/test_hugging_face_classifier_trainer.py deleted file mode 100644 index a5e0fee9..00000000 --- a/functions/development/hugging_face_classifier_trainer/0.0.1/src/test_hugging_face_classifier_trainer.py +++ /dev/null @@ -1,145 +0,0 @@ -# Copyright 2019 Iguazio -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -import os - -import mlrun -import pytest -from mlrun import import_function - -REQUIRED_ENV_VARS = [ - "MLRUN_DBPATH", - "MLRUN_ARTIFACT_PATH", - "V3IO_USERNAME", - "V3IO_API", - "V3IO_ACCESS_KEY", -] - -ADDITIONAL_PARAM_FOR_TRAIN = { - "TRAIN_output_dir": "finetuning-sentiment-model-3000-samples", - "TRAIN_learning_rate": 2e-5, - "TRAIN_per_device_train_batch_size": 16, - "TRAIN_per_device_eval_batch_size": 16, - "TRAIN_num_train_epochs": 2, - "TRAIN_weight_decay": 0.01, - "TRAIN_push_to_hub": False, - "TRAIN_evaluation_strategy": "epoch", - "TRAIN_eval_steps": 1, - "TRAIN_logging_steps": 1, - "CLASS_num_labels": 2, -} - - -def _validate_environment_variables() -> bool: - """ - Checks that all required Environment variables are set. - """ - environment_keys = os.environ.keys() - return all(key in environment_keys for key in REQUIRED_ENV_VARS) - - -def _set_environment(env_file=None): - if env_file: - mlrun.set_env_from_file(env_file) - mlrun.get_or_create_project( - "hugging-face-classifier-trainer-test", context="./", user_project=True - ) - - -@pytest.mark.skipif( - condition=not _validate_environment_variables(), - reason="Project's environment variables are not set", -) -def test_train_sequence_classification(): - _set_environment() - - # Importing function: - fn = import_function("function.yaml") - - train_run = None - - try: - train_run = fn.run( - params={ - "hf_dataset": "Shayanvsf/US_Airline_Sentiment", - "drop_columns": [ - "airline_sentiment_confidence", - "negativereason_confidence", - ], - "pretrained_tokenizer": "distilbert-base-uncased", - "pretrained_model": "distilbert-base-uncased", - "model_class": "transformers.AutoModelForSequenceClassification", - "label_name": "airline_sentiment", - "num_of_train_samples": 100, - "metrics": ["accuracy", "f1"], - "random_state": 42, - **ADDITIONAL_PARAM_FOR_TRAIN, - }, - handler="train", - local=True, - ) - except Exception as exception: - print(f"- The test failed - raised the following error:\n- {exception}") - assert train_run and all( - key in train_run.outputs for key in ["model", "loss"] - ), "outputs should include more data" - - -@pytest.mark.skipif( - condition=not _validate_environment_variables(), - reason="Project's environment variables are not set", -) -def test_train_and_optimize_sequence_classification(): - _set_environment() - - # Importing function: - fn = import_function("function.yaml") - - train_run = None - optimize_run = None - - try: - train_run = fn.run( - params={ - "hf_dataset": "Shayanvsf/US_Airline_Sentiment", - "drop_columns": [ - "airline_sentiment_confidence", - "negativereason_confidence", - ], - "pretrained_tokenizer": "distilbert-base-uncased", - "pretrained_model": "distilbert-base-uncased", - "model_class": "transformers.AutoModelForSequenceClassification", - "label_name": "airline_sentiment", - "num_of_train_samples": 100, - "metrics": ["accuracy", "f1"], - "random_state": 42, - **ADDITIONAL_PARAM_FOR_TRAIN, - }, - handler="train", - local=True, - ) - - optimize_run = fn.run( - params={"model_path": train_run.outputs["model"]}, - handler="optimize", - local=True, - ) - except Exception as exception: - print(f"- The test failed - raised the following error:\n- {exception}") - assert train_run and all( - key in train_run.outputs for key in ["model", "loss"] - ), "outputs should include more data" - assert optimize_run and all( - key in optimize_run.outputs for key in ["model"] - ), "outputs should include more data" diff --git a/functions/development/hugging_face_classifier_trainer/0.0.1/static/documentation.html b/functions/development/hugging_face_classifier_trainer/0.0.1/static/documentation.html deleted file mode 100644 index 1652c838..00000000 --- a/functions/development/hugging_face_classifier_trainer/0.0.1/static/documentation.html +++ /dev/null @@ -1,394 +0,0 @@ - - - - - - - -hugging_face_classifier_trainer package - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - - - -
-
- - -
-
-
- -
-

hugging_face_classifier_trainer package

- -
- -
-
-
-
-
-

hugging_face_classifier_trainer package#

-
-

Submodules#

-
-
-

hugging_face_classifier_trainer.hugging_face_classifier_trainer module#

-
-
-class hugging_face_classifier_trainer.hugging_face_classifier_trainer.HFORTOptimizerMLRunInterface(*args: Any, **kwargs: Any)[source]#
-

Bases: mlrun.frameworks._common., abc.ABC

-

Interface for adding MLRun features for tensorflow keras API.

-
-
-DEFAULT_CONTEXT_NAME = 'mlrun-huggingface'#
-
-
-
-classmethod add_interface(obj, restoration: Optional[mlrun.frameworks._common.CommonTypes.MLRunInterfaceRestorationType] = None)[source]#
-

Enrich the object with this interface properties, methods and functions, so it will have this TensorFlow.Keras -MLRun’s features. -:param obj: The object to enrich his interface. -:param restoration: Restoration information tuple as returned from ‘remove_interface’ in order to

-
-

add the interface in a certain state.

-
-
-
-
-enable_auto_logging(context: mlrun.execution.MLClientCtx, model_name: str = 'model', tag: str = '', labels: Optional[Dict[str, str]] = None, extra_data: Optional[dict] = None)[source]#
-
-
-
-classmethod mlrun_optimize()[source]#
-

MLRun’s tf.keras.Model.fit wrapper. It will setup the optimizer when using horovod. The optimizer must be -passed in a keyword argument and when using horovod, it must be passed as an Optimizer instance, not a string.

-

raise MLRunInvalidArgumentError: In case the optimizer provided did not follow the instructions above.

-
-
-
-
-class hugging_face_classifier_trainer.hugging_face_classifier_trainer.HFTrainerMLRunInterface(*args: Any, **kwargs: Any)[source]#
-

Bases: mlrun.frameworks._common., abc.ABC

-

Interface for adding MLRun features for tensorflow keras API.

-
-
-DEFAULT_CONTEXT_NAME = 'mlrun-huggingface'#
-
-
-
-classmethod add_interface(obj: transformers.Trainer, restoration: Optional[mlrun.frameworks._common.CommonTypes.MLRunInterfaceRestorationType] = None)[source]#
-

Enrich the object with this interface properties, methods and functions, so it will have this TensorFlow.Keras -MLRuns features. -:param obj: The object to enrich his interface. -:param restoration: Restoration information tuple as returned from ‘remove_interface’ in order to

-
-

add the interface in a certain state.

-
-
-
-
-classmethod mlrun_train()[source]#
-

MLRuns tf.keras.Model.fit wrapper. It will setup the optimizer when using horovod. The optimizer must be -passed in a keyword argument and when using horovod, it must be passed as an Optimizer instance, not a string.

-

raise MLRunInvalidArgumentError: In case the optimizer provided did not follow the instructions above.

-
-
-
-
-class hugging_face_classifier_trainer.hugging_face_classifier_trainer.KWArgsPrefixes[source]#
-

Bases: object

-
-
-FIT = 'FIT_'#
-
-
-
-MODEL_CLASS = 'CLASS_'#
-
-
-
-PREDICT = 'PREDICT_'#
-
-
-
-TRAIN = 'TRAIN_'#
-
-
-
-
-class hugging_face_classifier_trainer.hugging_face_classifier_trainer.MLRunCallback(*args: Any, **kwargs: Any)[source]#
-

Bases: transformers.

-

Callback for collecting logs during training / evaluation of the Trainer API.

-
-
-on_epoch_begin(args: transformers.TrainingArguments, state: transformers.TrainerState, control: transformers.TrainerControl, **kwargs)[source]#
-
-
-
-on_epoch_end(args: transformers.TrainingArguments, state: transformers.TrainerState, control: transformers.TrainerControl, **kwargs)[source]#
-
-
-
-on_evaluate(args: transformers.TrainingArguments, state: transformers.TrainerState, control: transformers.TrainerControl, **kwargs)[source]#
-
-
-
-on_log(args: transformers.TrainingArguments, state: transformers.TrainerState, control: transformers.TrainerControl, logs: Optional[Dict[str, float]] = None, **kwargs)[source]#
-
-
-
-on_train_begin(args: transformers.TrainingArguments, state: transformers.TrainerState, control: transformers.TrainerControl, **kwargs)[source]#
-
-
-
-on_train_end(args: transformers.TrainingArguments, state: transformers.TrainerState, control: transformers.TrainerControl, model: Optional[transformers.PreTrainedModel] = None, tokenizer: Optional[transformers.PreTrainedTokenizer] = None, **kwargs)[source]#
-
-
-
-
-hugging_face_classifier_trainer.hugging_face_classifier_trainer.apply_mlrun(huggingface_object, model_name: Optional[str] = None, tag: str = '', context: Optional[mlrun.execution.MLClientCtx] = None, auto_log: bool = True, labels: Optional[Dict[str, str]] = None, extra_data: Optional[dict] = None, **kwargs)[source]#
-

Wrap the given model with MLRun’s interface providing it with mlrun’s additional features. -:param huggingface_object: The model to wrap. Can be loaded from the model path given as well. -:param model_name: The model name to use for storing the model artifact. Default: “model”. -:param tag: The model’s tag to log with. -:param context: MLRun context to work with. If no context is given it will be retrieved via

-
-

‘mlrun.get_or_create_ctx(None)’

-
-
-
Parameters
-

auto_log – Whether to enable MLRun’s auto logging. Default: True.

-
-
-
-
-
-hugging_face_classifier_trainer.hugging_face_classifier_trainer.optimize(model_path: str, model_name: str = 'optimized_model', target_dir: str = './optimized', optimization_level: int = 1)[source]#
-

Optimizing the transformer model using ONNX optimization.

-
-
Parameters
-
    -
  • model_path – The path of the model to optimize.

  • -
  • model_name – Name of the optimized model.

  • -
  • target_dir – The directory to save the ONNX model.

  • -
  • optimization_level – Optimization level performed by ONNX Runtime of the loaded graph. (default is 1)

  • -
-
-
-
-
-
-hugging_face_classifier_trainer.hugging_face_classifier_trainer.train(context: mlrun.execution.MLClientCtx, hf_dataset: Optional[str] = None, dataset: Optional[mlrun.datastore.base.DataItem] = None, test_set: Optional[mlrun.datastore.base.DataItem] = None, drop_columns: Optional[List[str]] = None, pretrained_tokenizer: Optional[str] = None, pretrained_model: Optional[str] = None, model_class: Optional[str] = None, model_name: str = 'huggingface-model', label_name: str = 'labels', text_col: str = 'text', num_of_train_samples: Optional[int] = None, train_test_split_size: Optional[float] = None, metrics: Optional[List[str]] = None, random_state: Optional[int] = None)[source]#
-

Training and evaluating a pretrained model with a pretrained tokenizer over a dataset. -The dataset can be either be the name of the dataset that contains in the HuggingFace hub, -or a URI or a FeatureVector

-
-
Parameters
-
    -
  • context – MLRun context

  • -
  • hf_dataset – The name of the dataset to get from the HuggingFace hub

  • -
  • dataset – The dataset to train the model on. Can be either a URI or a FeatureVector

  • -
  • test_set – The test set to train the model with.

  • -
  • drop_columns – The columns to drop from the dataset.

  • -
  • pretrained_tokenizer – The name of the pretrained tokenizer from the HuggingFace hub.

  • -
  • pretrained_model – The name of the pretrained model from the HuggingFace hub.

  • -
  • model_name – The model’s name to use for storing the model artifact, default to ‘model’

  • -
  • model_class – The class of the model, e.g. transformers.AutoModelForSequenceClassification

  • -
  • label_name – The target label of the column in the dataset.

  • -
  • text_col – The input text column un the dataset.

  • -
  • num_of_train_samples – Max number of training samples, for debugging.

  • -
  • train_test_split_size – Should be between 0.0 and 1.0 and represent the proportion of the dataset to include -in the test split.

  • -
  • metrics – List of different metrics for evaluate the model such as f1, accuracy etc.

  • -
  • random_state – Random state for train_test_split

  • -
-
-
-
-
-
-

Module contents#

-
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/hugging_face_classifier_trainer/0.0.1/static/example.html b/functions/development/hugging_face_classifier_trainer/0.0.1/static/example.html deleted file mode 100644 index 9598e23c..00000000 --- a/functions/development/hugging_face_classifier_trainer/0.0.1/static/example.html +++ /dev/null @@ -1,539 +0,0 @@ - - - - - - - -MLRun Hugging Face Classifier Trainer Tutorial - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - - - -
-
- - -
-
-
- - -
-
-

-
-

MLRun Hugging Face Classifier Trainer Tutorial#

-

This notebook shows how to use the handlers of the Hugging Face classifier trainer. -the following handlers are:

-
    -
  • train

  • -
  • optimize

  • -
-

All you need is simply HF model type and a HF dataset name.

-
-
-
%pip install -r requirements.txt
-
-
-
-
-
-
-
import mlrun
-
-
-
-
-
-
-
mlrun.get_or_create_project('hugging-face-trainer', context="./", user_project=True)
-
-
-
-
-
> 2023-02-26 15:26:59,980 [info] loaded project hugging-face-trainer from MLRun DB
-
-
-
<mlrun.projects.project.MlrunProject at 0x7ff44733f3d0>
-
-
-
-
-
-

Importing the hugging_face_classifier_trainer function from the Marketplace#

-
-
-
hugging_face_classifier_trainer = mlrun.import_function("hub://hugging_face_classifier_trainer")
-
-
-
-
-
-
-

Training a model#

-

Choosing the train handler

-
-

Define task parameters¶#

-
    -
  • Class parameters should contain the prefix CLASS_

  • -
  • Train parameters should contain the prefix TRAIN_

  • -
-
-
-
model_class = "transformers.AutoModelForSequenceClassification"
-additional_parameters = {
-    "TRAIN_output_dir": "finetuning-sentiment-model-3000-samples",
-    "TRAIN_learning_rate": 2e-5,
-    "TRAIN_per_device_train_batch_size": 16,
-    "TRAIN_per_device_eval_batch_size": 16,
-    "TRAIN_num_train_epochs": 3,
-    "TRAIN_weight_decay": 0.01,
-    "TRAIN_push_to_hub": False,
-    "TRAIN_evaluation_strategy": "epoch",
-    "TRAIN_eval_steps": 1,
-    "TRAIN_logging_steps": 1,
-    "CLASS_num_labels": 2
-}
-
-
-
-
-
-
-

Running the Training job with the “train” handler#

-
-
-
train_run = hugging_face_classifier_trainer.run(params={
-                                                        "hf_dataset": "Shayanvsf/US_Airline_Sentiment",
-                                                        "drop_columns": [
-                                                            "airline_sentiment_confidence",
-                                                            "negativereason_confidence",
-                                                        ],
-                                                        "pretrained_tokenizer": "distilbert-base-uncased",
-                                                        "pretrained_model": "distilbert-base-uncased",
-                                                        "model_class": "transformers.AutoModelForSequenceClassification",
-                                                        "label_name": "airline_sentiment",
-                                                        "num_of_train_samples": 100,
-                                                        "metrics": ["accuracy", "f1"],
-                                                        "random_state": 42,
-                                                        **additional_parameters
-                                                    },
-                                                    handler="train",
-                                                    local=True,
-                                                )
-
-
-
-
-
-
-

The result of the train run#

-
-
-
train_run.outputs
-
-
-
-
-
{'loss': 0.5363,
- 'learning_rate': 0.0,
- 'eval_loss': 0.48737242817878723,
- 'eval_accuracy': 0.7916666666666666,
- 'eval_f1': 0.0,
- 'eval_runtime': 0.5752,
- 'eval_samples_per_second': 41.722,
- 'eval_steps_per_second': 3.477,
- 'train_runtime': 17.5022,
- 'train_samples_per_second': 17.141,
- 'train_steps_per_second': 1.2,
- 'total_flos': 3327208489680.0,
- 'loss_plot': 'v3io:///projects/hugging-face-trainer-davids/artifacts/hugging-face-classifier-trainer-train/0/loss_plot.html',
- 'learning_rate_plot': 'v3io:///projects/hugging-face-trainer-davids/artifacts/hugging-face-classifier-trainer-train/0/learning_rate_plot.html',
- 'eval_loss_plot': 'v3io:///projects/hugging-face-trainer-davids/artifacts/hugging-face-classifier-trainer-train/0/eval_loss_plot.html',
- 'eval_accuracy_plot': 'v3io:///projects/hugging-face-trainer-davids/artifacts/hugging-face-classifier-trainer-train/0/eval_accuracy_plot.html',
- 'eval_f1_plot': 'v3io:///projects/hugging-face-trainer-davids/artifacts/hugging-face-classifier-trainer-train/0/eval_f1_plot.html',
- 'eval_runtime_plot': 'v3io:///projects/hugging-face-trainer-davids/artifacts/hugging-face-classifier-trainer-train/0/eval_runtime_plot.html',
- 'eval_samples_per_second_plot': 'v3io:///projects/hugging-face-trainer-davids/artifacts/hugging-face-classifier-trainer-train/0/eval_samples_per_second_plot.html',
- 'eval_steps_per_second_plot': 'v3io:///projects/hugging-face-trainer-davids/artifacts/hugging-face-classifier-trainer-train/0/eval_steps_per_second_plot.html',
- 'tokenizer': 'v3io:///projects/hugging-face-trainer-davids/artifacts/hugging-face-classifier-trainer-train/0/tokenizer.zip',
- 'model': 'store://artifacts/hugging-face-trainer-davids/huggingface-model:32a62cb55414402facbf47bb0470dc9f'}
-
-
-
-
-
-
-
train_run.artifact('loss_plot').show()
-
-
-
-
-
- - -
-
- -
-
-
-
-

Getting the model for evaluating and predicting#

-
-
-
model_path = train_run.outputs['model']
-
-
-
-
-
-
-
-

Optimize the model#

-

Choosing the optimize handler

-

The result of using this handled is an onnx optimized model.

-
-
-
optimize_run = hugging_face_classifier_trainer.run(params={
-                                                        "model_path": str(model_path)
-                                                    },
-                                                    handler="optimize",
-                                                    local=True,
-                                                )
-
-
-
-
-
-
-
optimize_run.outputs
-
-
-
-
-
{'model': 'store://artifacts/hugging-face-trainer-davids/optimized_model:2c355967689240de8964a1d10d137215'}
-
-
-
-
-

Back to the top

-
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/hugging_face_classifier_trainer/0.0.1/static/function.html b/functions/development/hugging_face_classifier_trainer/0.0.1/static/function.html deleted file mode 100644 index a9c7b8ac..00000000 --- a/functions/development/hugging_face_classifier_trainer/0.0.1/static/function.html +++ /dev/null @@ -1,403 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: job
-metadata:
-  name: hugging-face-classifier-trainer
-  tag: ''
-  hash: bc6b080ee99a515a6ccbdbb3d3a26b74657d38ea
-  project: ''
-  labels:
-    author: davids
-  categories:
-  - machine-learning
-  - model-training
-spec:
-  command: ''
-  args: []
-  image: mlrun/ml-models
-  build:
-    functionSourceCode: 
-    commands:
-    - python -m pip install onnx~=1.10.1 onnxruntime~=1.8.1 optimum~=1.6.4 transformers~=4.26.1
-      datasets~=2.10.1 scikit-learn~=1.0.2
-    code_origin: https://github.com/davesh0812/functions.git#837c1850e97aab539fc2820db5e2b0100699543e:/Users/davids/Projects/functions/hugging_face_classifier_trainer/hugging_face_classifier_trainer.py
-    origin_filename: /Users/davids/Projects/functions/hugging_face_classifier_trainer/hugging_face_classifier_trainer.py
-  entry_points:
-    add_interface:
-      name: add_interface
-      doc: 'Enrich the object with this interface properties, methods and functions,
-        so it will have this TensorFlow.Keras
-
-        MLRuns features.'
-      parameters:
-      - name: cls
-        default: ''
-      - name: obj
-        type: Trainer
-        doc: The object to enrich his interface.
-        default: ''
-      - name: restoration
-        type: MLRunInterfaceRestorationType
-        doc: Restoration information tuple as returned from 'remove_interface' in
-          order to add the interface in a certain state.
-        default: null
-      outputs:
-      - default: ''
-      lineno: 144
-    mlrun_optimize:
-      name: mlrun_optimize
-      doc: 'MLRun''s tf.keras.Model.fit wrapper. It will setup the optimizer when
-        using horovod. The optimizer must be
-
-        passed in a keyword argument and when using horovod, it must be passed as
-        an Optimizer instance, not a string.
-
-
-        raise MLRunInvalidArgumentError: In case the optimizer provided did not follow
-        the instructions above.'
-      parameters:
-      - name: cls
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 77
-    wrapper:
-      name: wrapper
-      doc: ''
-      parameters:
-      - name: self
-        type: Trainer
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 172
-    enable_auto_logging:
-      name: enable_auto_logging
-      doc: ''
-      parameters:
-      - name: self
-        default: ''
-      - name: context
-        type: MLClientCtx
-        default: ''
-      - name: model_name
-        type: str
-        default: model
-      - name: tag
-        type: str
-        default: ''
-      - name: labels
-        type: Dict[str, str]
-        default: null
-      - name: extra_data
-        type: dict
-        default: null
-      outputs:
-      - default: ''
-      lineno: 113
-    mlrun_train:
-      name: mlrun_train
-      doc: 'MLRuns tf.keras.Model.fit wrapper. It will setup the optimizer when using
-        horovod. The optimizer must be
-
-        passed in a keyword argument and when using horovod, it must be passed as
-        an Optimizer instance, not a string.
-
-
-        raise MLRunInvalidArgumentError: In case the optimizer provided did not follow
-        the instructions above.'
-      parameters:
-      - name: cls
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 162
-    on_epoch_begin:
-      name: on_epoch_begin
-      doc: ''
-      parameters:
-      - name: self
-        default: ''
-      - name: args
-        type: TrainingArguments
-        default: ''
-      - name: state
-        type: TrainerState
-        default: ''
-      - name: control
-        type: TrainerControl
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 219
-    on_epoch_end:
-      name: on_epoch_end
-      doc: ''
-      parameters:
-      - name: self
-        default: ''
-      - name: args
-        type: TrainingArguments
-        default: ''
-      - name: state
-        type: TrainerState
-        default: ''
-      - name: control
-        type: TrainerControl
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 228
-    on_log:
-      name: on_log
-      doc: ''
-      parameters:
-      - name: self
-        default: ''
-      - name: args
-        type: TrainingArguments
-        default: ''
-      - name: state
-        type: TrainerState
-        default: ''
-      - name: control
-        type: TrainerControl
-        default: ''
-      - name: logs
-        type: Dict[str, float]
-        default: null
-      outputs:
-      - default: ''
-      lineno: 237
-    on_train_begin:
-      name: on_train_begin
-      doc: ''
-      parameters:
-      - name: self
-        default: ''
-      - name: args
-        type: TrainingArguments
-        default: ''
-      - name: state
-        type: TrainerState
-        default: ''
-      - name: control
-        type: TrainerControl
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 261
-    on_train_end:
-      name: on_train_end
-      doc: ''
-      parameters:
-      - name: self
-        default: ''
-      - name: args
-        type: TrainingArguments
-        default: ''
-      - name: state
-        type: TrainerState
-        default: ''
-      - name: control
-        type: TrainerControl
-        default: ''
-      - name: model
-        type: PreTrainedModel
-        default: null
-      - name: tokenizer
-        type: PreTrainedTokenizer
-        default: null
-      outputs:
-      - default: ''
-      lineno: 270
-    on_evaluate:
-      name: on_evaluate
-      doc: ''
-      parameters:
-      - name: self
-        default: ''
-      - name: args
-        type: TrainingArguments
-        default: ''
-      - name: state
-        type: TrainerState
-        default: ''
-      - name: control
-        type: TrainerControl
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 321
-    apply_mlrun:
-      name: apply_mlrun
-      doc: Wrap the given model with MLRun's interface providing it with mlrun's additional
-        features.
-      parameters:
-      - name: huggingface_object
-        doc: The model to wrap. Can be loaded from the model path given as well.
-        default: ''
-      - name: model_name
-        type: str
-        doc: 'The model name to use for storing the model artifact. Default: "model".'
-        default: null
-      - name: tag
-        type: str
-        doc: The model's tag to log with.
-        default: ''
-      - name: context
-        type: MLClientCtx
-        doc: MLRun context to work with. If no context is given it will be retrieved
-          via 'mlrun.get_or_create_ctx(None)'
-        default: null
-      - name: auto_log
-        type: bool
-        doc: 'Whether to enable MLRun''s auto logging. Default: True.'
-        default: true
-      - name: labels
-        type: Dict[str, str]
-        default: null
-      - name: extra_data
-        type: dict
-        default: null
-      outputs:
-      - default: ''
-      lineno: 420
-    train:
-      name: train
-      doc: 'Training and evaluating a pretrained model with a pretrained tokenizer
-        over a dataset.
-
-        The dataset can be either be the name of the dataset that contains in the
-        HuggingFace hub,
-
-        or a URI or a FeatureVector'
-      parameters:
-      - name: context
-        type: MLClientCtx
-        doc: MLRun context
-        default: ''
-      - name: hf_dataset
-        type: str
-        doc: The name of the dataset to get from the HuggingFace hub
-        default: null
-      - name: dataset
-        type: DataItem
-        doc: The dataset to train the model on. Can be either a URI or a FeatureVector
-        default: null
-      - name: test_set
-        type: DataItem
-        doc: The test set to train the model with.
-        default: null
-      - name: drop_columns
-        type: Optional[List[str]]
-        doc: The columns to drop from the dataset.
-        default: null
-      - name: pretrained_tokenizer
-        type: str
-        doc: The name of the pretrained tokenizer from the HuggingFace hub.
-        default: null
-      - name: pretrained_model
-        type: str
-        doc: The name of the pretrained model from the HuggingFace hub.
-        default: null
-      - name: model_class
-        type: str
-        doc: The class of the model, e.g. `transformers.AutoModelForSequenceClassification`
-        default: null
-      - name: model_name
-        type: str
-        doc: The model's name to use for storing the model artifact, default to 'model'
-        default: huggingface-model
-      - name: label_name
-        type: str
-        doc: The target label of the column in the dataset.
-        default: labels
-      - name: text_col
-        type: str
-        doc: The input text column un the dataset.
-        default: text
-      - name: num_of_train_samples
-        type: int
-        doc: Max number of training samples, for debugging.
-        default: null
-      - name: train_test_split_size
-        type: float
-        doc: Should be between 0.0 and 1.0 and represent the proportion of the dataset
-          to include in the test split.
-        default: null
-      - name: metrics
-        type: List[str]
-        doc: List of different metrics for evaluate the model such as f1, accuracy
-          etc.
-        default: null
-      - name: random_state
-        type: int
-        doc: Random state for train_test_split
-        default: null
-      outputs:
-      - default: ''
-      lineno: 645
-    preprocess_function:
-      name: preprocess_function
-      doc: ''
-      parameters:
-      - name: examples
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 694
-    optimize:
-      name: optimize
-      doc: Optimizing the transformer model using ONNX optimization.
-      parameters:
-      - name: model_path
-        type: str
-        doc: The path of the model to optimize.
-        default: ''
-      - name: model_name
-        type: str
-        doc: Name of the optimized model.
-        default: optimized_model
-      - name: target_dir
-        type: str
-        doc: The directory to save the ONNX model.
-        default: ./optimized
-      - name: optimization_level
-        type: int
-        doc: Optimization level performed by ONNX Runtime of the loaded graph. (default
-          is 1)
-        default: 1
-      outputs:
-      - default: ''
-      lineno: 797
-  description: Automatic train and optimize functions for HuggingFace framework
-  default_handler: train
-  disable_auto_mount: false
-  env: []
-  priority_class_name: ''
-  preemption_mode: prevent
-  affinity: null
-  tolerations: null
-  security_context: {}
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/hugging_face_classifier_trainer/0.0.1/static/hugging_face_classifier_trainer.html b/functions/development/hugging_face_classifier_trainer/0.0.1/static/hugging_face_classifier_trainer.html deleted file mode 100644 index d75afb92..00000000 --- a/functions/development/hugging_face_classifier_trainer/0.0.1/static/hugging_face_classifier_trainer.html +++ /dev/null @@ -1,970 +0,0 @@ - - - - - - - -hugging_face_classifier_trainer.hugging_face_classifier_trainer - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - -
-
- -
-
-
-
-
- -
-

- -
-
-
-
-
-
-
-

Source code for hugging_face_classifier_trainer.hugging_face_classifier_trainer

-import os
-import shutil
-import tempfile
-import zipfile
-from abc import ABC
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
-
-import mlrun
-import numpy as np
-import pandas as pd
-import transformers
-from datasets import Dataset, load_dataset, load_metric
-from mlrun import MLClientCtx
-from mlrun import feature_store as fs
-from mlrun.api.schemas import ObjectKind
-from mlrun.artifacts import Artifact, PlotlyArtifact
-from mlrun.datastore import DataItem
-from mlrun.frameworks._common import CommonTypes, MLRunInterface
-from mlrun.utils import create_class
-from plotly import graph_objects as go
-from sklearn.model_selection import train_test_split
-from transformers import (
-    AutoTokenizer,
-    DataCollatorWithPadding,
-    EvalPrediction,
-    PreTrainedModel,
-    PreTrainedTokenizer,
-    Trainer,
-    TrainerCallback,
-    TrainerControl,
-    TrainerState,
-    TrainingArguments,
-)
-
-
-# ----------------------from MLRUN--------------------------------
-
[docs]class HFORTOptimizerMLRunInterface(MLRunInterface, ABC): - """ - Interface for adding MLRun features for tensorflow keras API. - """ - - # MLRun's context default name: - DEFAULT_CONTEXT_NAME = "mlrun-huggingface" - - # Attributes to be inserted so the MLRun interface will be fully enabled. - _PROPERTIES = { - "_auto_log": False, - "_context": None, - "_model_name": "model", - "_tag": "", - "_labels": None, - "_extra_data": None, - } - _METHODS = ["enable_auto_logging"] - # Attributes to replace so the MLRun interface will be fully enabled. - _REPLACED_METHODS = [ - "optimize", - ] - -
[docs] @classmethod - def add_interface( - cls, - obj, - restoration: CommonTypes.MLRunInterfaceRestorationType = None, - ): - """ - Enrich the object with this interface properties, methods and functions, so it will have this TensorFlow.Keras - MLRun's features. - :param obj: The object to enrich his interface. - :param restoration: Restoration information tuple as returned from 'remove_interface' in order to - add the interface in a certain state. - """ - super(HFORTOptimizerMLRunInterface, cls).add_interface( - obj=obj, restoration=restoration - )
- -
[docs] @classmethod - def mlrun_optimize(cls): - """ - MLRun's tf.keras.Model.fit wrapper. It will setup the optimizer when using horovod. The optimizer must be - passed in a keyword argument and when using horovod, it must be passed as an Optimizer instance, not a string. - - raise MLRunInvalidArgumentError: In case the optimizer provided did not follow the instructions above. - """ - - def wrapper(self, *args, **kwargs): - save_dir = cls._get_function_argument( - self.optimize, - argument_name="save_dir", - passed_args=args, - passed_kwargs=kwargs, - )[0] - - # Call the original optimize method: - result = self.original_optimize(*args, **kwargs) - - if self._auto_log: - # Log the onnx model: - self._context.log_model( - key="model", - db_key=self._model_name, - model_file=f"{save_dir}/model_optimized.onnx", - tag=self._tag, - framework="ONNX", - labels=self._labels, - extra_data=self._extra_data, - ) - - return result - - return wrapper
- -
[docs] def enable_auto_logging( - self, - context: mlrun.MLClientCtx, - model_name: str = "model", - tag: str = "", - labels: Dict[str, str] = None, - extra_data: dict = None, - ): - self._auto_log = True - - self._context = context - self._model_name = model_name - self._tag = tag - self._labels = labels - self._extra_data = extra_data
- - -
[docs]class HFTrainerMLRunInterface(MLRunInterface, ABC): - """ - Interface for adding MLRun features for tensorflow keras API. - """ - - # MLRuns context default name: - DEFAULT_CONTEXT_NAME = "mlrun-huggingface" - - # Attributes to replace so the MLRun interface will be fully enabled. - _REPLACED_METHODS = [ - "train", - # "evaluate" - ] - -
[docs] @classmethod - def add_interface( - cls, - obj: Trainer, - restoration: CommonTypes.MLRunInterfaceRestorationType = None, - ): - """ - Enrich the object with this interface properties, methods and functions, so it will have this TensorFlow.Keras - MLRuns features. - :param obj: The object to enrich his interface. - :param restoration: Restoration information tuple as returned from 'remove_interface' in order to - add the interface in a certain state. - """ - - super(HFTrainerMLRunInterface, cls).add_interface( - obj=obj, restoration=restoration - )
- -
[docs] @classmethod - def mlrun_train(cls): - - """ - MLRuns tf.keras.Model.fit wrapper. It will setup the optimizer when using horovod. The optimizer must be - passed in a keyword argument and when using horovod, it must be passed as an Optimizer instance, not a string. - - raise MLRunInvalidArgumentError: In case the optimizer provided did not follow the instructions above. - """ - - def wrapper(self: Trainer, *args, **kwargs): - # Restore the evaluation method as `train` will use it: - # cls._restore_attribute(obj=self, attribute_name="evaluate") - - # Call the original fit method: - result = self.original_train(*args, **kwargs) - - # Replace the evaluation method again: - # cls._replace_function(obj=self, function_name="evaluate") - - return result - - return wrapper
- - -
[docs]class MLRunCallback(TrainerCallback): - """ - Callback for collecting logs during training / evaluation of the `Trainer` API. - """ - - def __init__( - self, - context: mlrun.MLClientCtx = None, - model_name: str = "model", - tag: str = "", - labels: Dict[str, str] = None, - extra_data: dict = None, - ): - super().__init__() - - # Store the configurations: - self._context = ( - context - if context is not None - else mlrun.get_or_create_ctx("./mlrun-huggingface") - ) - self._model_name = model_name - self._tag = tag - self._labels = labels - self._extra_data = extra_data if extra_data is not None else {} - - # Set up the logging mode: - self._is_training = False - self._steps: List[List[int]] = [] - self._metric_scores: Dict[str, List[float]] = {} - self._artifacts: Dict[str, Artifact] = {} - -
[docs] def on_epoch_begin( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs, - ): - self._steps.append([])
- -
[docs] def on_epoch_end( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs, - ): - self._log_metrics()
- -
[docs] def on_log( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - logs: Dict[str, float] = None, - **kwargs, - ): - recent_logs = state.log_history[-1].copy() - - recent_logs.pop("epoch") - current_step = int(recent_logs.pop("step")) - if current_step not in self._steps[-1]: - self._steps[-1].append(current_step) - - for metric_name, metric_score in recent_logs.items(): - if metric_name.startswith("train_"): - if metric_name.split("train_")[1] not in self._metric_scores: - self._metric_scores[metric_name] = [metric_score] - continue - if metric_name not in self._metric_scores: - self._metric_scores[metric_name] = [] - self._metric_scores[metric_name].append(metric_score)
- -
[docs] def on_train_begin( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs, - ): - self._is_training = True
- -
[docs] def on_train_end( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - model: PreTrainedModel = None, - tokenizer: PreTrainedTokenizer = None, - **kwargs, - ): - self._log_metrics() - - temp_directory = tempfile.gettempdir() - - # Save and log the tokenizer: - if tokenizer is not None: - # Save tokenizer: - tokenizer_dir = os.path.join(temp_directory, "tokenizer") - tokenizer.save_pretrained(save_directory=tokenizer_dir) - # Zip the tokenizer directory: - tokenizer_zip = shutil.make_archive( - base_name="tokenizer", - format="zip", - root_dir=tokenizer_dir, - ) - # Log the zip file: - self._artifacts["tokenizer"] = self._context.log_artifact( - item="tokenizer", local_path=tokenizer_zip - ) - - # Save the model: - model_dir = os.path.join(temp_directory, "model") - model.save_pretrained(save_directory=model_dir) - - # Zip the model directory: - shutil.make_archive( - base_name="model", - format="zip", - root_dir=model_dir, - ) - - # Log the model: - self._context.log_model( - key="model", - db_key=self._model_name, - model_file="model.zip", - tag=self._tag, - framework="Hugging Face", - labels=self._labels, - extra_data={**self._artifacts, **self._extra_data}, - )
- -
[docs] def on_evaluate( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs, - ): - self._log_metrics() - - if self._is_training: - return
- - # TODO: Update the model object - - def _log_metrics(self): - for metric_name, metric_scores in self._metric_scores.items(): - self._context.log_result(key=metric_name, value=metric_scores[-1]) - if len(metric_scores) > 1: - self._log_metric_plot(name=metric_name, scores=metric_scores) - self._context.commit(completed=False) - - def _log_metric_plot(self, name: str, scores: List[float]): - # Initialize a plotly figure: - metric_figure = go.Figure() - - # Add titles: - metric_figure.update_layout( - title=name.capitalize().replace("_", " "), - xaxis_title="Samples", - yaxis_title="Scores", - ) - - # Draw: - metric_figure.add_trace( - go.Scatter(x=np.arange(len(scores)), y=scores, mode="lines") - ) - - # Create the plotly artifact: - artifact_name = f"{name}_plot" - artifact = PlotlyArtifact(key=artifact_name, figure=metric_figure) - self._artifacts[artifact_name] = self._context.log_artifact(artifact)
- - -def _apply_mlrun_on_trainer( - trainer: transformers.Trainer, - model_name: str = None, - tag: str = "", - context: mlrun.MLClientCtx = None, - auto_log: bool = True, - labels: Dict[str, str] = None, - extra_data: dict = None, - **kwargs, -): - # Get parameters defaults: - if context is None: - context = mlrun.get_or_create_ctx(HFTrainerMLRunInterface.DEFAULT_CONTEXT_NAME) - - HFTrainerMLRunInterface.add_interface(obj=trainer) - - if auto_log: - trainer.add_callback( - MLRunCallback( - context=context, - model_name=model_name, - tag=tag, - labels=labels, - extra_data=extra_data, - ) - ) - - -def _apply_mlrun_on_optimizer( - optimizer, - model_name: str = None, - tag: str = "", - context: mlrun.MLClientCtx = None, - auto_log: bool = True, - labels: Dict[str, str] = None, - extra_data: dict = None, - **kwargs, -): - # Get parameters defaults: - if context is None: - context = mlrun.get_or_create_ctx( - HFORTOptimizerMLRunInterface.DEFAULT_CONTEXT_NAME - ) - - HFORTOptimizerMLRunInterface.add_interface(obj=optimizer) - - if auto_log: - optimizer.enable_auto_logging( - context=context, - model_name=model_name, - tag=tag, - labels=labels, - extra_data=extra_data, - ) - - -
[docs]def apply_mlrun( - huggingface_object, - model_name: str = None, - tag: str = "", - context: mlrun.MLClientCtx = None, - auto_log: bool = True, - labels: Dict[str, str] = None, - extra_data: dict = None, - **kwargs, -): - """ - Wrap the given model with MLRun's interface providing it with mlrun's additional features. - :param huggingface_object: The model to wrap. Can be loaded from the model path given as well. - :param model_name: The model name to use for storing the model artifact. Default: "model". - :param tag: The model's tag to log with. - :param context: MLRun context to work with. If no context is given it will be retrieved via - 'mlrun.get_or_create_ctx(None)' - :param auto_log: Whether to enable MLRun's auto logging. Default: True. - """ - - if isinstance(huggingface_object, transformers.Trainer): - return _apply_mlrun_on_trainer( - trainer=huggingface_object, - model_name=model_name, - tag=tag, - context=context, - auto_log=auto_log, - labels=labels, - extra_data=extra_data, - ) - import optimum.onnxruntime as optimum_ort - - if isinstance(huggingface_object, optimum_ort.ORTOptimizer): - return _apply_mlrun_on_optimizer( - optimizer=huggingface_object, - model_name=model_name, - tag=tag, - context=context, - auto_log=auto_log, - labels=labels, - extra_data=extra_data, - ) - raise mlrun.errors.MLRunInvalidArgumentError
- - -# ---------------------- from auto_trainer-------------------------------- -
[docs]class KWArgsPrefixes: - MODEL_CLASS = "CLASS_" - FIT = "FIT_" - TRAIN = "TRAIN_" - PREDICT = "PREDICT_"
- - -def _get_sub_dict_by_prefix(src: Dict, prefix_key: str) -> Dict[str, Any]: - """ - Collect all the keys from the given dict that starts with the given prefix and creates a new dictionary with these - keys. - - :param src: The source dict to extract the values from. - :param prefix_key: Only keys with this prefix will be returned. The keys in the result dict will be without this - prefix. - """ - return { - key.replace(prefix_key, ""): val - for key, val in src.items() - if key.startswith(prefix_key) - } - - -def _get_dataframe( - context: MLClientCtx, - dataset: DataItem, - label_columns: Optional[Union[str, List[str]]] = None, - drop_columns: Union[str, List[str], int, List[int]] = None, -) -> Tuple[pd.DataFrame, Optional[Union[str, List[str]]]]: - """ - Getting the DataFrame of the dataset and drop the columns accordingly. - - :param context: MLRun context. - :param dataset: The dataset to train the model on. - Can be either a list of lists, dict, URI or a FeatureVector. - :param label_columns: The target label(s) of the column(s) in the dataset. for Regression or - Classification tasks. - :param drop_columns: str/int or a list of strings/ints that represent the column names/indices to drop. - """ - if isinstance(dataset, (list, dict)): - dataset = pd.DataFrame(dataset) - # Checking if drop_columns provided by integer type: - if drop_columns: - if isinstance(drop_columns, str) or ( - isinstance(drop_columns, list) - and any(isinstance(col, str) for col in drop_columns) - ): - context.logger.error( - "drop_columns must be an integer/list of integers if not provided with a URI/FeatureVector dataset" - ) - raise ValueError - dataset.drop(drop_columns, axis=1, inplace=True) - - return dataset, label_columns - - if dataset.meta and dataset.meta.kind == ObjectKind.feature_vector: - # feature-vector case: - label_columns = label_columns or dataset.meta.status.label_column - dataset = fs.get_offline_features( - dataset.meta.uri, drop_columns=drop_columns - ).to_dataframe() - - context.logger.info(f"label columns: {label_columns}") - else: - # simple URL case: - dataset = dataset.as_df() - if drop_columns: - if all(col in dataset for col in drop_columns): - dataset = dataset.drop(drop_columns, axis=1) - else: - context.logger.info( - "not all of the columns to drop in the dataset, drop columns process skipped" - ) - return dataset, label_columns - - -# ---------------------- Hugging Face Trainer -------------------------------- - - -def _create_compute_metrics(metrics: List[str]) -> Callable[[EvalPrediction], Dict]: - """ - This function create and returns a function that will be used to compute metrics at evaluation. - :param metrics: List of different metrics for evaluate the model such as f1, accuracy etc. - - :returns: Function that will be used to compute metrics at evaluation. - Must take a [`EvalPrediction`] and return a dictionary string to metric values. - """ - - def _compute_metrics(eval_pred): - logits, labels = eval_pred - predictions = np.argmax(logits, axis=-1) - metric_dict_results = {} - for metric in metrics: - load_met = load_metric(metric) - metric_res = load_met.compute(predictions=predictions, references=labels)[ - metric - ] - metric_dict_results[metric] = metric_res - - return metric_dict_results - - return _compute_metrics - - -def _edit_columns( - dataset: Dataset, - drop_columns: List[str] = None, - rename_columns: [str, str] = None, -) -> Dataset: - """ - Drop and renames that columns of the given dataset - :param dataset: Dataset to process - :param drop_columns: The columns to drop from the dataset. - :param rename_columns: Dict of columns ro rename : {<old_name>: <new_name>, ...} - - :returns: The dataset after the desired process - """ - if drop_columns: - dataset = dataset.remove_columns(drop_columns) - if rename_columns: - dataset = dataset.rename_columns(rename_columns) - return dataset - - -def _prepare_dataset( - context: MLClientCtx, - dataset_name: str, - label_name: str = None, - drop_columns: Optional[List[str]] = None, - num_of_train_samples: int = None, - train_test_split_size: float = None, - random_state: int = None, -) -> Tuple[Dataset, Dataset]: - """ - Loading the dataset and editing the columns - - :param context: MLRun contex - :param dataset_name: The name of the dataset to get from the HuggingFace hub - :param label_name: The target label of the column in the dataset. - :param drop_columns: The columns to drop from the dataset. - :param num_of_train_samples: Max number of training samples, for debugging. - :param train_test_split_size: Should be between 0.0 and 1.0 and represent the proportion of the dataset to include - in the test split. - :param random_state: Random state for train_test_split - - """ - - context.logger.info( - f"Loading and editing {dataset_name} dataset from Hugging Face hub" - ) - rename_cols = {label_name: "labels"} - - # Loading and editing dataset: - dataset = load_dataset(dataset_name) - - # train set - train_dataset = dataset["train"] - if num_of_train_samples: - train_dataset = train_dataset.shuffle(seed=random_state).select( - list(range(num_of_train_samples)) - ) - train_dataset = _edit_columns(train_dataset, drop_columns, rename_cols) - - # test set - test_dataset = dataset["test"] - if train_test_split_size or num_of_train_samples: - train_test_split_size = train_test_split_size or 0.2 - num_of_test_samples = int( - (train_dataset.num_rows * train_test_split_size) - // (1 - train_test_split_size) - ) - test_dataset = test_dataset.shuffle(seed=random_state).select( - list(range(num_of_test_samples)) - ) - test_dataset = _edit_columns(test_dataset, drop_columns, rename_cols) - - return train_dataset, test_dataset - - -
[docs]def train( - context: MLClientCtx, - hf_dataset: str = None, - dataset: DataItem = None, - test_set: DataItem = None, - drop_columns: Optional[List[str]] = None, - pretrained_tokenizer: str = None, - pretrained_model: str = None, - model_class: str = None, - model_name: str = "huggingface-model", - label_name: str = "labels", - text_col: str = "text", - num_of_train_samples: int = None, - train_test_split_size: float = None, - metrics: List[str] = None, - random_state: int = None, -): - """ - Training and evaluating a pretrained model with a pretrained tokenizer over a dataset. - The dataset can be either be the name of the dataset that contains in the HuggingFace hub, - or a URI or a FeatureVector - - :param context: MLRun context - :param hf_dataset: The name of the dataset to get from the HuggingFace hub - :param dataset: The dataset to train the model on. Can be either a URI or a FeatureVector - :param test_set: The test set to train the model with. - :param drop_columns: The columns to drop from the dataset. - :param pretrained_tokenizer: The name of the pretrained tokenizer from the HuggingFace hub. - :param pretrained_model: The name of the pretrained model from the HuggingFace hub. - :param model_name: The model's name to use for storing the model artifact, default to 'model' - :param model_class: The class of the model, e.g. `transformers.AutoModelForSequenceClassification` - :param label_name: The target label of the column in the dataset. - :param text_col: The input text column un the dataset. - :param num_of_train_samples: Max number of training samples, for debugging. - :param train_test_split_size: Should be between 0.0 and 1.0 and represent the proportion of the dataset to include - in the test split. - :param metrics: List of different metrics for evaluate the model such as f1, accuracy etc. - :param random_state: Random state for train_test_split - """ - - if train_test_split_size is None and test_set is None: - context.logger.info( - "'train_test_split_size' is not provided, setting train_test_split_size to 0.2" - ) - train_test_split_size = 0.2 - - # Creating tokenizer: - tokenizer = AutoTokenizer.from_pretrained(pretrained_tokenizer) - - def preprocess_function(examples): - return tokenizer(examples[text_col], truncation=True) - - # prepare data for training - if hf_dataset: - train_dataset, test_dataset = _prepare_dataset( - context, - hf_dataset, - label_name, - drop_columns, - num_of_train_samples, - train_test_split_size, - random_state=random_state, - ) - elif dataset: - # Get DataFrame by URL or by FeatureVector: - train_dataset, label_name = _get_dataframe( - context=context, - dataset=dataset, - label_columns=label_name, - drop_columns=drop_columns, - ) - if test_set: - test_dataset, _ = _get_dataframe( - context=context, - dataset=test_set, - label_columns=label_name, - drop_columns=drop_columns, - ) - else: - train_dataset, test_dataset = train_test_split( - train_dataset, - test_size=train_test_split_size, - random_state=random_state, - ) - train_dataset = Dataset.from_pandas(train_dataset) - test_dataset = Dataset.from_pandas(test_dataset) - else: - raise mlrun.errors.MLRunInvalidArgumentError( - "Training data was not provided. A training dataset is mandatory for training." - " Please provide a training set using one of the arguments 'hf_dataset' or 'dataset'." - ) - - # Mapping datasets with the tokenizer: - tokenized_train = train_dataset.map(preprocess_function, batched=True) - tokenized_test = test_dataset.map(preprocess_function, batched=True) - - # Creating data collator for batching: - data_collator = DataCollatorWithPadding(tokenizer=tokenizer) - - # Parsing kwargs: - train_kwargs = _get_sub_dict_by_prefix( - src=context.parameters, prefix_key=KWArgsPrefixes.TRAIN - ) - model_class_kwargs = _get_sub_dict_by_prefix( - src=context.parameters, prefix_key=KWArgsPrefixes.MODEL_CLASS - ) - - # Loading our pretrained model: - model_class_kwargs["pretrained_model_name_or_path"] = ( - model_class_kwargs.get("pretrained_model_name_or_path") or pretrained_model - ) - train_kwargs["hub_token"] = train_kwargs.get("hub_token") or pretrained_tokenizer - if not model_class_kwargs["pretrained_model_name_or_path"]: - raise mlrun.errors.MLRunRuntimeError( - "Must provide pretrained_model name as " - "function argument or in extra params" - ) - model = create_class(model_class).from_pretrained(**model_class_kwargs) - - # Preparing training arguments: - training_args = TrainingArguments( - **train_kwargs, - ) - - compute_metrics = _create_compute_metrics(metrics) if metrics else None - trainer = Trainer( - model=model, - args=training_args, - train_dataset=tokenized_train, - eval_dataset=tokenized_test, - tokenizer=tokenizer, - data_collator=data_collator, - compute_metrics=compute_metrics, - ) - - apply_mlrun(trainer, model_name=model_name) - - # Apply training with evaluation: - context.logger.info(f"training '{model_name}'") - trainer.train()
- - -def _get_model_dir(model_uri: str): - model_file, _, _ = mlrun.artifacts.get_model(model_uri) - model_dir = tempfile.gettempdir() - # Unzip the Model: - with zipfile.ZipFile(model_file, "r") as zip_file: - zip_file.extractall(model_dir) - - return model_dir - - -
[docs]def optimize( - model_path: str, - model_name: str = "optimized_model", - target_dir: str = "./optimized", - optimization_level: int = 1, -): - """ - Optimizing the transformer model using ONNX optimization. - - - :param model_path: The path of the model to optimize. - :param model_name: Name of the optimized model. - :param target_dir: The directory to save the ONNX model. - :param optimization_level: Optimization level performed by ONNX Runtime of the loaded graph. (default is 1) - """ - # We import these in the function scope so ONNX won't be mandatory for the other handlers: - from optimum.onnxruntime import ORTModelForSequenceClassification, ORTOptimizer - from optimum.onnxruntime.configuration import OptimizationConfig - - model_dir = _get_model_dir(model_uri=model_path) - # Creating configuration for optimization step: - optimization_config = OptimizationConfig(optimization_level=optimization_level) - - # Converting our pretrained model to an ONNX-Runtime model: - ort_model = ORTModelForSequenceClassification.from_pretrained( - model_dir, from_transformers=True - ) - - # Creating an ONNX-Runtime optimizer from ONNX model: - optimizer = ORTOptimizer.from_pretrained(ort_model) - - apply_mlrun(optimizer, model_name=model_name) - # Optimizing and saving the ONNX model: - optimizer.optimize(save_dir=target_dir, optimization_config=optimization_config)
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/hugging_face_classifier_trainer/0.0.1/static/item.html b/functions/development/hugging_face_classifier_trainer/0.0.1/static/item.html deleted file mode 100644 index 4ee1b804..00000000 --- a/functions/development/hugging_face_classifier_trainer/0.0.1/static/item.html +++ /dev/null @@ -1,53 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- machine-learning
-- model-training
-description: Automatic train and optimize functions for HuggingFace framework
-doc: ''
-example: hugging_face_classifier_trainer.ipynb
-generationDate: 2022-08-28:17-25
-hidden: false
-icon: ''
-labels:
-  author: davids
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 1.2.0
-name: hugging_face_classifier_trainer
-platformVersion: 3.5.0
-spec:
-  filename: hugging_face_classifier_trainer.py
-  handler: train
-  image: mlrun/ml-models
-  kind: job
-  requirements:
-  - onnx~=1.10.1
-  - onnxruntime~=1.8.1
-  - optimum~=1.6.4
-  - transformers~=4.26.1
-  - datasets~=2.10.1
-  - scikit-learn~=1.0.2
-url: ''
-version: 0.0.1
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/hugging_face_classifier_trainer/0.0.1/static/source.html b/functions/development/hugging_face_classifier_trainer/0.0.1/static/source.html deleted file mode 100644 index 585fc6c7..00000000 --- a/functions/development/hugging_face_classifier_trainer/0.0.1/static/source.html +++ /dev/null @@ -1,852 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-import os
-import shutil
-import tempfile
-import zipfile
-from abc import ABC
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
-
-import mlrun
-import numpy as np
-import pandas as pd
-import transformers
-from datasets import Dataset, load_dataset, load_metric
-from mlrun import MLClientCtx
-from mlrun import feature_store as fs
-from mlrun.api.schemas import ObjectKind
-from mlrun.artifacts import Artifact, PlotlyArtifact
-from mlrun.datastore import DataItem
-from mlrun.frameworks._common import CommonTypes, MLRunInterface
-from mlrun.utils import create_class
-from plotly import graph_objects as go
-from sklearn.model_selection import train_test_split
-from transformers import (
-    AutoTokenizer,
-    DataCollatorWithPadding,
-    EvalPrediction,
-    PreTrainedModel,
-    PreTrainedTokenizer,
-    Trainer,
-    TrainerCallback,
-    TrainerControl,
-    TrainerState,
-    TrainingArguments,
-)
-
-
-# ----------------------from MLRUN--------------------------------
-class HFORTOptimizerMLRunInterface(MLRunInterface, ABC):
-    """
-    Interface for adding MLRun features for tensorflow keras API.
-    """
-
-    # MLRun's context default name:
-    DEFAULT_CONTEXT_NAME = "mlrun-huggingface"
-
-    # Attributes to be inserted so the MLRun interface will be fully enabled.
-    _PROPERTIES = {
-        "_auto_log": False,
-        "_context": None,
-        "_model_name": "model",
-        "_tag": "",
-        "_labels": None,
-        "_extra_data": None,
-    }
-    _METHODS = ["enable_auto_logging"]
-    # Attributes to replace so the MLRun interface will be fully enabled.
-    _REPLACED_METHODS = [
-        "optimize",
-    ]
-
-    @classmethod
-    def add_interface(
-        cls,
-        obj,
-        restoration: CommonTypes.MLRunInterfaceRestorationType = None,
-    ):
-        """
-        Enrich the object with this interface properties, methods and functions, so it will have this TensorFlow.Keras
-        MLRun's features.
-        :param obj:                     The object to enrich his interface.
-        :param restoration: Restoration information tuple as returned from 'remove_interface' in order to
-                                        add the interface in a certain state.
-        """
-        super(HFORTOptimizerMLRunInterface, cls).add_interface(
-            obj=obj, restoration=restoration
-        )
-
-    @classmethod
-    def mlrun_optimize(cls):
-        """
-        MLRun's tf.keras.Model.fit wrapper. It will setup the optimizer when using horovod. The optimizer must be
-        passed in a keyword argument and when using horovod, it must be passed as an Optimizer instance, not a string.
-
-        raise MLRunInvalidArgumentError: In case the optimizer provided did not follow the instructions above.
-        """
-
-        def wrapper(self, *args, **kwargs):
-            save_dir = cls._get_function_argument(
-                self.optimize,
-                argument_name="save_dir",
-                passed_args=args,
-                passed_kwargs=kwargs,
-            )[0]
-
-            # Call the original optimize method:
-            result = self.original_optimize(*args, **kwargs)
-
-            if self._auto_log:
-                # Log the onnx model:
-                self._context.log_model(
-                    key="model",
-                    db_key=self._model_name,
-                    model_file=f"{save_dir}/model_optimized.onnx",
-                    tag=self._tag,
-                    framework="ONNX",
-                    labels=self._labels,
-                    extra_data=self._extra_data,
-                )
-
-            return result
-
-        return wrapper
-
-    def enable_auto_logging(
-        self,
-        context: mlrun.MLClientCtx,
-        model_name: str = "model",
-        tag: str = "",
-        labels: Dict[str, str] = None,
-        extra_data: dict = None,
-    ):
-        self._auto_log = True
-
-        self._context = context
-        self._model_name = model_name
-        self._tag = tag
-        self._labels = labels
-        self._extra_data = extra_data
-
-
-class HFTrainerMLRunInterface(MLRunInterface, ABC):
-    """
-    Interface for adding MLRun features for tensorflow keras API.
-    """
-
-    # MLRuns context default name:
-    DEFAULT_CONTEXT_NAME = "mlrun-huggingface"
-
-    # Attributes to replace so the MLRun interface will be fully enabled.
-    _REPLACED_METHODS = [
-        "train",
-        # "evaluate"
-    ]
-
-    @classmethod
-    def add_interface(
-        cls,
-        obj: Trainer,
-        restoration: CommonTypes.MLRunInterfaceRestorationType = None,
-    ):
-        """
-        Enrich the object with this interface properties, methods and functions, so it will have this TensorFlow.Keras
-        MLRuns features.
-        :param obj:                     The object to enrich his interface.
-        :param restoration: Restoration information tuple as returned from 'remove_interface' in order to
-                                        add the interface in a certain state.
-        """
-
-        super(HFTrainerMLRunInterface, cls).add_interface(
-            obj=obj, restoration=restoration
-        )
-
-    @classmethod
-    def mlrun_train(cls):
-
-        """
-        MLRuns tf.keras.Model.fit wrapper. It will setup the optimizer when using horovod. The optimizer must be
-        passed in a keyword argument and when using horovod, it must be passed as an Optimizer instance, not a string.
-
-        raise MLRunInvalidArgumentError: In case the optimizer provided did not follow the instructions above.
-        """
-
-        def wrapper(self: Trainer, *args, **kwargs):
-            # Restore the evaluation method as `train` will use it:
-            # cls._restore_attribute(obj=self, attribute_name="evaluate")
-
-            # Call the original fit method:
-            result = self.original_train(*args, **kwargs)
-
-            # Replace the evaluation method again:
-            # cls._replace_function(obj=self, function_name="evaluate")
-
-            return result
-
-        return wrapper
-
-
-class MLRunCallback(TrainerCallback):
-    """
-    Callback for collecting logs during training / evaluation of the `Trainer` API.
-    """
-
-    def __init__(
-        self,
-        context: mlrun.MLClientCtx = None,
-        model_name: str = "model",
-        tag: str = "",
-        labels: Dict[str, str] = None,
-        extra_data: dict = None,
-    ):
-        super().__init__()
-
-        # Store the configurations:
-        self._context = (
-            context
-            if context is not None
-            else mlrun.get_or_create_ctx("./mlrun-huggingface")
-        )
-        self._model_name = model_name
-        self._tag = tag
-        self._labels = labels
-        self._extra_data = extra_data if extra_data is not None else {}
-
-        # Set up the logging mode:
-        self._is_training = False
-        self._steps: List[List[int]] = []
-        self._metric_scores: Dict[str, List[float]] = {}
-        self._artifacts: Dict[str, Artifact] = {}
-
-    def on_epoch_begin(
-        self,
-        args: TrainingArguments,
-        state: TrainerState,
-        control: TrainerControl,
-        **kwargs,
-    ):
-        self._steps.append([])
-
-    def on_epoch_end(
-        self,
-        args: TrainingArguments,
-        state: TrainerState,
-        control: TrainerControl,
-        **kwargs,
-    ):
-        self._log_metrics()
-
-    def on_log(
-        self,
-        args: TrainingArguments,
-        state: TrainerState,
-        control: TrainerControl,
-        logs: Dict[str, float] = None,
-        **kwargs,
-    ):
-        recent_logs = state.log_history[-1].copy()
-
-        recent_logs.pop("epoch")
-        current_step = int(recent_logs.pop("step"))
-        if current_step not in self._steps[-1]:
-            self._steps[-1].append(current_step)
-
-        for metric_name, metric_score in recent_logs.items():
-            if metric_name.startswith("train_"):
-                if metric_name.split("train_")[1] not in self._metric_scores:
-                    self._metric_scores[metric_name] = [metric_score]
-                continue
-            if metric_name not in self._metric_scores:
-                self._metric_scores[metric_name] = []
-            self._metric_scores[metric_name].append(metric_score)
-
-    def on_train_begin(
-        self,
-        args: TrainingArguments,
-        state: TrainerState,
-        control: TrainerControl,
-        **kwargs,
-    ):
-        self._is_training = True
-
-    def on_train_end(
-        self,
-        args: TrainingArguments,
-        state: TrainerState,
-        control: TrainerControl,
-        model: PreTrainedModel = None,
-        tokenizer: PreTrainedTokenizer = None,
-        **kwargs,
-    ):
-        self._log_metrics()
-
-        temp_directory = tempfile.gettempdir()
-
-        # Save and log the tokenizer:
-        if tokenizer is not None:
-            # Save tokenizer:
-            tokenizer_dir = os.path.join(temp_directory, "tokenizer")
-            tokenizer.save_pretrained(save_directory=tokenizer_dir)
-            # Zip the tokenizer directory:
-            tokenizer_zip = shutil.make_archive(
-                base_name="tokenizer",
-                format="zip",
-                root_dir=tokenizer_dir,
-            )
-            # Log the zip file:
-            self._artifacts["tokenizer"] = self._context.log_artifact(
-                item="tokenizer", local_path=tokenizer_zip
-            )
-
-        # Save the model:
-        model_dir = os.path.join(temp_directory, "model")
-        model.save_pretrained(save_directory=model_dir)
-
-        # Zip the model directory:
-        shutil.make_archive(
-            base_name="model",
-            format="zip",
-            root_dir=model_dir,
-        )
-
-        # Log the model:
-        self._context.log_model(
-            key="model",
-            db_key=self._model_name,
-            model_file="model.zip",
-            tag=self._tag,
-            framework="Hugging Face",
-            labels=self._labels,
-            extra_data={**self._artifacts, **self._extra_data},
-        )
-
-    def on_evaluate(
-        self,
-        args: TrainingArguments,
-        state: TrainerState,
-        control: TrainerControl,
-        **kwargs,
-    ):
-        self._log_metrics()
-
-        if self._is_training:
-            return
-
-        # TODO: Update the model object
-
-    def _log_metrics(self):
-        for metric_name, metric_scores in self._metric_scores.items():
-            self._context.log_result(key=metric_name, value=metric_scores[-1])
-            if len(metric_scores) > 1:
-                self._log_metric_plot(name=metric_name, scores=metric_scores)
-        self._context.commit(completed=False)
-
-    def _log_metric_plot(self, name: str, scores: List[float]):
-        # Initialize a plotly figure:
-        metric_figure = go.Figure()
-
-        # Add titles:
-        metric_figure.update_layout(
-            title=name.capitalize().replace("_", " "),
-            xaxis_title="Samples",
-            yaxis_title="Scores",
-        )
-
-        # Draw:
-        metric_figure.add_trace(
-            go.Scatter(x=np.arange(len(scores)), y=scores, mode="lines")
-        )
-
-        # Create the plotly artifact:
-        artifact_name = f"{name}_plot"
-        artifact = PlotlyArtifact(key=artifact_name, figure=metric_figure)
-        self._artifacts[artifact_name] = self._context.log_artifact(artifact)
-
-
-def _apply_mlrun_on_trainer(
-    trainer: transformers.Trainer,
-    model_name: str = None,
-    tag: str = "",
-    context: mlrun.MLClientCtx = None,
-    auto_log: bool = True,
-    labels: Dict[str, str] = None,
-    extra_data: dict = None,
-    **kwargs,
-):
-    # Get parameters defaults:
-    if context is None:
-        context = mlrun.get_or_create_ctx(HFTrainerMLRunInterface.DEFAULT_CONTEXT_NAME)
-
-    HFTrainerMLRunInterface.add_interface(obj=trainer)
-
-    if auto_log:
-        trainer.add_callback(
-            MLRunCallback(
-                context=context,
-                model_name=model_name,
-                tag=tag,
-                labels=labels,
-                extra_data=extra_data,
-            )
-        )
-
-
-def _apply_mlrun_on_optimizer(
-    optimizer,
-    model_name: str = None,
-    tag: str = "",
-    context: mlrun.MLClientCtx = None,
-    auto_log: bool = True,
-    labels: Dict[str, str] = None,
-    extra_data: dict = None,
-    **kwargs,
-):
-    # Get parameters defaults:
-    if context is None:
-        context = mlrun.get_or_create_ctx(
-            HFORTOptimizerMLRunInterface.DEFAULT_CONTEXT_NAME
-        )
-
-    HFORTOptimizerMLRunInterface.add_interface(obj=optimizer)
-
-    if auto_log:
-        optimizer.enable_auto_logging(
-            context=context,
-            model_name=model_name,
-            tag=tag,
-            labels=labels,
-            extra_data=extra_data,
-        )
-
-
-def apply_mlrun(
-    huggingface_object,
-    model_name: str = None,
-    tag: str = "",
-    context: mlrun.MLClientCtx = None,
-    auto_log: bool = True,
-    labels: Dict[str, str] = None,
-    extra_data: dict = None,
-    **kwargs,
-):
-    """
-    Wrap the given model with MLRun's interface providing it with mlrun's additional features.
-    :param huggingface_object: The model to wrap. Can be loaded from the model path given as well.
-    :param model_name:         The model name to use for storing the model artifact. Default: "model".
-    :param tag:                The model's tag to log with.
-    :param context:            MLRun context to work with. If no context is given it will be retrieved via
-                               'mlrun.get_or_create_ctx(None)'
-    :param auto_log:           Whether to enable MLRun's auto logging. Default: True.
-    """
-
-    if isinstance(huggingface_object, transformers.Trainer):
-        return _apply_mlrun_on_trainer(
-            trainer=huggingface_object,
-            model_name=model_name,
-            tag=tag,
-            context=context,
-            auto_log=auto_log,
-            labels=labels,
-            extra_data=extra_data,
-        )
-    import optimum.onnxruntime as optimum_ort
-
-    if isinstance(huggingface_object, optimum_ort.ORTOptimizer):
-        return _apply_mlrun_on_optimizer(
-            optimizer=huggingface_object,
-            model_name=model_name,
-            tag=tag,
-            context=context,
-            auto_log=auto_log,
-            labels=labels,
-            extra_data=extra_data,
-        )
-    raise mlrun.errors.MLRunInvalidArgumentError
-
-
-# ---------------------- from auto_trainer--------------------------------
-class KWArgsPrefixes:
-    MODEL_CLASS = "CLASS_"
-    FIT = "FIT_"
-    TRAIN = "TRAIN_"
-    PREDICT = "PREDICT_"
-
-
-def _get_sub_dict_by_prefix(src: Dict, prefix_key: str) -> Dict[str, Any]:
-    """
-    Collect all the keys from the given dict that starts with the given prefix and creates a new dictionary with these
-    keys.
-
-    :param src:         The source dict to extract the values from.
-    :param prefix_key:  Only keys with this prefix will be returned. The keys in the result dict will be without this
-                        prefix.
-    """
-    return {
-        key.replace(prefix_key, ""): val
-        for key, val in src.items()
-        if key.startswith(prefix_key)
-    }
-
-
-def _get_dataframe(
-    context: MLClientCtx,
-    dataset: DataItem,
-    label_columns: Optional[Union[str, List[str]]] = None,
-    drop_columns: Union[str, List[str], int, List[int]] = None,
-) -> Tuple[pd.DataFrame, Optional[Union[str, List[str]]]]:
-    """
-    Getting the DataFrame of the dataset and drop the columns accordingly.
-
-    :param context:         MLRun context.
-    :param dataset:         The dataset to train the model on.
-                            Can be either a list of lists, dict, URI or a FeatureVector.
-    :param label_columns:   The target label(s) of the column(s) in the dataset. for Regression or
-                            Classification tasks.
-    :param drop_columns:    str/int or a list of strings/ints that represent the column names/indices to drop.
-    """
-    if isinstance(dataset, (list, dict)):
-        dataset = pd.DataFrame(dataset)
-        # Checking if drop_columns provided by integer type:
-        if drop_columns:
-            if isinstance(drop_columns, str) or (
-                isinstance(drop_columns, list)
-                and any(isinstance(col, str) for col in drop_columns)
-            ):
-                context.logger.error(
-                    "drop_columns must be an integer/list of integers if not provided with a URI/FeatureVector dataset"
-                )
-                raise ValueError
-            dataset.drop(drop_columns, axis=1, inplace=True)
-
-        return dataset, label_columns
-
-    if dataset.meta and dataset.meta.kind == ObjectKind.feature_vector:
-        # feature-vector case:
-        label_columns = label_columns or dataset.meta.status.label_column
-        dataset = fs.get_offline_features(
-            dataset.meta.uri, drop_columns=drop_columns
-        ).to_dataframe()
-
-        context.logger.info(f"label columns: {label_columns}")
-    else:
-        # simple URL case:
-        dataset = dataset.as_df()
-        if drop_columns:
-            if all(col in dataset for col in drop_columns):
-                dataset = dataset.drop(drop_columns, axis=1)
-            else:
-                context.logger.info(
-                    "not all of the columns to drop in the dataset, drop columns process skipped"
-                )
-    return dataset, label_columns
-
-
-# ---------------------- Hugging Face Trainer --------------------------------
-
-
-def _create_compute_metrics(metrics: List[str]) -> Callable[[EvalPrediction], Dict]:
-    """
-    This function create and returns a function that will be used to compute metrics at evaluation.
-    :param metrics: List of different metrics for evaluate the model such as f1, accuracy etc.
-
-    :returns: Function that will be used to compute metrics at evaluation.
-             Must take a [`EvalPrediction`] and return a dictionary string to metric values.
-    """
-
-    def _compute_metrics(eval_pred):
-        logits, labels = eval_pred
-        predictions = np.argmax(logits, axis=-1)
-        metric_dict_results = {}
-        for metric in metrics:
-            load_met = load_metric(metric)
-            metric_res = load_met.compute(predictions=predictions, references=labels)[
-                metric
-            ]
-            metric_dict_results[metric] = metric_res
-
-        return metric_dict_results
-
-    return _compute_metrics
-
-
-def _edit_columns(
-    dataset: Dataset,
-    drop_columns: List[str] = None,
-    rename_columns: [str, str] = None,
-) -> Dataset:
-    """
-    Drop and renames that columns of the given dataset
-    :param dataset:         Dataset to process
-    :param drop_columns:    The columns to drop from the dataset.
-    :param rename_columns:  Dict of columns ro rename : {: , ...}
-
-    :returns: The dataset after the desired process
-    """
-    if drop_columns:
-        dataset = dataset.remove_columns(drop_columns)
-    if rename_columns:
-        dataset = dataset.rename_columns(rename_columns)
-    return dataset
-
-
-def _prepare_dataset(
-    context: MLClientCtx,
-    dataset_name: str,
-    label_name: str = None,
-    drop_columns: Optional[List[str]] = None,
-    num_of_train_samples: int = None,
-    train_test_split_size: float = None,
-    random_state: int = None,
-) -> Tuple[Dataset, Dataset]:
-    """
-    Loading the dataset and editing the columns
-
-    :param context:                 MLRun contex
-    :param dataset_name:            The name of the dataset to get from the HuggingFace hub
-    :param label_name:              The target label of the column in the dataset.
-    :param drop_columns:            The columns to drop from the dataset.
-    :param num_of_train_samples:    Max number of training samples, for debugging.
-    :param train_test_split_size:   Should be between 0.0 and 1.0 and represent the proportion of the dataset to include
-                                    in the test split.
-    :param random_state:            Random state for train_test_split
-
-    """
-
-    context.logger.info(
-        f"Loading and editing {dataset_name} dataset from Hugging Face hub"
-    )
-    rename_cols = {label_name: "labels"}
-
-    # Loading and editing dataset:
-    dataset = load_dataset(dataset_name)
-
-    # train set
-    train_dataset = dataset["train"]
-    if num_of_train_samples:
-        train_dataset = train_dataset.shuffle(seed=random_state).select(
-            list(range(num_of_train_samples))
-        )
-    train_dataset = _edit_columns(train_dataset, drop_columns, rename_cols)
-
-    # test set
-    test_dataset = dataset["test"]
-    if train_test_split_size or num_of_train_samples:
-        train_test_split_size = train_test_split_size or 0.2
-        num_of_test_samples = int(
-            (train_dataset.num_rows * train_test_split_size)
-            // (1 - train_test_split_size)
-        )
-        test_dataset = test_dataset.shuffle(seed=random_state).select(
-            list(range(num_of_test_samples))
-        )
-    test_dataset = _edit_columns(test_dataset, drop_columns, rename_cols)
-
-    return train_dataset, test_dataset
-
-
-def train(
-    context: MLClientCtx,
-    hf_dataset: str = None,
-    dataset: DataItem = None,
-    test_set: DataItem = None,
-    drop_columns: Optional[List[str]] = None,
-    pretrained_tokenizer: str = None,
-    pretrained_model: str = None,
-    model_class: str = None,
-    model_name: str = "huggingface-model",
-    label_name: str = "labels",
-    text_col: str = "text",
-    num_of_train_samples: int = None,
-    train_test_split_size: float = None,
-    metrics: List[str] = None,
-    random_state: int = None,
-):
-    """
-    Training and evaluating a pretrained model with a pretrained tokenizer over a dataset.
-    The dataset can be either be the name of the dataset that contains in the HuggingFace hub,
-    or a URI or a FeatureVector
-
-    :param context:                 MLRun context
-    :param hf_dataset:              The name of the dataset to get from the HuggingFace hub
-    :param dataset:                 The dataset to train the model on. Can be either a URI or a FeatureVector
-    :param test_set:                The test set to train the model with.
-    :param drop_columns:            The columns to drop from the dataset.
-    :param pretrained_tokenizer:    The name of the pretrained tokenizer from the HuggingFace hub.
-    :param pretrained_model:        The name of the pretrained model from the HuggingFace hub.
-    :param model_name:              The model's name to use for storing the model artifact, default to 'model'
-    :param model_class:             The class of the model, e.g. `transformers.AutoModelForSequenceClassification`
-    :param label_name:              The target label of the column in the dataset.
-    :param text_col:                The input text column un the dataset.
-    :param num_of_train_samples:    Max number of training samples, for debugging.
-    :param train_test_split_size:   Should be between 0.0 and 1.0 and represent the proportion of the dataset to include
-                                    in the test split.
-    :param metrics:                 List of different metrics for evaluate the model such as f1, accuracy etc.
-    :param random_state:            Random state for train_test_split
-    """
-
-    if train_test_split_size is None and test_set is None:
-        context.logger.info(
-            "'train_test_split_size' is not provided, setting train_test_split_size to 0.2"
-        )
-        train_test_split_size = 0.2
-
-    # Creating tokenizer:
-    tokenizer = AutoTokenizer.from_pretrained(pretrained_tokenizer)
-
-    def preprocess_function(examples):
-        return tokenizer(examples[text_col], truncation=True)
-
-    # prepare data for training
-    if hf_dataset:
-        train_dataset, test_dataset = _prepare_dataset(
-            context,
-            hf_dataset,
-            label_name,
-            drop_columns,
-            num_of_train_samples,
-            train_test_split_size,
-            random_state=random_state,
-        )
-    elif dataset:
-        # Get DataFrame by URL or by FeatureVector:
-        train_dataset, label_name = _get_dataframe(
-            context=context,
-            dataset=dataset,
-            label_columns=label_name,
-            drop_columns=drop_columns,
-        )
-        if test_set:
-            test_dataset, _ = _get_dataframe(
-                context=context,
-                dataset=test_set,
-                label_columns=label_name,
-                drop_columns=drop_columns,
-            )
-        else:
-            train_dataset, test_dataset = train_test_split(
-                train_dataset,
-                test_size=train_test_split_size,
-                random_state=random_state,
-            )
-        train_dataset = Dataset.from_pandas(train_dataset)
-        test_dataset = Dataset.from_pandas(test_dataset)
-    else:
-        raise mlrun.errors.MLRunInvalidArgumentError(
-            "Training data was not provided. A training dataset is mandatory for training."
-            " Please provide a training set using one of the arguments 'hf_dataset' or 'dataset'."
-        )
-
-    # Mapping datasets with the tokenizer:
-    tokenized_train = train_dataset.map(preprocess_function, batched=True)
-    tokenized_test = test_dataset.map(preprocess_function, batched=True)
-
-    # Creating data collator for batching:
-    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
-
-    # Parsing kwargs:
-    train_kwargs = _get_sub_dict_by_prefix(
-        src=context.parameters, prefix_key=KWArgsPrefixes.TRAIN
-    )
-    model_class_kwargs = _get_sub_dict_by_prefix(
-        src=context.parameters, prefix_key=KWArgsPrefixes.MODEL_CLASS
-    )
-
-    # Loading our pretrained model:
-    model_class_kwargs["pretrained_model_name_or_path"] = (
-        model_class_kwargs.get("pretrained_model_name_or_path") or pretrained_model
-    )
-    train_kwargs["hub_token"] = train_kwargs.get("hub_token") or pretrained_tokenizer
-    if not model_class_kwargs["pretrained_model_name_or_path"]:
-        raise mlrun.errors.MLRunRuntimeError(
-            "Must provide pretrained_model name as "
-            "function argument or in extra params"
-        )
-    model = create_class(model_class).from_pretrained(**model_class_kwargs)
-
-    # Preparing training arguments:
-    training_args = TrainingArguments(
-        **train_kwargs,
-    )
-
-    compute_metrics = _create_compute_metrics(metrics) if metrics else None
-    trainer = Trainer(
-        model=model,
-        args=training_args,
-        train_dataset=tokenized_train,
-        eval_dataset=tokenized_test,
-        tokenizer=tokenizer,
-        data_collator=data_collator,
-        compute_metrics=compute_metrics,
-    )
-
-    apply_mlrun(trainer, model_name=model_name)
-
-    # Apply training with evaluation:
-    context.logger.info(f"training '{model_name}'")
-    trainer.train()
-
-
-def _get_model_dir(model_uri: str):
-    model_file, _, _ = mlrun.artifacts.get_model(model_uri)
-    model_dir = tempfile.gettempdir()
-    # Unzip the Model:
-    with zipfile.ZipFile(model_file, "r") as zip_file:
-        zip_file.extractall(model_dir)
-
-    return model_dir
-
-
-def optimize(
-    model_path: str,
-    model_name: str = "optimized_model",
-    target_dir: str = "./optimized",
-    optimization_level: int = 1,
-):
-    """
-    Optimizing the transformer model using ONNX optimization.
-
-
-    :param model_path:          The path of the model to optimize.
-    :param model_name:          Name of the optimized model.
-    :param target_dir:          The directory to save the ONNX model.
-    :param optimization_level:  Optimization level performed by ONNX Runtime of the loaded graph. (default is 1)
-    """
-    # We import these in the function scope so ONNX won't be mandatory for the other handlers:
-    from optimum.onnxruntime import ORTModelForSequenceClassification, ORTOptimizer
-    from optimum.onnxruntime.configuration import OptimizationConfig
-
-    model_dir = _get_model_dir(model_uri=model_path)
-    # Creating configuration for optimization step:
-    optimization_config = OptimizationConfig(optimization_level=optimization_level)
-
-    # Converting our pretrained model to an ONNX-Runtime model:
-    ort_model = ORTModelForSequenceClassification.from_pretrained(
-        model_dir, from_transformers=True
-    )
-
-    # Creating an ONNX-Runtime optimizer from ONNX model:
-    optimizer = ORTOptimizer.from_pretrained(ort_model)
-
-    apply_mlrun(optimizer, model_name=model_name)
-    # Optimizing and saving the ONNX model:
-    optimizer.optimize(save_dir=target_dir, optimization_config=optimization_config)
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/hugging_face_classifier_trainer/0.1.0/src/function.yaml b/functions/development/hugging_face_classifier_trainer/0.1.0/src/function.yaml deleted file mode 100644 index 6d53297e..00000000 --- a/functions/development/hugging_face_classifier_trainer/0.1.0/src/function.yaml +++ /dev/null @@ -1,381 +0,0 @@ -kind: job -metadata: - name: hugging-face-classifier-trainer - tag: '' - hash: e5d9d8ea6d86779e43cb11c261c36e2812a75653 - project: '' - labels: - author: davids - categories: - - machine-learning - - model-training -spec: - command: '' - args: [] - image: mlrun/ml-models - build: - functionSourceCode:  - commands: - - python -m pip install 'onnx~=1.10.1' 'onnxruntime~=1.8.1' 'optimum~=1.6.4' 'transformers~=4.26.1' - 'datasets~=2.10.1' 'scikit-learn~=1.0.2' - code_origin: https://github.com/yonishelach/functions.git#1f6afed6bbe17186995c39b6e40067cfd8dc6e64:/Users/Yonatan_Shelach/projects/functions/hugging_face_classifier_trainer/hugging_face_classifier_trainer.py - origin_filename: /Users/Yonatan_Shelach/projects/functions/hugging_face_classifier_trainer/hugging_face_classifier_trainer.py - entry_points: - add_interface: - name: add_interface - doc: 'Enrich the object with this interface properties, methods and functions, - so it will have this TensorFlow.Keras - - MLRuns features.' - parameters: - - name: cls - default: '' - - name: obj - type: Trainer - doc: The object to enrich his interface. - default: '' - - name: restoration - type: MLRunInterfaceRestorationType - doc: Restoration information tuple as returned from 'remove_interface' in - order to add the interface in a certain state. - default: null - outputs: - - default: '' - lineno: 146 - mlrun_optimize: - name: mlrun_optimize - doc: 'MLRun''s tf.keras.Model.fit wrapper. It will setup the optimizer when - using horovod. The optimizer must be - - passed in a keyword argument and when using horovod, it must be passed as - an Optimizer instance, not a string. - - - raise MLRunInvalidArgumentError: In case the optimizer provided did not follow - the instructions above.' - parameters: - - name: cls - default: '' - outputs: - - default: '' - lineno: 79 - wrapper: - name: wrapper - doc: '' - parameters: - - name: self - type: Trainer - default: '' - outputs: - - default: '' - lineno: 173 - enable_auto_logging: - name: enable_auto_logging - doc: '' - parameters: - - name: self - default: '' - - name: context - type: MLClientCtx - default: '' - - name: model_name - type: str - default: model - - name: tag - type: str - default: '' - - name: labels - type: Dict[str, str] - default: null - - name: extra_data - type: dict - default: null - outputs: - - default: '' - lineno: 114 - mlrun_train: - name: mlrun_train - doc: 'MLRuns tf.keras.Model.fit wrapper. It will setup the optimizer when using - horovod. The optimizer must be - - passed in a keyword argument and when using horovod, it must be passed as - an Optimizer instance, not a string. - - - raise MLRunInvalidArgumentError: In case the optimizer provided did not follow - the instructions above.' - parameters: - - name: cls - default: '' - outputs: - - default: '' - lineno: 164 - on_epoch_begin: - name: on_epoch_begin - doc: '' - parameters: - - name: self - default: '' - - name: args - type: TrainingArguments - default: '' - - name: state - type: TrainerState - default: '' - - name: control - type: TrainerControl - default: '' - outputs: - - default: '' - lineno: 220 - on_epoch_end: - name: on_epoch_end - doc: '' - parameters: - - name: self - default: '' - - name: args - type: TrainingArguments - default: '' - - name: state - type: TrainerState - default: '' - - name: control - type: TrainerControl - default: '' - outputs: - - default: '' - lineno: 229 - on_log: - name: on_log - doc: '' - parameters: - - name: self - default: '' - - name: args - type: TrainingArguments - default: '' - - name: state - type: TrainerState - default: '' - - name: control - type: TrainerControl - default: '' - - name: logs - type: Dict[str, float] - default: null - outputs: - - default: '' - lineno: 238 - on_train_begin: - name: on_train_begin - doc: '' - parameters: - - name: self - default: '' - - name: args - type: TrainingArguments - default: '' - - name: state - type: TrainerState - default: '' - - name: control - type: TrainerControl - default: '' - outputs: - - default: '' - lineno: 262 - on_train_end: - name: on_train_end - doc: '' - parameters: - - name: self - default: '' - - name: args - type: TrainingArguments - default: '' - - name: state - type: TrainerState - default: '' - - name: control - type: TrainerControl - default: '' - - name: model - type: PreTrainedModel - default: null - - name: tokenizer - type: PreTrainedTokenizer - default: null - outputs: - - default: '' - lineno: 271 - on_evaluate: - name: on_evaluate - doc: '' - parameters: - - name: self - default: '' - - name: args - type: TrainingArguments - default: '' - - name: state - type: TrainerState - default: '' - - name: control - type: TrainerControl - default: '' - outputs: - - default: '' - lineno: 322 - apply_mlrun: - name: apply_mlrun - doc: Wrap the given model with MLRun's interface providing it with mlrun's additional - features. - parameters: - - name: huggingface_object - doc: The model to wrap. Can be loaded from the model path given as well. - default: '' - - name: model_name - type: str - doc: 'The model name to use for storing the model artifact. Default: "model".' - default: null - - name: tag - type: str - doc: The model's tag to log with. - default: '' - - name: context - type: MLClientCtx - doc: MLRun context to work with. If no context is given it will be retrieved - via 'mlrun.get_or_create_ctx(None)' - default: null - - name: auto_log - type: bool - doc: 'Whether to enable MLRun''s auto logging. Default: True.' - default: true - - name: labels - type: Dict[str, str] - default: null - - name: extra_data - type: dict - default: null - outputs: - - default: '' - lineno: 421 - train: - name: train - doc: 'Training and evaluating a pretrained model with a pretrained tokenizer - over a dataset. - - The dataset can be either be the name of the dataset that contains in the - HuggingFace hub, - - or a URI or a FeatureVector' - parameters: - - name: context - type: MLClientCtx - doc: MLRun context - default: '' - - name: hf_dataset - type: str - doc: The name of the dataset to get from the HuggingFace hub - default: null - - name: dataset - type: DataItem - doc: The dataset to train the model on. Can be either a URI or a FeatureVector - default: null - - name: test_set - type: DataItem - doc: The test set to train the model with. - default: null - - name: drop_columns - type: Optional[List[str]] - doc: The columns to drop from the dataset. - default: null - - name: pretrained_tokenizer - type: str - doc: The name of the pretrained tokenizer from the HuggingFace hub. - default: null - - name: pretrained_model - type: str - doc: The name of the pretrained model from the HuggingFace hub. - default: null - - name: model_class - type: str - doc: The class of the model, e.g. `transformers.AutoModelForSequenceClassification` - default: null - - name: model_name - type: str - doc: The model's name to use for storing the model artifact, default to 'model' - default: huggingface-model - - name: label_name - type: str - doc: The target label of the column in the dataset. - default: labels - - name: text_col - type: str - doc: The input text column un the dataset. - default: text - - name: num_of_train_samples - type: int - doc: Max number of training samples, for debugging. - default: null - - name: train_test_split_size - type: float - doc: Should be between 0.0 and 1.0 and represent the proportion of the dataset - to include in the test split. - default: null - - name: metrics - type: List[str] - doc: List of different metrics for evaluate the model such as f1, accuracy - etc. - default: null - - name: random_state - type: int - doc: Random state for train_test_split - default: null - outputs: - - default: '' - lineno: 647 - preprocess_function: - name: preprocess_function - doc: '' - parameters: - - name: examples - default: '' - outputs: - - default: '' - lineno: 696 - optimize: - name: optimize - doc: Optimizing the transformer model using ONNX optimization. - parameters: - - name: model_path - type: str - doc: The path of the model to optimize. - default: '' - - name: model_name - type: str - doc: Name of the optimized model. - default: optimized_model - - name: target_dir - type: str - doc: The directory to save the ONNX model. - default: ./optimized - - name: optimization_level - type: int - doc: Optimization level performed by ONNX Runtime of the loaded graph. (default - is 1) - default: 1 - outputs: - - default: '' - lineno: 799 - description: Automatic train and optimize functions for HuggingFace framework - default_handler: train - disable_auto_mount: false - env: [] - priority_class_name: '' - preemption_mode: prevent - affinity: null - tolerations: null - security_context: {} -verbose: false diff --git a/functions/development/hugging_face_classifier_trainer/0.1.0/src/hugging_face_classifier_trainer.ipynb b/functions/development/hugging_face_classifier_trainer/0.1.0/src/hugging_face_classifier_trainer.ipynb deleted file mode 100644 index 16989335..00000000 --- a/functions/development/hugging_face_classifier_trainer/0.1.0/src/hugging_face_classifier_trainer.ipynb +++ /dev/null @@ -1,455 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "\n", - "# MLRun Hugging Face Classifier Trainer Tutorial" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "This notebook shows how to use the handlers of the Hugging Face classifier trainer.\n", - "the following handlers are:\n", - "- `train`\n", - "- `optimize`\n", - "\n", - "All you need is simply **HF model type** and a **HF dataset name**." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%pip install -r requirements.txt" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [], - "source": [ - "import mlrun" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2023-02-26 15:26:59,980 [info] loaded project hugging-face-trainer from MLRun DB\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "mlrun.get_or_create_project('hugging-face-trainer', context=\"./\", user_project=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "### **Importing the hugging_face_classifier_trainer function from the Marketplace**" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [], - "source": [ - "hugging_face_classifier_trainer = mlrun.import_function(\"hub://hugging_face_classifier_trainer\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "### **Training a model**\n", - "\n", - "Choosing the `train` handler" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "#### Define task parameters¶\n", - "* Class parameters should contain the prefix `CLASS_`\n", - "* Train parameters should contain the prefix `TRAIN_`" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [], - "source": [ - "model_class = \"transformers.AutoModelForSequenceClassification\"\n", - "additional_parameters = {\n", - " \"TRAIN_output_dir\": \"finetuning-sentiment-model-3000-samples\",\n", - " \"TRAIN_learning_rate\": 2e-5,\n", - " \"TRAIN_per_device_train_batch_size\": 16,\n", - " \"TRAIN_per_device_eval_batch_size\": 16,\n", - " \"TRAIN_num_train_epochs\": 3,\n", - " \"TRAIN_weight_decay\": 0.01,\n", - " \"TRAIN_push_to_hub\": False,\n", - " \"TRAIN_evaluation_strategy\": \"epoch\",\n", - " \"TRAIN_eval_steps\": 1,\n", - " \"TRAIN_logging_steps\": 1,\n", - " \"CLASS_num_labels\": 2\n", - "}" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "#### Running the Training job with the \"train\" handler" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [], - "source": [ - "train_run = hugging_face_classifier_trainer.run(params={\n", - " \"hf_dataset\": \"Shayanvsf/US_Airline_Sentiment\",\n", - " \"drop_columns\": [\n", - " \"airline_sentiment_confidence\",\n", - " \"negativereason_confidence\",\n", - " ],\n", - " \"pretrained_tokenizer\": \"distilbert-base-uncased\",\n", - " \"pretrained_model\": \"distilbert-base-uncased\",\n", - " \"model_class\": \"transformers.AutoModelForSequenceClassification\",\n", - " \"label_name\": \"airline_sentiment\",\n", - " \"num_of_train_samples\": 100,\n", - " \"metrics\": [\"accuracy\", \"f1\"],\n", - " \"random_state\": 42,\n", - " **additional_parameters\n", - " },\n", - " handler=\"train\",\n", - " local=True,\n", - " )" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "#### The result of the train run" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "{'loss': 0.5363,\n", - " 'learning_rate': 0.0,\n", - " 'eval_loss': 0.48737242817878723,\n", - " 'eval_accuracy': 0.7916666666666666,\n", - " 'eval_f1': 0.0,\n", - " 'eval_runtime': 0.5752,\n", - " 'eval_samples_per_second': 41.722,\n", - " 'eval_steps_per_second': 3.477,\n", - " 'train_runtime': 17.5022,\n", - " 'train_samples_per_second': 17.141,\n", - " 'train_steps_per_second': 1.2,\n", - " 'total_flos': 3327208489680.0,\n", - " 'loss_plot': 'v3io:///projects/hugging-face-trainer-davids/artifacts/hugging-face-classifier-trainer-train/0/loss_plot.html',\n", - " 'learning_rate_plot': 'v3io:///projects/hugging-face-trainer-davids/artifacts/hugging-face-classifier-trainer-train/0/learning_rate_plot.html',\n", - " 'eval_loss_plot': 'v3io:///projects/hugging-face-trainer-davids/artifacts/hugging-face-classifier-trainer-train/0/eval_loss_plot.html',\n", - " 'eval_accuracy_plot': 'v3io:///projects/hugging-face-trainer-davids/artifacts/hugging-face-classifier-trainer-train/0/eval_accuracy_plot.html',\n", - " 'eval_f1_plot': 'v3io:///projects/hugging-face-trainer-davids/artifacts/hugging-face-classifier-trainer-train/0/eval_f1_plot.html',\n", - " 'eval_runtime_plot': 'v3io:///projects/hugging-face-trainer-davids/artifacts/hugging-face-classifier-trainer-train/0/eval_runtime_plot.html',\n", - " 'eval_samples_per_second_plot': 'v3io:///projects/hugging-face-trainer-davids/artifacts/hugging-face-classifier-trainer-train/0/eval_samples_per_second_plot.html',\n", - " 'eval_steps_per_second_plot': 'v3io:///projects/hugging-face-trainer-davids/artifacts/hugging-face-classifier-trainer-train/0/eval_steps_per_second_plot.html',\n", - " 'tokenizer': 'v3io:///projects/hugging-face-trainer-davids/artifacts/hugging-face-classifier-trainer-train/0/tokenizer.zip',\n", - " 'model': 'store://artifacts/hugging-face-trainer-davids/huggingface-model:32a62cb55414402facbf47bb0470dc9f'}" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "train_run.outputs" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "
\n", - "
\n", - "\n", - "" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "train_run.artifact('loss_plot').show()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "#### Getting the model for evaluating and predicting" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [], - "source": [ - "model_path = train_run.outputs['model']" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Optimize the model**\n", - "\n", - "Choosing the `optimize` handler\n", - "\n", - "The result of using this handled is an onnx optimized model." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "optimize_run = hugging_face_classifier_trainer.run(params={\n", - " \"model_path\": str(model_path)\n", - " },\n", - " handler=\"optimize\",\n", - " local=True,\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'model': 'store://artifacts/hugging-face-trainer-davids/optimized_model:2c355967689240de8964a1d10d137215'}" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "optimize_run.outputs" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "[Back to the top](#top)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python [conda env:root] *", - "language": "python", - "name": "conda-root-py" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/functions/development/hugging_face_classifier_trainer/0.1.0/src/hugging_face_classifier_trainer.py b/functions/development/hugging_face_classifier_trainer/0.1.0/src/hugging_face_classifier_trainer.py deleted file mode 100755 index 29d07039..00000000 --- a/functions/development/hugging_face_classifier_trainer/0.1.0/src/hugging_face_classifier_trainer.py +++ /dev/null @@ -1,832 +0,0 @@ -import os -import shutil -import tempfile -import zipfile -from abc import ABC -from typing import Any, Callable, Dict, List, Optional, Tuple, Union - -import mlrun -import mlrun.datastore -import mlrun.utils -import numpy as np -import pandas as pd -import transformers -from datasets import Dataset, load_dataset, load_metric -from mlrun import MLClientCtx -from mlrun import feature_store as fs -from mlrun.artifacts import Artifact, PlotlyArtifact -from mlrun.datastore import DataItem -from mlrun.frameworks._common import CommonTypes, MLRunInterface -from mlrun.utils import create_class -from plotly import graph_objects as go -from sklearn.model_selection import train_test_split -from transformers import ( - AutoTokenizer, - DataCollatorWithPadding, - EvalPrediction, - PreTrainedModel, - PreTrainedTokenizer, - Trainer, - TrainerCallback, - TrainerControl, - TrainerState, - TrainingArguments, -) - - -# ----------------------from MLRUN-------------------------------- -class HFORTOptimizerMLRunInterface(MLRunInterface, ABC): - """ - Interface for adding MLRun features for tensorflow keras API. - """ - - # MLRun's context default name: - DEFAULT_CONTEXT_NAME = "mlrun-huggingface" - - # Attributes to be inserted so the MLRun interface will be fully enabled. - _PROPERTIES = { - "_auto_log": False, - "_context": None, - "_model_name": "model", - "_tag": "", - "_labels": None, - "_extra_data": None, - } - _METHODS = ["enable_auto_logging"] - # Attributes to replace so the MLRun interface will be fully enabled. - _REPLACED_METHODS = [ - "optimize", - ] - - @classmethod - def add_interface( - cls, - obj, - restoration: CommonTypes.MLRunInterfaceRestorationType = None, - ): - """ - Enrich the object with this interface properties, methods and functions, so it will have this TensorFlow.Keras - MLRun's features. - :param obj: The object to enrich his interface. - :param restoration: Restoration information tuple as returned from 'remove_interface' in order to - add the interface in a certain state. - """ - super(HFORTOptimizerMLRunInterface, cls).add_interface( - obj=obj, restoration=restoration - ) - - @classmethod - def mlrun_optimize(cls): - """ - MLRun's tf.keras.Model.fit wrapper. It will setup the optimizer when using horovod. The optimizer must be - passed in a keyword argument and when using horovod, it must be passed as an Optimizer instance, not a string. - - raise MLRunInvalidArgumentError: In case the optimizer provided did not follow the instructions above. - """ - - def wrapper(self, *args, **kwargs): - save_dir = cls._get_function_argument( - self.optimize, - argument_name="save_dir", - passed_args=args, - passed_kwargs=kwargs, - )[0] - - # Call the original optimize method: - result = self.original_optimize(*args, **kwargs) - - if self._auto_log: - # Log the onnx model: - self._context.log_model( - key="model", - db_key=self._model_name, - model_file=f"{save_dir}/model_optimized.onnx", - tag=self._tag, - framework="ONNX", - labels=self._labels, - extra_data=self._extra_data, - ) - - return result - - return wrapper - - def enable_auto_logging( - self, - context: mlrun.MLClientCtx, - model_name: str = "model", - tag: str = "", - labels: Dict[str, str] = None, - extra_data: dict = None, - ): - self._auto_log = True - - self._context = context - self._model_name = model_name - self._tag = tag - self._labels = labels - self._extra_data = extra_data - - -class HFTrainerMLRunInterface(MLRunInterface, ABC): - """ - Interface for adding MLRun features for tensorflow keras API. - """ - - # MLRuns context default name: - DEFAULT_CONTEXT_NAME = "mlrun-huggingface" - - # Attributes to replace so the MLRun interface will be fully enabled. - _REPLACED_METHODS = [ - "train", - # "evaluate" - ] - - @classmethod - def add_interface( - cls, - obj: Trainer, - restoration: CommonTypes.MLRunInterfaceRestorationType = None, - ): - """ - Enrich the object with this interface properties, methods and functions, so it will have this TensorFlow.Keras - MLRuns features. - :param obj: The object to enrich his interface. - :param restoration: Restoration information tuple as returned from 'remove_interface' in order to - add the interface in a certain state. - """ - - super(HFTrainerMLRunInterface, cls).add_interface( - obj=obj, restoration=restoration - ) - - @classmethod - def mlrun_train(cls): - - """ - MLRuns tf.keras.Model.fit wrapper. It will setup the optimizer when using horovod. The optimizer must be - passed in a keyword argument and when using horovod, it must be passed as an Optimizer instance, not a string. - - raise MLRunInvalidArgumentError: In case the optimizer provided did not follow the instructions above. - """ - - def wrapper(self: Trainer, *args, **kwargs): - # Restore the evaluation method as `train` will use it: - # cls._restore_attribute(obj=self, attribute_name="evaluate") - - # Call the original fit method: - result = self.original_train(*args, **kwargs) - - # Replace the evaluation method again: - # cls._replace_function(obj=self, function_name="evaluate") - - return result - - return wrapper - - -class MLRunCallback(TrainerCallback): - """ - Callback for collecting logs during training / evaluation of the `Trainer` API. - """ - - def __init__( - self, - context: mlrun.MLClientCtx = None, - model_name: str = "model", - tag: str = "", - labels: Dict[str, str] = None, - extra_data: dict = None, - ): - super().__init__() - - # Store the configurations: - self._context = ( - context - if context is not None - else mlrun.get_or_create_ctx("./mlrun-huggingface") - ) - self._model_name = model_name - self._tag = tag - self._labels = labels - self._extra_data = extra_data if extra_data is not None else {} - - # Set up the logging mode: - self._is_training = False - self._steps: List[List[int]] = [] - self._metric_scores: Dict[str, List[float]] = {} - self._artifacts: Dict[str, Artifact] = {} - - def on_epoch_begin( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs, - ): - self._steps.append([]) - - def on_epoch_end( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs, - ): - self._log_metrics() - - def on_log( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - logs: Dict[str, float] = None, - **kwargs, - ): - recent_logs = state.log_history[-1].copy() - - recent_logs.pop("epoch") - current_step = int(recent_logs.pop("step")) - if current_step not in self._steps[-1]: - self._steps[-1].append(current_step) - - for metric_name, metric_score in recent_logs.items(): - if metric_name.startswith("train_"): - if metric_name.split("train_")[1] not in self._metric_scores: - self._metric_scores[metric_name] = [metric_score] - continue - if metric_name not in self._metric_scores: - self._metric_scores[metric_name] = [] - self._metric_scores[metric_name].append(metric_score) - - def on_train_begin( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs, - ): - self._is_training = True - - def on_train_end( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - model: PreTrainedModel = None, - tokenizer: PreTrainedTokenizer = None, - **kwargs, - ): - self._log_metrics() - - temp_directory = tempfile.gettempdir() - - # Save and log the tokenizer: - if tokenizer is not None: - # Save tokenizer: - tokenizer_dir = os.path.join(temp_directory, "tokenizer") - tokenizer.save_pretrained(save_directory=tokenizer_dir) - # Zip the tokenizer directory: - tokenizer_zip = shutil.make_archive( - base_name="tokenizer", - format="zip", - root_dir=tokenizer_dir, - ) - # Log the zip file: - self._artifacts["tokenizer"] = self._context.log_artifact( - item="tokenizer", local_path=tokenizer_zip - ) - - # Save the model: - model_dir = os.path.join(temp_directory, "model") - model.save_pretrained(save_directory=model_dir) - - # Zip the model directory: - shutil.make_archive( - base_name="model", - format="zip", - root_dir=model_dir, - ) - - # Log the model: - self._context.log_model( - key="model", - db_key=self._model_name, - model_file="model.zip", - tag=self._tag, - framework="Hugging Face", - labels=self._labels, - extra_data={**self._artifacts, **self._extra_data}, - ) - - def on_evaluate( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs, - ): - self._log_metrics() - - if self._is_training: - return - - # TODO: Update the model object - - def _log_metrics(self): - for metric_name, metric_scores in self._metric_scores.items(): - self._context.log_result(key=metric_name, value=metric_scores[-1]) - if len(metric_scores) > 1: - self._log_metric_plot(name=metric_name, scores=metric_scores) - self._context.commit(completed=False) - - def _log_metric_plot(self, name: str, scores: List[float]): - # Initialize a plotly figure: - metric_figure = go.Figure() - - # Add titles: - metric_figure.update_layout( - title=name.capitalize().replace("_", " "), - xaxis_title="Samples", - yaxis_title="Scores", - ) - - # Draw: - metric_figure.add_trace( - go.Scatter(x=np.arange(len(scores)), y=scores, mode="lines") - ) - - # Create the plotly artifact: - artifact_name = f"{name}_plot" - artifact = PlotlyArtifact(key=artifact_name, figure=metric_figure) - self._artifacts[artifact_name] = self._context.log_artifact(artifact) - - -def _apply_mlrun_on_trainer( - trainer: transformers.Trainer, - model_name: str = None, - tag: str = "", - context: mlrun.MLClientCtx = None, - auto_log: bool = True, - labels: Dict[str, str] = None, - extra_data: dict = None, - **kwargs, -): - # Get parameters defaults: - if context is None: - context = mlrun.get_or_create_ctx(HFTrainerMLRunInterface.DEFAULT_CONTEXT_NAME) - - HFTrainerMLRunInterface.add_interface(obj=trainer) - - if auto_log: - trainer.add_callback( - MLRunCallback( - context=context, - model_name=model_name, - tag=tag, - labels=labels, - extra_data=extra_data, - ) - ) - - -def _apply_mlrun_on_optimizer( - optimizer, - model_name: str = None, - tag: str = "", - context: mlrun.MLClientCtx = None, - auto_log: bool = True, - labels: Dict[str, str] = None, - extra_data: dict = None, - **kwargs, -): - # Get parameters defaults: - if context is None: - context = mlrun.get_or_create_ctx( - HFORTOptimizerMLRunInterface.DEFAULT_CONTEXT_NAME - ) - - HFORTOptimizerMLRunInterface.add_interface(obj=optimizer) - - if auto_log: - optimizer.enable_auto_logging( - context=context, - model_name=model_name, - tag=tag, - labels=labels, - extra_data=extra_data, - ) - - -def apply_mlrun( - huggingface_object, - model_name: str = None, - tag: str = "", - context: mlrun.MLClientCtx = None, - auto_log: bool = True, - labels: Dict[str, str] = None, - extra_data: dict = None, - **kwargs, -): - """ - Wrap the given model with MLRun's interface providing it with mlrun's additional features. - :param huggingface_object: The model to wrap. Can be loaded from the model path given as well. - :param model_name: The model name to use for storing the model artifact. Default: "model". - :param tag: The model's tag to log with. - :param context: MLRun context to work with. If no context is given it will be retrieved via - 'mlrun.get_or_create_ctx(None)' - :param auto_log: Whether to enable MLRun's auto logging. Default: True. - """ - - if isinstance(huggingface_object, transformers.Trainer): - return _apply_mlrun_on_trainer( - trainer=huggingface_object, - model_name=model_name, - tag=tag, - context=context, - auto_log=auto_log, - labels=labels, - extra_data=extra_data, - ) - import optimum.onnxruntime as optimum_ort - - if isinstance(huggingface_object, optimum_ort.ORTOptimizer): - return _apply_mlrun_on_optimizer( - optimizer=huggingface_object, - model_name=model_name, - tag=tag, - context=context, - auto_log=auto_log, - labels=labels, - extra_data=extra_data, - ) - raise mlrun.errors.MLRunInvalidArgumentError - - -# ---------------------- from auto_trainer-------------------------------- -class KWArgsPrefixes: - MODEL_CLASS = "CLASS_" - FIT = "FIT_" - TRAIN = "TRAIN_" - PREDICT = "PREDICT_" - - -def _get_sub_dict_by_prefix(src: Dict, prefix_key: str) -> Dict[str, Any]: - """ - Collect all the keys from the given dict that starts with the given prefix and creates a new dictionary with these - keys. - - :param src: The source dict to extract the values from. - :param prefix_key: Only keys with this prefix will be returned. The keys in the result dict will be without this - prefix. - """ - return { - key.replace(prefix_key, ""): val - for key, val in src.items() - if key.startswith(prefix_key) - } - - -def _get_dataframe( - context: MLClientCtx, - dataset: DataItem, - label_columns: Optional[Union[str, List[str]]] = None, - drop_columns: Union[str, List[str], int, List[int]] = None, -) -> Tuple[pd.DataFrame, Optional[Union[str, List[str]]]]: - """ - Getting the DataFrame of the dataset and drop the columns accordingly. - - :param context: MLRun context. - :param dataset: The dataset to train the model on. - Can be either a list of lists, dict, URI or a FeatureVector. - :param label_columns: The target label(s) of the column(s) in the dataset. for Regression or - Classification tasks. - :param drop_columns: str/int or a list of strings/ints that represent the column names/indices to drop. - """ - if isinstance(dataset, (list, dict)): - dataset = pd.DataFrame(dataset) - # Checking if drop_columns provided by integer type: - if drop_columns: - if isinstance(drop_columns, str) or ( - isinstance(drop_columns, list) - and any(isinstance(col, str) for col in drop_columns) - ): - context.logger.error( - "drop_columns must be an integer/list of integers if not provided with a URI/FeatureVector dataset" - ) - raise ValueError - dataset.drop(drop_columns, axis=1, inplace=True) - - return dataset, label_columns - - store_uri_prefix, _ = mlrun.datastore.parse_store_uri(dataset.artifact_url) - if mlrun.utils.StorePrefix.FeatureVector == store_uri_prefix: - # feature-vector case: - label_columns = label_columns or dataset.meta.status.label_column - dataset = fs.get_offline_features( - dataset.meta.uri, drop_columns=drop_columns - ).to_dataframe() - - context.logger.info(f"label columns: {label_columns}") - else: - # simple URL case: - dataset = dataset.as_df() - if drop_columns: - if all(col in dataset for col in drop_columns): - dataset = dataset.drop(drop_columns, axis=1) - else: - context.logger.info( - "not all of the columns to drop in the dataset, drop columns process skipped" - ) - return dataset, label_columns - - -# ---------------------- Hugging Face Trainer -------------------------------- - - -def _create_compute_metrics(metrics: List[str]) -> Callable[[EvalPrediction], Dict]: - """ - This function create and returns a function that will be used to compute metrics at evaluation. - :param metrics: List of different metrics for evaluate the model such as f1, accuracy etc. - - :returns: Function that will be used to compute metrics at evaluation. - Must take a [`EvalPrediction`] and return a dictionary string to metric values. - """ - - def _compute_metrics(eval_pred): - logits, labels = eval_pred - predictions = np.argmax(logits, axis=-1) - metric_dict_results = {} - for metric in metrics: - load_met = load_metric(metric) - metric_res = load_met.compute(predictions=predictions, references=labels)[ - metric - ] - metric_dict_results[metric] = metric_res - - return metric_dict_results - - return _compute_metrics - - -def _edit_columns( - dataset: Dataset, - drop_columns: List[str] = None, - rename_columns: [str, str] = None, -) -> Dataset: - """ - Drop and renames that columns of the given dataset - :param dataset: Dataset to process - :param drop_columns: The columns to drop from the dataset. - :param rename_columns: Dict of columns ro rename : {: , ...} - - :returns: The dataset after the desired process - """ - if drop_columns: - dataset = dataset.remove_columns(drop_columns) - if rename_columns: - dataset = dataset.rename_columns(rename_columns) - return dataset - - -def _prepare_dataset( - context: MLClientCtx, - dataset_name: str, - label_name: str = None, - drop_columns: Optional[List[str]] = None, - num_of_train_samples: int = None, - train_test_split_size: float = None, - random_state: int = None, -) -> Tuple[Dataset, Dataset]: - """ - Loading the dataset and editing the columns - - :param context: MLRun contex - :param dataset_name: The name of the dataset to get from the HuggingFace hub - :param label_name: The target label of the column in the dataset. - :param drop_columns: The columns to drop from the dataset. - :param num_of_train_samples: Max number of training samples, for debugging. - :param train_test_split_size: Should be between 0.0 and 1.0 and represent the proportion of the dataset to include - in the test split. - :param random_state: Random state for train_test_split - - """ - - context.logger.info( - f"Loading and editing {dataset_name} dataset from Hugging Face hub" - ) - rename_cols = {label_name: "labels"} - - # Loading and editing dataset: - dataset = load_dataset(dataset_name) - - # train set - train_dataset = dataset["train"] - if num_of_train_samples: - train_dataset = train_dataset.shuffle(seed=random_state).select( - list(range(num_of_train_samples)) - ) - train_dataset = _edit_columns(train_dataset, drop_columns, rename_cols) - - # test set - test_dataset = dataset["test"] - if train_test_split_size or num_of_train_samples: - train_test_split_size = train_test_split_size or 0.2 - num_of_test_samples = int( - (train_dataset.num_rows * train_test_split_size) - // (1 - train_test_split_size) - ) - test_dataset = test_dataset.shuffle(seed=random_state).select( - list(range(num_of_test_samples)) - ) - test_dataset = _edit_columns(test_dataset, drop_columns, rename_cols) - - return train_dataset, test_dataset - - -def train( - context: MLClientCtx, - hf_dataset: str = None, - dataset: DataItem = None, - test_set: DataItem = None, - drop_columns: Optional[List[str]] = None, - pretrained_tokenizer: str = None, - pretrained_model: str = None, - model_class: str = None, - model_name: str = "huggingface-model", - label_name: str = "labels", - text_col: str = "text", - num_of_train_samples: int = None, - train_test_split_size: float = None, - metrics: List[str] = None, - random_state: int = None, -): - """ - Training and evaluating a pretrained model with a pretrained tokenizer over a dataset. - The dataset can be either be the name of the dataset that contains in the HuggingFace hub, - or a URI or a FeatureVector - - :param context: MLRun context - :param hf_dataset: The name of the dataset to get from the HuggingFace hub - :param dataset: The dataset to train the model on. Can be either a URI or a FeatureVector - :param test_set: The test set to train the model with. - :param drop_columns: The columns to drop from the dataset. - :param pretrained_tokenizer: The name of the pretrained tokenizer from the HuggingFace hub. - :param pretrained_model: The name of the pretrained model from the HuggingFace hub. - :param model_name: The model's name to use for storing the model artifact, default to 'model' - :param model_class: The class of the model, e.g. `transformers.AutoModelForSequenceClassification` - :param label_name: The target label of the column in the dataset. - :param text_col: The input text column un the dataset. - :param num_of_train_samples: Max number of training samples, for debugging. - :param train_test_split_size: Should be between 0.0 and 1.0 and represent the proportion of the dataset to include - in the test split. - :param metrics: List of different metrics for evaluate the model such as f1, accuracy etc. - :param random_state: Random state for train_test_split - """ - - if train_test_split_size is None and test_set is None: - context.logger.info( - "'train_test_split_size' is not provided, setting train_test_split_size to 0.2" - ) - train_test_split_size = 0.2 - - # Creating tokenizer: - tokenizer = AutoTokenizer.from_pretrained(pretrained_tokenizer) - - def preprocess_function(examples): - return tokenizer(examples[text_col], truncation=True) - - # prepare data for training - if hf_dataset: - train_dataset, test_dataset = _prepare_dataset( - context, - hf_dataset, - label_name, - drop_columns, - num_of_train_samples, - train_test_split_size, - random_state=random_state, - ) - elif dataset: - # Get DataFrame by URL or by FeatureVector: - train_dataset, label_name = _get_dataframe( - context=context, - dataset=dataset, - label_columns=label_name, - drop_columns=drop_columns, - ) - if test_set: - test_dataset, _ = _get_dataframe( - context=context, - dataset=test_set, - label_columns=label_name, - drop_columns=drop_columns, - ) - else: - train_dataset, test_dataset = train_test_split( - train_dataset, - test_size=train_test_split_size, - random_state=random_state, - ) - train_dataset = Dataset.from_pandas(train_dataset) - test_dataset = Dataset.from_pandas(test_dataset) - else: - raise mlrun.errors.MLRunInvalidArgumentError( - "Training data was not provided. A training dataset is mandatory for training." - " Please provide a training set using one of the arguments 'hf_dataset' or 'dataset'." - ) - - # Mapping datasets with the tokenizer: - tokenized_train = train_dataset.map(preprocess_function, batched=True) - tokenized_test = test_dataset.map(preprocess_function, batched=True) - - # Creating data collator for batching: - data_collator = DataCollatorWithPadding(tokenizer=tokenizer) - - # Parsing kwargs: - train_kwargs = _get_sub_dict_by_prefix( - src=context.parameters, prefix_key=KWArgsPrefixes.TRAIN - ) - model_class_kwargs = _get_sub_dict_by_prefix( - src=context.parameters, prefix_key=KWArgsPrefixes.MODEL_CLASS - ) - - # Loading our pretrained model: - model_class_kwargs["pretrained_model_name_or_path"] = ( - model_class_kwargs.get("pretrained_model_name_or_path") or pretrained_model - ) - train_kwargs["hub_token"] = train_kwargs.get("hub_token") or pretrained_tokenizer - if not model_class_kwargs["pretrained_model_name_or_path"]: - raise mlrun.errors.MLRunRuntimeError( - "Must provide pretrained_model name as " - "function argument or in extra params" - ) - model = create_class(model_class).from_pretrained(**model_class_kwargs) - - # Preparing training arguments: - training_args = TrainingArguments( - **train_kwargs, - ) - - compute_metrics = _create_compute_metrics(metrics) if metrics else None - trainer = Trainer( - model=model, - args=training_args, - train_dataset=tokenized_train, - eval_dataset=tokenized_test, - tokenizer=tokenizer, - data_collator=data_collator, - compute_metrics=compute_metrics, - ) - - apply_mlrun(trainer, model_name=model_name) - - # Apply training with evaluation: - context.logger.info(f"training '{model_name}'") - trainer.train() - - -def _get_model_dir(model_uri: str): - model_file, _, _ = mlrun.artifacts.get_model(model_uri) - model_dir = tempfile.gettempdir() - # Unzip the Model: - with zipfile.ZipFile(model_file, "r") as zip_file: - zip_file.extractall(model_dir) - - return model_dir - - -def optimize( - model_path: str, - model_name: str = "optimized_model", - target_dir: str = "./optimized", - optimization_level: int = 1, -): - """ - Optimizing the transformer model using ONNX optimization. - - - :param model_path: The path of the model to optimize. - :param model_name: Name of the optimized model. - :param target_dir: The directory to save the ONNX model. - :param optimization_level: Optimization level performed by ONNX Runtime of the loaded graph. (default is 1) - """ - # We import these in the function scope so ONNX won't be mandatory for the other handlers: - from optimum.onnxruntime import ORTModelForSequenceClassification, ORTOptimizer - from optimum.onnxruntime.configuration import OptimizationConfig - - model_dir = _get_model_dir(model_uri=model_path) - # Creating configuration for optimization step: - optimization_config = OptimizationConfig(optimization_level=optimization_level) - - # Converting our pretrained model to an ONNX-Runtime model: - ort_model = ORTModelForSequenceClassification.from_pretrained( - model_dir, from_transformers=True - ) - - # Creating an ONNX-Runtime optimizer from ONNX model: - optimizer = ORTOptimizer.from_pretrained(ort_model) - - apply_mlrun(optimizer, model_name=model_name) - # Optimizing and saving the ONNX model: - optimizer.optimize(save_dir=target_dir, optimization_config=optimization_config) diff --git a/functions/development/hugging_face_classifier_trainer/0.1.0/src/item.yaml b/functions/development/hugging_face_classifier_trainer/0.1.0/src/item.yaml deleted file mode 100755 index 2e5d97f5..00000000 --- a/functions/development/hugging_face_classifier_trainer/0.1.0/src/item.yaml +++ /dev/null @@ -1,31 +0,0 @@ -apiVersion: v1 -categories: -- machine-learning -- model-training -description: Automatic train and optimize functions for HuggingFace framework -doc: '' -example: hugging_face_classifier_trainer.ipynb -generationDate: 2022-08-28:17-25 -hidden: false -icon: '' -labels: - author: davids -maintainers: [] -marketplaceType: '' -mlrunVersion: 1.2.0 -name: hugging_face_classifier_trainer -platformVersion: 3.5.0 -spec: - filename: hugging_face_classifier_trainer.py - handler: train - image: mlrun/ml-models - kind: job - requirements: - - onnx~=1.10.1 - - onnxruntime~=1.8.1 - - optimum~=1.6.4 - - transformers~=4.26.1 - - datasets~=2.10.1 - - scikit-learn~=1.0.2 -url: '' -version: 0.1.0 diff --git a/functions/development/hugging_face_classifier_trainer/0.1.0/src/requirements.txt b/functions/development/hugging_face_classifier_trainer/0.1.0/src/requirements.txt deleted file mode 100644 index 10b0872a..00000000 --- a/functions/development/hugging_face_classifier_trainer/0.1.0/src/requirements.txt +++ /dev/null @@ -1,6 +0,0 @@ -onnx~=1.10.1 -onnxruntime~=1.8.1 -optimum~=1.6.4 -transformers~=4.26.1 -datasets~=2.10.1 -scikit-learn~=1.0.2 \ No newline at end of file diff --git a/functions/development/hugging_face_classifier_trainer/0.1.0/src/test_hugging_face_classifier_trainer.py b/functions/development/hugging_face_classifier_trainer/0.1.0/src/test_hugging_face_classifier_trainer.py deleted file mode 100644 index a5e0fee9..00000000 --- a/functions/development/hugging_face_classifier_trainer/0.1.0/src/test_hugging_face_classifier_trainer.py +++ /dev/null @@ -1,145 +0,0 @@ -# Copyright 2019 Iguazio -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -import os - -import mlrun -import pytest -from mlrun import import_function - -REQUIRED_ENV_VARS = [ - "MLRUN_DBPATH", - "MLRUN_ARTIFACT_PATH", - "V3IO_USERNAME", - "V3IO_API", - "V3IO_ACCESS_KEY", -] - -ADDITIONAL_PARAM_FOR_TRAIN = { - "TRAIN_output_dir": "finetuning-sentiment-model-3000-samples", - "TRAIN_learning_rate": 2e-5, - "TRAIN_per_device_train_batch_size": 16, - "TRAIN_per_device_eval_batch_size": 16, - "TRAIN_num_train_epochs": 2, - "TRAIN_weight_decay": 0.01, - "TRAIN_push_to_hub": False, - "TRAIN_evaluation_strategy": "epoch", - "TRAIN_eval_steps": 1, - "TRAIN_logging_steps": 1, - "CLASS_num_labels": 2, -} - - -def _validate_environment_variables() -> bool: - """ - Checks that all required Environment variables are set. - """ - environment_keys = os.environ.keys() - return all(key in environment_keys for key in REQUIRED_ENV_VARS) - - -def _set_environment(env_file=None): - if env_file: - mlrun.set_env_from_file(env_file) - mlrun.get_or_create_project( - "hugging-face-classifier-trainer-test", context="./", user_project=True - ) - - -@pytest.mark.skipif( - condition=not _validate_environment_variables(), - reason="Project's environment variables are not set", -) -def test_train_sequence_classification(): - _set_environment() - - # Importing function: - fn = import_function("function.yaml") - - train_run = None - - try: - train_run = fn.run( - params={ - "hf_dataset": "Shayanvsf/US_Airline_Sentiment", - "drop_columns": [ - "airline_sentiment_confidence", - "negativereason_confidence", - ], - "pretrained_tokenizer": "distilbert-base-uncased", - "pretrained_model": "distilbert-base-uncased", - "model_class": "transformers.AutoModelForSequenceClassification", - "label_name": "airline_sentiment", - "num_of_train_samples": 100, - "metrics": ["accuracy", "f1"], - "random_state": 42, - **ADDITIONAL_PARAM_FOR_TRAIN, - }, - handler="train", - local=True, - ) - except Exception as exception: - print(f"- The test failed - raised the following error:\n- {exception}") - assert train_run and all( - key in train_run.outputs for key in ["model", "loss"] - ), "outputs should include more data" - - -@pytest.mark.skipif( - condition=not _validate_environment_variables(), - reason="Project's environment variables are not set", -) -def test_train_and_optimize_sequence_classification(): - _set_environment() - - # Importing function: - fn = import_function("function.yaml") - - train_run = None - optimize_run = None - - try: - train_run = fn.run( - params={ - "hf_dataset": "Shayanvsf/US_Airline_Sentiment", - "drop_columns": [ - "airline_sentiment_confidence", - "negativereason_confidence", - ], - "pretrained_tokenizer": "distilbert-base-uncased", - "pretrained_model": "distilbert-base-uncased", - "model_class": "transformers.AutoModelForSequenceClassification", - "label_name": "airline_sentiment", - "num_of_train_samples": 100, - "metrics": ["accuracy", "f1"], - "random_state": 42, - **ADDITIONAL_PARAM_FOR_TRAIN, - }, - handler="train", - local=True, - ) - - optimize_run = fn.run( - params={"model_path": train_run.outputs["model"]}, - handler="optimize", - local=True, - ) - except Exception as exception: - print(f"- The test failed - raised the following error:\n- {exception}") - assert train_run and all( - key in train_run.outputs for key in ["model", "loss"] - ), "outputs should include more data" - assert optimize_run and all( - key in optimize_run.outputs for key in ["model"] - ), "outputs should include more data" diff --git a/functions/development/hugging_face_classifier_trainer/0.1.0/static/documentation.html b/functions/development/hugging_face_classifier_trainer/0.1.0/static/documentation.html deleted file mode 100644 index 1652c838..00000000 --- a/functions/development/hugging_face_classifier_trainer/0.1.0/static/documentation.html +++ /dev/null @@ -1,394 +0,0 @@ - - - - - - - -hugging_face_classifier_trainer package - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - - - -
-
- - -
-
-
- -
-

hugging_face_classifier_trainer package

- -
- -
-
-
-
-
-

hugging_face_classifier_trainer package#

-
-

Submodules#

-
-
-

hugging_face_classifier_trainer.hugging_face_classifier_trainer module#

-
-
-class hugging_face_classifier_trainer.hugging_face_classifier_trainer.HFORTOptimizerMLRunInterface(*args: Any, **kwargs: Any)[source]#
-

Bases: mlrun.frameworks._common., abc.ABC

-

Interface for adding MLRun features for tensorflow keras API.

-
-
-DEFAULT_CONTEXT_NAME = 'mlrun-huggingface'#
-
-
-
-classmethod add_interface(obj, restoration: Optional[mlrun.frameworks._common.CommonTypes.MLRunInterfaceRestorationType] = None)[source]#
-

Enrich the object with this interface properties, methods and functions, so it will have this TensorFlow.Keras -MLRun’s features. -:param obj: The object to enrich his interface. -:param restoration: Restoration information tuple as returned from ‘remove_interface’ in order to

-
-

add the interface in a certain state.

-
-
-
-
-enable_auto_logging(context: mlrun.execution.MLClientCtx, model_name: str = 'model', tag: str = '', labels: Optional[Dict[str, str]] = None, extra_data: Optional[dict] = None)[source]#
-
-
-
-classmethod mlrun_optimize()[source]#
-

MLRun’s tf.keras.Model.fit wrapper. It will setup the optimizer when using horovod. The optimizer must be -passed in a keyword argument and when using horovod, it must be passed as an Optimizer instance, not a string.

-

raise MLRunInvalidArgumentError: In case the optimizer provided did not follow the instructions above.

-
-
-
-
-class hugging_face_classifier_trainer.hugging_face_classifier_trainer.HFTrainerMLRunInterface(*args: Any, **kwargs: Any)[source]#
-

Bases: mlrun.frameworks._common., abc.ABC

-

Interface for adding MLRun features for tensorflow keras API.

-
-
-DEFAULT_CONTEXT_NAME = 'mlrun-huggingface'#
-
-
-
-classmethod add_interface(obj: transformers.Trainer, restoration: Optional[mlrun.frameworks._common.CommonTypes.MLRunInterfaceRestorationType] = None)[source]#
-

Enrich the object with this interface properties, methods and functions, so it will have this TensorFlow.Keras -MLRuns features. -:param obj: The object to enrich his interface. -:param restoration: Restoration information tuple as returned from ‘remove_interface’ in order to

-
-

add the interface in a certain state.

-
-
-
-
-classmethod mlrun_train()[source]#
-

MLRuns tf.keras.Model.fit wrapper. It will setup the optimizer when using horovod. The optimizer must be -passed in a keyword argument and when using horovod, it must be passed as an Optimizer instance, not a string.

-

raise MLRunInvalidArgumentError: In case the optimizer provided did not follow the instructions above.

-
-
-
-
-class hugging_face_classifier_trainer.hugging_face_classifier_trainer.KWArgsPrefixes[source]#
-

Bases: object

-
-
-FIT = 'FIT_'#
-
-
-
-MODEL_CLASS = 'CLASS_'#
-
-
-
-PREDICT = 'PREDICT_'#
-
-
-
-TRAIN = 'TRAIN_'#
-
-
-
-
-class hugging_face_classifier_trainer.hugging_face_classifier_trainer.MLRunCallback(*args: Any, **kwargs: Any)[source]#
-

Bases: transformers.

-

Callback for collecting logs during training / evaluation of the Trainer API.

-
-
-on_epoch_begin(args: transformers.TrainingArguments, state: transformers.TrainerState, control: transformers.TrainerControl, **kwargs)[source]#
-
-
-
-on_epoch_end(args: transformers.TrainingArguments, state: transformers.TrainerState, control: transformers.TrainerControl, **kwargs)[source]#
-
-
-
-on_evaluate(args: transformers.TrainingArguments, state: transformers.TrainerState, control: transformers.TrainerControl, **kwargs)[source]#
-
-
-
-on_log(args: transformers.TrainingArguments, state: transformers.TrainerState, control: transformers.TrainerControl, logs: Optional[Dict[str, float]] = None, **kwargs)[source]#
-
-
-
-on_train_begin(args: transformers.TrainingArguments, state: transformers.TrainerState, control: transformers.TrainerControl, **kwargs)[source]#
-
-
-
-on_train_end(args: transformers.TrainingArguments, state: transformers.TrainerState, control: transformers.TrainerControl, model: Optional[transformers.PreTrainedModel] = None, tokenizer: Optional[transformers.PreTrainedTokenizer] = None, **kwargs)[source]#
-
-
-
-
-hugging_face_classifier_trainer.hugging_face_classifier_trainer.apply_mlrun(huggingface_object, model_name: Optional[str] = None, tag: str = '', context: Optional[mlrun.execution.MLClientCtx] = None, auto_log: bool = True, labels: Optional[Dict[str, str]] = None, extra_data: Optional[dict] = None, **kwargs)[source]#
-

Wrap the given model with MLRun’s interface providing it with mlrun’s additional features. -:param huggingface_object: The model to wrap. Can be loaded from the model path given as well. -:param model_name: The model name to use for storing the model artifact. Default: “model”. -:param tag: The model’s tag to log with. -:param context: MLRun context to work with. If no context is given it will be retrieved via

-
-

‘mlrun.get_or_create_ctx(None)’

-
-
-
Parameters
-

auto_log – Whether to enable MLRun’s auto logging. Default: True.

-
-
-
-
-
-hugging_face_classifier_trainer.hugging_face_classifier_trainer.optimize(model_path: str, model_name: str = 'optimized_model', target_dir: str = './optimized', optimization_level: int = 1)[source]#
-

Optimizing the transformer model using ONNX optimization.

-
-
Parameters
-
    -
  • model_path – The path of the model to optimize.

  • -
  • model_name – Name of the optimized model.

  • -
  • target_dir – The directory to save the ONNX model.

  • -
  • optimization_level – Optimization level performed by ONNX Runtime of the loaded graph. (default is 1)

  • -
-
-
-
-
-
-hugging_face_classifier_trainer.hugging_face_classifier_trainer.train(context: mlrun.execution.MLClientCtx, hf_dataset: Optional[str] = None, dataset: Optional[mlrun.datastore.base.DataItem] = None, test_set: Optional[mlrun.datastore.base.DataItem] = None, drop_columns: Optional[List[str]] = None, pretrained_tokenizer: Optional[str] = None, pretrained_model: Optional[str] = None, model_class: Optional[str] = None, model_name: str = 'huggingface-model', label_name: str = 'labels', text_col: str = 'text', num_of_train_samples: Optional[int] = None, train_test_split_size: Optional[float] = None, metrics: Optional[List[str]] = None, random_state: Optional[int] = None)[source]#
-

Training and evaluating a pretrained model with a pretrained tokenizer over a dataset. -The dataset can be either be the name of the dataset that contains in the HuggingFace hub, -or a URI or a FeatureVector

-
-
Parameters
-
    -
  • context – MLRun context

  • -
  • hf_dataset – The name of the dataset to get from the HuggingFace hub

  • -
  • dataset – The dataset to train the model on. Can be either a URI or a FeatureVector

  • -
  • test_set – The test set to train the model with.

  • -
  • drop_columns – The columns to drop from the dataset.

  • -
  • pretrained_tokenizer – The name of the pretrained tokenizer from the HuggingFace hub.

  • -
  • pretrained_model – The name of the pretrained model from the HuggingFace hub.

  • -
  • model_name – The model’s name to use for storing the model artifact, default to ‘model’

  • -
  • model_class – The class of the model, e.g. transformers.AutoModelForSequenceClassification

  • -
  • label_name – The target label of the column in the dataset.

  • -
  • text_col – The input text column un the dataset.

  • -
  • num_of_train_samples – Max number of training samples, for debugging.

  • -
  • train_test_split_size – Should be between 0.0 and 1.0 and represent the proportion of the dataset to include -in the test split.

  • -
  • metrics – List of different metrics for evaluate the model such as f1, accuracy etc.

  • -
  • random_state – Random state for train_test_split

  • -
-
-
-
-
-
-

Module contents#

-
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/hugging_face_classifier_trainer/0.1.0/static/example.html b/functions/development/hugging_face_classifier_trainer/0.1.0/static/example.html deleted file mode 100644 index 9598e23c..00000000 --- a/functions/development/hugging_face_classifier_trainer/0.1.0/static/example.html +++ /dev/null @@ -1,539 +0,0 @@ - - - - - - - -MLRun Hugging Face Classifier Trainer Tutorial - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - - - -
-
- - -
-
-
- - -
-
-

-
-

MLRun Hugging Face Classifier Trainer Tutorial#

-

This notebook shows how to use the handlers of the Hugging Face classifier trainer. -the following handlers are:

-
    -
  • train

  • -
  • optimize

  • -
-

All you need is simply HF model type and a HF dataset name.

-
-
-
%pip install -r requirements.txt
-
-
-
-
-
-
-
import mlrun
-
-
-
-
-
-
-
mlrun.get_or_create_project('hugging-face-trainer', context="./", user_project=True)
-
-
-
-
-
> 2023-02-26 15:26:59,980 [info] loaded project hugging-face-trainer from MLRun DB
-
-
-
<mlrun.projects.project.MlrunProject at 0x7ff44733f3d0>
-
-
-
-
-
-

Importing the hugging_face_classifier_trainer function from the Marketplace#

-
-
-
hugging_face_classifier_trainer = mlrun.import_function("hub://hugging_face_classifier_trainer")
-
-
-
-
-
-
-

Training a model#

-

Choosing the train handler

-
-

Define task parameters¶#

-
    -
  • Class parameters should contain the prefix CLASS_

  • -
  • Train parameters should contain the prefix TRAIN_

  • -
-
-
-
model_class = "transformers.AutoModelForSequenceClassification"
-additional_parameters = {
-    "TRAIN_output_dir": "finetuning-sentiment-model-3000-samples",
-    "TRAIN_learning_rate": 2e-5,
-    "TRAIN_per_device_train_batch_size": 16,
-    "TRAIN_per_device_eval_batch_size": 16,
-    "TRAIN_num_train_epochs": 3,
-    "TRAIN_weight_decay": 0.01,
-    "TRAIN_push_to_hub": False,
-    "TRAIN_evaluation_strategy": "epoch",
-    "TRAIN_eval_steps": 1,
-    "TRAIN_logging_steps": 1,
-    "CLASS_num_labels": 2
-}
-
-
-
-
-
-
-

Running the Training job with the “train” handler#

-
-
-
train_run = hugging_face_classifier_trainer.run(params={
-                                                        "hf_dataset": "Shayanvsf/US_Airline_Sentiment",
-                                                        "drop_columns": [
-                                                            "airline_sentiment_confidence",
-                                                            "negativereason_confidence",
-                                                        ],
-                                                        "pretrained_tokenizer": "distilbert-base-uncased",
-                                                        "pretrained_model": "distilbert-base-uncased",
-                                                        "model_class": "transformers.AutoModelForSequenceClassification",
-                                                        "label_name": "airline_sentiment",
-                                                        "num_of_train_samples": 100,
-                                                        "metrics": ["accuracy", "f1"],
-                                                        "random_state": 42,
-                                                        **additional_parameters
-                                                    },
-                                                    handler="train",
-                                                    local=True,
-                                                )
-
-
-
-
-
-
-

The result of the train run#

-
-
-
train_run.outputs
-
-
-
-
-
{'loss': 0.5363,
- 'learning_rate': 0.0,
- 'eval_loss': 0.48737242817878723,
- 'eval_accuracy': 0.7916666666666666,
- 'eval_f1': 0.0,
- 'eval_runtime': 0.5752,
- 'eval_samples_per_second': 41.722,
- 'eval_steps_per_second': 3.477,
- 'train_runtime': 17.5022,
- 'train_samples_per_second': 17.141,
- 'train_steps_per_second': 1.2,
- 'total_flos': 3327208489680.0,
- 'loss_plot': 'v3io:///projects/hugging-face-trainer-davids/artifacts/hugging-face-classifier-trainer-train/0/loss_plot.html',
- 'learning_rate_plot': 'v3io:///projects/hugging-face-trainer-davids/artifacts/hugging-face-classifier-trainer-train/0/learning_rate_plot.html',
- 'eval_loss_plot': 'v3io:///projects/hugging-face-trainer-davids/artifacts/hugging-face-classifier-trainer-train/0/eval_loss_plot.html',
- 'eval_accuracy_plot': 'v3io:///projects/hugging-face-trainer-davids/artifacts/hugging-face-classifier-trainer-train/0/eval_accuracy_plot.html',
- 'eval_f1_plot': 'v3io:///projects/hugging-face-trainer-davids/artifacts/hugging-face-classifier-trainer-train/0/eval_f1_plot.html',
- 'eval_runtime_plot': 'v3io:///projects/hugging-face-trainer-davids/artifacts/hugging-face-classifier-trainer-train/0/eval_runtime_plot.html',
- 'eval_samples_per_second_plot': 'v3io:///projects/hugging-face-trainer-davids/artifacts/hugging-face-classifier-trainer-train/0/eval_samples_per_second_plot.html',
- 'eval_steps_per_second_plot': 'v3io:///projects/hugging-face-trainer-davids/artifacts/hugging-face-classifier-trainer-train/0/eval_steps_per_second_plot.html',
- 'tokenizer': 'v3io:///projects/hugging-face-trainer-davids/artifacts/hugging-face-classifier-trainer-train/0/tokenizer.zip',
- 'model': 'store://artifacts/hugging-face-trainer-davids/huggingface-model:32a62cb55414402facbf47bb0470dc9f'}
-
-
-
-
-
-
-
train_run.artifact('loss_plot').show()
-
-
-
-
-
- - -
-
- -
-
-
-
-

Getting the model for evaluating and predicting#

-
-
-
model_path = train_run.outputs['model']
-
-
-
-
-
-
-
-

Optimize the model#

-

Choosing the optimize handler

-

The result of using this handled is an onnx optimized model.

-
-
-
optimize_run = hugging_face_classifier_trainer.run(params={
-                                                        "model_path": str(model_path)
-                                                    },
-                                                    handler="optimize",
-                                                    local=True,
-                                                )
-
-
-
-
-
-
-
optimize_run.outputs
-
-
-
-
-
{'model': 'store://artifacts/hugging-face-trainer-davids/optimized_model:2c355967689240de8964a1d10d137215'}
-
-
-
-
-

Back to the top

-
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/hugging_face_classifier_trainer/0.1.0/static/function.html b/functions/development/hugging_face_classifier_trainer/0.1.0/static/function.html deleted file mode 100644 index faba0f39..00000000 --- a/functions/development/hugging_face_classifier_trainer/0.1.0/static/function.html +++ /dev/null @@ -1,403 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: job
-metadata:
-  name: hugging-face-classifier-trainer
-  tag: ''
-  hash: e5d9d8ea6d86779e43cb11c261c36e2812a75653
-  project: ''
-  labels:
-    author: davids
-  categories:
-  - machine-learning
-  - model-training
-spec:
-  command: ''
-  args: []
-  image: mlrun/ml-models
-  build:
-    functionSourceCode: 
-    commands:
-    - python -m pip install 'onnx~=1.10.1' 'onnxruntime~=1.8.1' 'optimum~=1.6.4' 'transformers~=4.26.1'
-      'datasets~=2.10.1' 'scikit-learn~=1.0.2'
-    code_origin: https://github.com/yonishelach/functions.git#1f6afed6bbe17186995c39b6e40067cfd8dc6e64:/Users/Yonatan_Shelach/projects/functions/hugging_face_classifier_trainer/hugging_face_classifier_trainer.py
-    origin_filename: /Users/Yonatan_Shelach/projects/functions/hugging_face_classifier_trainer/hugging_face_classifier_trainer.py
-  entry_points:
-    add_interface:
-      name: add_interface
-      doc: 'Enrich the object with this interface properties, methods and functions,
-        so it will have this TensorFlow.Keras
-
-        MLRuns features.'
-      parameters:
-      - name: cls
-        default: ''
-      - name: obj
-        type: Trainer
-        doc: The object to enrich his interface.
-        default: ''
-      - name: restoration
-        type: MLRunInterfaceRestorationType
-        doc: Restoration information tuple as returned from 'remove_interface' in
-          order to add the interface in a certain state.
-        default: null
-      outputs:
-      - default: ''
-      lineno: 146
-    mlrun_optimize:
-      name: mlrun_optimize
-      doc: 'MLRun''s tf.keras.Model.fit wrapper. It will setup the optimizer when
-        using horovod. The optimizer must be
-
-        passed in a keyword argument and when using horovod, it must be passed as
-        an Optimizer instance, not a string.
-
-
-        raise MLRunInvalidArgumentError: In case the optimizer provided did not follow
-        the instructions above.'
-      parameters:
-      - name: cls
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 79
-    wrapper:
-      name: wrapper
-      doc: ''
-      parameters:
-      - name: self
-        type: Trainer
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 173
-    enable_auto_logging:
-      name: enable_auto_logging
-      doc: ''
-      parameters:
-      - name: self
-        default: ''
-      - name: context
-        type: MLClientCtx
-        default: ''
-      - name: model_name
-        type: str
-        default: model
-      - name: tag
-        type: str
-        default: ''
-      - name: labels
-        type: Dict[str, str]
-        default: null
-      - name: extra_data
-        type: dict
-        default: null
-      outputs:
-      - default: ''
-      lineno: 114
-    mlrun_train:
-      name: mlrun_train
-      doc: 'MLRuns tf.keras.Model.fit wrapper. It will setup the optimizer when using
-        horovod. The optimizer must be
-
-        passed in a keyword argument and when using horovod, it must be passed as
-        an Optimizer instance, not a string.
-
-
-        raise MLRunInvalidArgumentError: In case the optimizer provided did not follow
-        the instructions above.'
-      parameters:
-      - name: cls
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 164
-    on_epoch_begin:
-      name: on_epoch_begin
-      doc: ''
-      parameters:
-      - name: self
-        default: ''
-      - name: args
-        type: TrainingArguments
-        default: ''
-      - name: state
-        type: TrainerState
-        default: ''
-      - name: control
-        type: TrainerControl
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 220
-    on_epoch_end:
-      name: on_epoch_end
-      doc: ''
-      parameters:
-      - name: self
-        default: ''
-      - name: args
-        type: TrainingArguments
-        default: ''
-      - name: state
-        type: TrainerState
-        default: ''
-      - name: control
-        type: TrainerControl
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 229
-    on_log:
-      name: on_log
-      doc: ''
-      parameters:
-      - name: self
-        default: ''
-      - name: args
-        type: TrainingArguments
-        default: ''
-      - name: state
-        type: TrainerState
-        default: ''
-      - name: control
-        type: TrainerControl
-        default: ''
-      - name: logs
-        type: Dict[str, float]
-        default: null
-      outputs:
-      - default: ''
-      lineno: 238
-    on_train_begin:
-      name: on_train_begin
-      doc: ''
-      parameters:
-      - name: self
-        default: ''
-      - name: args
-        type: TrainingArguments
-        default: ''
-      - name: state
-        type: TrainerState
-        default: ''
-      - name: control
-        type: TrainerControl
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 262
-    on_train_end:
-      name: on_train_end
-      doc: ''
-      parameters:
-      - name: self
-        default: ''
-      - name: args
-        type: TrainingArguments
-        default: ''
-      - name: state
-        type: TrainerState
-        default: ''
-      - name: control
-        type: TrainerControl
-        default: ''
-      - name: model
-        type: PreTrainedModel
-        default: null
-      - name: tokenizer
-        type: PreTrainedTokenizer
-        default: null
-      outputs:
-      - default: ''
-      lineno: 271
-    on_evaluate:
-      name: on_evaluate
-      doc: ''
-      parameters:
-      - name: self
-        default: ''
-      - name: args
-        type: TrainingArguments
-        default: ''
-      - name: state
-        type: TrainerState
-        default: ''
-      - name: control
-        type: TrainerControl
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 322
-    apply_mlrun:
-      name: apply_mlrun
-      doc: Wrap the given model with MLRun's interface providing it with mlrun's additional
-        features.
-      parameters:
-      - name: huggingface_object
-        doc: The model to wrap. Can be loaded from the model path given as well.
-        default: ''
-      - name: model_name
-        type: str
-        doc: 'The model name to use for storing the model artifact. Default: "model".'
-        default: null
-      - name: tag
-        type: str
-        doc: The model's tag to log with.
-        default: ''
-      - name: context
-        type: MLClientCtx
-        doc: MLRun context to work with. If no context is given it will be retrieved
-          via 'mlrun.get_or_create_ctx(None)'
-        default: null
-      - name: auto_log
-        type: bool
-        doc: 'Whether to enable MLRun''s auto logging. Default: True.'
-        default: true
-      - name: labels
-        type: Dict[str, str]
-        default: null
-      - name: extra_data
-        type: dict
-        default: null
-      outputs:
-      - default: ''
-      lineno: 421
-    train:
-      name: train
-      doc: 'Training and evaluating a pretrained model with a pretrained tokenizer
-        over a dataset.
-
-        The dataset can be either be the name of the dataset that contains in the
-        HuggingFace hub,
-
-        or a URI or a FeatureVector'
-      parameters:
-      - name: context
-        type: MLClientCtx
-        doc: MLRun context
-        default: ''
-      - name: hf_dataset
-        type: str
-        doc: The name of the dataset to get from the HuggingFace hub
-        default: null
-      - name: dataset
-        type: DataItem
-        doc: The dataset to train the model on. Can be either a URI or a FeatureVector
-        default: null
-      - name: test_set
-        type: DataItem
-        doc: The test set to train the model with.
-        default: null
-      - name: drop_columns
-        type: Optional[List[str]]
-        doc: The columns to drop from the dataset.
-        default: null
-      - name: pretrained_tokenizer
-        type: str
-        doc: The name of the pretrained tokenizer from the HuggingFace hub.
-        default: null
-      - name: pretrained_model
-        type: str
-        doc: The name of the pretrained model from the HuggingFace hub.
-        default: null
-      - name: model_class
-        type: str
-        doc: The class of the model, e.g. `transformers.AutoModelForSequenceClassification`
-        default: null
-      - name: model_name
-        type: str
-        doc: The model's name to use for storing the model artifact, default to 'model'
-        default: huggingface-model
-      - name: label_name
-        type: str
-        doc: The target label of the column in the dataset.
-        default: labels
-      - name: text_col
-        type: str
-        doc: The input text column un the dataset.
-        default: text
-      - name: num_of_train_samples
-        type: int
-        doc: Max number of training samples, for debugging.
-        default: null
-      - name: train_test_split_size
-        type: float
-        doc: Should be between 0.0 and 1.0 and represent the proportion of the dataset
-          to include in the test split.
-        default: null
-      - name: metrics
-        type: List[str]
-        doc: List of different metrics for evaluate the model such as f1, accuracy
-          etc.
-        default: null
-      - name: random_state
-        type: int
-        doc: Random state for train_test_split
-        default: null
-      outputs:
-      - default: ''
-      lineno: 647
-    preprocess_function:
-      name: preprocess_function
-      doc: ''
-      parameters:
-      - name: examples
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 696
-    optimize:
-      name: optimize
-      doc: Optimizing the transformer model using ONNX optimization.
-      parameters:
-      - name: model_path
-        type: str
-        doc: The path of the model to optimize.
-        default: ''
-      - name: model_name
-        type: str
-        doc: Name of the optimized model.
-        default: optimized_model
-      - name: target_dir
-        type: str
-        doc: The directory to save the ONNX model.
-        default: ./optimized
-      - name: optimization_level
-        type: int
-        doc: Optimization level performed by ONNX Runtime of the loaded graph. (default
-          is 1)
-        default: 1
-      outputs:
-      - default: ''
-      lineno: 799
-  description: Automatic train and optimize functions for HuggingFace framework
-  default_handler: train
-  disable_auto_mount: false
-  env: []
-  priority_class_name: ''
-  preemption_mode: prevent
-  affinity: null
-  tolerations: null
-  security_context: {}
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/hugging_face_classifier_trainer/0.1.0/static/hugging_face_classifier_trainer.html b/functions/development/hugging_face_classifier_trainer/0.1.0/static/hugging_face_classifier_trainer.html deleted file mode 100644 index 99a105cb..00000000 --- a/functions/development/hugging_face_classifier_trainer/0.1.0/static/hugging_face_classifier_trainer.html +++ /dev/null @@ -1,972 +0,0 @@ - - - - - - - -hugging_face_classifier_trainer.hugging_face_classifier_trainer - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - -
-
- -
-
-
-
-
- -
-

- -
-
-
-
-
-
-
-

Source code for hugging_face_classifier_trainer.hugging_face_classifier_trainer

-import os
-import shutil
-import tempfile
-import zipfile
-from abc import ABC
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
-
-import mlrun
-import mlrun.datastore
-import mlrun.utils
-import numpy as np
-import pandas as pd
-import transformers
-from datasets import Dataset, load_dataset, load_metric
-from mlrun import MLClientCtx
-from mlrun import feature_store as fs
-from mlrun.artifacts import Artifact, PlotlyArtifact
-from mlrun.datastore import DataItem
-from mlrun.frameworks._common import CommonTypes, MLRunInterface
-from mlrun.utils import create_class
-from plotly import graph_objects as go
-from sklearn.model_selection import train_test_split
-from transformers import (
-    AutoTokenizer,
-    DataCollatorWithPadding,
-    EvalPrediction,
-    PreTrainedModel,
-    PreTrainedTokenizer,
-    Trainer,
-    TrainerCallback,
-    TrainerControl,
-    TrainerState,
-    TrainingArguments,
-)
-
-
-# ----------------------from MLRUN--------------------------------
-
[docs]class HFORTOptimizerMLRunInterface(MLRunInterface, ABC): - """ - Interface for adding MLRun features for tensorflow keras API. - """ - - # MLRun's context default name: - DEFAULT_CONTEXT_NAME = "mlrun-huggingface" - - # Attributes to be inserted so the MLRun interface will be fully enabled. - _PROPERTIES = { - "_auto_log": False, - "_context": None, - "_model_name": "model", - "_tag": "", - "_labels": None, - "_extra_data": None, - } - _METHODS = ["enable_auto_logging"] - # Attributes to replace so the MLRun interface will be fully enabled. - _REPLACED_METHODS = [ - "optimize", - ] - -
[docs] @classmethod - def add_interface( - cls, - obj, - restoration: CommonTypes.MLRunInterfaceRestorationType = None, - ): - """ - Enrich the object with this interface properties, methods and functions, so it will have this TensorFlow.Keras - MLRun's features. - :param obj: The object to enrich his interface. - :param restoration: Restoration information tuple as returned from 'remove_interface' in order to - add the interface in a certain state. - """ - super(HFORTOptimizerMLRunInterface, cls).add_interface( - obj=obj, restoration=restoration - )
- -
[docs] @classmethod - def mlrun_optimize(cls): - """ - MLRun's tf.keras.Model.fit wrapper. It will setup the optimizer when using horovod. The optimizer must be - passed in a keyword argument and when using horovod, it must be passed as an Optimizer instance, not a string. - - raise MLRunInvalidArgumentError: In case the optimizer provided did not follow the instructions above. - """ - - def wrapper(self, *args, **kwargs): - save_dir = cls._get_function_argument( - self.optimize, - argument_name="save_dir", - passed_args=args, - passed_kwargs=kwargs, - )[0] - - # Call the original optimize method: - result = self.original_optimize(*args, **kwargs) - - if self._auto_log: - # Log the onnx model: - self._context.log_model( - key="model", - db_key=self._model_name, - model_file=f"{save_dir}/model_optimized.onnx", - tag=self._tag, - framework="ONNX", - labels=self._labels, - extra_data=self._extra_data, - ) - - return result - - return wrapper
- -
[docs] def enable_auto_logging( - self, - context: mlrun.MLClientCtx, - model_name: str = "model", - tag: str = "", - labels: Dict[str, str] = None, - extra_data: dict = None, - ): - self._auto_log = True - - self._context = context - self._model_name = model_name - self._tag = tag - self._labels = labels - self._extra_data = extra_data
- - -
[docs]class HFTrainerMLRunInterface(MLRunInterface, ABC): - """ - Interface for adding MLRun features for tensorflow keras API. - """ - - # MLRuns context default name: - DEFAULT_CONTEXT_NAME = "mlrun-huggingface" - - # Attributes to replace so the MLRun interface will be fully enabled. - _REPLACED_METHODS = [ - "train", - # "evaluate" - ] - -
[docs] @classmethod - def add_interface( - cls, - obj: Trainer, - restoration: CommonTypes.MLRunInterfaceRestorationType = None, - ): - """ - Enrich the object with this interface properties, methods and functions, so it will have this TensorFlow.Keras - MLRuns features. - :param obj: The object to enrich his interface. - :param restoration: Restoration information tuple as returned from 'remove_interface' in order to - add the interface in a certain state. - """ - - super(HFTrainerMLRunInterface, cls).add_interface( - obj=obj, restoration=restoration - )
- -
[docs] @classmethod - def mlrun_train(cls): - - """ - MLRuns tf.keras.Model.fit wrapper. It will setup the optimizer when using horovod. The optimizer must be - passed in a keyword argument and when using horovod, it must be passed as an Optimizer instance, not a string. - - raise MLRunInvalidArgumentError: In case the optimizer provided did not follow the instructions above. - """ - - def wrapper(self: Trainer, *args, **kwargs): - # Restore the evaluation method as `train` will use it: - # cls._restore_attribute(obj=self, attribute_name="evaluate") - - # Call the original fit method: - result = self.original_train(*args, **kwargs) - - # Replace the evaluation method again: - # cls._replace_function(obj=self, function_name="evaluate") - - return result - - return wrapper
- - -
[docs]class MLRunCallback(TrainerCallback): - """ - Callback for collecting logs during training / evaluation of the `Trainer` API. - """ - - def __init__( - self, - context: mlrun.MLClientCtx = None, - model_name: str = "model", - tag: str = "", - labels: Dict[str, str] = None, - extra_data: dict = None, - ): - super().__init__() - - # Store the configurations: - self._context = ( - context - if context is not None - else mlrun.get_or_create_ctx("./mlrun-huggingface") - ) - self._model_name = model_name - self._tag = tag - self._labels = labels - self._extra_data = extra_data if extra_data is not None else {} - - # Set up the logging mode: - self._is_training = False - self._steps: List[List[int]] = [] - self._metric_scores: Dict[str, List[float]] = {} - self._artifacts: Dict[str, Artifact] = {} - -
[docs] def on_epoch_begin( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs, - ): - self._steps.append([])
- -
[docs] def on_epoch_end( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs, - ): - self._log_metrics()
- -
[docs] def on_log( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - logs: Dict[str, float] = None, - **kwargs, - ): - recent_logs = state.log_history[-1].copy() - - recent_logs.pop("epoch") - current_step = int(recent_logs.pop("step")) - if current_step not in self._steps[-1]: - self._steps[-1].append(current_step) - - for metric_name, metric_score in recent_logs.items(): - if metric_name.startswith("train_"): - if metric_name.split("train_")[1] not in self._metric_scores: - self._metric_scores[metric_name] = [metric_score] - continue - if metric_name not in self._metric_scores: - self._metric_scores[metric_name] = [] - self._metric_scores[metric_name].append(metric_score)
- -
[docs] def on_train_begin( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs, - ): - self._is_training = True
- -
[docs] def on_train_end( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - model: PreTrainedModel = None, - tokenizer: PreTrainedTokenizer = None, - **kwargs, - ): - self._log_metrics() - - temp_directory = tempfile.gettempdir() - - # Save and log the tokenizer: - if tokenizer is not None: - # Save tokenizer: - tokenizer_dir = os.path.join(temp_directory, "tokenizer") - tokenizer.save_pretrained(save_directory=tokenizer_dir) - # Zip the tokenizer directory: - tokenizer_zip = shutil.make_archive( - base_name="tokenizer", - format="zip", - root_dir=tokenizer_dir, - ) - # Log the zip file: - self._artifacts["tokenizer"] = self._context.log_artifact( - item="tokenizer", local_path=tokenizer_zip - ) - - # Save the model: - model_dir = os.path.join(temp_directory, "model") - model.save_pretrained(save_directory=model_dir) - - # Zip the model directory: - shutil.make_archive( - base_name="model", - format="zip", - root_dir=model_dir, - ) - - # Log the model: - self._context.log_model( - key="model", - db_key=self._model_name, - model_file="model.zip", - tag=self._tag, - framework="Hugging Face", - labels=self._labels, - extra_data={**self._artifacts, **self._extra_data}, - )
- -
[docs] def on_evaluate( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs, - ): - self._log_metrics() - - if self._is_training: - return
- - # TODO: Update the model object - - def _log_metrics(self): - for metric_name, metric_scores in self._metric_scores.items(): - self._context.log_result(key=metric_name, value=metric_scores[-1]) - if len(metric_scores) > 1: - self._log_metric_plot(name=metric_name, scores=metric_scores) - self._context.commit(completed=False) - - def _log_metric_plot(self, name: str, scores: List[float]): - # Initialize a plotly figure: - metric_figure = go.Figure() - - # Add titles: - metric_figure.update_layout( - title=name.capitalize().replace("_", " "), - xaxis_title="Samples", - yaxis_title="Scores", - ) - - # Draw: - metric_figure.add_trace( - go.Scatter(x=np.arange(len(scores)), y=scores, mode="lines") - ) - - # Create the plotly artifact: - artifact_name = f"{name}_plot" - artifact = PlotlyArtifact(key=artifact_name, figure=metric_figure) - self._artifacts[artifact_name] = self._context.log_artifact(artifact)
- - -def _apply_mlrun_on_trainer( - trainer: transformers.Trainer, - model_name: str = None, - tag: str = "", - context: mlrun.MLClientCtx = None, - auto_log: bool = True, - labels: Dict[str, str] = None, - extra_data: dict = None, - **kwargs, -): - # Get parameters defaults: - if context is None: - context = mlrun.get_or_create_ctx(HFTrainerMLRunInterface.DEFAULT_CONTEXT_NAME) - - HFTrainerMLRunInterface.add_interface(obj=trainer) - - if auto_log: - trainer.add_callback( - MLRunCallback( - context=context, - model_name=model_name, - tag=tag, - labels=labels, - extra_data=extra_data, - ) - ) - - -def _apply_mlrun_on_optimizer( - optimizer, - model_name: str = None, - tag: str = "", - context: mlrun.MLClientCtx = None, - auto_log: bool = True, - labels: Dict[str, str] = None, - extra_data: dict = None, - **kwargs, -): - # Get parameters defaults: - if context is None: - context = mlrun.get_or_create_ctx( - HFORTOptimizerMLRunInterface.DEFAULT_CONTEXT_NAME - ) - - HFORTOptimizerMLRunInterface.add_interface(obj=optimizer) - - if auto_log: - optimizer.enable_auto_logging( - context=context, - model_name=model_name, - tag=tag, - labels=labels, - extra_data=extra_data, - ) - - -
[docs]def apply_mlrun( - huggingface_object, - model_name: str = None, - tag: str = "", - context: mlrun.MLClientCtx = None, - auto_log: bool = True, - labels: Dict[str, str] = None, - extra_data: dict = None, - **kwargs, -): - """ - Wrap the given model with MLRun's interface providing it with mlrun's additional features. - :param huggingface_object: The model to wrap. Can be loaded from the model path given as well. - :param model_name: The model name to use for storing the model artifact. Default: "model". - :param tag: The model's tag to log with. - :param context: MLRun context to work with. If no context is given it will be retrieved via - 'mlrun.get_or_create_ctx(None)' - :param auto_log: Whether to enable MLRun's auto logging. Default: True. - """ - - if isinstance(huggingface_object, transformers.Trainer): - return _apply_mlrun_on_trainer( - trainer=huggingface_object, - model_name=model_name, - tag=tag, - context=context, - auto_log=auto_log, - labels=labels, - extra_data=extra_data, - ) - import optimum.onnxruntime as optimum_ort - - if isinstance(huggingface_object, optimum_ort.ORTOptimizer): - return _apply_mlrun_on_optimizer( - optimizer=huggingface_object, - model_name=model_name, - tag=tag, - context=context, - auto_log=auto_log, - labels=labels, - extra_data=extra_data, - ) - raise mlrun.errors.MLRunInvalidArgumentError
- - -# ---------------------- from auto_trainer-------------------------------- -
[docs]class KWArgsPrefixes: - MODEL_CLASS = "CLASS_" - FIT = "FIT_" - TRAIN = "TRAIN_" - PREDICT = "PREDICT_"
- - -def _get_sub_dict_by_prefix(src: Dict, prefix_key: str) -> Dict[str, Any]: - """ - Collect all the keys from the given dict that starts with the given prefix and creates a new dictionary with these - keys. - - :param src: The source dict to extract the values from. - :param prefix_key: Only keys with this prefix will be returned. The keys in the result dict will be without this - prefix. - """ - return { - key.replace(prefix_key, ""): val - for key, val in src.items() - if key.startswith(prefix_key) - } - - -def _get_dataframe( - context: MLClientCtx, - dataset: DataItem, - label_columns: Optional[Union[str, List[str]]] = None, - drop_columns: Union[str, List[str], int, List[int]] = None, -) -> Tuple[pd.DataFrame, Optional[Union[str, List[str]]]]: - """ - Getting the DataFrame of the dataset and drop the columns accordingly. - - :param context: MLRun context. - :param dataset: The dataset to train the model on. - Can be either a list of lists, dict, URI or a FeatureVector. - :param label_columns: The target label(s) of the column(s) in the dataset. for Regression or - Classification tasks. - :param drop_columns: str/int or a list of strings/ints that represent the column names/indices to drop. - """ - if isinstance(dataset, (list, dict)): - dataset = pd.DataFrame(dataset) - # Checking if drop_columns provided by integer type: - if drop_columns: - if isinstance(drop_columns, str) or ( - isinstance(drop_columns, list) - and any(isinstance(col, str) for col in drop_columns) - ): - context.logger.error( - "drop_columns must be an integer/list of integers if not provided with a URI/FeatureVector dataset" - ) - raise ValueError - dataset.drop(drop_columns, axis=1, inplace=True) - - return dataset, label_columns - - store_uri_prefix, _ = mlrun.datastore.parse_store_uri(dataset.artifact_url) - if mlrun.utils.StorePrefix.FeatureVector == store_uri_prefix: - # feature-vector case: - label_columns = label_columns or dataset.meta.status.label_column - dataset = fs.get_offline_features( - dataset.meta.uri, drop_columns=drop_columns - ).to_dataframe() - - context.logger.info(f"label columns: {label_columns}") - else: - # simple URL case: - dataset = dataset.as_df() - if drop_columns: - if all(col in dataset for col in drop_columns): - dataset = dataset.drop(drop_columns, axis=1) - else: - context.logger.info( - "not all of the columns to drop in the dataset, drop columns process skipped" - ) - return dataset, label_columns - - -# ---------------------- Hugging Face Trainer -------------------------------- - - -def _create_compute_metrics(metrics: List[str]) -> Callable[[EvalPrediction], Dict]: - """ - This function create and returns a function that will be used to compute metrics at evaluation. - :param metrics: List of different metrics for evaluate the model such as f1, accuracy etc. - - :returns: Function that will be used to compute metrics at evaluation. - Must take a [`EvalPrediction`] and return a dictionary string to metric values. - """ - - def _compute_metrics(eval_pred): - logits, labels = eval_pred - predictions = np.argmax(logits, axis=-1) - metric_dict_results = {} - for metric in metrics: - load_met = load_metric(metric) - metric_res = load_met.compute(predictions=predictions, references=labels)[ - metric - ] - metric_dict_results[metric] = metric_res - - return metric_dict_results - - return _compute_metrics - - -def _edit_columns( - dataset: Dataset, - drop_columns: List[str] = None, - rename_columns: [str, str] = None, -) -> Dataset: - """ - Drop and renames that columns of the given dataset - :param dataset: Dataset to process - :param drop_columns: The columns to drop from the dataset. - :param rename_columns: Dict of columns ro rename : {<old_name>: <new_name>, ...} - - :returns: The dataset after the desired process - """ - if drop_columns: - dataset = dataset.remove_columns(drop_columns) - if rename_columns: - dataset = dataset.rename_columns(rename_columns) - return dataset - - -def _prepare_dataset( - context: MLClientCtx, - dataset_name: str, - label_name: str = None, - drop_columns: Optional[List[str]] = None, - num_of_train_samples: int = None, - train_test_split_size: float = None, - random_state: int = None, -) -> Tuple[Dataset, Dataset]: - """ - Loading the dataset and editing the columns - - :param context: MLRun contex - :param dataset_name: The name of the dataset to get from the HuggingFace hub - :param label_name: The target label of the column in the dataset. - :param drop_columns: The columns to drop from the dataset. - :param num_of_train_samples: Max number of training samples, for debugging. - :param train_test_split_size: Should be between 0.0 and 1.0 and represent the proportion of the dataset to include - in the test split. - :param random_state: Random state for train_test_split - - """ - - context.logger.info( - f"Loading and editing {dataset_name} dataset from Hugging Face hub" - ) - rename_cols = {label_name: "labels"} - - # Loading and editing dataset: - dataset = load_dataset(dataset_name) - - # train set - train_dataset = dataset["train"] - if num_of_train_samples: - train_dataset = train_dataset.shuffle(seed=random_state).select( - list(range(num_of_train_samples)) - ) - train_dataset = _edit_columns(train_dataset, drop_columns, rename_cols) - - # test set - test_dataset = dataset["test"] - if train_test_split_size or num_of_train_samples: - train_test_split_size = train_test_split_size or 0.2 - num_of_test_samples = int( - (train_dataset.num_rows * train_test_split_size) - // (1 - train_test_split_size) - ) - test_dataset = test_dataset.shuffle(seed=random_state).select( - list(range(num_of_test_samples)) - ) - test_dataset = _edit_columns(test_dataset, drop_columns, rename_cols) - - return train_dataset, test_dataset - - -
[docs]def train( - context: MLClientCtx, - hf_dataset: str = None, - dataset: DataItem = None, - test_set: DataItem = None, - drop_columns: Optional[List[str]] = None, - pretrained_tokenizer: str = None, - pretrained_model: str = None, - model_class: str = None, - model_name: str = "huggingface-model", - label_name: str = "labels", - text_col: str = "text", - num_of_train_samples: int = None, - train_test_split_size: float = None, - metrics: List[str] = None, - random_state: int = None, -): - """ - Training and evaluating a pretrained model with a pretrained tokenizer over a dataset. - The dataset can be either be the name of the dataset that contains in the HuggingFace hub, - or a URI or a FeatureVector - - :param context: MLRun context - :param hf_dataset: The name of the dataset to get from the HuggingFace hub - :param dataset: The dataset to train the model on. Can be either a URI or a FeatureVector - :param test_set: The test set to train the model with. - :param drop_columns: The columns to drop from the dataset. - :param pretrained_tokenizer: The name of the pretrained tokenizer from the HuggingFace hub. - :param pretrained_model: The name of the pretrained model from the HuggingFace hub. - :param model_name: The model's name to use for storing the model artifact, default to 'model' - :param model_class: The class of the model, e.g. `transformers.AutoModelForSequenceClassification` - :param label_name: The target label of the column in the dataset. - :param text_col: The input text column un the dataset. - :param num_of_train_samples: Max number of training samples, for debugging. - :param train_test_split_size: Should be between 0.0 and 1.0 and represent the proportion of the dataset to include - in the test split. - :param metrics: List of different metrics for evaluate the model such as f1, accuracy etc. - :param random_state: Random state for train_test_split - """ - - if train_test_split_size is None and test_set is None: - context.logger.info( - "'train_test_split_size' is not provided, setting train_test_split_size to 0.2" - ) - train_test_split_size = 0.2 - - # Creating tokenizer: - tokenizer = AutoTokenizer.from_pretrained(pretrained_tokenizer) - - def preprocess_function(examples): - return tokenizer(examples[text_col], truncation=True) - - # prepare data for training - if hf_dataset: - train_dataset, test_dataset = _prepare_dataset( - context, - hf_dataset, - label_name, - drop_columns, - num_of_train_samples, - train_test_split_size, - random_state=random_state, - ) - elif dataset: - # Get DataFrame by URL or by FeatureVector: - train_dataset, label_name = _get_dataframe( - context=context, - dataset=dataset, - label_columns=label_name, - drop_columns=drop_columns, - ) - if test_set: - test_dataset, _ = _get_dataframe( - context=context, - dataset=test_set, - label_columns=label_name, - drop_columns=drop_columns, - ) - else: - train_dataset, test_dataset = train_test_split( - train_dataset, - test_size=train_test_split_size, - random_state=random_state, - ) - train_dataset = Dataset.from_pandas(train_dataset) - test_dataset = Dataset.from_pandas(test_dataset) - else: - raise mlrun.errors.MLRunInvalidArgumentError( - "Training data was not provided. A training dataset is mandatory for training." - " Please provide a training set using one of the arguments 'hf_dataset' or 'dataset'." - ) - - # Mapping datasets with the tokenizer: - tokenized_train = train_dataset.map(preprocess_function, batched=True) - tokenized_test = test_dataset.map(preprocess_function, batched=True) - - # Creating data collator for batching: - data_collator = DataCollatorWithPadding(tokenizer=tokenizer) - - # Parsing kwargs: - train_kwargs = _get_sub_dict_by_prefix( - src=context.parameters, prefix_key=KWArgsPrefixes.TRAIN - ) - model_class_kwargs = _get_sub_dict_by_prefix( - src=context.parameters, prefix_key=KWArgsPrefixes.MODEL_CLASS - ) - - # Loading our pretrained model: - model_class_kwargs["pretrained_model_name_or_path"] = ( - model_class_kwargs.get("pretrained_model_name_or_path") or pretrained_model - ) - train_kwargs["hub_token"] = train_kwargs.get("hub_token") or pretrained_tokenizer - if not model_class_kwargs["pretrained_model_name_or_path"]: - raise mlrun.errors.MLRunRuntimeError( - "Must provide pretrained_model name as " - "function argument or in extra params" - ) - model = create_class(model_class).from_pretrained(**model_class_kwargs) - - # Preparing training arguments: - training_args = TrainingArguments( - **train_kwargs, - ) - - compute_metrics = _create_compute_metrics(metrics) if metrics else None - trainer = Trainer( - model=model, - args=training_args, - train_dataset=tokenized_train, - eval_dataset=tokenized_test, - tokenizer=tokenizer, - data_collator=data_collator, - compute_metrics=compute_metrics, - ) - - apply_mlrun(trainer, model_name=model_name) - - # Apply training with evaluation: - context.logger.info(f"training '{model_name}'") - trainer.train()
- - -def _get_model_dir(model_uri: str): - model_file, _, _ = mlrun.artifacts.get_model(model_uri) - model_dir = tempfile.gettempdir() - # Unzip the Model: - with zipfile.ZipFile(model_file, "r") as zip_file: - zip_file.extractall(model_dir) - - return model_dir - - -
[docs]def optimize( - model_path: str, - model_name: str = "optimized_model", - target_dir: str = "./optimized", - optimization_level: int = 1, -): - """ - Optimizing the transformer model using ONNX optimization. - - - :param model_path: The path of the model to optimize. - :param model_name: Name of the optimized model. - :param target_dir: The directory to save the ONNX model. - :param optimization_level: Optimization level performed by ONNX Runtime of the loaded graph. (default is 1) - """ - # We import these in the function scope so ONNX won't be mandatory for the other handlers: - from optimum.onnxruntime import ORTModelForSequenceClassification, ORTOptimizer - from optimum.onnxruntime.configuration import OptimizationConfig - - model_dir = _get_model_dir(model_uri=model_path) - # Creating configuration for optimization step: - optimization_config = OptimizationConfig(optimization_level=optimization_level) - - # Converting our pretrained model to an ONNX-Runtime model: - ort_model = ORTModelForSequenceClassification.from_pretrained( - model_dir, from_transformers=True - ) - - # Creating an ONNX-Runtime optimizer from ONNX model: - optimizer = ORTOptimizer.from_pretrained(ort_model) - - apply_mlrun(optimizer, model_name=model_name) - # Optimizing and saving the ONNX model: - optimizer.optimize(save_dir=target_dir, optimization_config=optimization_config)
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/hugging_face_classifier_trainer/0.1.0/static/item.html b/functions/development/hugging_face_classifier_trainer/0.1.0/static/item.html deleted file mode 100644 index 4347c7f0..00000000 --- a/functions/development/hugging_face_classifier_trainer/0.1.0/static/item.html +++ /dev/null @@ -1,53 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- machine-learning
-- model-training
-description: Automatic train and optimize functions for HuggingFace framework
-doc: ''
-example: hugging_face_classifier_trainer.ipynb
-generationDate: 2022-08-28:17-25
-hidden: false
-icon: ''
-labels:
-  author: davids
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 1.2.0
-name: hugging_face_classifier_trainer
-platformVersion: 3.5.0
-spec:
-  filename: hugging_face_classifier_trainer.py
-  handler: train
-  image: mlrun/ml-models
-  kind: job
-  requirements:
-  - onnx~=1.10.1
-  - onnxruntime~=1.8.1
-  - optimum~=1.6.4
-  - transformers~=4.26.1
-  - datasets~=2.10.1
-  - scikit-learn~=1.0.2
-url: ''
-version: 0.1.0
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/hugging_face_classifier_trainer/0.1.0/static/source.html b/functions/development/hugging_face_classifier_trainer/0.1.0/static/source.html deleted file mode 100644 index 6eee51f5..00000000 --- a/functions/development/hugging_face_classifier_trainer/0.1.0/static/source.html +++ /dev/null @@ -1,854 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-import os
-import shutil
-import tempfile
-import zipfile
-from abc import ABC
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
-
-import mlrun
-import mlrun.datastore
-import mlrun.utils
-import numpy as np
-import pandas as pd
-import transformers
-from datasets import Dataset, load_dataset, load_metric
-from mlrun import MLClientCtx
-from mlrun import feature_store as fs
-from mlrun.artifacts import Artifact, PlotlyArtifact
-from mlrun.datastore import DataItem
-from mlrun.frameworks._common import CommonTypes, MLRunInterface
-from mlrun.utils import create_class
-from plotly import graph_objects as go
-from sklearn.model_selection import train_test_split
-from transformers import (
-    AutoTokenizer,
-    DataCollatorWithPadding,
-    EvalPrediction,
-    PreTrainedModel,
-    PreTrainedTokenizer,
-    Trainer,
-    TrainerCallback,
-    TrainerControl,
-    TrainerState,
-    TrainingArguments,
-)
-
-
-# ----------------------from MLRUN--------------------------------
-class HFORTOptimizerMLRunInterface(MLRunInterface, ABC):
-    """
-    Interface for adding MLRun features for tensorflow keras API.
-    """
-
-    # MLRun's context default name:
-    DEFAULT_CONTEXT_NAME = "mlrun-huggingface"
-
-    # Attributes to be inserted so the MLRun interface will be fully enabled.
-    _PROPERTIES = {
-        "_auto_log": False,
-        "_context": None,
-        "_model_name": "model",
-        "_tag": "",
-        "_labels": None,
-        "_extra_data": None,
-    }
-    _METHODS = ["enable_auto_logging"]
-    # Attributes to replace so the MLRun interface will be fully enabled.
-    _REPLACED_METHODS = [
-        "optimize",
-    ]
-
-    @classmethod
-    def add_interface(
-        cls,
-        obj,
-        restoration: CommonTypes.MLRunInterfaceRestorationType = None,
-    ):
-        """
-        Enrich the object with this interface properties, methods and functions, so it will have this TensorFlow.Keras
-        MLRun's features.
-        :param obj:                     The object to enrich his interface.
-        :param restoration: Restoration information tuple as returned from 'remove_interface' in order to
-                                        add the interface in a certain state.
-        """
-        super(HFORTOptimizerMLRunInterface, cls).add_interface(
-            obj=obj, restoration=restoration
-        )
-
-    @classmethod
-    def mlrun_optimize(cls):
-        """
-        MLRun's tf.keras.Model.fit wrapper. It will setup the optimizer when using horovod. The optimizer must be
-        passed in a keyword argument and when using horovod, it must be passed as an Optimizer instance, not a string.
-
-        raise MLRunInvalidArgumentError: In case the optimizer provided did not follow the instructions above.
-        """
-
-        def wrapper(self, *args, **kwargs):
-            save_dir = cls._get_function_argument(
-                self.optimize,
-                argument_name="save_dir",
-                passed_args=args,
-                passed_kwargs=kwargs,
-            )[0]
-
-            # Call the original optimize method:
-            result = self.original_optimize(*args, **kwargs)
-
-            if self._auto_log:
-                # Log the onnx model:
-                self._context.log_model(
-                    key="model",
-                    db_key=self._model_name,
-                    model_file=f"{save_dir}/model_optimized.onnx",
-                    tag=self._tag,
-                    framework="ONNX",
-                    labels=self._labels,
-                    extra_data=self._extra_data,
-                )
-
-            return result
-
-        return wrapper
-
-    def enable_auto_logging(
-        self,
-        context: mlrun.MLClientCtx,
-        model_name: str = "model",
-        tag: str = "",
-        labels: Dict[str, str] = None,
-        extra_data: dict = None,
-    ):
-        self._auto_log = True
-
-        self._context = context
-        self._model_name = model_name
-        self._tag = tag
-        self._labels = labels
-        self._extra_data = extra_data
-
-
-class HFTrainerMLRunInterface(MLRunInterface, ABC):
-    """
-    Interface for adding MLRun features for tensorflow keras API.
-    """
-
-    # MLRuns context default name:
-    DEFAULT_CONTEXT_NAME = "mlrun-huggingface"
-
-    # Attributes to replace so the MLRun interface will be fully enabled.
-    _REPLACED_METHODS = [
-        "train",
-        # "evaluate"
-    ]
-
-    @classmethod
-    def add_interface(
-        cls,
-        obj: Trainer,
-        restoration: CommonTypes.MLRunInterfaceRestorationType = None,
-    ):
-        """
-        Enrich the object with this interface properties, methods and functions, so it will have this TensorFlow.Keras
-        MLRuns features.
-        :param obj:                     The object to enrich his interface.
-        :param restoration: Restoration information tuple as returned from 'remove_interface' in order to
-                                        add the interface in a certain state.
-        """
-
-        super(HFTrainerMLRunInterface, cls).add_interface(
-            obj=obj, restoration=restoration
-        )
-
-    @classmethod
-    def mlrun_train(cls):
-
-        """
-        MLRuns tf.keras.Model.fit wrapper. It will setup the optimizer when using horovod. The optimizer must be
-        passed in a keyword argument and when using horovod, it must be passed as an Optimizer instance, not a string.
-
-        raise MLRunInvalidArgumentError: In case the optimizer provided did not follow the instructions above.
-        """
-
-        def wrapper(self: Trainer, *args, **kwargs):
-            # Restore the evaluation method as `train` will use it:
-            # cls._restore_attribute(obj=self, attribute_name="evaluate")
-
-            # Call the original fit method:
-            result = self.original_train(*args, **kwargs)
-
-            # Replace the evaluation method again:
-            # cls._replace_function(obj=self, function_name="evaluate")
-
-            return result
-
-        return wrapper
-
-
-class MLRunCallback(TrainerCallback):
-    """
-    Callback for collecting logs during training / evaluation of the `Trainer` API.
-    """
-
-    def __init__(
-        self,
-        context: mlrun.MLClientCtx = None,
-        model_name: str = "model",
-        tag: str = "",
-        labels: Dict[str, str] = None,
-        extra_data: dict = None,
-    ):
-        super().__init__()
-
-        # Store the configurations:
-        self._context = (
-            context
-            if context is not None
-            else mlrun.get_or_create_ctx("./mlrun-huggingface")
-        )
-        self._model_name = model_name
-        self._tag = tag
-        self._labels = labels
-        self._extra_data = extra_data if extra_data is not None else {}
-
-        # Set up the logging mode:
-        self._is_training = False
-        self._steps: List[List[int]] = []
-        self._metric_scores: Dict[str, List[float]] = {}
-        self._artifacts: Dict[str, Artifact] = {}
-
-    def on_epoch_begin(
-        self,
-        args: TrainingArguments,
-        state: TrainerState,
-        control: TrainerControl,
-        **kwargs,
-    ):
-        self._steps.append([])
-
-    def on_epoch_end(
-        self,
-        args: TrainingArguments,
-        state: TrainerState,
-        control: TrainerControl,
-        **kwargs,
-    ):
-        self._log_metrics()
-
-    def on_log(
-        self,
-        args: TrainingArguments,
-        state: TrainerState,
-        control: TrainerControl,
-        logs: Dict[str, float] = None,
-        **kwargs,
-    ):
-        recent_logs = state.log_history[-1].copy()
-
-        recent_logs.pop("epoch")
-        current_step = int(recent_logs.pop("step"))
-        if current_step not in self._steps[-1]:
-            self._steps[-1].append(current_step)
-
-        for metric_name, metric_score in recent_logs.items():
-            if metric_name.startswith("train_"):
-                if metric_name.split("train_")[1] not in self._metric_scores:
-                    self._metric_scores[metric_name] = [metric_score]
-                continue
-            if metric_name not in self._metric_scores:
-                self._metric_scores[metric_name] = []
-            self._metric_scores[metric_name].append(metric_score)
-
-    def on_train_begin(
-        self,
-        args: TrainingArguments,
-        state: TrainerState,
-        control: TrainerControl,
-        **kwargs,
-    ):
-        self._is_training = True
-
-    def on_train_end(
-        self,
-        args: TrainingArguments,
-        state: TrainerState,
-        control: TrainerControl,
-        model: PreTrainedModel = None,
-        tokenizer: PreTrainedTokenizer = None,
-        **kwargs,
-    ):
-        self._log_metrics()
-
-        temp_directory = tempfile.gettempdir()
-
-        # Save and log the tokenizer:
-        if tokenizer is not None:
-            # Save tokenizer:
-            tokenizer_dir = os.path.join(temp_directory, "tokenizer")
-            tokenizer.save_pretrained(save_directory=tokenizer_dir)
-            # Zip the tokenizer directory:
-            tokenizer_zip = shutil.make_archive(
-                base_name="tokenizer",
-                format="zip",
-                root_dir=tokenizer_dir,
-            )
-            # Log the zip file:
-            self._artifacts["tokenizer"] = self._context.log_artifact(
-                item="tokenizer", local_path=tokenizer_zip
-            )
-
-        # Save the model:
-        model_dir = os.path.join(temp_directory, "model")
-        model.save_pretrained(save_directory=model_dir)
-
-        # Zip the model directory:
-        shutil.make_archive(
-            base_name="model",
-            format="zip",
-            root_dir=model_dir,
-        )
-
-        # Log the model:
-        self._context.log_model(
-            key="model",
-            db_key=self._model_name,
-            model_file="model.zip",
-            tag=self._tag,
-            framework="Hugging Face",
-            labels=self._labels,
-            extra_data={**self._artifacts, **self._extra_data},
-        )
-
-    def on_evaluate(
-        self,
-        args: TrainingArguments,
-        state: TrainerState,
-        control: TrainerControl,
-        **kwargs,
-    ):
-        self._log_metrics()
-
-        if self._is_training:
-            return
-
-        # TODO: Update the model object
-
-    def _log_metrics(self):
-        for metric_name, metric_scores in self._metric_scores.items():
-            self._context.log_result(key=metric_name, value=metric_scores[-1])
-            if len(metric_scores) > 1:
-                self._log_metric_plot(name=metric_name, scores=metric_scores)
-        self._context.commit(completed=False)
-
-    def _log_metric_plot(self, name: str, scores: List[float]):
-        # Initialize a plotly figure:
-        metric_figure = go.Figure()
-
-        # Add titles:
-        metric_figure.update_layout(
-            title=name.capitalize().replace("_", " "),
-            xaxis_title="Samples",
-            yaxis_title="Scores",
-        )
-
-        # Draw:
-        metric_figure.add_trace(
-            go.Scatter(x=np.arange(len(scores)), y=scores, mode="lines")
-        )
-
-        # Create the plotly artifact:
-        artifact_name = f"{name}_plot"
-        artifact = PlotlyArtifact(key=artifact_name, figure=metric_figure)
-        self._artifacts[artifact_name] = self._context.log_artifact(artifact)
-
-
-def _apply_mlrun_on_trainer(
-    trainer: transformers.Trainer,
-    model_name: str = None,
-    tag: str = "",
-    context: mlrun.MLClientCtx = None,
-    auto_log: bool = True,
-    labels: Dict[str, str] = None,
-    extra_data: dict = None,
-    **kwargs,
-):
-    # Get parameters defaults:
-    if context is None:
-        context = mlrun.get_or_create_ctx(HFTrainerMLRunInterface.DEFAULT_CONTEXT_NAME)
-
-    HFTrainerMLRunInterface.add_interface(obj=trainer)
-
-    if auto_log:
-        trainer.add_callback(
-            MLRunCallback(
-                context=context,
-                model_name=model_name,
-                tag=tag,
-                labels=labels,
-                extra_data=extra_data,
-            )
-        )
-
-
-def _apply_mlrun_on_optimizer(
-    optimizer,
-    model_name: str = None,
-    tag: str = "",
-    context: mlrun.MLClientCtx = None,
-    auto_log: bool = True,
-    labels: Dict[str, str] = None,
-    extra_data: dict = None,
-    **kwargs,
-):
-    # Get parameters defaults:
-    if context is None:
-        context = mlrun.get_or_create_ctx(
-            HFORTOptimizerMLRunInterface.DEFAULT_CONTEXT_NAME
-        )
-
-    HFORTOptimizerMLRunInterface.add_interface(obj=optimizer)
-
-    if auto_log:
-        optimizer.enable_auto_logging(
-            context=context,
-            model_name=model_name,
-            tag=tag,
-            labels=labels,
-            extra_data=extra_data,
-        )
-
-
-def apply_mlrun(
-    huggingface_object,
-    model_name: str = None,
-    tag: str = "",
-    context: mlrun.MLClientCtx = None,
-    auto_log: bool = True,
-    labels: Dict[str, str] = None,
-    extra_data: dict = None,
-    **kwargs,
-):
-    """
-    Wrap the given model with MLRun's interface providing it with mlrun's additional features.
-    :param huggingface_object: The model to wrap. Can be loaded from the model path given as well.
-    :param model_name:         The model name to use for storing the model artifact. Default: "model".
-    :param tag:                The model's tag to log with.
-    :param context:            MLRun context to work with. If no context is given it will be retrieved via
-                               'mlrun.get_or_create_ctx(None)'
-    :param auto_log:           Whether to enable MLRun's auto logging. Default: True.
-    """
-
-    if isinstance(huggingface_object, transformers.Trainer):
-        return _apply_mlrun_on_trainer(
-            trainer=huggingface_object,
-            model_name=model_name,
-            tag=tag,
-            context=context,
-            auto_log=auto_log,
-            labels=labels,
-            extra_data=extra_data,
-        )
-    import optimum.onnxruntime as optimum_ort
-
-    if isinstance(huggingface_object, optimum_ort.ORTOptimizer):
-        return _apply_mlrun_on_optimizer(
-            optimizer=huggingface_object,
-            model_name=model_name,
-            tag=tag,
-            context=context,
-            auto_log=auto_log,
-            labels=labels,
-            extra_data=extra_data,
-        )
-    raise mlrun.errors.MLRunInvalidArgumentError
-
-
-# ---------------------- from auto_trainer--------------------------------
-class KWArgsPrefixes:
-    MODEL_CLASS = "CLASS_"
-    FIT = "FIT_"
-    TRAIN = "TRAIN_"
-    PREDICT = "PREDICT_"
-
-
-def _get_sub_dict_by_prefix(src: Dict, prefix_key: str) -> Dict[str, Any]:
-    """
-    Collect all the keys from the given dict that starts with the given prefix and creates a new dictionary with these
-    keys.
-
-    :param src:         The source dict to extract the values from.
-    :param prefix_key:  Only keys with this prefix will be returned. The keys in the result dict will be without this
-                        prefix.
-    """
-    return {
-        key.replace(prefix_key, ""): val
-        for key, val in src.items()
-        if key.startswith(prefix_key)
-    }
-
-
-def _get_dataframe(
-    context: MLClientCtx,
-    dataset: DataItem,
-    label_columns: Optional[Union[str, List[str]]] = None,
-    drop_columns: Union[str, List[str], int, List[int]] = None,
-) -> Tuple[pd.DataFrame, Optional[Union[str, List[str]]]]:
-    """
-    Getting the DataFrame of the dataset and drop the columns accordingly.
-
-    :param context:         MLRun context.
-    :param dataset:         The dataset to train the model on.
-                            Can be either a list of lists, dict, URI or a FeatureVector.
-    :param label_columns:   The target label(s) of the column(s) in the dataset. for Regression or
-                            Classification tasks.
-    :param drop_columns:    str/int or a list of strings/ints that represent the column names/indices to drop.
-    """
-    if isinstance(dataset, (list, dict)):
-        dataset = pd.DataFrame(dataset)
-        # Checking if drop_columns provided by integer type:
-        if drop_columns:
-            if isinstance(drop_columns, str) or (
-                isinstance(drop_columns, list)
-                and any(isinstance(col, str) for col in drop_columns)
-            ):
-                context.logger.error(
-                    "drop_columns must be an integer/list of integers if not provided with a URI/FeatureVector dataset"
-                )
-                raise ValueError
-            dataset.drop(drop_columns, axis=1, inplace=True)
-
-        return dataset, label_columns
-
-    store_uri_prefix, _ = mlrun.datastore.parse_store_uri(dataset.artifact_url)
-    if mlrun.utils.StorePrefix.FeatureVector == store_uri_prefix:
-        # feature-vector case:
-        label_columns = label_columns or dataset.meta.status.label_column
-        dataset = fs.get_offline_features(
-            dataset.meta.uri, drop_columns=drop_columns
-        ).to_dataframe()
-
-        context.logger.info(f"label columns: {label_columns}")
-    else:
-        # simple URL case:
-        dataset = dataset.as_df()
-        if drop_columns:
-            if all(col in dataset for col in drop_columns):
-                dataset = dataset.drop(drop_columns, axis=1)
-            else:
-                context.logger.info(
-                    "not all of the columns to drop in the dataset, drop columns process skipped"
-                )
-    return dataset, label_columns
-
-
-# ---------------------- Hugging Face Trainer --------------------------------
-
-
-def _create_compute_metrics(metrics: List[str]) -> Callable[[EvalPrediction], Dict]:
-    """
-    This function create and returns a function that will be used to compute metrics at evaluation.
-    :param metrics: List of different metrics for evaluate the model such as f1, accuracy etc.
-
-    :returns: Function that will be used to compute metrics at evaluation.
-             Must take a [`EvalPrediction`] and return a dictionary string to metric values.
-    """
-
-    def _compute_metrics(eval_pred):
-        logits, labels = eval_pred
-        predictions = np.argmax(logits, axis=-1)
-        metric_dict_results = {}
-        for metric in metrics:
-            load_met = load_metric(metric)
-            metric_res = load_met.compute(predictions=predictions, references=labels)[
-                metric
-            ]
-            metric_dict_results[metric] = metric_res
-
-        return metric_dict_results
-
-    return _compute_metrics
-
-
-def _edit_columns(
-    dataset: Dataset,
-    drop_columns: List[str] = None,
-    rename_columns: [str, str] = None,
-) -> Dataset:
-    """
-    Drop and renames that columns of the given dataset
-    :param dataset:         Dataset to process
-    :param drop_columns:    The columns to drop from the dataset.
-    :param rename_columns:  Dict of columns ro rename : {: , ...}
-
-    :returns: The dataset after the desired process
-    """
-    if drop_columns:
-        dataset = dataset.remove_columns(drop_columns)
-    if rename_columns:
-        dataset = dataset.rename_columns(rename_columns)
-    return dataset
-
-
-def _prepare_dataset(
-    context: MLClientCtx,
-    dataset_name: str,
-    label_name: str = None,
-    drop_columns: Optional[List[str]] = None,
-    num_of_train_samples: int = None,
-    train_test_split_size: float = None,
-    random_state: int = None,
-) -> Tuple[Dataset, Dataset]:
-    """
-    Loading the dataset and editing the columns
-
-    :param context:                 MLRun contex
-    :param dataset_name:            The name of the dataset to get from the HuggingFace hub
-    :param label_name:              The target label of the column in the dataset.
-    :param drop_columns:            The columns to drop from the dataset.
-    :param num_of_train_samples:    Max number of training samples, for debugging.
-    :param train_test_split_size:   Should be between 0.0 and 1.0 and represent the proportion of the dataset to include
-                                    in the test split.
-    :param random_state:            Random state for train_test_split
-
-    """
-
-    context.logger.info(
-        f"Loading and editing {dataset_name} dataset from Hugging Face hub"
-    )
-    rename_cols = {label_name: "labels"}
-
-    # Loading and editing dataset:
-    dataset = load_dataset(dataset_name)
-
-    # train set
-    train_dataset = dataset["train"]
-    if num_of_train_samples:
-        train_dataset = train_dataset.shuffle(seed=random_state).select(
-            list(range(num_of_train_samples))
-        )
-    train_dataset = _edit_columns(train_dataset, drop_columns, rename_cols)
-
-    # test set
-    test_dataset = dataset["test"]
-    if train_test_split_size or num_of_train_samples:
-        train_test_split_size = train_test_split_size or 0.2
-        num_of_test_samples = int(
-            (train_dataset.num_rows * train_test_split_size)
-            // (1 - train_test_split_size)
-        )
-        test_dataset = test_dataset.shuffle(seed=random_state).select(
-            list(range(num_of_test_samples))
-        )
-    test_dataset = _edit_columns(test_dataset, drop_columns, rename_cols)
-
-    return train_dataset, test_dataset
-
-
-def train(
-    context: MLClientCtx,
-    hf_dataset: str = None,
-    dataset: DataItem = None,
-    test_set: DataItem = None,
-    drop_columns: Optional[List[str]] = None,
-    pretrained_tokenizer: str = None,
-    pretrained_model: str = None,
-    model_class: str = None,
-    model_name: str = "huggingface-model",
-    label_name: str = "labels",
-    text_col: str = "text",
-    num_of_train_samples: int = None,
-    train_test_split_size: float = None,
-    metrics: List[str] = None,
-    random_state: int = None,
-):
-    """
-    Training and evaluating a pretrained model with a pretrained tokenizer over a dataset.
-    The dataset can be either be the name of the dataset that contains in the HuggingFace hub,
-    or a URI or a FeatureVector
-
-    :param context:                 MLRun context
-    :param hf_dataset:              The name of the dataset to get from the HuggingFace hub
-    :param dataset:                 The dataset to train the model on. Can be either a URI or a FeatureVector
-    :param test_set:                The test set to train the model with.
-    :param drop_columns:            The columns to drop from the dataset.
-    :param pretrained_tokenizer:    The name of the pretrained tokenizer from the HuggingFace hub.
-    :param pretrained_model:        The name of the pretrained model from the HuggingFace hub.
-    :param model_name:              The model's name to use for storing the model artifact, default to 'model'
-    :param model_class:             The class of the model, e.g. `transformers.AutoModelForSequenceClassification`
-    :param label_name:              The target label of the column in the dataset.
-    :param text_col:                The input text column un the dataset.
-    :param num_of_train_samples:    Max number of training samples, for debugging.
-    :param train_test_split_size:   Should be between 0.0 and 1.0 and represent the proportion of the dataset to include
-                                    in the test split.
-    :param metrics:                 List of different metrics for evaluate the model such as f1, accuracy etc.
-    :param random_state:            Random state for train_test_split
-    """
-
-    if train_test_split_size is None and test_set is None:
-        context.logger.info(
-            "'train_test_split_size' is not provided, setting train_test_split_size to 0.2"
-        )
-        train_test_split_size = 0.2
-
-    # Creating tokenizer:
-    tokenizer = AutoTokenizer.from_pretrained(pretrained_tokenizer)
-
-    def preprocess_function(examples):
-        return tokenizer(examples[text_col], truncation=True)
-
-    # prepare data for training
-    if hf_dataset:
-        train_dataset, test_dataset = _prepare_dataset(
-            context,
-            hf_dataset,
-            label_name,
-            drop_columns,
-            num_of_train_samples,
-            train_test_split_size,
-            random_state=random_state,
-        )
-    elif dataset:
-        # Get DataFrame by URL or by FeatureVector:
-        train_dataset, label_name = _get_dataframe(
-            context=context,
-            dataset=dataset,
-            label_columns=label_name,
-            drop_columns=drop_columns,
-        )
-        if test_set:
-            test_dataset, _ = _get_dataframe(
-                context=context,
-                dataset=test_set,
-                label_columns=label_name,
-                drop_columns=drop_columns,
-            )
-        else:
-            train_dataset, test_dataset = train_test_split(
-                train_dataset,
-                test_size=train_test_split_size,
-                random_state=random_state,
-            )
-        train_dataset = Dataset.from_pandas(train_dataset)
-        test_dataset = Dataset.from_pandas(test_dataset)
-    else:
-        raise mlrun.errors.MLRunInvalidArgumentError(
-            "Training data was not provided. A training dataset is mandatory for training."
-            " Please provide a training set using one of the arguments 'hf_dataset' or 'dataset'."
-        )
-
-    # Mapping datasets with the tokenizer:
-    tokenized_train = train_dataset.map(preprocess_function, batched=True)
-    tokenized_test = test_dataset.map(preprocess_function, batched=True)
-
-    # Creating data collator for batching:
-    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
-
-    # Parsing kwargs:
-    train_kwargs = _get_sub_dict_by_prefix(
-        src=context.parameters, prefix_key=KWArgsPrefixes.TRAIN
-    )
-    model_class_kwargs = _get_sub_dict_by_prefix(
-        src=context.parameters, prefix_key=KWArgsPrefixes.MODEL_CLASS
-    )
-
-    # Loading our pretrained model:
-    model_class_kwargs["pretrained_model_name_or_path"] = (
-        model_class_kwargs.get("pretrained_model_name_or_path") or pretrained_model
-    )
-    train_kwargs["hub_token"] = train_kwargs.get("hub_token") or pretrained_tokenizer
-    if not model_class_kwargs["pretrained_model_name_or_path"]:
-        raise mlrun.errors.MLRunRuntimeError(
-            "Must provide pretrained_model name as "
-            "function argument or in extra params"
-        )
-    model = create_class(model_class).from_pretrained(**model_class_kwargs)
-
-    # Preparing training arguments:
-    training_args = TrainingArguments(
-        **train_kwargs,
-    )
-
-    compute_metrics = _create_compute_metrics(metrics) if metrics else None
-    trainer = Trainer(
-        model=model,
-        args=training_args,
-        train_dataset=tokenized_train,
-        eval_dataset=tokenized_test,
-        tokenizer=tokenizer,
-        data_collator=data_collator,
-        compute_metrics=compute_metrics,
-    )
-
-    apply_mlrun(trainer, model_name=model_name)
-
-    # Apply training with evaluation:
-    context.logger.info(f"training '{model_name}'")
-    trainer.train()
-
-
-def _get_model_dir(model_uri: str):
-    model_file, _, _ = mlrun.artifacts.get_model(model_uri)
-    model_dir = tempfile.gettempdir()
-    # Unzip the Model:
-    with zipfile.ZipFile(model_file, "r") as zip_file:
-        zip_file.extractall(model_dir)
-
-    return model_dir
-
-
-def optimize(
-    model_path: str,
-    model_name: str = "optimized_model",
-    target_dir: str = "./optimized",
-    optimization_level: int = 1,
-):
-    """
-    Optimizing the transformer model using ONNX optimization.
-
-
-    :param model_path:          The path of the model to optimize.
-    :param model_name:          Name of the optimized model.
-    :param target_dir:          The directory to save the ONNX model.
-    :param optimization_level:  Optimization level performed by ONNX Runtime of the loaded graph. (default is 1)
-    """
-    # We import these in the function scope so ONNX won't be mandatory for the other handlers:
-    from optimum.onnxruntime import ORTModelForSequenceClassification, ORTOptimizer
-    from optimum.onnxruntime.configuration import OptimizationConfig
-
-    model_dir = _get_model_dir(model_uri=model_path)
-    # Creating configuration for optimization step:
-    optimization_config = OptimizationConfig(optimization_level=optimization_level)
-
-    # Converting our pretrained model to an ONNX-Runtime model:
-    ort_model = ORTModelForSequenceClassification.from_pretrained(
-        model_dir, from_transformers=True
-    )
-
-    # Creating an ONNX-Runtime optimizer from ONNX model:
-    optimizer = ORTOptimizer.from_pretrained(ort_model)
-
-    apply_mlrun(optimizer, model_name=model_name)
-    # Optimizing and saving the ONNX model:
-    optimizer.optimize(save_dir=target_dir, optimization_config=optimization_config)
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/hugging_face_classifier_trainer/0.2.0/src/function.yaml b/functions/development/hugging_face_classifier_trainer/0.2.0/src/function.yaml deleted file mode 100644 index eb223b2b..00000000 --- a/functions/development/hugging_face_classifier_trainer/0.2.0/src/function.yaml +++ /dev/null @@ -1,368 +0,0 @@ -kind: job -metadata: - name: hugging-face-classifier-trainer - tag: '' - hash: e8113e81f04c96fc9a8a94e717dea81ee3e05a18 - project: '' - labels: - author: davids - categories: - - machine-learning - - model-training -spec: - command: '' - args: [] - image: '' - build: - functionSourceCode:  - base_image: mlrun/mlrun - commands: [] - code_origin: '' - origin_filename: '' - requirements: - - onnx~=1.14.1 - - onnxruntime~=1.16.1 - - optimum~=1.6.4 - - transformers~=4.26.1 - - datasets~=2.10.1 - - scikit-learn~=1.0.2 - entry_points: - add_interface: - name: add_interface - doc: 'Enrich the object with this interface properties, methods and functions, - so it will have this TensorFlow.Keras - - MLRuns features.' - parameters: - - name: cls - - name: obj - type: Trainer - doc: The object to enrich his interface. - - name: restoration - type: MLRunInterfaceRestorationType - doc: Restoration information tuple as returned from 'remove_interface' in - order to add the interface in a certain state. - default: null - outputs: [] - lineno: 146 - has_varargs: false - has_kwargs: false - mlrun_optimize: - name: mlrun_optimize - doc: 'MLRun''s tf.keras.Model.fit wrapper. It will setup the optimizer when - using horovod. The optimizer must be - - passed in a keyword argument and when using horovod, it must be passed as - an Optimizer instance, not a string. - - - raise MLRunInvalidArgumentError: In case the optimizer provided did not follow - the instructions above.' - parameters: - - name: cls - outputs: [] - lineno: 79 - has_varargs: false - has_kwargs: false - wrapper: - name: wrapper - doc: '' - parameters: - - name: self - type: Trainer - outputs: [] - lineno: 173 - has_varargs: true - has_kwargs: true - enable_auto_logging: - name: enable_auto_logging - doc: '' - parameters: - - name: self - - name: context - type: MLClientCtx - - name: model_name - type: str - default: model - - name: tag - type: str - default: '' - - name: labels - type: Dict[str, str] - default: null - - name: extra_data - type: dict - default: null - outputs: [] - lineno: 114 - has_varargs: false - has_kwargs: false - mlrun_train: - name: mlrun_train - doc: 'MLRuns tf.keras.Model.fit wrapper. It will setup the optimizer when using - horovod. The optimizer must be - - passed in a keyword argument and when using horovod, it must be passed as - an Optimizer instance, not a string. - - - raise MLRunInvalidArgumentError: In case the optimizer provided did not follow - the instructions above.' - parameters: - - name: cls - outputs: [] - lineno: 164 - has_varargs: false - has_kwargs: false - on_epoch_begin: - name: on_epoch_begin - doc: '' - parameters: - - name: self - - name: args - type: TrainingArguments - - name: state - type: TrainerState - - name: control - type: TrainerControl - outputs: [] - lineno: 220 - has_varargs: false - has_kwargs: true - on_epoch_end: - name: on_epoch_end - doc: '' - parameters: - - name: self - - name: args - type: TrainingArguments - - name: state - type: TrainerState - - name: control - type: TrainerControl - outputs: [] - lineno: 229 - has_varargs: false - has_kwargs: true - on_log: - name: on_log - doc: '' - parameters: - - name: self - - name: args - type: TrainingArguments - - name: state - type: TrainerState - - name: control - type: TrainerControl - - name: logs - type: Dict[str, float] - default: null - outputs: [] - lineno: 238 - has_varargs: false - has_kwargs: true - on_train_begin: - name: on_train_begin - doc: '' - parameters: - - name: self - - name: args - type: TrainingArguments - - name: state - type: TrainerState - - name: control - type: TrainerControl - outputs: [] - lineno: 262 - has_varargs: false - has_kwargs: true - on_train_end: - name: on_train_end - doc: '' - parameters: - - name: self - - name: args - type: TrainingArguments - - name: state - type: TrainerState - - name: control - type: TrainerControl - - name: model - type: PreTrainedModel - default: null - - name: tokenizer - type: PreTrainedTokenizer - default: null - outputs: [] - lineno: 271 - has_varargs: false - has_kwargs: true - on_evaluate: - name: on_evaluate - doc: '' - parameters: - - name: self - - name: args - type: TrainingArguments - - name: state - type: TrainerState - - name: control - type: TrainerControl - outputs: [] - lineno: 322 - has_varargs: false - has_kwargs: true - apply_mlrun: - name: apply_mlrun - doc: Wrap the given model with MLRun's interface providing it with mlrun's additional - features. - parameters: - - name: huggingface_object - doc: The model to wrap. Can be loaded from the model path given as well. - - name: model_name - type: str - doc: 'The model name to use for storing the model artifact. Default: "model".' - default: null - - name: tag - type: str - doc: The model's tag to log with. - default: '' - - name: context - type: MLClientCtx - doc: MLRun context to work with. If no context is given it will be retrieved - via 'mlrun.get_or_create_ctx(None)' - default: null - - name: auto_log - type: bool - doc: 'Whether to enable MLRun''s auto logging. Default: True.' - default: true - - name: labels - type: Dict[str, str] - default: null - - name: extra_data - type: dict - default: null - outputs: [] - lineno: 421 - has_varargs: false - has_kwargs: true - train: - name: train - doc: 'Training and evaluating a pretrained model with a pretrained tokenizer - over a dataset. - - The dataset can be either be the name of the dataset that contains in the - HuggingFace hub, - - or a URI or a FeatureVector' - parameters: - - name: context - type: MLClientCtx - doc: MLRun context - - name: hf_dataset - type: str - doc: The name of the dataset to get from the HuggingFace hub - default: null - - name: dataset - type: DataItem - doc: The dataset to train the model on. Can be either a URI or a FeatureVector - default: null - - name: test_set - type: DataItem - doc: The test set to train the model with. - default: null - - name: drop_columns - type: Optional[List[str]] - doc: The columns to drop from the dataset. - default: null - - name: pretrained_tokenizer - type: str - doc: The name of the pretrained tokenizer from the HuggingFace hub. - default: null - - name: pretrained_model - type: str - doc: The name of the pretrained model from the HuggingFace hub. - default: null - - name: model_class - type: str - doc: The class of the model, e.g. `transformers.AutoModelForSequenceClassification` - default: null - - name: model_name - type: str - doc: The model's name to use for storing the model artifact, default to 'model' - default: huggingface-model - - name: label_name - type: str - doc: The target label of the column in the dataset. - default: labels - - name: text_col - type: str - doc: The input text column un the dataset. - default: text - - name: num_of_train_samples - type: int - doc: Max number of training samples, for debugging. - default: null - - name: train_test_split_size - type: float - doc: Should be between 0.0 and 1.0 and represent the proportion of the dataset - to include in the test split. - default: null - - name: metrics - type: List[str] - doc: List of different metrics for evaluate the model such as f1, accuracy - etc. - default: null - - name: random_state - type: int - doc: Random state for train_test_split - default: null - outputs: [] - lineno: 647 - has_varargs: false - has_kwargs: false - preprocess_function: - name: preprocess_function - doc: '' - parameters: - - name: examples - outputs: [] - lineno: 696 - has_varargs: false - has_kwargs: false - optimize: - name: optimize - doc: Optimizing the transformer model using ONNX optimization. - parameters: - - name: model_path - type: str - doc: The path of the model to optimize. - - name: model_name - type: str - doc: Name of the optimized model. - default: optimized_model - - name: target_dir - type: str - doc: The directory to save the ONNX model. - default: ./optimized - - name: optimization_level - type: int - doc: Optimization level performed by ONNX Runtime of the loaded graph. (default - is 1) - default: 1 - outputs: [] - lineno: 799 - has_varargs: false - has_kwargs: false - description: Automatic train and optimize functions for HuggingFace framework - default_handler: train - disable_auto_mount: false - clone_target_dir: '' - env: [] - priority_class_name: '' - preemption_mode: prevent - affinity: null - tolerations: null - security_context: {} -verbose: false diff --git a/functions/development/hugging_face_classifier_trainer/0.2.0/src/hugging_face_classifier_trainer.ipynb b/functions/development/hugging_face_classifier_trainer/0.2.0/src/hugging_face_classifier_trainer.ipynb deleted file mode 100644 index 2768d2dc..00000000 --- a/functions/development/hugging_face_classifier_trainer/0.2.0/src/hugging_face_classifier_trainer.ipynb +++ /dev/null @@ -1,2533 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "\n", - "# MLRun Hugging Face Classifier Trainer Tutorial" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "This notebook shows how to use the handlers of the Hugging Face classifier trainer.\n", - "the following handlers are:\n", - "- `train`\n", - "- `optimize`\n", - "\n", - "All you need is simply **HF model type** and a **HF dataset name**." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "scrolled": true, - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Requirement already satisfied: onnx~=1.14.1 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from -r requirements.txt (line 1)) (1.14.1)\n", - "Requirement already satisfied: onnxruntime==1.16.1 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from -r requirements.txt (line 2)) (1.16.1)\n", - "Requirement already satisfied: optimum~=1.6.4 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from -r requirements.txt (line 3)) (1.6.4)\n", - "Requirement already satisfied: transformers~=4.26.1 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from -r requirements.txt (line 4)) (4.26.1)\n", - "Requirement already satisfied: datasets~=2.10.1 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from -r requirements.txt (line 5)) (2.10.1)\n", - "Requirement already satisfied: scikit-learn~=1.0.2 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from -r requirements.txt (line 6)) (1.0.2)\n", - "Requirement already satisfied: coloredlogs in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from onnxruntime==1.16.1->-r requirements.txt (line 2)) (15.0.1)\n", - "Requirement already satisfied: flatbuffers in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from onnxruntime==1.16.1->-r requirements.txt (line 2)) (1.12)\n", - "Requirement already satisfied: numpy>=1.21.6 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from onnxruntime==1.16.1->-r requirements.txt (line 2)) (1.23.5)\n", - "Requirement already satisfied: packaging in /conda/envs/mlrun-base/lib/python3.9/site-packages (from onnxruntime==1.16.1->-r requirements.txt (line 2)) (21.3)\n", - "Requirement already satisfied: protobuf in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from onnxruntime==1.16.1->-r requirements.txt (line 2)) (3.20.2)\n", - "Requirement already satisfied: sympy in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from onnxruntime==1.16.1->-r requirements.txt (line 2)) (1.12)\n", - "Requirement already satisfied: typing-extensions>=3.6.2.1 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from onnx~=1.14.1->-r requirements.txt (line 1)) (4.7.1)\n", - "Requirement already satisfied: torch>=1.9 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from optimum~=1.6.4->-r requirements.txt (line 3)) (2.1.2)\n", - "Requirement already satisfied: huggingface-hub>=0.8.0 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from optimum~=1.6.4->-r requirements.txt (line 3)) (0.20.1)\n", - "Requirement already satisfied: filelock in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from transformers~=4.26.1->-r requirements.txt (line 4)) (3.13.1)\n", - "Requirement already satisfied: pyyaml>=5.1 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from transformers~=4.26.1->-r requirements.txt (line 4)) (5.4.1)\n", - "Requirement already satisfied: regex!=2019.12.17 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from transformers~=4.26.1->-r requirements.txt (line 4)) (2023.12.25)\n", - "Requirement already satisfied: requests in /conda/envs/mlrun-base/lib/python3.9/site-packages (from transformers~=4.26.1->-r requirements.txt (line 4)) (2.31.0)\n", - "Requirement already satisfied: tokenizers!=0.11.3,<0.14,>=0.11.1 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from transformers~=4.26.1->-r requirements.txt (line 4)) (0.13.3)\n", - "Requirement already satisfied: tqdm>=4.27 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from transformers~=4.26.1->-r requirements.txt (line 4)) (4.65.0)\n", - "Requirement already satisfied: pyarrow>=6.0.0 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from datasets~=2.10.1->-r requirements.txt (line 5)) (11.0.0)\n", - "Requirement already satisfied: dill<0.3.7,>=0.3.0 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from datasets~=2.10.1->-r requirements.txt (line 5)) (0.3.6)\n", - "Requirement already satisfied: pandas in /conda/envs/mlrun-base/lib/python3.9/site-packages (from datasets~=2.10.1->-r requirements.txt (line 5)) (1.4.4)\n", - "Requirement already satisfied: xxhash in /conda/envs/mlrun-base/lib/python3.9/site-packages (from datasets~=2.10.1->-r requirements.txt (line 5)) (3.3.0)\n", - "Requirement already satisfied: multiprocess in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from datasets~=2.10.1->-r requirements.txt (line 5)) (0.70.14)\n", - "Requirement already satisfied: fsspec>=2021.11.1 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from fsspec[http]>=2021.11.1->datasets~=2.10.1->-r requirements.txt (line 5)) (2023.9.2)\n", - "Requirement already satisfied: aiohttp in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from datasets~=2.10.1->-r requirements.txt (line 5)) (3.9.1)\n", - "Requirement already satisfied: responses<0.19 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from datasets~=2.10.1->-r requirements.txt (line 5)) (0.18.0)\n", - "Requirement already satisfied: scipy>=1.1.0 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from scikit-learn~=1.0.2->-r requirements.txt (line 6)) (1.11.4)\n", - "Requirement already satisfied: joblib>=0.11 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from scikit-learn~=1.0.2->-r requirements.txt (line 6)) (1.3.2)\n", - "Requirement already satisfied: threadpoolctl>=2.0.0 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from scikit-learn~=1.0.2->-r requirements.txt (line 6)) (3.2.0)\n", - "Requirement already satisfied: attrs>=17.3.0 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from aiohttp->datasets~=2.10.1->-r requirements.txt (line 5)) (19.1.0)\n", - "Requirement already satisfied: multidict<7.0,>=4.5 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from aiohttp->datasets~=2.10.1->-r requirements.txt (line 5)) (6.0.4)\n", - "Requirement already satisfied: yarl<2.0,>=1.0 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from aiohttp->datasets~=2.10.1->-r requirements.txt (line 5)) (1.9.2)\n", - "Requirement already satisfied: frozenlist>=1.1.1 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from aiohttp->datasets~=2.10.1->-r requirements.txt (line 5)) (1.4.0)\n", - "Requirement already satisfied: aiosignal>=1.1.2 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from aiohttp->datasets~=2.10.1->-r requirements.txt (line 5)) (1.3.1)\n", - "Requirement already satisfied: async-timeout<5.0,>=4.0 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from aiohttp->datasets~=2.10.1->-r requirements.txt (line 5)) (4.0.3)\n", - "Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from packaging->onnxruntime==1.16.1->-r requirements.txt (line 2)) (3.1.1)\n", - "Requirement already satisfied: charset-normalizer<4,>=2 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from requests->transformers~=4.26.1->-r requirements.txt (line 4)) (2.1.1)\n", - "Requirement already satisfied: idna<4,>=2.5 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from requests->transformers~=4.26.1->-r requirements.txt (line 4)) (3.4)\n", - "Requirement already satisfied: urllib3<3,>=1.21.1 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from requests->transformers~=4.26.1->-r requirements.txt (line 4)) (1.26.16)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from requests->transformers~=4.26.1->-r requirements.txt (line 4)) (2023.7.22)\n", - "Requirement already satisfied: networkx in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (3.2.1)\n", - "Requirement already satisfied: jinja2 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (3.1.3)\n", - "Requirement already satisfied: nvidia-cuda-nvrtc-cu12==12.1.105 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (12.1.105)\n", - "Requirement already satisfied: nvidia-cuda-runtime-cu12==12.1.105 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (12.1.105)\n", - "Requirement already satisfied: nvidia-cuda-cupti-cu12==12.1.105 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (12.1.105)\n", - "Requirement already satisfied: nvidia-cudnn-cu12==8.9.2.26 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (8.9.2.26)\n", - "Requirement already satisfied: nvidia-cublas-cu12==12.1.3.1 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (12.1.3.1)\n", - "Requirement already satisfied: nvidia-cufft-cu12==11.0.2.54 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (11.0.2.54)\n", - "Requirement already satisfied: nvidia-curand-cu12==10.3.2.106 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (10.3.2.106)\n", - "Requirement already satisfied: nvidia-cusolver-cu12==11.4.5.107 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (11.4.5.107)\n", - "Requirement already satisfied: nvidia-cusparse-cu12==12.1.0.106 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (12.1.0.106)\n", - "Requirement already satisfied: nvidia-nccl-cu12==2.18.1 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (2.18.1)\n", - "Requirement already satisfied: nvidia-nvtx-cu12==12.1.105 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (12.1.105)\n", - "Requirement already satisfied: triton==2.1.0 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (2.1.0)\n", - "Requirement already satisfied: nvidia-nvjitlink-cu12 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from nvidia-cusolver-cu12==11.4.5.107->torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (12.3.101)\n", - "Requirement already satisfied: sentencepiece!=0.1.92,>=0.1.91 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from transformers[sentencepiece]>=4.26.0->optimum~=1.6.4->-r requirements.txt (line 3)) (0.2.0)\n", - "Requirement already satisfied: humanfriendly>=9.1 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from coloredlogs->onnxruntime==1.16.1->-r requirements.txt (line 2)) (9.2)\n", - "Requirement already satisfied: python-dateutil>=2.8.1 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from pandas->datasets~=2.10.1->-r requirements.txt (line 5)) (2.8.2)\n", - "Requirement already satisfied: pytz>=2020.1 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from pandas->datasets~=2.10.1->-r requirements.txt (line 5)) (2023.3.post1)\n", - "Requirement already satisfied: mpmath>=0.19 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from sympy->onnxruntime==1.16.1->-r requirements.txt (line 2)) (1.3.0)\n", - "Requirement already satisfied: six>=1.5 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from python-dateutil>=2.8.1->pandas->datasets~=2.10.1->-r requirements.txt (line 5)) (1.16.0)\n", - "Requirement already satisfied: MarkupSafe>=2.0 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from jinja2->torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (2.1.3)\n", - "Note: you may need to restart the kernel to use updated packages.\n" - ] - } - ], - "source": [ - "%pip install -r requirements.txt" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [], - "source": [ - "import mlrun" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2024-03-24 17:10:17,091 [info] Project loaded successfully: {'project_name': 'hugging-face-trainer'}\n" - ] - } - ], - "source": [ - "project = mlrun.get_or_create_project('hugging-face-trainer', context=\"./\", user_project=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "### **Importing the hugging_face_classifier_trainer function from the Marketplace**" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [], - "source": [ - "hugging_face_classifier_trainer = mlrun.import_function(\"hub://hugging_face_classifier_trainer\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "### **Training a model**\n", - "\n", - "Choosing the `train` handler" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "#### Define task parameters¶\n", - "* Class parameters should contain the prefix `CLASS_`\n", - "* Train parameters should contain the prefix `TRAIN_`" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [], - "source": [ - "model_class = \"transformers.AutoModelForSequenceClassification\"\n", - "additional_parameters = {\n", - " \"TRAIN_output_dir\": \"finetuning-sentiment-model-3000-samples\",\n", - " \"TRAIN_learning_rate\": 2e-5,\n", - " \"TRAIN_per_device_train_batch_size\": 16,\n", - " \"TRAIN_per_device_eval_batch_size\": 16,\n", - " \"TRAIN_num_train_epochs\": 3,\n", - " \"TRAIN_weight_decay\": 0.01,\n", - " \"TRAIN_push_to_hub\": False,\n", - " \"TRAIN_evaluation_strategy\": \"epoch\",\n", - " \"TRAIN_eval_steps\": 1,\n", - " \"TRAIN_logging_steps\": 1,\n", - " \"CLASS_num_labels\": 2\n", - "}" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "#### Running the Training job with the \"train\" handler" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "pycharm": { - "name": "#%%\n" - }, - "scrolled": true, - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2024-03-24 17:10:21,025 [info] Storing function: {'name': 'hugging-face-classifier-trainer-train', 'uid': '514d8d5530c842238b1cc81983cd943e', 'db': 'http://mlrun-api:8080'}\n", - "> 2024-03-24 17:11:03,727 [info] 'train_test_split_size' is not provided, setting train_test_split_size to 0.2\n", - "> 2024-03-24 17:11:03,882 [info] Loading and editing Shayanvsf/US_Airline_Sentiment dataset from Hugging Face hub\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Found cached dataset parquet (/igz/.cache/huggingface/datasets/Shayanvsf___parquet/Shayanvsf--US_Airline_Sentiment-1319c42f87c44b2f/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "f43b1388d0b344888323bec590baadee", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/3 [00:00 2024-03-24 17:11:08,938 [info] training 'huggingface-model'\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`, you can safely ignore this message.\n", - "This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n", - "***** Running training *****\n", - " Num examples = 100\n", - " Num Epochs = 3\n", - " Instantaneous batch size per device = 16\n", - " Total train batch size (w. parallel, distributed & accumulation) = 16\n", - " Gradient Accumulation steps = 1\n", - " Total optimization steps = 21\n", - " Number of trainable parameters = 66955010\n", - "You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - " \n", - " \n", - " [21/21 00:15, Epoch 3/3]\n", - "
\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
EpochTraining LossValidation LossAccuracyF1
10.7389000.5153110.7916670.000000
20.5259000.4815630.7916670.000000
30.4908000.4716750.7916670.000000

" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`, you can safely ignore this message.\n", - "***** Running Evaluation *****\n", - " Num examples = 24\n", - " Batch size = 16\n", - "/tmp/tmp0c1aawrq.py:561: FutureWarning:\n", - "\n", - "load_metric is deprecated and will be removed in the next major version of datasets. Use 'evaluate.load' instead, from the new library 🤗 Evaluate: https://huggingface.co/docs/evaluate\n", - "\n", - "The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`, you can safely ignore this message.\n", - "***** Running Evaluation *****\n", - " Num examples = 24\n", - " Batch size = 16\n", - "The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`, you can safely ignore this message.\n", - "***** Running Evaluation *****\n", - " Num examples = 24\n", - " Batch size = 16\n", - "\n", - "\n", - "Training completed. Do not forget to share your model on huggingface.co/models =)\n", - "\n", - "\n", - "tokenizer config file saved in /tmp/tokenizer/tokenizer_config.json\n", - "Special tokens file saved in /tmp/tokenizer/special_tokens_map.json\n", - "Configuration saved in /tmp/model/config.json\n", - "Model weights saved in /tmp/model/pytorch_model.bin\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "

\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
hugging-face-trainer-avia0Mar 24 17:10:21completedhugging-face-classifier-trainer-train
v3io_user=avia
kind=local
owner=avia
host=jupyter-avia-6454bdd4c5-xz8cg
hf_dataset=Shayanvsf/US_Airline_Sentiment
drop_columns=['airline_sentiment_confidence', 'negativereason_confidence']
pretrained_tokenizer=distilbert-base-uncased
pretrained_model=distilbert-base-uncased
model_class=transformers.AutoModelForSequenceClassification
label_name=airline_sentiment
num_of_train_samples=100
metrics=['accuracy', 'f1']
random_state=42
TRAIN_output_dir=finetuning-sentiment-model-3000-samples
TRAIN_learning_rate=2e-05
TRAIN_per_device_train_batch_size=16
TRAIN_per_device_eval_batch_size=16
TRAIN_num_train_epochs=3
TRAIN_weight_decay=0.01
TRAIN_push_to_hub=False
TRAIN_evaluation_strategy=epoch
TRAIN_eval_steps=1
TRAIN_logging_steps=1
CLASS_num_labels=2
loss=0.4908
learning_rate=0.0
eval_loss=0.47167453169822693
eval_accuracy=0.7916666666666666
eval_f1=0.0
eval_runtime=0.5186
eval_samples_per_second=46.276
eval_steps_per_second=3.856
train_runtime=17.6054
train_samples_per_second=17.04
train_steps_per_second=1.193
total_flos=3327208489680.0
loss_plot
learning_rate_plot
eval_loss_plot
eval_accuracy_plot
eval_f1_plot
eval_runtime_plot
eval_samples_per_second_plot
eval_steps_per_second_plot
tokenizer
model
\n", - "
\n", - "
\n", - "
\n", - " Title\n", - " ×\n", - "
\n", - " \n", - "
\n", - "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - }, - { - "data": { - "text/html": [ - " > to track results use the .show() or .logs() methods or click here to open in UI" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2024-03-24 17:12:01,880 [info] Run execution finished: {'status': 'completed', 'name': 'hugging-face-classifier-trainer-train'}\n" - ] - } - ], - "source": [ - "train_run = hugging_face_classifier_trainer.run(params={\n", - " \"hf_dataset\": \"Shayanvsf/US_Airline_Sentiment\",\n", - " \"drop_columns\": [\n", - " \"airline_sentiment_confidence\",\n", - " \"negativereason_confidence\",\n", - " ],\n", - " \"pretrained_tokenizer\": \"distilbert-base-uncased\",\n", - " \"pretrained_model\": \"distilbert-base-uncased\",\n", - " \"model_class\": \"transformers.AutoModelForSequenceClassification\",\n", - " \"label_name\": \"airline_sentiment\",\n", - " \"num_of_train_samples\": 100,\n", - " \"metrics\": [\"accuracy\", \"f1\"],\n", - " \"random_state\": 42,\n", - " **additional_parameters\n", - " },\n", - " handler=\"train\",\n", - " local=True,\n", - " )" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "#### The result of the train run" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "{'loss': 0.4908,\n", - " 'learning_rate': 0.0,\n", - " 'eval_loss': 0.47167453169822693,\n", - " 'eval_accuracy': 0.7916666666666666,\n", - " 'eval_f1': 0.0,\n", - " 'eval_runtime': 0.5186,\n", - " 'eval_samples_per_second': 46.276,\n", - " 'eval_steps_per_second': 3.856,\n", - " 'train_runtime': 17.6054,\n", - " 'train_samples_per_second': 17.04,\n", - " 'train_steps_per_second': 1.193,\n", - " 'total_flos': 3327208489680.0,\n", - " 'loss_plot': 'v3io:///projects/hugging-face-trainer-avia/artifacts/hugging-face-classifier-trainer-train/0/loss_plot.html',\n", - " 'learning_rate_plot': 'v3io:///projects/hugging-face-trainer-avia/artifacts/hugging-face-classifier-trainer-train/0/learning_rate_plot.html',\n", - " 'eval_loss_plot': 'v3io:///projects/hugging-face-trainer-avia/artifacts/hugging-face-classifier-trainer-train/0/eval_loss_plot.html',\n", - " 'eval_accuracy_plot': 'v3io:///projects/hugging-face-trainer-avia/artifacts/hugging-face-classifier-trainer-train/0/eval_accuracy_plot.html',\n", - " 'eval_f1_plot': 'v3io:///projects/hugging-face-trainer-avia/artifacts/hugging-face-classifier-trainer-train/0/eval_f1_plot.html',\n", - " 'eval_runtime_plot': 'v3io:///projects/hugging-face-trainer-avia/artifacts/hugging-face-classifier-trainer-train/0/eval_runtime_plot.html',\n", - " 'eval_samples_per_second_plot': 'v3io:///projects/hugging-face-trainer-avia/artifacts/hugging-face-classifier-trainer-train/0/eval_samples_per_second_plot.html',\n", - " 'eval_steps_per_second_plot': 'v3io:///projects/hugging-face-trainer-avia/artifacts/hugging-face-classifier-trainer-train/0/eval_steps_per_second_plot.html',\n", - " 'tokenizer': 'store://artifacts/hugging-face-trainer-avia/hugging-face-classifier-trainer-train_tokenizer@514d8d5530c842238b1cc81983cd943e',\n", - " 'model': 'store://artifacts/hugging-face-trainer-avia/huggingface-model@514d8d5530c842238b1cc81983cd943e'}" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "train_run.outputs" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "
\n", - "
\n", - "\n", - "" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "train_run.artifact('loss_plot').show()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "#### Getting the model for evaluating and predicting" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [], - "source": [ - "model_path = train_run.outputs['model']" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Optimize the model**\n", - "\n", - "Choosing the `optimize` handler\n", - "\n", - "The result of using this handled is an onnx optimized model." - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "scrolled": true, - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2024-03-24 17:12:02,020 [info] Storing function: {'name': 'hugging-face-classifier-trainer-optimize', 'uid': 'fbee1ead18444824a4b5c0308a677bf4', 'db': 'http://mlrun-api:8080'}\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/User/.pythonlibs/mlrun-base/lib/python3.9/site-packages/optimum/onnxruntime/configuration.py:726: FutureWarning:\n", - "\n", - "disable_embed_layer_norm will be deprecated soon, use disable_embed_layer_norm_fusion instead, disable_embed_layer_norm_fusion is set to True.\n", - "\n", - "loading configuration file /tmp/config.json\n", - "Model config DistilBertConfig {\n", - " \"_name_or_path\": \"/tmp/config.json\",\n", - " \"activation\": \"gelu\",\n", - " \"architectures\": [\n", - " \"DistilBertForSequenceClassification\"\n", - " ],\n", - " \"attention_dropout\": 0.1,\n", - " \"dim\": 768,\n", - " \"dropout\": 0.1,\n", - " \"hidden_dim\": 3072,\n", - " \"initializer_range\": 0.02,\n", - " \"max_position_embeddings\": 512,\n", - " \"model_type\": \"distilbert\",\n", - " \"n_heads\": 12,\n", - " \"n_layers\": 6,\n", - " \"pad_token_id\": 0,\n", - " \"problem_type\": \"single_label_classification\",\n", - " \"qa_dropout\": 0.1,\n", - " \"seq_classif_dropout\": 0.2,\n", - " \"sinusoidal_pos_embds\": false,\n", - " \"tie_weights_\": true,\n", - " \"torch_dtype\": \"float32\",\n", - " \"transformers_version\": \"4.26.1\",\n", - " \"vocab_size\": 30522\n", - "}\n", - "\n", - "loading configuration file /tmp/config.json\n", - "Model config DistilBertConfig {\n", - " \"_name_or_path\": \"/tmp\",\n", - " \"activation\": \"gelu\",\n", - " \"architectures\": [\n", - " \"DistilBertForSequenceClassification\"\n", - " ],\n", - " \"attention_dropout\": 0.1,\n", - " \"dim\": 768,\n", - " \"dropout\": 0.1,\n", - " \"hidden_dim\": 3072,\n", - " \"initializer_range\": 0.02,\n", - " \"max_position_embeddings\": 512,\n", - " \"model_type\": \"distilbert\",\n", - " \"n_heads\": 12,\n", - " \"n_layers\": 6,\n", - " \"pad_token_id\": 0,\n", - " \"problem_type\": \"single_label_classification\",\n", - " \"qa_dropout\": 0.1,\n", - " \"seq_classif_dropout\": 0.2,\n", - " \"sinusoidal_pos_embds\": false,\n", - " \"tie_weights_\": true,\n", - " \"torch_dtype\": \"float32\",\n", - " \"transformers_version\": \"4.26.1\",\n", - " \"vocab_size\": 30522\n", - "}\n", - "\n", - "loading weights file /tmp/pytorch_model.bin\n", - "All model checkpoint weights were used when initializing DistilBertForSequenceClassification.\n", - "\n", - "All the weights of DistilBertForSequenceClassification were initialized from the model checkpoint at /tmp.\n", - "If your task is similar to the task the model of the checkpoint was trained on, you can already use DistilBertForSequenceClassification for predictions without further training.\n", - "/User/.pythonlibs/mlrun-base/lib/python3.9/site-packages/transformers/models/distilbert/modeling_distilbert.py:218: TracerWarning:\n", - "\n", - "torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect.\n", - "\n", - "Configuration saved in /tmp/tmp79wjp8m8/config.json\n", - "Could not locate the tokenizer configuration file, will try to use the model config instead.\n", - "loading configuration file /tmp/config.json\n", - "Model config DistilBertConfig {\n", - " \"_name_or_path\": \"/tmp\",\n", - " \"activation\": \"gelu\",\n", - " \"architectures\": [\n", - " \"DistilBertForSequenceClassification\"\n", - " ],\n", - " \"attention_dropout\": 0.1,\n", - " \"dim\": 768,\n", - " \"dropout\": 0.1,\n", - " \"hidden_dim\": 3072,\n", - " \"initializer_range\": 0.02,\n", - " \"max_position_embeddings\": 512,\n", - " \"model_type\": \"distilbert\",\n", - " \"n_heads\": 12,\n", - " \"n_layers\": 6,\n", - " \"pad_token_id\": 0,\n", - " \"problem_type\": \"single_label_classification\",\n", - " \"qa_dropout\": 0.1,\n", - " \"seq_classif_dropout\": 0.2,\n", - " \"sinusoidal_pos_embds\": false,\n", - " \"tie_weights_\": true,\n", - " \"torch_dtype\": \"float32\",\n", - " \"transformers_version\": \"4.26.1\",\n", - " \"vocab_size\": 30522\n", - "}\n", - "\n", - "loading configuration file /tmp/config.json\n", - "Model config DistilBertConfig {\n", - " \"_name_or_path\": \"/tmp\",\n", - " \"activation\": \"gelu\",\n", - " \"architectures\": [\n", - " \"DistilBertForSequenceClassification\"\n", - " ],\n", - " \"attention_dropout\": 0.1,\n", - " \"dim\": 768,\n", - " \"dropout\": 0.1,\n", - " \"hidden_dim\": 3072,\n", - " \"initializer_range\": 0.02,\n", - " \"max_position_embeddings\": 512,\n", - " \"model_type\": \"distilbert\",\n", - " \"n_heads\": 12,\n", - " \"n_layers\": 6,\n", - " \"pad_token_id\": 0,\n", - " \"problem_type\": \"single_label_classification\",\n", - " \"qa_dropout\": 0.1,\n", - " \"seq_classif_dropout\": 0.2,\n", - " \"sinusoidal_pos_embds\": false,\n", - " \"tie_weights_\": true,\n", - " \"torch_dtype\": \"float32\",\n", - " \"transformers_version\": \"4.26.1\",\n", - " \"vocab_size\": 30522\n", - "}\n", - "\n", - "Could not locate the tokenizer configuration file, will try to use the model config instead.\n", - "loading configuration file /tmp/config.json\n", - "Model config DistilBertConfig {\n", - " \"_name_or_path\": \"/tmp\",\n", - " \"activation\": \"gelu\",\n", - " \"architectures\": [\n", - " \"DistilBertForSequenceClassification\"\n", - " ],\n", - " \"attention_dropout\": 0.1,\n", - " \"dim\": 768,\n", - " \"dropout\": 0.1,\n", - " \"hidden_dim\": 3072,\n", - " \"initializer_range\": 0.02,\n", - " \"max_position_embeddings\": 512,\n", - " \"model_type\": \"distilbert\",\n", - " \"n_heads\": 12,\n", - " \"n_layers\": 6,\n", - " \"pad_token_id\": 0,\n", - " \"problem_type\": \"single_label_classification\",\n", - " \"qa_dropout\": 0.1,\n", - " \"seq_classif_dropout\": 0.2,\n", - " \"sinusoidal_pos_embds\": false,\n", - " \"tie_weights_\": true,\n", - " \"torch_dtype\": \"float32\",\n", - " \"transformers_version\": \"4.26.1\",\n", - " \"vocab_size\": 30522\n", - "}\n", - "\n", - "Could not locate the tokenizer configuration file, will try to use the model config instead.\n", - "loading configuration file /tmp/tmp79wjp8m8/config.json\n", - "Model config DistilBertConfig {\n", - " \"_name_or_path\": \"/tmp/tmp79wjp8m8\",\n", - " \"activation\": \"gelu\",\n", - " \"architectures\": [\n", - " \"DistilBertForSequenceClassification\"\n", - " ],\n", - " \"attention_dropout\": 0.1,\n", - " \"dim\": 768,\n", - " \"dropout\": 0.1,\n", - " \"hidden_dim\": 3072,\n", - " \"initializer_range\": 0.02,\n", - " \"max_position_embeddings\": 512,\n", - " \"model_type\": \"distilbert\",\n", - " \"n_heads\": 12,\n", - " \"n_layers\": 6,\n", - " \"pad_token_id\": 0,\n", - " \"problem_type\": \"single_label_classification\",\n", - " \"qa_dropout\": 0.1,\n", - " \"seq_classif_dropout\": 0.2,\n", - " \"sinusoidal_pos_embds\": false,\n", - " \"tie_weights_\": true,\n", - " \"torch_dtype\": \"float32\",\n", - " \"transformers_version\": \"4.26.1\",\n", - " \"vocab_size\": 30522\n", - "}\n", - "\n", - "loading configuration file /tmp/tmp79wjp8m8/config.json\n", - "Model config DistilBertConfig {\n", - " \"_name_or_path\": \"/tmp/tmp79wjp8m8\",\n", - " \"activation\": \"gelu\",\n", - " \"architectures\": [\n", - " \"DistilBertForSequenceClassification\"\n", - " ],\n", - " \"attention_dropout\": 0.1,\n", - " \"dim\": 768,\n", - " \"dropout\": 0.1,\n", - " \"hidden_dim\": 3072,\n", - " \"initializer_range\": 0.02,\n", - " \"max_position_embeddings\": 512,\n", - " \"model_type\": \"distilbert\",\n", - " \"n_heads\": 12,\n", - " \"n_layers\": 6,\n", - " \"pad_token_id\": 0,\n", - " \"problem_type\": \"single_label_classification\",\n", - " \"qa_dropout\": 0.1,\n", - " \"seq_classif_dropout\": 0.2,\n", - " \"sinusoidal_pos_embds\": false,\n", - " \"tie_weights_\": true,\n", - " \"torch_dtype\": \"float32\",\n", - " \"transformers_version\": \"4.26.1\",\n", - " \"vocab_size\": 30522\n", - "}\n", - "\n", - "Could not locate the tokenizer configuration file, will try to use the model config instead.\n", - "loading configuration file /tmp/tmp79wjp8m8/config.json\n", - "Model config DistilBertConfig {\n", - " \"_name_or_path\": \"/tmp/tmp79wjp8m8\",\n", - " \"activation\": \"gelu\",\n", - " \"architectures\": [\n", - " \"DistilBertForSequenceClassification\"\n", - " ],\n", - " \"attention_dropout\": 0.1,\n", - " \"dim\": 768,\n", - " \"dropout\": 0.1,\n", - " \"hidden_dim\": 3072,\n", - " \"initializer_range\": 0.02,\n", - " \"max_position_embeddings\": 512,\n", - " \"model_type\": \"distilbert\",\n", - " \"n_heads\": 12,\n", - " \"n_layers\": 6,\n", - " \"pad_token_id\": 0,\n", - " \"problem_type\": \"single_label_classification\",\n", - " \"qa_dropout\": 0.1,\n", - " \"seq_classif_dropout\": 0.2,\n", - " \"sinusoidal_pos_embds\": false,\n", - " \"tie_weights_\": true,\n", - " \"torch_dtype\": \"float32\",\n", - " \"transformers_version\": \"4.26.1\",\n", - " \"vocab_size\": 30522\n", - "}\n", - "\n", - "Configuration saved in optimized/config.json\n", - "Could not locate the tokenizer configuration file, will try to use the model config instead.\n", - "loading configuration file /tmp/tmp79wjp8m8/config.json\n", - "Model config DistilBertConfig {\n", - " \"_name_or_path\": \"/tmp/tmp79wjp8m8\",\n", - " \"activation\": \"gelu\",\n", - " \"architectures\": [\n", - " \"DistilBertForSequenceClassification\"\n", - " ],\n", - " \"attention_dropout\": 0.1,\n", - " \"dim\": 768,\n", - " \"dropout\": 0.1,\n", - " \"hidden_dim\": 3072,\n", - " \"initializer_range\": 0.02,\n", - " \"max_position_embeddings\": 512,\n", - " \"model_type\": \"distilbert\",\n", - " \"n_heads\": 12,\n", - " \"n_layers\": 6,\n", - " \"pad_token_id\": 0,\n", - " \"problem_type\": \"single_label_classification\",\n", - " \"qa_dropout\": 0.1,\n", - " \"seq_classif_dropout\": 0.2,\n", - " \"sinusoidal_pos_embds\": false,\n", - " \"tie_weights_\": true,\n", - " \"torch_dtype\": \"float32\",\n", - " \"transformers_version\": \"4.26.1\",\n", - " \"vocab_size\": 30522\n", - "}\n", - "\n", - "loading configuration file /tmp/tmp79wjp8m8/config.json\n", - "Model config DistilBertConfig {\n", - " \"_name_or_path\": \"/tmp/tmp79wjp8m8\",\n", - " \"activation\": \"gelu\",\n", - " \"architectures\": [\n", - " \"DistilBertForSequenceClassification\"\n", - " ],\n", - " \"attention_dropout\": 0.1,\n", - " \"dim\": 768,\n", - " \"dropout\": 0.1,\n", - " \"hidden_dim\": 3072,\n", - " \"initializer_range\": 0.02,\n", - " \"max_position_embeddings\": 512,\n", - " \"model_type\": \"distilbert\",\n", - " \"n_heads\": 12,\n", - " \"n_layers\": 6,\n", - " \"pad_token_id\": 0,\n", - " \"problem_type\": \"single_label_classification\",\n", - " \"qa_dropout\": 0.1,\n", - " \"seq_classif_dropout\": 0.2,\n", - " \"sinusoidal_pos_embds\": false,\n", - " \"tie_weights_\": true,\n", - " \"torch_dtype\": \"float32\",\n", - " \"transformers_version\": \"4.26.1\",\n", - " \"vocab_size\": 30522\n", - "}\n", - "\n", - "Could not locate the tokenizer configuration file, will try to use the model config instead.\n", - "loading configuration file /tmp/tmp79wjp8m8/config.json\n", - "Model config DistilBertConfig {\n", - " \"_name_or_path\": \"/tmp/tmp79wjp8m8\",\n", - " \"activation\": \"gelu\",\n", - " \"architectures\": [\n", - " \"DistilBertForSequenceClassification\"\n", - " ],\n", - " \"attention_dropout\": 0.1,\n", - " \"dim\": 768,\n", - " \"dropout\": 0.1,\n", - " \"hidden_dim\": 3072,\n", - " \"initializer_range\": 0.02,\n", - " \"max_position_embeddings\": 512,\n", - " \"model_type\": \"distilbert\",\n", - " \"n_heads\": 12,\n", - " \"n_layers\": 6,\n", - " \"pad_token_id\": 0,\n", - " \"problem_type\": \"single_label_classification\",\n", - " \"qa_dropout\": 0.1,\n", - " \"seq_classif_dropout\": 0.2,\n", - " \"sinusoidal_pos_embds\": false,\n", - " \"tie_weights_\": true,\n", - " \"torch_dtype\": \"float32\",\n", - " \"transformers_version\": \"4.26.1\",\n", - " \"vocab_size\": 30522\n", - "}\n", - "\n", - "Failed to remove node input: \"/distilbert/transformer/layer.0/attention/Transpose_output_0\"\n", - "input: \"/distilbert/transformer/layer.0/attention/Constant_11_output_0\"\n", - "output: \"/distilbert/transformer/layer.0/attention/Div_output_0\"\n", - "name: \"/distilbert/transformer/layer.0/attention/Div\"\n", - "op_type: \"Div\"\n", - "\n", - "Failed to remove node input: \"/distilbert/transformer/layer.1/attention/Transpose_output_0\"\n", - "input: \"/distilbert/transformer/layer.1/attention/Constant_11_output_0\"\n", - "output: \"/distilbert/transformer/layer.1/attention/Div_output_0\"\n", - "name: \"/distilbert/transformer/layer.1/attention/Div\"\n", - "op_type: \"Div\"\n", - "\n", - "Failed to remove node input: \"/distilbert/transformer/layer.2/attention/Transpose_output_0\"\n", - "input: \"/distilbert/transformer/layer.2/attention/Constant_11_output_0\"\n", - "output: \"/distilbert/transformer/layer.2/attention/Div_output_0\"\n", - "name: \"/distilbert/transformer/layer.2/attention/Div\"\n", - "op_type: \"Div\"\n", - "\n", - "Failed to remove node input: \"/distilbert/transformer/layer.3/attention/Transpose_output_0\"\n", - "input: \"/distilbert/transformer/layer.3/attention/Constant_11_output_0\"\n", - "output: \"/distilbert/transformer/layer.3/attention/Div_output_0\"\n", - "name: \"/distilbert/transformer/layer.3/attention/Div\"\n", - "op_type: \"Div\"\n", - "\n", - "Failed to remove node input: \"/distilbert/transformer/layer.4/attention/Transpose_output_0\"\n", - "input: \"/distilbert/transformer/layer.4/attention/Constant_11_output_0\"\n", - "output: \"/distilbert/transformer/layer.4/attention/Div_output_0\"\n", - "name: \"/distilbert/transformer/layer.4/attention/Div\"\n", - "op_type: \"Div\"\n", - "\n", - "Failed to remove node input: \"/distilbert/transformer/layer.5/attention/Transpose_output_0\"\n", - "input: \"/distilbert/transformer/layer.5/attention/Constant_11_output_0\"\n", - "output: \"/distilbert/transformer/layer.5/attention/Div_output_0\"\n", - "name: \"/distilbert/transformer/layer.5/attention/Div\"\n", - "op_type: \"Div\"\n", - "\n", - "Configuration saved in optimized/config.json\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
hugging-face-trainer-avia0Mar 24 17:12:02completedhugging-face-classifier-trainer-optimize
v3io_user=avia
kind=local
owner=avia
host=jupyter-avia-6454bdd4c5-xz8cg
model_path=store://artifacts/hugging-face-trainer-avia/huggingface-model@514d8d5530c842238b1cc81983cd943e
model
\n", - "
\n", - "
\n", - "
\n", - " Title\n", - " ×\n", - "
\n", - " \n", - "
\n", - "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - }, - { - "data": { - "text/html": [ - " > to track results use the .show() or .logs() methods or click here to open in UI" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2024-03-24 17:12:22,721 [info] Run execution finished: {'status': 'completed', 'name': 'hugging-face-classifier-trainer-optimize'}\n" - ] - } - ], - "source": [ - "optimize_run = hugging_face_classifier_trainer.run(params={\n", - " \"model_path\": str(model_path)\n", - " },\n", - " handler=\"optimize\",\n", - " local=True,\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'model': 'store://artifacts/hugging-face-trainer-avia/optimized_model@fbee1ead18444824a4b5c0308a677bf4'}" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "optimize_run.outputs" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Running the training remotely**\n" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": { - "scrolled": true, - "tags": [] - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/User/.pythonlibs/mlrun-base/lib/python3.9/site-packages/mlrun/projects/operations.py:276: OverwriteBuildParamsWarning:\n", - "\n", - "The `overwrite_build_params` parameter default will change from 'False' to 'True' in 1.8.0.\n", - "\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2024-03-24 17:14:22,792 [info] Started building image: .mlrun/func-hugging-face-trainer-avia-hugging-face-classifier-trainer:latest\n", - "\u001b[36mINFO\u001b[0m[0000] Retrieving image manifest mlrun/mlrun:1.6.1 \n", - "\u001b[36mINFO\u001b[0m[0000] Retrieving image mlrun/mlrun:1.6.1 from registry index.docker.io \n", - "\u001b[36mINFO\u001b[0m[0000] Built cross stage deps: map[] \n", - "\u001b[36mINFO\u001b[0m[0000] Retrieving image manifest mlrun/mlrun:1.6.1 \n", - "\u001b[36mINFO\u001b[0m[0000] Returning cached image manifest \n", - "\u001b[36mINFO\u001b[0m[0000] Executing 0 build triggers \n", - "\u001b[36mINFO\u001b[0m[0000] Building stage 'mlrun/mlrun:1.6.1' [idx: '0', base-idx: '-1'] \n", - "\u001b[36mINFO\u001b[0m[0000] Unpacking rootfs as cmd RUN echo 'Installing /empty/requirements.txt...'; cat /empty/requirements.txt requires it. \n", - "\u001b[36mINFO\u001b[0m[0047] RUN echo 'Installing /empty/requirements.txt...'; cat /empty/requirements.txt \n", - "\u001b[36mINFO\u001b[0m[0047] Initializing snapshotter ... \n", - "\u001b[36mINFO\u001b[0m[0047] Taking snapshot of full filesystem... \n", - "\u001b[36mINFO\u001b[0m[0074] Cmd: /bin/sh \n", - "\u001b[36mINFO\u001b[0m[0074] Args: [-c echo 'Installing /empty/requirements.txt...'; cat /empty/requirements.txt] \n", - "\u001b[36mINFO\u001b[0m[0074] Running: [/bin/sh -c echo 'Installing /empty/requirements.txt...'; cat /empty/requirements.txt] \n", - "Installing /empty/requirements.txt...\n", - "mlrun[complete]==1.6.1\n", - "onnx~=1.14.1\n", - "onnxruntime~=1.16.1\n", - "optimum~=1.6.4\n", - "transformers~=4.26.1\n", - "datasets~=2.10.1\n", - "scikit-learn~=1.0.2\n", - "\u001b[36mINFO\u001b[0m[0074] Taking snapshot of full filesystem... \n", - "\u001b[36mINFO\u001b[0m[0078] No files were changed, appending empty layer to config. No layer added to image. \n", - "\u001b[36mINFO\u001b[0m[0078] RUN python -m pip install -r /empty/requirements.txt \n", - "\u001b[36mINFO\u001b[0m[0078] Cmd: /bin/sh \n", - "\u001b[36mINFO\u001b[0m[0078] Args: [-c python -m pip install -r /empty/requirements.txt] \n", - "\u001b[36mINFO\u001b[0m[0078] Running: [/bin/sh -c python -m pip install -r /empty/requirements.txt] \n", - "Requirement already satisfied: mlrun[complete]==1.6.1 in /opt/conda/lib/python3.9/site-packages (from -r /empty/requirements.txt (line 1)) (1.6.1)\n", - "Collecting onnx~=1.14.1 (from -r /empty/requirements.txt (line 2))\n", - " Obtaining dependency information for onnx~=1.14.1 from https://files.pythonhosted.org/packages/ff/24/0e522fdcadf0e15fc304145a5b6e5d7246d7f2c507fd9bfe6e1fafb2aa95/onnx-1.14.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata\n", - " Downloading onnx-1.14.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (15 kB)\n", - "Collecting onnxruntime~=1.16.1 (from -r /empty/requirements.txt (line 3))\n", - " Obtaining dependency information for onnxruntime~=1.16.1 from https://files.pythonhosted.org/packages/de/ab/ed3ae0d649cee41e870f8b1653cf4a1c1fc321e0ded4e3e1a3d4a25c0131/onnxruntime-1.16.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata\n", - " Downloading onnxruntime-1.16.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.3 kB)\n", - "Collecting optimum~=1.6.4 (from -r /empty/requirements.txt (line 4))\n", - " Obtaining dependency information for optimum~=1.6.4 from https://files.pythonhosted.org/packages/31/72/a7e3b2c57d6368c5f4bb6fba54a85cbf07d25c385a2db3f1a638f3c0ddb2/optimum-1.6.4-py3-none-any.whl.metadata\n", - " Downloading optimum-1.6.4-py3-none-any.whl.metadata (17 kB)\n", - "Collecting transformers~=4.26.1 (from -r /empty/requirements.txt (line 5))\n", - " Obtaining dependency information for transformers~=4.26.1 from https://files.pythonhosted.org/packages/1e/e2/60c3f4691b16d126ee9cfe28f598b13c424b60350ab339aba81aef054b8f/transformers-4.26.1-py3-none-any.whl.metadata\n", - " Downloading transformers-4.26.1-py3-none-any.whl.metadata (100 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100.3/100.3 kB 6.2 MB/s eta 0:00:00\n", - "Collecting datasets~=2.10.1 (from -r /empty/requirements.txt (line 6))\n", - " Obtaining dependency information for datasets~=2.10.1 from https://files.pythonhosted.org/packages/fe/17/5825fdf034ff1a315becdbb9b6fe5a2bd9d8e724464535f18809593bf9c2/datasets-2.10.1-py3-none-any.whl.metadata\n", - " Downloading datasets-2.10.1-py3-none-any.whl.metadata (20 kB)\n", - "Collecting scikit-learn~=1.0.2 (from -r /empty/requirements.txt (line 7))\n", - " Obtaining dependency information for scikit-learn~=1.0.2 from https://files.pythonhosted.org/packages/57/aa/483fbe6b5314bce2d49801e6cec1f2139a9c220d0d51494788fff47233b3/scikit_learn-1.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata\n", - " Downloading scikit_learn-1.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)\n", - "Requirement already satisfied: urllib3<1.27,>=1.26.9 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.26.18)\n", - "Requirement already satisfied: GitPython>=3.1.41,~=3.1 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.1.42)\n", - "Requirement already satisfied: aiohttp~=3.9 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.9.3)\n", - "Requirement already satisfied: aiohttp-retry~=2.8 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.8.3)\n", - "Requirement already satisfied: click~=8.1 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (8.1.7)\n", - "Requirement already satisfied: kfp~=1.8 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.8.22)\n", - "Requirement already satisfied: nest-asyncio~=1.0 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.6.0)\n", - "Requirement already satisfied: ipython~=8.10 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (8.18.1)\n", - "Requirement already satisfied: nuclio-jupyter~=0.9.15 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.9.16)\n", - "Requirement already satisfied: numpy<1.27.0,>=1.16.5 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.26.4)\n", - "Requirement already satisfied: pandas<2.2,>=1.2 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.1.4)\n", - "Requirement already satisfied: pyarrow<15,>=10.0 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (14.0.2)\n", - "Requirement already satisfied: pyyaml~=5.1 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (5.4.1)\n", - "Requirement already satisfied: requests~=2.31 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.31.0)\n", - "Requirement already satisfied: tabulate~=0.8.6 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.8.10)\n", - "Requirement already satisfied: v3io~=0.5.21 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.5.23)\n", - "Requirement already satisfied: pydantic>=1.10.8,~=1.10 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.10.14)\n", - "Requirement already satisfied: mergedeep~=1.3 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.3.4)\n", - "Requirement already satisfied: v3io-frames~=0.10.12 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.10.13)\n", - "Requirement already satisfied: semver~=3.0 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.0.2)\n", - "Requirement already satisfied: dependency-injector~=4.41 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (4.41.0)\n", - "Requirement already satisfied: fsspec==2023.9.2 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2023.9.2)\n", - "Requirement already satisfied: v3iofs~=0.1.17 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.1.18)\n", - "Requirement already satisfied: storey~=1.6.18 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.6.18)\n", - "Requirement already satisfied: inflection~=0.5.0 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.5.1)\n", - "Requirement already satisfied: python-dotenv~=0.17.0 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.17.1)\n", - "Requirement already satisfied: setuptools~=68.2 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (68.2.2)\n", - "Requirement already satisfied: deprecated~=1.2 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.2.14)\n", - "Requirement already satisfied: jinja2>=3.1.3,~=3.1 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.1.3)\n", - "Requirement already satisfied: anyio~=3.7 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.7.1)\n", - "Requirement already satisfied: orjson~=3.9 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.9.15)\n", - "Requirement already satisfied: adlfs==2023.9.0 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2023.9.0)\n", - "Requirement already satisfied: aiobotocore<2.8,>=2.5.0 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.5.4)\n", - "Requirement already satisfied: avro~=1.11 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.11.3)\n", - "Requirement already satisfied: azure-core~=1.24 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.30.0)\n", - "Requirement already satisfied: azure-identity~=1.5 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.15.0)\n", - "Requirement already satisfied: azure-keyvault-secrets~=4.2 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (4.8.0)\n", - "Requirement already satisfied: boto3<1.29.0,>=1.28.0 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.28.17)\n", - "Requirement already satisfied: dask~=2023.9.0 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2023.9.3)\n", - "Requirement already satisfied: databricks-sdk~=0.13.0 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.13.0)\n", - "Requirement already satisfied: distributed~=2023.9.0 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2023.9.3)\n", - "Requirement already satisfied: gcsfs==2023.9.2 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2023.9.2)\n", - "Requirement already satisfied: google-cloud-bigquery[bqstorage,pandas]==3.14.1 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.14.1)\n", - "Requirement already satisfied: graphviz~=0.20.0 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.20.1)\n", - "Requirement already satisfied: kafka-python~=2.0 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.0.2)\n", - "Requirement already satisfied: mlflow~=2.8 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.10.2)\n", - "Requirement already satisfied: msrest~=0.6.21 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.6.21)\n", - "Requirement already satisfied: plotly<5.12.0,~=5.4 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (5.11.0)\n", - "Requirement already satisfied: pyopenssl>=23 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (24.0.0)\n", - "Requirement already satisfied: redis~=4.3 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (4.6.0)\n", - "Requirement already satisfied: s3fs==2023.9.2 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2023.9.2)\n", - "Requirement already satisfied: sqlalchemy~=1.4 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.4.51)\n", - "Requirement already satisfied: azure-datalake-store<0.1,>=0.0.46 in /opt/conda/lib/python3.9/site-packages (from adlfs==2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.0.53)\n", - "Requirement already satisfied: azure-storage-blob>=12.12.0 in /opt/conda/lib/python3.9/site-packages (from adlfs==2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (12.19.0)\n", - "Requirement already satisfied: decorator>4.1.2 in /opt/conda/lib/python3.9/site-packages (from gcsfs==2023.9.2->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (5.1.1)\n", - "Requirement already satisfied: google-auth>=1.2 in /opt/conda/lib/python3.9/site-packages (from gcsfs==2023.9.2->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.28.1)\n", - "Requirement already satisfied: google-auth-oauthlib in /opt/conda/lib/python3.9/site-packages (from gcsfs==2023.9.2->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.2.0)\n", - "Requirement already satisfied: google-cloud-storage in /opt/conda/lib/python3.9/site-packages (from gcsfs==2023.9.2->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.14.0)\n", - "Requirement already satisfied: google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0dev,>=1.31.5 in /opt/conda/lib/python3.9/site-packages (from google-cloud-bigquery[bqstorage,pandas]==3.14.1->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.17.1)\n", - "Requirement already satisfied: google-cloud-core<3.0.0dev,>=1.6.0 in /opt/conda/lib/python3.9/site-packages (from google-cloud-bigquery[bqstorage,pandas]==3.14.1->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.4.1)\n", - "Requirement already satisfied: google-resumable-media<3.0dev,>=0.6.0 in /opt/conda/lib/python3.9/site-packages (from google-cloud-bigquery[bqstorage,pandas]==3.14.1->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.7.0)\n", - "Requirement already satisfied: packaging>=20.0.0 in /opt/conda/lib/python3.9/site-packages (from google-cloud-bigquery[bqstorage,pandas]==3.14.1->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (23.1)\n", - "Requirement already satisfied: python-dateutil<3.0dev,>=2.7.2 in /opt/conda/lib/python3.9/site-packages (from google-cloud-bigquery[bqstorage,pandas]==3.14.1->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.8.2)\n", - "Requirement already satisfied: db-dtypes<2.0.0dev,>=0.3.0 in /opt/conda/lib/python3.9/site-packages (from google-cloud-bigquery[bqstorage,pandas]==3.14.1->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.2.0)\n", - "Requirement already satisfied: google-cloud-bigquery-storage<3.0.0dev,>=2.6.0 in /opt/conda/lib/python3.9/site-packages (from google-cloud-bigquery[bqstorage,pandas]==3.14.1->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.24.0)\n", - "Requirement already satisfied: grpcio<2.0dev,>=1.47.0 in /opt/conda/lib/python3.9/site-packages (from google-cloud-bigquery[bqstorage,pandas]==3.14.1->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.62.0)\n", - "Requirement already satisfied: protobuf>=3.20.2 in /opt/conda/lib/python3.9/site-packages (from onnx~=1.14.1->-r /empty/requirements.txt (line 2)) (3.20.3)\n", - "Requirement already satisfied: typing-extensions>=3.6.2.1 in /opt/conda/lib/python3.9/site-packages (from onnx~=1.14.1->-r /empty/requirements.txt (line 2)) (4.10.0)\n", - "Collecting coloredlogs (from onnxruntime~=1.16.1->-r /empty/requirements.txt (line 3))\n", - " Obtaining dependency information for coloredlogs from https://files.pythonhosted.org/packages/a7/06/3d6badcf13db419e25b07041d9c7b4a2c331d3f4e7134445ec5df57714cd/coloredlogs-15.0.1-py2.py3-none-any.whl.metadata\n", - " Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)\n", - "Collecting flatbuffers (from onnxruntime~=1.16.1->-r /empty/requirements.txt (line 3))\n", - " Obtaining dependency information for flatbuffers from https://files.pythonhosted.org/packages/bf/45/c961e3cb6ddad76b325c163d730562bb6deb1ace5acbed0306f5fbefb90e/flatbuffers-24.3.7-py2.py3-none-any.whl.metadata\n", - " Downloading flatbuffers-24.3.7-py2.py3-none-any.whl.metadata (849 bytes)\n", - "Collecting sympy (from onnxruntime~=1.16.1->-r /empty/requirements.txt (line 3))\n", - " Obtaining dependency information for sympy from https://files.pythonhosted.org/packages/d2/05/e6600db80270777c4a64238a98d442f0fd07cc8915be2a1c16da7f2b9e74/sympy-1.12-py3-none-any.whl.metadata\n", - " Downloading sympy-1.12-py3-none-any.whl.metadata (12 kB)\n", - "Collecting transformers[sentencepiece]>=4.26.0 (from optimum~=1.6.4->-r /empty/requirements.txt (line 4))\n", - " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/0a/fd/280f4385e76f3c1890efc15fa93f7206134fefad6351397e1bfab6d0d0de/transformers-4.39.1-py3-none-any.whl.metadata\n", - " Downloading transformers-4.39.1-py3-none-any.whl.metadata (134 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 134.8/134.8 kB 40.1 MB/s eta 0:00:00\n", - "Collecting torch>=1.9 (from optimum~=1.6.4->-r /empty/requirements.txt (line 4))\n", - " Obtaining dependency information for torch>=1.9 from https://files.pythonhosted.org/packages/98/04/95a12556d068786d6505c609daf2805bed91c9210c5185499a7c121eba47/torch-2.2.1-cp39-cp39-manylinux1_x86_64.whl.metadata\n", - " Downloading torch-2.2.1-cp39-cp39-manylinux1_x86_64.whl.metadata (25 kB)\n", - "Collecting numpy<1.27.0,>=1.16.5 (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1))\n", - " Obtaining dependency information for numpy<1.27.0,>=1.16.5 from https://files.pythonhosted.org/packages/4c/b9/038abd6fbd67b05b03cb1af590cfc02b7f1e5a37af7ac6a868f5093c29f5/numpy-1.23.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata\n", - " Downloading numpy-1.23.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.3 kB)\n", - "Collecting huggingface-hub>=0.8.0 (from optimum~=1.6.4->-r /empty/requirements.txt (line 4))\n", - " Obtaining dependency information for huggingface-hub>=0.8.0 from https://files.pythonhosted.org/packages/ab/28/d4b691840d73126d4c9845f8a22dad033ac872509b6d3a0d93b456eef424/huggingface_hub-0.21.4-py3-none-any.whl.metadata\n", - " Downloading huggingface_hub-0.21.4-py3-none-any.whl.metadata (13 kB)\n", - "Collecting filelock (from transformers~=4.26.1->-r /empty/requirements.txt (line 5))\n", - " Obtaining dependency information for filelock from https://files.pythonhosted.org/packages/81/54/84d42a0bee35edba99dee7b59a8d4970eccdd44b99fe728ed912106fc781/filelock-3.13.1-py3-none-any.whl.metadata\n", - " Downloading filelock-3.13.1-py3-none-any.whl.metadata (2.8 kB)\n", - "Collecting regex!=2019.12.17 (from transformers~=4.26.1->-r /empty/requirements.txt (line 5))\n", - " Obtaining dependency information for regex!=2019.12.17 from https://files.pythonhosted.org/packages/05/9e/80c20f1151432a6025690c9c2037053039b028a7b236fa81d7e7ac9dec60/regex-2023.12.25-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata\n", - " Downloading regex-2023.12.25-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 40.9/40.9 kB 217.5 MB/s eta 0:00:00\n", - "Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers~=4.26.1->-r /empty/requirements.txt (line 5))\n", - " Obtaining dependency information for tokenizers!=0.11.3,<0.14,>=0.11.1 from https://files.pythonhosted.org/packages/d6/27/07a337087dd507170a1b20fed3bbf8da81401185a7130a6e74e440c52040/tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata\n", - " Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)\n", - "Requirement already satisfied: tqdm>=4.27 in /opt/conda/lib/python3.9/site-packages (from transformers~=4.26.1->-r /empty/requirements.txt (line 5)) (4.65.0)\n", - "Collecting dill<0.3.7,>=0.3.0 (from datasets~=2.10.1->-r /empty/requirements.txt (line 6))\n", - " Obtaining dependency information for dill<0.3.7,>=0.3.0 from https://files.pythonhosted.org/packages/be/e3/a84bf2e561beed15813080d693b4b27573262433fced9c1d1fea59e60553/dill-0.3.6-py3-none-any.whl.metadata\n", - " Downloading dill-0.3.6-py3-none-any.whl.metadata (9.8 kB)\n", - "Requirement already satisfied: xxhash in /opt/conda/lib/python3.9/site-packages (from datasets~=2.10.1->-r /empty/requirements.txt (line 6)) (3.4.1)\n", - "Collecting multiprocess (from datasets~=2.10.1->-r /empty/requirements.txt (line 6))\n", - " Obtaining dependency information for multiprocess from https://files.pythonhosted.org/packages/da/d9/f7f9379981e39b8c2511c9e0326d212accacb82f12fbfdc1aa2ce2a7b2b6/multiprocess-0.70.16-py39-none-any.whl.metadata\n", - " Downloading multiprocess-0.70.16-py39-none-any.whl.metadata (7.2 kB)\n", - "Collecting responses<0.19 (from datasets~=2.10.1->-r /empty/requirements.txt (line 6))\n", - " Obtaining dependency information for responses<0.19 from https://files.pythonhosted.org/packages/79/f3/2b3a6dc5986303b3dd1bbbcf482022acb2583c428cd23f0b6d37b1a1a519/responses-0.18.0-py3-none-any.whl.metadata\n", - " Downloading responses-0.18.0-py3-none-any.whl.metadata (29 kB)\n", - "Requirement already satisfied: scipy>=1.1.0 in /opt/conda/lib/python3.9/site-packages (from scikit-learn~=1.0.2->-r /empty/requirements.txt (line 7)) (1.12.0)\n", - "Requirement already satisfied: joblib>=0.11 in /opt/conda/lib/python3.9/site-packages (from scikit-learn~=1.0.2->-r /empty/requirements.txt (line 7)) (1.3.2)\n", - "Requirement already satisfied: threadpoolctl>=2.0.0 in /opt/conda/lib/python3.9/site-packages (from scikit-learn~=1.0.2->-r /empty/requirements.txt (line 7)) (3.3.0)\n", - "Requirement already satisfied: botocore<1.31.18,>=1.31.17 in /opt/conda/lib/python3.9/site-packages (from aiobotocore<2.8,>=2.5.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.31.17)\n", - "Requirement already satisfied: wrapt<2.0.0,>=1.10.10 in /opt/conda/lib/python3.9/site-packages (from aiobotocore<2.8,>=2.5.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.16.0)\n", - "Requirement already satisfied: aioitertools<1.0.0,>=0.5.1 in /opt/conda/lib/python3.9/site-packages (from aiobotocore<2.8,>=2.5.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.11.0)\n", - "Requirement already satisfied: aiosignal>=1.1.2 in /opt/conda/lib/python3.9/site-packages (from aiohttp~=3.9->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.3.1)\n", - "Requirement already satisfied: attrs>=17.3.0 in /opt/conda/lib/python3.9/site-packages (from aiohttp~=3.9->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (23.2.0)\n", - "Requirement already satisfied: frozenlist>=1.1.1 in /opt/conda/lib/python3.9/site-packages (from aiohttp~=3.9->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.4.1)\n", - "Requirement already satisfied: multidict<7.0,>=4.5 in /opt/conda/lib/python3.9/site-packages (from aiohttp~=3.9->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (6.0.5)\n", - "Requirement already satisfied: yarl<2.0,>=1.0 in /opt/conda/lib/python3.9/site-packages (from aiohttp~=3.9->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.9.4)\n", - "Requirement already satisfied: async-timeout<5.0,>=4.0 in /opt/conda/lib/python3.9/site-packages (from aiohttp~=3.9->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (4.0.3)\n", - "Requirement already satisfied: idna>=2.8 in /opt/conda/lib/python3.9/site-packages (from anyio~=3.7->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.4)\n", - "Requirement already satisfied: sniffio>=1.1 in /opt/conda/lib/python3.9/site-packages (from anyio~=3.7->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.3.1)\n", - "Requirement already satisfied: exceptiongroup in /opt/conda/lib/python3.9/site-packages (from anyio~=3.7->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.2.0)\n", - "Requirement already satisfied: six>=1.11.0 in /opt/conda/lib/python3.9/site-packages (from azure-core~=1.24->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.16.0)\n", - "Requirement already satisfied: cryptography>=2.5 in /opt/conda/lib/python3.9/site-packages (from azure-identity~=1.5->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (42.0.2)\n", - "Requirement already satisfied: msal<2.0.0,>=1.24.0 in /opt/conda/lib/python3.9/site-packages (from azure-identity~=1.5->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.27.0)\n", - "Requirement already satisfied: msal-extensions<2.0.0,>=0.3.0 in /opt/conda/lib/python3.9/site-packages (from azure-identity~=1.5->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.1.0)\n", - "Requirement already satisfied: isodate>=0.6.1 in /opt/conda/lib/python3.9/site-packages (from azure-keyvault-secrets~=4.2->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.6.1)\n", - "Requirement already satisfied: jmespath<2.0.0,>=0.7.1 in /opt/conda/lib/python3.9/site-packages (from boto3<1.29.0,>=1.28.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.0.1)\n", - "Requirement already satisfied: s3transfer<0.7.0,>=0.6.0 in /opt/conda/lib/python3.9/site-packages (from boto3<1.29.0,>=1.28.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.6.2)\n", - "Requirement already satisfied: cloudpickle>=1.5.0 in /opt/conda/lib/python3.9/site-packages (from dask~=2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.2.1)\n", - "Requirement already satisfied: partd>=1.2.0 in /opt/conda/lib/python3.9/site-packages (from dask~=2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.4.1)\n", - "Requirement already satisfied: toolz>=0.10.0 in /opt/conda/lib/python3.9/site-packages (from dask~=2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.12.0)\n", - "Requirement already satisfied: importlib-metadata>=4.13.0 in /opt/conda/lib/python3.9/site-packages (from dask~=2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (7.0.1)\n", - "Requirement already satisfied: locket>=1.0.0 in /opt/conda/lib/python3.9/site-packages (from distributed~=2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.0.0)\n", - "Requirement already satisfied: msgpack>=1.0.0 in /opt/conda/lib/python3.9/site-packages (from distributed~=2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.0.7)\n", - "Requirement already satisfied: psutil>=5.7.2 in /opt/conda/lib/python3.9/site-packages (from distributed~=2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (5.9.8)\n", - "Requirement already satisfied: sortedcontainers>=2.0.5 in /opt/conda/lib/python3.9/site-packages (from distributed~=2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.4.0)\n", - "Requirement already satisfied: tblib>=1.6.0 in /opt/conda/lib/python3.9/site-packages (from distributed~=2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.0.0)\n", - "Requirement already satisfied: tornado>=6.0.4 in /opt/conda/lib/python3.9/site-packages (from distributed~=2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (6.4)\n", - "Requirement already satisfied: zict>=3.0.0 in /opt/conda/lib/python3.9/site-packages (from distributed~=2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.0.0)\n", - "Requirement already satisfied: gitdb<5,>=4.0.1 in /opt/conda/lib/python3.9/site-packages (from GitPython>=3.1.41,~=3.1->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (4.0.11)\n", - "Requirement already satisfied: jedi>=0.16 in /opt/conda/lib/python3.9/site-packages (from ipython~=8.10->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.19.1)\n", - "Requirement already satisfied: matplotlib-inline in /opt/conda/lib/python3.9/site-packages (from ipython~=8.10->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.1.6)\n", - "Requirement already satisfied: prompt-toolkit<3.1.0,>=3.0.41 in /opt/conda/lib/python3.9/site-packages (from ipython~=8.10->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.0.43)\n", - "Requirement already satisfied: pygments>=2.4.0 in /opt/conda/lib/python3.9/site-packages (from ipython~=8.10->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.17.2)\n", - "Requirement already satisfied: stack-data in /opt/conda/lib/python3.9/site-packages (from ipython~=8.10->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.6.3)\n", - "Requirement already satisfied: traitlets>=5 in /opt/conda/lib/python3.9/site-packages (from ipython~=8.10->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (5.14.1)\n", - "Requirement already satisfied: pexpect>4.3 in /opt/conda/lib/python3.9/site-packages (from ipython~=8.10->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (4.9.0)\n", - "Requirement already satisfied: MarkupSafe>=2.0 in /opt/conda/lib/python3.9/site-packages (from jinja2>=3.1.3,~=3.1->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.1.5)\n", - "Requirement already satisfied: absl-py<2,>=0.9 in /opt/conda/lib/python3.9/site-packages (from kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.4.0)\n", - "Requirement already satisfied: kubernetes<26,>=8.0.0 in /opt/conda/lib/python3.9/site-packages (from kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (25.3.0)\n", - "Requirement already satisfied: google-api-python-client<2,>=1.7.8 in /opt/conda/lib/python3.9/site-packages (from kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.12.11)\n", - "Requirement already satisfied: requests-toolbelt<1,>=0.8.0 in /opt/conda/lib/python3.9/site-packages (from kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.10.1)\n", - "Requirement already satisfied: kfp-server-api<2.0.0,>=1.1.2 in /opt/conda/lib/python3.9/site-packages (from kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.8.5)\n", - "Requirement already satisfied: jsonschema<5,>=3.0.1 in /opt/conda/lib/python3.9/site-packages (from kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (4.21.1)\n", - "Requirement already satisfied: strip-hints<1,>=0.1.8 in /opt/conda/lib/python3.9/site-packages (from kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.1.10)\n", - "Requirement already satisfied: docstring-parser<1,>=0.7.3 in /opt/conda/lib/python3.9/site-packages (from kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.15)\n", - "Requirement already satisfied: kfp-pipeline-spec<0.2.0,>=0.1.16 in /opt/conda/lib/python3.9/site-packages (from kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.1.16)\n", - "Requirement already satisfied: fire<1,>=0.3.1 in /opt/conda/lib/python3.9/site-packages (from kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.5.0)\n", - "Requirement already satisfied: uritemplate<4,>=3.0.1 in /opt/conda/lib/python3.9/site-packages (from kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.0.1)\n", - "Requirement already satisfied: typer<1.0,>=0.3.2 in /opt/conda/lib/python3.9/site-packages (from kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.9.0)\n", - "Requirement already satisfied: entrypoints<1 in /opt/conda/lib/python3.9/site-packages (from mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.4)\n", - "Requirement already satisfied: pytz<2024 in /opt/conda/lib/python3.9/site-packages (from mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2023.4)\n", - "Requirement already satisfied: sqlparse<1,>=0.4.0 in /opt/conda/lib/python3.9/site-packages (from mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.4.4)\n", - "Requirement already satisfied: alembic!=1.10.0,<2 in /opt/conda/lib/python3.9/site-packages (from mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.13.1)\n", - "Requirement already satisfied: docker<8,>=4.0.0 in /opt/conda/lib/python3.9/site-packages (from mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (7.0.0)\n", - "Requirement already satisfied: Flask<4 in /opt/conda/lib/python3.9/site-packages (from mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.0.2)\n", - "Requirement already satisfied: querystring-parser<2 in /opt/conda/lib/python3.9/site-packages (from mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.2.4)\n", - "Requirement already satisfied: markdown<4,>=3.3 in /opt/conda/lib/python3.9/site-packages (from mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.5.2)\n", - "Requirement already satisfied: matplotlib<4 in /opt/conda/lib/python3.9/site-packages (from mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.8.3)\n", - "Requirement already satisfied: gunicorn<22 in /opt/conda/lib/python3.9/site-packages (from mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (21.2.0)\n", - "Requirement already satisfied: requests-oauthlib>=0.5.0 in /opt/conda/lib/python3.9/site-packages (from msrest~=0.6.21->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.3.1)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.9/site-packages (from msrest~=0.6.21->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2024.2.2)\n", - "Requirement already satisfied: nbconvert>=6.4.5 in /opt/conda/lib/python3.9/site-packages (from nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (7.16.1)\n", - "Requirement already satisfied: notebook<7.0.0,>=6.4 in /opt/conda/lib/python3.9/site-packages (from nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (6.5.6)\n", - "Requirement already satisfied: tzdata>=2022.1 in /opt/conda/lib/python3.9/site-packages (from pandas<2.2,>=1.2->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2024.1)\n", - "Requirement already satisfied: tenacity>=6.2.0 in /opt/conda/lib/python3.9/site-packages (from plotly<5.12.0,~=5.4->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (8.2.3)\n", - "Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.9/site-packages (from requests~=2.31->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.0.4)\n", - "Requirement already satisfied: greenlet!=0.4.17 in /opt/conda/lib/python3.9/site-packages (from sqlalchemy~=1.4->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.0.3)\n", - "Requirement already satisfied: nuclio-sdk>=0.5.3 in /opt/conda/lib/python3.9/site-packages (from storey~=1.6.18->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.5.9)\n", - "Collecting networkx (from torch>=1.9->optimum~=1.6.4->-r /empty/requirements.txt (line 4))\n", - " Obtaining dependency information for networkx from https://files.pythonhosted.org/packages/d5/f0/8fbc882ca80cf077f1b246c0e3c3465f7f415439bdea6b899f6b19f61f70/networkx-3.2.1-py3-none-any.whl.metadata\n", - " Downloading networkx-3.2.1-py3-none-any.whl.metadata (5.2 kB)\n", - "Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.9->optimum~=1.6.4->-r /empty/requirements.txt (line 4))\n", - " Obtaining dependency information for nvidia-cuda-nvrtc-cu12==12.1.105 from https://files.pythonhosted.org/packages/b6/9f/c64c03f49d6fbc56196664d05dba14e3a561038a81a638eeb47f4d4cfd48/nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata\n", - " Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)\n", - "Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.9->optimum~=1.6.4->-r /empty/requirements.txt (line 4))\n", - " Obtaining dependency information for nvidia-cuda-runtime-cu12==12.1.105 from https://files.pythonhosted.org/packages/eb/d5/c68b1d2cdfcc59e72e8a5949a37ddb22ae6cade80cd4a57a84d4c8b55472/nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata\n", - " Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)\n", - "Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.9->optimum~=1.6.4->-r /empty/requirements.txt (line 4))\n", - " Obtaining dependency information for nvidia-cuda-cupti-cu12==12.1.105 from https://files.pythonhosted.org/packages/7e/00/6b218edd739ecfc60524e585ba8e6b00554dd908de2c9c66c1af3e44e18d/nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata\n", - " Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)\n", - "Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.9->optimum~=1.6.4->-r /empty/requirements.txt (line 4))\n", - " Obtaining dependency information for nvidia-cudnn-cu12==8.9.2.26 from https://files.pythonhosted.org/packages/ff/74/a2e2be7fb83aaedec84f391f082cf765dfb635e7caa9b49065f73e4835d8/nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata\n", - " Downloading nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)\n", - "Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.9->optimum~=1.6.4->-r /empty/requirements.txt (line 4))\n", - " Obtaining dependency information for nvidia-cublas-cu12==12.1.3.1 from https://files.pythonhosted.org/packages/37/6d/121efd7382d5b0284239f4ab1fc1590d86d34ed4a4a2fdb13b30ca8e5740/nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata\n", - " Downloading nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)\n", - "Collecting nvidia-cufft-cu12==11.0.2.54 (from torch>=1.9->optimum~=1.6.4->-r /empty/requirements.txt (line 4))\n", - " Obtaining dependency information for nvidia-cufft-cu12==11.0.2.54 from https://files.pythonhosted.org/packages/86/94/eb540db023ce1d162e7bea9f8f5aa781d57c65aed513c33ee9a5123ead4d/nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl.metadata\n", - " Downloading nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)\n", - "Collecting nvidia-curand-cu12==10.3.2.106 (from torch>=1.9->optimum~=1.6.4->-r /empty/requirements.txt (line 4))\n", - " Obtaining dependency information for nvidia-curand-cu12==10.3.2.106 from https://files.pythonhosted.org/packages/44/31/4890b1c9abc496303412947fc7dcea3d14861720642b49e8ceed89636705/nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl.metadata\n", - " Downloading nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)\n", - "Collecting nvidia-cusolver-cu12==11.4.5.107 (from torch>=1.9->optimum~=1.6.4->-r /empty/requirements.txt (line 4))\n", - " Obtaining dependency information for nvidia-cusolver-cu12==11.4.5.107 from https://files.pythonhosted.org/packages/bc/1d/8de1e5c67099015c834315e333911273a8c6aaba78923dd1d1e25fc5f217/nvidia_cusolver_cu12-11.4.5.107-py3-none-manylinux1_x86_64.whl.metadata\n", - " Downloading nvidia_cusolver_cu12-11.4.5.107-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)\n", - "Collecting nvidia-cusparse-cu12==12.1.0.106 (from torch>=1.9->optimum~=1.6.4->-r /empty/requirements.txt (line 4))\n", - " Obtaining dependency information for nvidia-cusparse-cu12==12.1.0.106 from https://files.pythonhosted.org/packages/65/5b/cfaeebf25cd9fdec14338ccb16f6b2c4c7fa9163aefcf057d86b9cc248bb/nvidia_cusparse_cu12-12.1.0.106-py3-none-manylinux1_x86_64.whl.metadata\n", - " Downloading nvidia_cusparse_cu12-12.1.0.106-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)\n", - "Collecting nvidia-nccl-cu12==2.19.3 (from torch>=1.9->optimum~=1.6.4->-r /empty/requirements.txt (line 4))\n", - " Obtaining dependency information for nvidia-nccl-cu12==2.19.3 from https://files.pythonhosted.org/packages/38/00/d0d4e48aef772ad5aebcf70b73028f88db6e5640b36c38e90445b7a57c45/nvidia_nccl_cu12-2.19.3-py3-none-manylinux1_x86_64.whl.metadata\n", - " Downloading nvidia_nccl_cu12-2.19.3-py3-none-manylinux1_x86_64.whl.metadata (1.8 kB)\n", - "Collecting nvidia-nvtx-cu12==12.1.105 (from torch>=1.9->optimum~=1.6.4->-r /empty/requirements.txt (line 4))\n", - " Obtaining dependency information for nvidia-nvtx-cu12==12.1.105 from https://files.pythonhosted.org/packages/da/d3/8057f0587683ed2fcd4dbfbdfdfa807b9160b809976099d36b8f60d08f03/nvidia_nvtx_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata\n", - " Downloading nvidia_nvtx_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.7 kB)\n", - "Collecting triton==2.2.0 (from torch>=1.9->optimum~=1.6.4->-r /empty/requirements.txt (line 4))\n", - " Obtaining dependency information for triton==2.2.0 from https://files.pythonhosted.org/packages/6a/5c/01d9f062f719581cf6e60053e1a005d666ec67dcb59630fffaa3a3e5c9d8/triton-2.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata\n", - " Downloading triton-2.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)\n", - "Collecting nvidia-nvjitlink-cu12 (from nvidia-cusolver-cu12==11.4.5.107->torch>=1.9->optimum~=1.6.4->-r /empty/requirements.txt (line 4))\n", - " Obtaining dependency information for nvidia-nvjitlink-cu12 from https://files.pythonhosted.org/packages/58/d1/d1c80553f9d5d07b6072bc132607d75a0ef3600e28e1890e11c0f55d7346/nvidia_nvjitlink_cu12-12.4.99-py3-none-manylinux2014_x86_64.whl.metadata\n", - " Downloading nvidia_nvjitlink_cu12-12.4.99-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)\n", - "INFO: pip is looking at multiple versions of transformers[sentencepiece] to determine which version is compatible with other requirements. This could take a while.\n", - "Collecting transformers[sentencepiece]>=4.26.0 (from optimum~=1.6.4->-r /empty/requirements.txt (line 4))\n", - " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/a4/73/f620d76193954e16db3d5c53a07d956d7b9c800e570758d3bff91906d4a4/transformers-4.39.0-py3-none-any.whl.metadata\n", - " Downloading transformers-4.39.0-py3-none-any.whl.metadata (134 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 134.8/134.8 kB 115.9 MB/s eta 0:00:00\n", - " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/b6/4d/fbe6d89fde59d8107f0a02816c4ac4542a8f9a85559fdf33c68282affcc1/transformers-4.38.2-py3-none-any.whl.metadata\n", - " Downloading transformers-4.38.2-py3-none-any.whl.metadata (130 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 130.7/130.7 kB 126.3 MB/s eta 0:00:00\n", - " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/3e/6b/1b589f7b69aaea8193cf5bc91cf97410284aecd97b6312cdb08baedbdffe/transformers-4.38.1-py3-none-any.whl.metadata\n", - " Downloading transformers-4.38.1-py3-none-any.whl.metadata (131 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 131.1/131.1 kB 138.2 MB/s eta 0:00:00\n", - " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/91/89/5416dc364c7ef0711c564fd61a69b03d1e40eeb5c506c38e53ba8a969e79/transformers-4.38.0-py3-none-any.whl.metadata\n", - " Downloading transformers-4.38.0-py3-none-any.whl.metadata (131 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 131.1/131.1 kB 186.3 MB/s eta 0:00:00\n", - " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/85/f6/c5065913119c41ecad148c34e3a861f719e16b89a522287213698da911fc/transformers-4.37.2-py3-none-any.whl.metadata\n", - " Downloading transformers-4.37.2-py3-none-any.whl.metadata (129 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 129.4/129.4 kB 236.8 MB/s eta 0:00:00\n", - " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/ad/67/b4d6a51dcaf988cb45b31e26c6e33fb169fe34ba5fb168b086309bd7c028/transformers-4.37.1-py3-none-any.whl.metadata\n", - " Downloading transformers-4.37.1-py3-none-any.whl.metadata (129 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 129.4/129.4 kB 156.4 MB/s eta 0:00:00\n", - " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/3c/45/52133ce6bce49a099cc865599803bf1fad93de887276f728e56848d77a70/transformers-4.37.0-py3-none-any.whl.metadata\n", - " Downloading transformers-4.37.0-py3-none-any.whl.metadata (129 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 129.4/129.4 kB 102.0 MB/s eta 0:00:00\n", - "INFO: pip is still looking at multiple versions of transformers[sentencepiece] to determine which version is compatible with other requirements. This could take a while.\n", - " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/20/0a/739426a81f7635b422fbe6cb8d1d99d1235579a6ac8024c13d743efa6847/transformers-4.36.2-py3-none-any.whl.metadata\n", - " Downloading transformers-4.36.2-py3-none-any.whl.metadata (126 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 126.8/126.8 kB 108.8 MB/s eta 0:00:00\n", - " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/fc/04/0aad491cd98b09236c54ab849863ee85421eeda5138bbf9d33ecc594652b/transformers-4.36.1-py3-none-any.whl.metadata\n", - " Downloading transformers-4.36.1-py3-none-any.whl.metadata (126 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 126.8/126.8 kB 140.6 MB/s eta 0:00:00\n", - " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/0f/12/d8e27a190ca67811f81deea3183b528d9169f10b74d827e0b9211520ecfa/transformers-4.36.0-py3-none-any.whl.metadata\n", - " Downloading transformers-4.36.0-py3-none-any.whl.metadata (126 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 126.8/126.8 kB 267.8 MB/s eta 0:00:00\n", - " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/12/dd/f17b11a93a9ca27728e12512d167eb1281c151c4c6881d3ab59eb58f4127/transformers-4.35.2-py3-none-any.whl.metadata\n", - " Downloading transformers-4.35.2-py3-none-any.whl.metadata (123 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 123.5/123.5 kB 130.2 MB/s eta 0:00:00\n", - " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/92/ba/cfff7e01f7070d9fca3964bf42b2257b86964c3e6763b8d5435436cc1d77/transformers-4.35.1-py3-none-any.whl.metadata\n", - " Downloading transformers-4.35.1-py3-none-any.whl.metadata (123 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 123.1/123.1 kB 183.6 MB/s eta 0:00:00\n", - "INFO: This is taking longer than usual. You might need to provide the dependency resolver with stricter constraints to reduce runtime. See https://pip.pypa.io/warnings/backtracking for guidance. If you want to abort this run, press Ctrl + C.\n", - " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/9a/06/e4ec2a321e57c03b7e9345d709d554a52c33760e5015fdff0919d9459af0/transformers-4.35.0-py3-none-any.whl.metadata\n", - " Downloading transformers-4.35.0-py3-none-any.whl.metadata (123 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 123.1/123.1 kB 177.3 MB/s eta 0:00:00\n", - " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/c1/bd/f64d67df4d3b05a460f281defe830ffab6d7940b7ca98ec085e94e024781/transformers-4.34.1-py3-none-any.whl.metadata\n", - " Downloading transformers-4.34.1-py3-none-any.whl.metadata (121 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 121.5/121.5 kB 270.5 MB/s eta 0:00:00\n", - " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/1a/d1/3bba59606141ae808017f6fde91453882f931957f125009417b87a281067/transformers-4.34.0-py3-none-any.whl.metadata\n", - " Downloading transformers-4.34.0-py3-none-any.whl.metadata (121 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 121.5/121.5 kB 133.4 MB/s eta 0:00:00\n", - " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/98/46/f6a79f944d5c7763a9bc13b2aa6ac72daf43a6551f5fb03bccf0a9c2fec1/transformers-4.33.3-py3-none-any.whl.metadata\n", - " Downloading transformers-4.33.3-py3-none-any.whl.metadata (119 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 119.9/119.9 kB 163.1 MB/s eta 0:00:00\n", - " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/1a/06/3817f9bb923437ead9a794f0ac0d03b8b5e0478ab112db4c413dd37c09da/transformers-4.33.2-py3-none-any.whl.metadata\n", - " Downloading transformers-4.33.2-py3-none-any.whl.metadata (119 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 119.9/119.9 kB 274.9 MB/s eta 0:00:00\n", - " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/13/30/54b59e73400df3de506ad8630284e9fd63f4b94f735423d55fc342181037/transformers-4.33.1-py3-none-any.whl.metadata\n", - " Downloading transformers-4.33.1-py3-none-any.whl.metadata (119 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 119.9/119.9 kB 274.2 MB/s eta 0:00:00\n", - " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/e1/9d/4d9fe5c3b820db10773392ac5f4a0c8dab668f70b245ce2ce09785166128/transformers-4.33.0-py3-none-any.whl.metadata\n", - " Downloading transformers-4.33.0-py3-none-any.whl.metadata (119 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 119.9/119.9 kB 185.9 MB/s eta 0:00:00\n", - " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/83/8d/f65f8138365462ace54458a9e164f4b28ce1141361970190eef36bdef986/transformers-4.32.1-py3-none-any.whl.metadata\n", - " Downloading transformers-4.32.1-py3-none-any.whl.metadata (118 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 118.5/118.5 kB 144.4 MB/s eta 0:00:00\n", - " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/ae/95/283a1c004430bd2a9425d6937fc545dd49a4e4592feb76be0299a14e2378/transformers-4.32.0-py3-none-any.whl.metadata\n", - " Downloading transformers-4.32.0-py3-none-any.whl.metadata (118 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 118.5/118.5 kB 150.3 MB/s eta 0:00:00\n", - " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/21/02/ae8e595f45b6c8edee07913892b3b41f5f5f273962ad98851dc6a564bbb9/transformers-4.31.0-py3-none-any.whl.metadata\n", - " Downloading transformers-4.31.0-py3-none-any.whl.metadata (116 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 116.9/116.9 kB 156.7 MB/s eta 0:00:00\n", - " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/5b/0b/e45d26ccd28568013523e04f325432ea88a442b4e3020b757cf4361f0120/transformers-4.30.2-py3-none-any.whl.metadata\n", - " Downloading transformers-4.30.2-py3-none-any.whl.metadata (113 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 113.6/113.6 kB 263.7 MB/s eta 0:00:00\n", - " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/b8/df/b01b5e67cde3883757c9212455cbb9169385dcab5858b7172199126b756d/transformers-4.30.1-py3-none-any.whl.metadata\n", - " Downloading transformers-4.30.1-py3-none-any.whl.metadata (113 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 113.6/113.6 kB 263.8 MB/s eta 0:00:00\n", - " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/e2/72/1af3d38e98fdcceb3876de4567ac395a66c26976e259fe2d46266e052d61/transformers-4.30.0-py3-none-any.whl.metadata\n", - " Downloading transformers-4.30.0-py3-none-any.whl.metadata (113 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 113.6/113.6 kB 266.5 MB/s eta 0:00:00\n", - " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/17/aa/a89864288afe45abe1ab79f002140a20348140e86836d96096d8f8a3bac0/transformers-4.29.2-py3-none-any.whl.metadata\n", - " Downloading transformers-4.29.2-py3-none-any.whl.metadata (112 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 112.3/112.3 kB 272.7 MB/s eta 0:00:00\n", - " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/e8/b5/ddb16f9de207e6571ab7cc5db0cc538fa2d6d91cf024565496462af4c1ce/transformers-4.29.1-py3-none-any.whl.metadata\n", - " Downloading transformers-4.29.1-py3-none-any.whl.metadata (112 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 112.3/112.3 kB 262.3 MB/s eta 0:00:00\n", - " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/45/e4/4914b11df70954d95a7c36b74bf9010c8594fcec960471479449b0deb4f7/transformers-4.29.0-py3-none-any.whl.metadata\n", - " Downloading transformers-4.29.0-py3-none-any.whl.metadata (111 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 111.9/111.9 kB 269.5 MB/s eta 0:00:00\n", - " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/d8/a7/a6ff727fd5d96d6625f4658944a2ae230f0c75743a9a117fbda013b03d3d/transformers-4.28.1-py3-none-any.whl.metadata\n", - " Downloading transformers-4.28.1-py3-none-any.whl.metadata (109 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 110.0/110.0 kB 245.6 MB/s eta 0:00:00\n", - " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/8b/13/1ce598763b3669d43f192a7911bf2bf730a328012ab8801b93187a4f70d0/transformers-4.28.0-py3-none-any.whl.metadata\n", - " Downloading transformers-4.28.0-py3-none-any.whl.metadata (109 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 110.0/110.0 kB 256.3 MB/s eta 0:00:00\n", - " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/87/f0/2a152ed10ab8601431e87a606d397f7473c5fa4f8162f4ec5bda6ddb2df4/transformers-4.27.4-py3-none-any.whl.metadata\n", - " Downloading transformers-4.27.4-py3-none-any.whl.metadata (106 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 106.7/106.7 kB 254.4 MB/s eta 0:00:00\n", - " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/52/ac/9dc5a17ba60bc354d99250d9d1629f99d76f6729cee438fa91c8cc74bc5d/transformers-4.27.3-py3-none-any.whl.metadata\n", - " Downloading transformers-4.27.3-py3-none-any.whl.metadata (106 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 106.7/106.7 kB 251.5 MB/s eta 0:00:00\n", - " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/73/f0/4a795505387a3e7cd7f0c2a2a87f876658f9a07947a38fb67bffceff9246/transformers-4.27.2-py3-none-any.whl.metadata\n", - " Downloading transformers-4.27.2-py3-none-any.whl.metadata (106 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 106.7/106.7 kB 246.1 MB/s eta 0:00:00\n", - " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/6d/9b/2f536f9e73390209e0b27b74691355dac494b7ec8154f3012fdc6debbae7/transformers-4.27.1-py3-none-any.whl.metadata\n", - " Downloading transformers-4.27.1-py3-none-any.whl.metadata (106 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 106.7/106.7 kB 114.0 MB/s eta 0:00:00\n", - " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/4d/3e/1378ed266cf991f5ab5fcb29e953d97d793c7f9242ea5dc52f856415ea3a/transformers-4.27.0-py3-none-any.whl.metadata\n", - " Downloading transformers-4.27.0-py3-none-any.whl.metadata (106 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 106.7/106.7 kB 247.2 MB/s eta 0:00:00\n", - "Collecting sentencepiece!=0.1.92,>=0.1.91 (from transformers~=4.26.1->-r /empty/requirements.txt (line 5))\n", - " Obtaining dependency information for sentencepiece!=0.1.92,>=0.1.91 from https://files.pythonhosted.org/packages/5f/01/c95e42eb86282b2c79305d3e0b0ca5a743f85a61262bb7130999c70b9374/sentencepiece-0.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata\n", - " Downloading sentencepiece-0.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)\n", - "Collecting protobuf>=3.20.2 (from onnx~=1.14.1->-r /empty/requirements.txt (line 2))\n", - " Obtaining dependency information for protobuf>=3.20.2 from https://files.pythonhosted.org/packages/38/b1/d9b615dceb67ac38e13cbd7680c27182b40154996022cbb244ba1ac7d30f/protobuf-3.20.2-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl.metadata\n", - " Downloading protobuf-3.20.2-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl.metadata (679 bytes)\n", - "Requirement already satisfied: future>=0.18.2 in /opt/conda/lib/python3.9/site-packages (from v3io~=0.5.21->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.0.0)\n", - "Requirement already satisfied: ujson>=3 in /opt/conda/lib/python3.9/site-packages (from v3io~=0.5.21->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (5.9.0)\n", - "Requirement already satisfied: googleapis-common-protos>=1.5.3 in /opt/conda/lib/python3.9/site-packages (from v3io-frames~=0.10.12->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.62.0)\n", - "Requirement already satisfied: grpcio-tools!=1.34.0,<1.49,>=1.30 in /opt/conda/lib/python3.9/site-packages (from v3io-frames~=0.10.12->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.48.2)\n", - "Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime~=1.16.1->-r /empty/requirements.txt (line 3))\n", - " Obtaining dependency information for humanfriendly>=9.1 from https://files.pythonhosted.org/packages/f0/0f/310fb31e39e2d734ccaa2c0fb981ee41f7bd5056ce9bc29b2248bd569169/humanfriendly-10.0-py2.py3-none-any.whl.metadata\n", - " Downloading humanfriendly-10.0-py2.py3-none-any.whl.metadata (9.2 kB)\n", - "INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.\n", - "Collecting multiprocess (from datasets~=2.10.1->-r /empty/requirements.txt (line 6))\n", - " Obtaining dependency information for multiprocess from https://files.pythonhosted.org/packages/c6/c9/820b5ab056f4ada76fbe05bd481a948f287957d6cbfd59e2dd2618b408c1/multiprocess-0.70.15-py39-none-any.whl.metadata\n", - " Downloading multiprocess-0.70.15-py39-none-any.whl.metadata (7.2 kB)\n", - " Obtaining dependency information for multiprocess from https://files.pythonhosted.org/packages/6a/f4/fbeb03ef7abdda54db4a6a75c971b88ab73d724ff09e3275cc1e99f1c946/multiprocess-0.70.14-py39-none-any.whl.metadata\n", - " Downloading multiprocess-0.70.14-py39-none-any.whl.metadata (6.6 kB)\n", - "Collecting mpmath>=0.19 (from sympy->onnxruntime~=1.16.1->-r /empty/requirements.txt (line 3))\n", - " Obtaining dependency information for mpmath>=0.19 from https://files.pythonhosted.org/packages/43/e3/7d92a15f894aa0c9c4b49b8ee9ac9850d6e63b03c9c32c0367a13ae62209/mpmath-1.3.0-py3-none-any.whl.metadata\n", - " Downloading mpmath-1.3.0-py3-none-any.whl.metadata (8.6 kB)\n", - "Requirement already satisfied: Mako in /opt/conda/lib/python3.9/site-packages (from alembic!=1.10.0,<2->mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.3.2)\n", - "Requirement already satisfied: cffi in /opt/conda/lib/python3.9/site-packages (from azure-datalake-store<0.1,>=0.0.46->adlfs==2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.16.0)\n", - "Requirement already satisfied: termcolor in /opt/conda/lib/python3.9/site-packages (from fire<1,>=0.3.1->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.4.0)\n", - "Requirement already satisfied: Werkzeug>=3.0.0 in /opt/conda/lib/python3.9/site-packages (from Flask<4->mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.0.1)\n", - "Requirement already satisfied: itsdangerous>=2.1.2 in /opt/conda/lib/python3.9/site-packages (from Flask<4->mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.1.2)\n", - "Requirement already satisfied: blinker>=1.6.2 in /opt/conda/lib/python3.9/site-packages (from Flask<4->mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.7.0)\n", - "Requirement already satisfied: smmap<6,>=3.0.1 in /opt/conda/lib/python3.9/site-packages (from gitdb<5,>=4.0.1->GitPython>=3.1.41,~=3.1->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (5.0.1)\n", - "Requirement already satisfied: httplib2<1dev,>=0.15.0 in /opt/conda/lib/python3.9/site-packages (from google-api-python-client<2,>=1.7.8->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.22.0)\n", - "Requirement already satisfied: google-auth-httplib2>=0.0.3 in /opt/conda/lib/python3.9/site-packages (from google-api-python-client<2,>=1.7.8->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.2.0)\n", - "Requirement already satisfied: cachetools<6.0,>=2.0.0 in /opt/conda/lib/python3.9/site-packages (from google-auth>=1.2->gcsfs==2023.9.2->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (5.3.3)\n", - "Requirement already satisfied: pyasn1-modules>=0.2.1 in /opt/conda/lib/python3.9/site-packages (from google-auth>=1.2->gcsfs==2023.9.2->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.3.0)\n", - "Requirement already satisfied: rsa<5,>=3.1.4 in /opt/conda/lib/python3.9/site-packages (from google-auth>=1.2->gcsfs==2023.9.2->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (4.9)\n", - "Requirement already satisfied: proto-plus<2.0.0dev,>=1.22.0 in /opt/conda/lib/python3.9/site-packages (from google-cloud-bigquery-storage<3.0.0dev,>=2.6.0->google-cloud-bigquery[bqstorage,pandas]==3.14.1->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.23.0)\n", - "Requirement already satisfied: google-crc32c<2.0dev,>=1.0 in /opt/conda/lib/python3.9/site-packages (from google-cloud-storage->gcsfs==2023.9.2->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.5.0)\n", - "Requirement already satisfied: zipp>=0.5 in /opt/conda/lib/python3.9/site-packages (from importlib-metadata>=4.13.0->dask~=2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.17.0)\n", - "Requirement already satisfied: parso<0.9.0,>=0.8.3 in /opt/conda/lib/python3.9/site-packages (from jedi>=0.16->ipython~=8.10->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.8.3)\n", - "Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /opt/conda/lib/python3.9/site-packages (from jsonschema<5,>=3.0.1->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2023.12.1)\n", - "Requirement already satisfied: referencing>=0.28.4 in /opt/conda/lib/python3.9/site-packages (from jsonschema<5,>=3.0.1->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.33.0)\n", - "Requirement already satisfied: rpds-py>=0.7.1 in /opt/conda/lib/python3.9/site-packages (from jsonschema<5,>=3.0.1->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.18.0)\n", - "Requirement already satisfied: websocket-client!=0.40.0,!=0.41.*,!=0.42.*,>=0.32.0 in /opt/conda/lib/python3.9/site-packages (from kubernetes<26,>=8.0.0->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.7.0)\n", - "Requirement already satisfied: contourpy>=1.0.1 in /opt/conda/lib/python3.9/site-packages (from matplotlib<4->mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.2.0)\n", - "Requirement already satisfied: cycler>=0.10 in /opt/conda/lib/python3.9/site-packages (from matplotlib<4->mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.12.1)\n", - "Requirement already satisfied: fonttools>=4.22.0 in /opt/conda/lib/python3.9/site-packages (from matplotlib<4->mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (4.49.0)\n", - "Requirement already satisfied: kiwisolver>=1.3.1 in /opt/conda/lib/python3.9/site-packages (from matplotlib<4->mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.4.5)\n", - "Requirement already satisfied: pillow>=8 in /opt/conda/lib/python3.9/site-packages (from matplotlib<4->mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (10.2.0)\n", - "Requirement already satisfied: pyparsing>=2.3.1 in /opt/conda/lib/python3.9/site-packages (from matplotlib<4->mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.1.1)\n", - "Requirement already satisfied: importlib-resources>=3.2.0 in /opt/conda/lib/python3.9/site-packages (from matplotlib<4->mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (6.1.2)\n", - "Requirement already satisfied: PyJWT[crypto]<3,>=1.0.0 in /opt/conda/lib/python3.9/site-packages (from msal<2.0.0,>=1.24.0->azure-identity~=1.5->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.8.0)\n", - "Requirement already satisfied: portalocker<3,>=1.0 in /opt/conda/lib/python3.9/site-packages (from msal-extensions<2.0.0,>=0.3.0->azure-identity~=1.5->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.8.2)\n", - "Requirement already satisfied: beautifulsoup4 in /opt/conda/lib/python3.9/site-packages (from nbconvert>=6.4.5->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (4.12.3)\n", - "Requirement already satisfied: bleach!=5.0.0 in /opt/conda/lib/python3.9/site-packages (from nbconvert>=6.4.5->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (6.1.0)\n", - "Requirement already satisfied: defusedxml in /opt/conda/lib/python3.9/site-packages (from nbconvert>=6.4.5->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.7.1)\n", - "Requirement already satisfied: jupyter-core>=4.7 in /opt/conda/lib/python3.9/site-packages (from nbconvert>=6.4.5->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (5.7.1)\n", - "Requirement already satisfied: jupyterlab-pygments in /opt/conda/lib/python3.9/site-packages (from nbconvert>=6.4.5->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.3.0)\n", - "Requirement already satisfied: mistune<4,>=2.0.3 in /opt/conda/lib/python3.9/site-packages (from nbconvert>=6.4.5->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.0.2)\n", - "Requirement already satisfied: nbclient>=0.5.0 in /opt/conda/lib/python3.9/site-packages (from nbconvert>=6.4.5->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.9.0)\n", - "Requirement already satisfied: nbformat>=5.7 in /opt/conda/lib/python3.9/site-packages (from nbconvert>=6.4.5->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (5.9.2)\n", - "Requirement already satisfied: pandocfilters>=1.4.1 in /opt/conda/lib/python3.9/site-packages (from nbconvert>=6.4.5->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.5.1)\n", - "Requirement already satisfied: tinycss2 in /opt/conda/lib/python3.9/site-packages (from nbconvert>=6.4.5->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.2.1)\n", - "Requirement already satisfied: pyzmq<25,>=17 in /opt/conda/lib/python3.9/site-packages (from notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (24.0.1)\n", - "Requirement already satisfied: argon2-cffi in /opt/conda/lib/python3.9/site-packages (from notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (23.1.0)\n", - "Requirement already satisfied: jupyter-client<8,>=5.3.4 in /opt/conda/lib/python3.9/site-packages (from notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (7.4.9)\n", - "Requirement already satisfied: ipython-genutils in /opt/conda/lib/python3.9/site-packages (from notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.2.0)\n", - "Requirement already satisfied: ipykernel in /opt/conda/lib/python3.9/site-packages (from notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (6.29.3)\n", - "Requirement already satisfied: Send2Trash>=1.8.0 in /opt/conda/lib/python3.9/site-packages (from notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.8.2)\n", - "Requirement already satisfied: terminado>=0.8.3 in /opt/conda/lib/python3.9/site-packages (from notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.18.0)\n", - "Requirement already satisfied: prometheus-client in /opt/conda/lib/python3.9/site-packages (from notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.20.0)\n", - "Requirement already satisfied: nbclassic>=0.4.7 in /opt/conda/lib/python3.9/site-packages (from notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.0.0)\n", - "Requirement already satisfied: ptyprocess>=0.5 in /opt/conda/lib/python3.9/site-packages (from pexpect>4.3->ipython~=8.10->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.7.0)\n", - "Requirement already satisfied: wcwidth in /opt/conda/lib/python3.9/site-packages (from prompt-toolkit<3.1.0,>=3.0.41->ipython~=8.10->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.2.13)\n", - "Requirement already satisfied: oauthlib>=3.0.0 in /opt/conda/lib/python3.9/site-packages (from requests-oauthlib>=0.5.0->msrest~=0.6.21->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.2.2)\n", - "Requirement already satisfied: wheel in /opt/conda/lib/python3.9/site-packages (from strip-hints<1,>=0.1.8->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.41.2)\n", - "Requirement already satisfied: executing>=1.2.0 in /opt/conda/lib/python3.9/site-packages (from stack-data->ipython~=8.10->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.0.1)\n", - "Requirement already satisfied: asttokens>=2.1.0 in /opt/conda/lib/python3.9/site-packages (from stack-data->ipython~=8.10->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.4.1)\n", - "Requirement already satisfied: pure-eval in /opt/conda/lib/python3.9/site-packages (from stack-data->ipython~=8.10->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.2.2)\n", - "Requirement already satisfied: webencodings in /opt/conda/lib/python3.9/site-packages (from bleach!=5.0.0->nbconvert>=6.4.5->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.5.1)\n", - "Requirement already satisfied: pycparser in /opt/conda/lib/python3.9/site-packages (from cffi->azure-datalake-store<0.1,>=0.0.46->adlfs==2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.21)\n", - "Requirement already satisfied: grpcio-status<2.0.dev0,>=1.33.2 in /opt/conda/lib/python3.9/site-packages (from google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0dev,>=1.31.5->google-cloud-bigquery[bqstorage,pandas]==3.14.1->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.48.2)\n", - "Requirement already satisfied: platformdirs>=2.5 in /opt/conda/lib/python3.9/site-packages (from jupyter-core>=4.7->nbconvert>=6.4.5->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.10.0)\n", - "Requirement already satisfied: jupyter-server>=1.8 in /opt/conda/lib/python3.9/site-packages (from nbclassic>=0.4.7->notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.12.5)\n", - "Requirement already satisfied: notebook-shim>=0.2.3 in /opt/conda/lib/python3.9/site-packages (from nbclassic>=0.4.7->notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.2.4)\n", - "Requirement already satisfied: fastjsonschema in /opt/conda/lib/python3.9/site-packages (from nbformat>=5.7->nbconvert>=6.4.5->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.19.1)\n", - "Requirement already satisfied: pyasn1<0.6.0,>=0.4.6 in /opt/conda/lib/python3.9/site-packages (from pyasn1-modules>=0.2.1->google-auth>=1.2->gcsfs==2023.9.2->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.5.1)\n", - "Requirement already satisfied: argon2-cffi-bindings in /opt/conda/lib/python3.9/site-packages (from argon2-cffi->notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (21.2.0)\n", - "Requirement already satisfied: soupsieve>1.2 in /opt/conda/lib/python3.9/site-packages (from beautifulsoup4->nbconvert>=6.4.5->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.5)\n", - "Requirement already satisfied: comm>=0.1.1 in /opt/conda/lib/python3.9/site-packages (from ipykernel->notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.2.1)\n", - "Requirement already satisfied: debugpy>=1.6.5 in /opt/conda/lib/python3.9/site-packages (from ipykernel->notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.8.1)\n", - "Requirement already satisfied: jupyter-events>=0.9.0 in /opt/conda/lib/python3.9/site-packages (from jupyter-server>=1.8->nbclassic>=0.4.7->notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.9.0)\n", - "Requirement already satisfied: jupyter-server-terminals in /opt/conda/lib/python3.9/site-packages (from jupyter-server>=1.8->nbclassic>=0.4.7->notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.5.2)\n", - "Requirement already satisfied: overrides in /opt/conda/lib/python3.9/site-packages (from jupyter-server>=1.8->nbclassic>=0.4.7->notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (7.7.0)\n", - "Requirement already satisfied: python-json-logger>=2.0.4 in /opt/conda/lib/python3.9/site-packages (from jupyter-events>=0.9.0->jupyter-server>=1.8->nbclassic>=0.4.7->notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.0.7)\n", - "Requirement already satisfied: rfc3339-validator in /opt/conda/lib/python3.9/site-packages (from jupyter-events>=0.9.0->jupyter-server>=1.8->nbclassic>=0.4.7->notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.1.4)\n", - "Requirement already satisfied: rfc3986-validator>=0.1.1 in /opt/conda/lib/python3.9/site-packages (from jupyter-events>=0.9.0->jupyter-server>=1.8->nbclassic>=0.4.7->notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.1.1)\n", - "Requirement already satisfied: fqdn in /opt/conda/lib/python3.9/site-packages (from jsonschema<5,>=3.0.1->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.5.1)\n", - "Requirement already satisfied: isoduration in /opt/conda/lib/python3.9/site-packages (from jsonschema<5,>=3.0.1->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (20.11.0)\n", - "Requirement already satisfied: jsonpointer>1.13 in /opt/conda/lib/python3.9/site-packages (from jsonschema<5,>=3.0.1->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.1)\n", - "Requirement already satisfied: uri-template in /opt/conda/lib/python3.9/site-packages (from jsonschema<5,>=3.0.1->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.3.0)\n", - "Requirement already satisfied: webcolors>=1.11 in /opt/conda/lib/python3.9/site-packages (from jsonschema<5,>=3.0.1->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.13)\n", - "Requirement already satisfied: arrow>=0.15.0 in /opt/conda/lib/python3.9/site-packages (from isoduration->jsonschema<5,>=3.0.1->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.3.0)\n", - "Requirement already satisfied: types-python-dateutil>=2.8.10 in /opt/conda/lib/python3.9/site-packages (from arrow>=0.15.0->isoduration->jsonschema<5,>=3.0.1->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.8.19.20240106)\n", - "Downloading onnx-1.14.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (14.6 MB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 14.6/14.6 MB 274.2 MB/s eta 0:00:00\n", - "Downloading onnxruntime-1.16.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.4 MB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 6.4/6.4 MB 277.9 MB/s eta 0:00:00\n", - "Downloading optimum-1.6.4-py3-none-any.whl (227 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 227.8/227.8 kB 291.3 MB/s eta 0:00:00\n", - "Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 6.3/6.3 MB 242.4 MB/s eta 0:00:00\n", - "Downloading datasets-2.10.1-py3-none-any.whl (469 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 469.0/469.0 kB 185.9 MB/s eta 0:00:00\n", - "Downloading scikit_learn-1.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.4 MB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 26.4/26.4 MB 275.9 MB/s eta 0:00:00\n", - "Downloading dill-0.3.6-py3-none-any.whl (110 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 110.5/110.5 kB 282.3 MB/s eta 0:00:00\n", - "Downloading huggingface_hub-0.21.4-py3-none-any.whl (346 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 346.4/346.4 kB 311.7 MB/s eta 0:00:00\n", - "Downloading numpy-1.23.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.1 MB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 17.1/17.1 MB 269.6 MB/s eta 0:00:00\n", - "Downloading regex-2023.12.25-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (773 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 773.4/773.4 kB 311.9 MB/s eta 0:00:00\n", - "Downloading responses-0.18.0-py3-none-any.whl (38 kB)\n", - "Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 7.8/7.8 MB 264.1 MB/s eta 0:00:00\n", - "Downloading torch-2.2.1-cp39-cp39-manylinux1_x86_64.whl (755.5 MB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 755.5/755.5 MB 204.0 MB/s eta 0:00:00\n", - "Downloading nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 410.6/410.6 MB 40.3 MB/s eta 0:00:00\n", - "Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 14.1/14.1 MB 43.0 MB/s eta 0:00:00\n", - "Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 23.7/23.7 MB 46.9 MB/s eta 0:00:00\n", - "Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 823.6/823.6 kB 51.0 MB/s eta 0:00:00\n", - "Downloading nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 731.7/731.7 MB 58.2 MB/s eta 0:00:00\n", - "Downloading nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 121.6/121.6 MB 69.0 MB/s eta 0:00:00\n", - "Downloading nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 56.5/56.5 MB 36.0 MB/s eta 0:00:00\n", - "Downloading nvidia_cusolver_cu12-11.4.5.107-py3-none-manylinux1_x86_64.whl (124.2 MB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 124.2/124.2 MB 52.8 MB/s eta 0:00:00\n", - "Downloading nvidia_cusparse_cu12-12.1.0.106-py3-none-manylinux1_x86_64.whl (196.0 MB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 196.0/196.0 MB 45.9 MB/s eta 0:00:00\n", - "Downloading nvidia_nccl_cu12-2.19.3-py3-none-manylinux1_x86_64.whl (166.0 MB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 166.0/166.0 MB 19.6 MB/s eta 0:00:00\n", - "Downloading nvidia_nvtx_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (99 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 99.1/99.1 kB 27.7 MB/s eta 0:00:00\n", - "Downloading triton-2.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (167.9 MB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 167.9/167.9 MB 41.3 MB/s eta 0:00:00\n", - "Downloading protobuf-3.20.2-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl (1.0 MB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.0/1.0 MB 42.8 MB/s eta 0:00:00\n", - "Downloading coloredlogs-15.0.1-py2.py3-none-any.whl (46 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 46.0/46.0 kB 192.0 MB/s eta 0:00:00\n", - "Downloading filelock-3.13.1-py3-none-any.whl (11 kB)\n", - "Downloading flatbuffers-24.3.7-py2.py3-none-any.whl (26 kB)\n", - "Downloading multiprocess-0.70.14-py39-none-any.whl (132 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 132.9/132.9 kB 100.7 MB/s eta 0:00:00\n", - "Downloading sympy-1.12-py3-none-any.whl (5.7 MB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 5.7/5.7 MB 41.4 MB/s eta 0:00:00\n", - "Downloading humanfriendly-10.0-py2.py3-none-any.whl (86 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 86.8/86.8 kB 253.7 MB/s eta 0:00:00\n", - "Downloading mpmath-1.3.0-py3-none-any.whl (536 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 536.2/536.2 kB 45.4 MB/s eta 0:00:00\n", - "Downloading sentencepiece-0.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.3/1.3 MB 46.1 MB/s eta 0:00:00\n", - "Downloading networkx-3.2.1-py3-none-any.whl (1.6 MB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.6/1.6 MB 43.7 MB/s eta 0:00:00\n", - "Downloading nvidia_nvjitlink_cu12-12.4.99-py3-none-manylinux2014_x86_64.whl (21.1 MB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 21.1/21.1 MB 43.8 MB/s eta 0:00:00\n", - "Installing collected packages: tokenizers, sentencepiece, mpmath, flatbuffers, sympy, regex, protobuf, nvidia-nvtx-cu12, nvidia-nvjitlink-cu12, nvidia-nccl-cu12, nvidia-curand-cu12, nvidia-cufft-cu12, nvidia-cuda-runtime-cu12, nvidia-cuda-nvrtc-cu12, nvidia-cuda-cupti-cu12, nvidia-cublas-cu12, numpy, networkx, humanfriendly, filelock, dill, triton, responses, onnx, nvidia-cusparse-cu12, nvidia-cudnn-cu12, multiprocess, huggingface-hub, coloredlogs, transformers, scikit-learn, onnxruntime, nvidia-cusolver-cu12, torch, datasets, optimum\n", - " Attempting uninstall: protobuf\n", - " Found existing installation: protobuf 3.20.3\n", - " Uninstalling protobuf-3.20.3:\n", - " Successfully uninstalled protobuf-3.20.3\n", - " Attempting uninstall: numpy\n", - " Found existing installation: numpy 1.26.4\n", - " Uninstalling numpy-1.26.4:\n", - " Successfully uninstalled numpy-1.26.4\n", - " Attempting uninstall: scikit-learn\n", - " Found existing installation: scikit-learn 1.4.1.post1\n", - " Uninstalling scikit-learn-1.4.1.post1:\n", - " Successfully uninstalled scikit-learn-1.4.1.post1\n", - "Successfully installed coloredlogs-15.0.1 datasets-2.10.1 dill-0.3.6 filelock-3.13.1 flatbuffers-24.3.7 huggingface-hub-0.21.4 humanfriendly-10.0 mpmath-1.3.0 multiprocess-0.70.14 networkx-3.2.1 numpy-1.23.5 nvidia-cublas-cu12-12.1.3.1 nvidia-cuda-cupti-cu12-12.1.105 nvidia-cuda-nvrtc-cu12-12.1.105 nvidia-cuda-runtime-cu12-12.1.105 nvidia-cudnn-cu12-8.9.2.26 nvidia-cufft-cu12-11.0.2.54 nvidia-curand-cu12-10.3.2.106 nvidia-cusolver-cu12-11.4.5.107 nvidia-cusparse-cu12-12.1.0.106 nvidia-nccl-cu12-2.19.3 nvidia-nvjitlink-cu12-12.4.99 nvidia-nvtx-cu12-12.1.105 onnx-1.14.1 onnxruntime-1.16.3 optimum-1.6.4 protobuf-3.20.2 regex-2023.12.25 responses-0.18.0 scikit-learn-1.0.2 sentencepiece-0.2.0 sympy-1.12 tokenizers-0.13.3 torch-2.2.1 transformers-4.26.1 triton-2.2.0\n", - "WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\n", - "\u001b[36mINFO\u001b[0m[0238] Taking snapshot of full filesystem... \n", - "\u001b[36mINFO\u001b[0m[0463] Pushing image to docker-registry.default-tenant.app.app-lab-2-b688.iguazio-cd2.com/mlrun/func-hugging-face-trainer-avia-hugging-face-classifier-trainer:latest \n", - "\u001b[36mINFO\u001b[0m[0493] Pushed docker-registry.default-tenant.app.app-lab-2-b688.iguazio-cd2.com/mlrun/func-hugging-face-trainer-avia-hugging-face-classifier-trainer@sha256:691d0bb3c23487b4b5d2f84ab323c24735626ee81681475f53a4158b72d4cfee \n" - ] - }, - { - "data": { - "text/plain": [ - "BuildStatus(ready=True, outputs={'image': '.mlrun/func-hugging-face-trainer-avia-hugging-face-classifier-trainer:latest'})" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "project.build_function(\"hugging-face-classifier-trainer\",with_mlrun=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": { - "scrolled": true, - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2024-03-24 17:22:42,252 [info] Storing function: {'name': 'hugging-face-classifier-trainer-train', 'uid': '53252ce7aacb4b1aacf86bf3b862daa2', 'db': 'http://mlrun-api:8080'}\n", - "> 2024-03-24 17:22:42,536 [info] Job is running in the background, pod: hugging-face-classifier-trainer-train-dqqfr\n", - "> 2024-03-24 17:24:43,288 [info] 'train_test_split_size' is not provided, setting train_test_split_size to 0.2\n", - "> 2024-03-24 17:24:43,847 [info] Loading and editing Shayanvsf/US_Airline_Sentiment dataset from Hugging Face hub\n", - "Downloading metadata: 100%|██████████| 1.03k/1.03k [00:00<00:00, 6.77MB/s]\n", - "Downloading and preparing dataset None/None (download: 265.13 KiB, generated: 1.50 MiB, post-processed: Unknown size, total: 1.76 MiB) to /root/.cache/huggingface/datasets/Shayanvsf___parquet/Shayanvsf--US_Airline_Sentiment-1319c42f87c44b2f/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...\n", - "Downloading data files: 0%| | 0/3 [00:00 2024-03-24 17:24:47,076 [info] training 'huggingface-model'\n", - "The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`, you can safely ignore this message.\n", - "This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n", - "***** Running training *****\n", - " Num examples = 100\n", - " Num Epochs = 3\n", - " Instantaneous batch size per device = 16\n", - " Total train batch size (w. parallel, distributed & accumulation) = 16\n", - " Gradient Accumulation steps = 1\n", - " Total optimization steps = 21\n", - " Number of trainable parameters = 66955010\n", - "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", - "To disable this warning, you can either:\n", - "\t- Avoid using `tokenizers` before the fork if possible\n", - "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", - " 0%| | 0/21 [00:00 2024-03-24 17:26:00,230 [info] To track results use the CLI: {'info_cmd': 'mlrun get run 53252ce7aacb4b1aacf86bf3b862daa2 -p hugging-face-trainer-avia', 'logs_cmd': 'mlrun logs 53252ce7aacb4b1aacf86bf3b862daa2 -p hugging-face-trainer-avia'}\n", - "> 2024-03-24 17:26:00,231 [info] Or click for UI: {'ui_url': 'https://dashboard.default-tenant.app.app-lab-2-b688.iguazio-cd2.com/mlprojects/hugging-face-trainer-avia/jobs/monitor/53252ce7aacb4b1aacf86bf3b862daa2/overview'}\n", - "> 2024-03-24 17:26:00,231 [info] Run execution finished: {'status': 'completed', 'name': 'hugging-face-classifier-trainer-train'}\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
hugging-face-trainer-avia0Mar 24 17:24:39completedhugging-face-classifier-trainer-train
v3io_user=avia
kind=job
owner=avia
mlrun/client_version=1.6.1
mlrun/client_python_version=3.9.16
host=hugging-face-classifier-trainer-train-dqqfr
hf_dataset=Shayanvsf/US_Airline_Sentiment
drop_columns=['airline_sentiment_confidence', 'negativereason_confidence']
pretrained_tokenizer=distilbert-base-uncased
pretrained_model=distilbert-base-uncased
model_class=transformers.AutoModelForSequenceClassification
label_name=airline_sentiment
num_of_train_samples=100
metrics=['accuracy', 'f1']
random_state=42
TRAIN_output_dir=finetuning-sentiment-model-3000-samples
TRAIN_learning_rate=2e-05
TRAIN_per_device_train_batch_size=16
TRAIN_per_device_eval_batch_size=16
TRAIN_num_train_epochs=3
TRAIN_weight_decay=0.01
TRAIN_push_to_hub=False
TRAIN_evaluation_strategy=epoch
TRAIN_eval_steps=1
TRAIN_logging_steps=1
CLASS_num_labels=2
loss=0.5215
learning_rate=0.0
eval_loss=0.4750453531742096
eval_accuracy=0.7916666666666666
eval_f1=0.0
eval_runtime=1.0524
eval_samples_per_second=22.806
eval_steps_per_second=1.9
train_runtime=55.1543
train_samples_per_second=5.439
train_steps_per_second=0.381
total_flos=3327208489680.0
loss_plot
learning_rate_plot
eval_loss_plot
eval_accuracy_plot
eval_f1_plot
eval_runtime_plot
eval_samples_per_second_plot
eval_steps_per_second_plot
tokenizer
model
\n", - "
\n", - "
\n", - "
\n", - " Title\n", - " ×\n", - "
\n", - " \n", - "
\n", - "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - }, - { - "data": { - "text/html": [ - " > to track results use the .show() or .logs() methods or click here to open in UI" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2024-03-24 17:26:09,792 [info] Run execution finished: {'status': 'completed', 'name': 'hugging-face-classifier-trainer-train'}\n" - ] - } - ], - "source": [ - "train_run = hugging_face_classifier_trainer.run(params={\n", - " \"hf_dataset\": \"Shayanvsf/US_Airline_Sentiment\",\n", - " \"drop_columns\": [\n", - " \"airline_sentiment_confidence\",\n", - " \"negativereason_confidence\",\n", - " ],\n", - " \"pretrained_tokenizer\": \"distilbert-base-uncased\",\n", - " \"pretrained_model\": \"distilbert-base-uncased\",\n", - " \"model_class\": \"transformers.AutoModelForSequenceClassification\",\n", - " \"label_name\": \"airline_sentiment\",\n", - " \"num_of_train_samples\": 100,\n", - " \"metrics\": [\"accuracy\", \"f1\"],\n", - " \"random_state\": 42,\n", - " **additional_parameters\n", - " },\n", - " handler=\"train\", \n", - " )" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "[Back to the top](#top)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "mlrun-base", - "language": "python", - "name": "conda-env-mlrun-base-py" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.16" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/functions/development/hugging_face_classifier_trainer/0.2.0/src/hugging_face_classifier_trainer.py b/functions/development/hugging_face_classifier_trainer/0.2.0/src/hugging_face_classifier_trainer.py deleted file mode 100755 index 29d07039..00000000 --- a/functions/development/hugging_face_classifier_trainer/0.2.0/src/hugging_face_classifier_trainer.py +++ /dev/null @@ -1,832 +0,0 @@ -import os -import shutil -import tempfile -import zipfile -from abc import ABC -from typing import Any, Callable, Dict, List, Optional, Tuple, Union - -import mlrun -import mlrun.datastore -import mlrun.utils -import numpy as np -import pandas as pd -import transformers -from datasets import Dataset, load_dataset, load_metric -from mlrun import MLClientCtx -from mlrun import feature_store as fs -from mlrun.artifacts import Artifact, PlotlyArtifact -from mlrun.datastore import DataItem -from mlrun.frameworks._common import CommonTypes, MLRunInterface -from mlrun.utils import create_class -from plotly import graph_objects as go -from sklearn.model_selection import train_test_split -from transformers import ( - AutoTokenizer, - DataCollatorWithPadding, - EvalPrediction, - PreTrainedModel, - PreTrainedTokenizer, - Trainer, - TrainerCallback, - TrainerControl, - TrainerState, - TrainingArguments, -) - - -# ----------------------from MLRUN-------------------------------- -class HFORTOptimizerMLRunInterface(MLRunInterface, ABC): - """ - Interface for adding MLRun features for tensorflow keras API. - """ - - # MLRun's context default name: - DEFAULT_CONTEXT_NAME = "mlrun-huggingface" - - # Attributes to be inserted so the MLRun interface will be fully enabled. - _PROPERTIES = { - "_auto_log": False, - "_context": None, - "_model_name": "model", - "_tag": "", - "_labels": None, - "_extra_data": None, - } - _METHODS = ["enable_auto_logging"] - # Attributes to replace so the MLRun interface will be fully enabled. - _REPLACED_METHODS = [ - "optimize", - ] - - @classmethod - def add_interface( - cls, - obj, - restoration: CommonTypes.MLRunInterfaceRestorationType = None, - ): - """ - Enrich the object with this interface properties, methods and functions, so it will have this TensorFlow.Keras - MLRun's features. - :param obj: The object to enrich his interface. - :param restoration: Restoration information tuple as returned from 'remove_interface' in order to - add the interface in a certain state. - """ - super(HFORTOptimizerMLRunInterface, cls).add_interface( - obj=obj, restoration=restoration - ) - - @classmethod - def mlrun_optimize(cls): - """ - MLRun's tf.keras.Model.fit wrapper. It will setup the optimizer when using horovod. The optimizer must be - passed in a keyword argument and when using horovod, it must be passed as an Optimizer instance, not a string. - - raise MLRunInvalidArgumentError: In case the optimizer provided did not follow the instructions above. - """ - - def wrapper(self, *args, **kwargs): - save_dir = cls._get_function_argument( - self.optimize, - argument_name="save_dir", - passed_args=args, - passed_kwargs=kwargs, - )[0] - - # Call the original optimize method: - result = self.original_optimize(*args, **kwargs) - - if self._auto_log: - # Log the onnx model: - self._context.log_model( - key="model", - db_key=self._model_name, - model_file=f"{save_dir}/model_optimized.onnx", - tag=self._tag, - framework="ONNX", - labels=self._labels, - extra_data=self._extra_data, - ) - - return result - - return wrapper - - def enable_auto_logging( - self, - context: mlrun.MLClientCtx, - model_name: str = "model", - tag: str = "", - labels: Dict[str, str] = None, - extra_data: dict = None, - ): - self._auto_log = True - - self._context = context - self._model_name = model_name - self._tag = tag - self._labels = labels - self._extra_data = extra_data - - -class HFTrainerMLRunInterface(MLRunInterface, ABC): - """ - Interface for adding MLRun features for tensorflow keras API. - """ - - # MLRuns context default name: - DEFAULT_CONTEXT_NAME = "mlrun-huggingface" - - # Attributes to replace so the MLRun interface will be fully enabled. - _REPLACED_METHODS = [ - "train", - # "evaluate" - ] - - @classmethod - def add_interface( - cls, - obj: Trainer, - restoration: CommonTypes.MLRunInterfaceRestorationType = None, - ): - """ - Enrich the object with this interface properties, methods and functions, so it will have this TensorFlow.Keras - MLRuns features. - :param obj: The object to enrich his interface. - :param restoration: Restoration information tuple as returned from 'remove_interface' in order to - add the interface in a certain state. - """ - - super(HFTrainerMLRunInterface, cls).add_interface( - obj=obj, restoration=restoration - ) - - @classmethod - def mlrun_train(cls): - - """ - MLRuns tf.keras.Model.fit wrapper. It will setup the optimizer when using horovod. The optimizer must be - passed in a keyword argument and when using horovod, it must be passed as an Optimizer instance, not a string. - - raise MLRunInvalidArgumentError: In case the optimizer provided did not follow the instructions above. - """ - - def wrapper(self: Trainer, *args, **kwargs): - # Restore the evaluation method as `train` will use it: - # cls._restore_attribute(obj=self, attribute_name="evaluate") - - # Call the original fit method: - result = self.original_train(*args, **kwargs) - - # Replace the evaluation method again: - # cls._replace_function(obj=self, function_name="evaluate") - - return result - - return wrapper - - -class MLRunCallback(TrainerCallback): - """ - Callback for collecting logs during training / evaluation of the `Trainer` API. - """ - - def __init__( - self, - context: mlrun.MLClientCtx = None, - model_name: str = "model", - tag: str = "", - labels: Dict[str, str] = None, - extra_data: dict = None, - ): - super().__init__() - - # Store the configurations: - self._context = ( - context - if context is not None - else mlrun.get_or_create_ctx("./mlrun-huggingface") - ) - self._model_name = model_name - self._tag = tag - self._labels = labels - self._extra_data = extra_data if extra_data is not None else {} - - # Set up the logging mode: - self._is_training = False - self._steps: List[List[int]] = [] - self._metric_scores: Dict[str, List[float]] = {} - self._artifacts: Dict[str, Artifact] = {} - - def on_epoch_begin( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs, - ): - self._steps.append([]) - - def on_epoch_end( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs, - ): - self._log_metrics() - - def on_log( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - logs: Dict[str, float] = None, - **kwargs, - ): - recent_logs = state.log_history[-1].copy() - - recent_logs.pop("epoch") - current_step = int(recent_logs.pop("step")) - if current_step not in self._steps[-1]: - self._steps[-1].append(current_step) - - for metric_name, metric_score in recent_logs.items(): - if metric_name.startswith("train_"): - if metric_name.split("train_")[1] not in self._metric_scores: - self._metric_scores[metric_name] = [metric_score] - continue - if metric_name not in self._metric_scores: - self._metric_scores[metric_name] = [] - self._metric_scores[metric_name].append(metric_score) - - def on_train_begin( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs, - ): - self._is_training = True - - def on_train_end( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - model: PreTrainedModel = None, - tokenizer: PreTrainedTokenizer = None, - **kwargs, - ): - self._log_metrics() - - temp_directory = tempfile.gettempdir() - - # Save and log the tokenizer: - if tokenizer is not None: - # Save tokenizer: - tokenizer_dir = os.path.join(temp_directory, "tokenizer") - tokenizer.save_pretrained(save_directory=tokenizer_dir) - # Zip the tokenizer directory: - tokenizer_zip = shutil.make_archive( - base_name="tokenizer", - format="zip", - root_dir=tokenizer_dir, - ) - # Log the zip file: - self._artifacts["tokenizer"] = self._context.log_artifact( - item="tokenizer", local_path=tokenizer_zip - ) - - # Save the model: - model_dir = os.path.join(temp_directory, "model") - model.save_pretrained(save_directory=model_dir) - - # Zip the model directory: - shutil.make_archive( - base_name="model", - format="zip", - root_dir=model_dir, - ) - - # Log the model: - self._context.log_model( - key="model", - db_key=self._model_name, - model_file="model.zip", - tag=self._tag, - framework="Hugging Face", - labels=self._labels, - extra_data={**self._artifacts, **self._extra_data}, - ) - - def on_evaluate( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs, - ): - self._log_metrics() - - if self._is_training: - return - - # TODO: Update the model object - - def _log_metrics(self): - for metric_name, metric_scores in self._metric_scores.items(): - self._context.log_result(key=metric_name, value=metric_scores[-1]) - if len(metric_scores) > 1: - self._log_metric_plot(name=metric_name, scores=metric_scores) - self._context.commit(completed=False) - - def _log_metric_plot(self, name: str, scores: List[float]): - # Initialize a plotly figure: - metric_figure = go.Figure() - - # Add titles: - metric_figure.update_layout( - title=name.capitalize().replace("_", " "), - xaxis_title="Samples", - yaxis_title="Scores", - ) - - # Draw: - metric_figure.add_trace( - go.Scatter(x=np.arange(len(scores)), y=scores, mode="lines") - ) - - # Create the plotly artifact: - artifact_name = f"{name}_plot" - artifact = PlotlyArtifact(key=artifact_name, figure=metric_figure) - self._artifacts[artifact_name] = self._context.log_artifact(artifact) - - -def _apply_mlrun_on_trainer( - trainer: transformers.Trainer, - model_name: str = None, - tag: str = "", - context: mlrun.MLClientCtx = None, - auto_log: bool = True, - labels: Dict[str, str] = None, - extra_data: dict = None, - **kwargs, -): - # Get parameters defaults: - if context is None: - context = mlrun.get_or_create_ctx(HFTrainerMLRunInterface.DEFAULT_CONTEXT_NAME) - - HFTrainerMLRunInterface.add_interface(obj=trainer) - - if auto_log: - trainer.add_callback( - MLRunCallback( - context=context, - model_name=model_name, - tag=tag, - labels=labels, - extra_data=extra_data, - ) - ) - - -def _apply_mlrun_on_optimizer( - optimizer, - model_name: str = None, - tag: str = "", - context: mlrun.MLClientCtx = None, - auto_log: bool = True, - labels: Dict[str, str] = None, - extra_data: dict = None, - **kwargs, -): - # Get parameters defaults: - if context is None: - context = mlrun.get_or_create_ctx( - HFORTOptimizerMLRunInterface.DEFAULT_CONTEXT_NAME - ) - - HFORTOptimizerMLRunInterface.add_interface(obj=optimizer) - - if auto_log: - optimizer.enable_auto_logging( - context=context, - model_name=model_name, - tag=tag, - labels=labels, - extra_data=extra_data, - ) - - -def apply_mlrun( - huggingface_object, - model_name: str = None, - tag: str = "", - context: mlrun.MLClientCtx = None, - auto_log: bool = True, - labels: Dict[str, str] = None, - extra_data: dict = None, - **kwargs, -): - """ - Wrap the given model with MLRun's interface providing it with mlrun's additional features. - :param huggingface_object: The model to wrap. Can be loaded from the model path given as well. - :param model_name: The model name to use for storing the model artifact. Default: "model". - :param tag: The model's tag to log with. - :param context: MLRun context to work with. If no context is given it will be retrieved via - 'mlrun.get_or_create_ctx(None)' - :param auto_log: Whether to enable MLRun's auto logging. Default: True. - """ - - if isinstance(huggingface_object, transformers.Trainer): - return _apply_mlrun_on_trainer( - trainer=huggingface_object, - model_name=model_name, - tag=tag, - context=context, - auto_log=auto_log, - labels=labels, - extra_data=extra_data, - ) - import optimum.onnxruntime as optimum_ort - - if isinstance(huggingface_object, optimum_ort.ORTOptimizer): - return _apply_mlrun_on_optimizer( - optimizer=huggingface_object, - model_name=model_name, - tag=tag, - context=context, - auto_log=auto_log, - labels=labels, - extra_data=extra_data, - ) - raise mlrun.errors.MLRunInvalidArgumentError - - -# ---------------------- from auto_trainer-------------------------------- -class KWArgsPrefixes: - MODEL_CLASS = "CLASS_" - FIT = "FIT_" - TRAIN = "TRAIN_" - PREDICT = "PREDICT_" - - -def _get_sub_dict_by_prefix(src: Dict, prefix_key: str) -> Dict[str, Any]: - """ - Collect all the keys from the given dict that starts with the given prefix and creates a new dictionary with these - keys. - - :param src: The source dict to extract the values from. - :param prefix_key: Only keys with this prefix will be returned. The keys in the result dict will be without this - prefix. - """ - return { - key.replace(prefix_key, ""): val - for key, val in src.items() - if key.startswith(prefix_key) - } - - -def _get_dataframe( - context: MLClientCtx, - dataset: DataItem, - label_columns: Optional[Union[str, List[str]]] = None, - drop_columns: Union[str, List[str], int, List[int]] = None, -) -> Tuple[pd.DataFrame, Optional[Union[str, List[str]]]]: - """ - Getting the DataFrame of the dataset and drop the columns accordingly. - - :param context: MLRun context. - :param dataset: The dataset to train the model on. - Can be either a list of lists, dict, URI or a FeatureVector. - :param label_columns: The target label(s) of the column(s) in the dataset. for Regression or - Classification tasks. - :param drop_columns: str/int or a list of strings/ints that represent the column names/indices to drop. - """ - if isinstance(dataset, (list, dict)): - dataset = pd.DataFrame(dataset) - # Checking if drop_columns provided by integer type: - if drop_columns: - if isinstance(drop_columns, str) or ( - isinstance(drop_columns, list) - and any(isinstance(col, str) for col in drop_columns) - ): - context.logger.error( - "drop_columns must be an integer/list of integers if not provided with a URI/FeatureVector dataset" - ) - raise ValueError - dataset.drop(drop_columns, axis=1, inplace=True) - - return dataset, label_columns - - store_uri_prefix, _ = mlrun.datastore.parse_store_uri(dataset.artifact_url) - if mlrun.utils.StorePrefix.FeatureVector == store_uri_prefix: - # feature-vector case: - label_columns = label_columns or dataset.meta.status.label_column - dataset = fs.get_offline_features( - dataset.meta.uri, drop_columns=drop_columns - ).to_dataframe() - - context.logger.info(f"label columns: {label_columns}") - else: - # simple URL case: - dataset = dataset.as_df() - if drop_columns: - if all(col in dataset for col in drop_columns): - dataset = dataset.drop(drop_columns, axis=1) - else: - context.logger.info( - "not all of the columns to drop in the dataset, drop columns process skipped" - ) - return dataset, label_columns - - -# ---------------------- Hugging Face Trainer -------------------------------- - - -def _create_compute_metrics(metrics: List[str]) -> Callable[[EvalPrediction], Dict]: - """ - This function create and returns a function that will be used to compute metrics at evaluation. - :param metrics: List of different metrics for evaluate the model such as f1, accuracy etc. - - :returns: Function that will be used to compute metrics at evaluation. - Must take a [`EvalPrediction`] and return a dictionary string to metric values. - """ - - def _compute_metrics(eval_pred): - logits, labels = eval_pred - predictions = np.argmax(logits, axis=-1) - metric_dict_results = {} - for metric in metrics: - load_met = load_metric(metric) - metric_res = load_met.compute(predictions=predictions, references=labels)[ - metric - ] - metric_dict_results[metric] = metric_res - - return metric_dict_results - - return _compute_metrics - - -def _edit_columns( - dataset: Dataset, - drop_columns: List[str] = None, - rename_columns: [str, str] = None, -) -> Dataset: - """ - Drop and renames that columns of the given dataset - :param dataset: Dataset to process - :param drop_columns: The columns to drop from the dataset. - :param rename_columns: Dict of columns ro rename : {: , ...} - - :returns: The dataset after the desired process - """ - if drop_columns: - dataset = dataset.remove_columns(drop_columns) - if rename_columns: - dataset = dataset.rename_columns(rename_columns) - return dataset - - -def _prepare_dataset( - context: MLClientCtx, - dataset_name: str, - label_name: str = None, - drop_columns: Optional[List[str]] = None, - num_of_train_samples: int = None, - train_test_split_size: float = None, - random_state: int = None, -) -> Tuple[Dataset, Dataset]: - """ - Loading the dataset and editing the columns - - :param context: MLRun contex - :param dataset_name: The name of the dataset to get from the HuggingFace hub - :param label_name: The target label of the column in the dataset. - :param drop_columns: The columns to drop from the dataset. - :param num_of_train_samples: Max number of training samples, for debugging. - :param train_test_split_size: Should be between 0.0 and 1.0 and represent the proportion of the dataset to include - in the test split. - :param random_state: Random state for train_test_split - - """ - - context.logger.info( - f"Loading and editing {dataset_name} dataset from Hugging Face hub" - ) - rename_cols = {label_name: "labels"} - - # Loading and editing dataset: - dataset = load_dataset(dataset_name) - - # train set - train_dataset = dataset["train"] - if num_of_train_samples: - train_dataset = train_dataset.shuffle(seed=random_state).select( - list(range(num_of_train_samples)) - ) - train_dataset = _edit_columns(train_dataset, drop_columns, rename_cols) - - # test set - test_dataset = dataset["test"] - if train_test_split_size or num_of_train_samples: - train_test_split_size = train_test_split_size or 0.2 - num_of_test_samples = int( - (train_dataset.num_rows * train_test_split_size) - // (1 - train_test_split_size) - ) - test_dataset = test_dataset.shuffle(seed=random_state).select( - list(range(num_of_test_samples)) - ) - test_dataset = _edit_columns(test_dataset, drop_columns, rename_cols) - - return train_dataset, test_dataset - - -def train( - context: MLClientCtx, - hf_dataset: str = None, - dataset: DataItem = None, - test_set: DataItem = None, - drop_columns: Optional[List[str]] = None, - pretrained_tokenizer: str = None, - pretrained_model: str = None, - model_class: str = None, - model_name: str = "huggingface-model", - label_name: str = "labels", - text_col: str = "text", - num_of_train_samples: int = None, - train_test_split_size: float = None, - metrics: List[str] = None, - random_state: int = None, -): - """ - Training and evaluating a pretrained model with a pretrained tokenizer over a dataset. - The dataset can be either be the name of the dataset that contains in the HuggingFace hub, - or a URI or a FeatureVector - - :param context: MLRun context - :param hf_dataset: The name of the dataset to get from the HuggingFace hub - :param dataset: The dataset to train the model on. Can be either a URI or a FeatureVector - :param test_set: The test set to train the model with. - :param drop_columns: The columns to drop from the dataset. - :param pretrained_tokenizer: The name of the pretrained tokenizer from the HuggingFace hub. - :param pretrained_model: The name of the pretrained model from the HuggingFace hub. - :param model_name: The model's name to use for storing the model artifact, default to 'model' - :param model_class: The class of the model, e.g. `transformers.AutoModelForSequenceClassification` - :param label_name: The target label of the column in the dataset. - :param text_col: The input text column un the dataset. - :param num_of_train_samples: Max number of training samples, for debugging. - :param train_test_split_size: Should be between 0.0 and 1.0 and represent the proportion of the dataset to include - in the test split. - :param metrics: List of different metrics for evaluate the model such as f1, accuracy etc. - :param random_state: Random state for train_test_split - """ - - if train_test_split_size is None and test_set is None: - context.logger.info( - "'train_test_split_size' is not provided, setting train_test_split_size to 0.2" - ) - train_test_split_size = 0.2 - - # Creating tokenizer: - tokenizer = AutoTokenizer.from_pretrained(pretrained_tokenizer) - - def preprocess_function(examples): - return tokenizer(examples[text_col], truncation=True) - - # prepare data for training - if hf_dataset: - train_dataset, test_dataset = _prepare_dataset( - context, - hf_dataset, - label_name, - drop_columns, - num_of_train_samples, - train_test_split_size, - random_state=random_state, - ) - elif dataset: - # Get DataFrame by URL or by FeatureVector: - train_dataset, label_name = _get_dataframe( - context=context, - dataset=dataset, - label_columns=label_name, - drop_columns=drop_columns, - ) - if test_set: - test_dataset, _ = _get_dataframe( - context=context, - dataset=test_set, - label_columns=label_name, - drop_columns=drop_columns, - ) - else: - train_dataset, test_dataset = train_test_split( - train_dataset, - test_size=train_test_split_size, - random_state=random_state, - ) - train_dataset = Dataset.from_pandas(train_dataset) - test_dataset = Dataset.from_pandas(test_dataset) - else: - raise mlrun.errors.MLRunInvalidArgumentError( - "Training data was not provided. A training dataset is mandatory for training." - " Please provide a training set using one of the arguments 'hf_dataset' or 'dataset'." - ) - - # Mapping datasets with the tokenizer: - tokenized_train = train_dataset.map(preprocess_function, batched=True) - tokenized_test = test_dataset.map(preprocess_function, batched=True) - - # Creating data collator for batching: - data_collator = DataCollatorWithPadding(tokenizer=tokenizer) - - # Parsing kwargs: - train_kwargs = _get_sub_dict_by_prefix( - src=context.parameters, prefix_key=KWArgsPrefixes.TRAIN - ) - model_class_kwargs = _get_sub_dict_by_prefix( - src=context.parameters, prefix_key=KWArgsPrefixes.MODEL_CLASS - ) - - # Loading our pretrained model: - model_class_kwargs["pretrained_model_name_or_path"] = ( - model_class_kwargs.get("pretrained_model_name_or_path") or pretrained_model - ) - train_kwargs["hub_token"] = train_kwargs.get("hub_token") or pretrained_tokenizer - if not model_class_kwargs["pretrained_model_name_or_path"]: - raise mlrun.errors.MLRunRuntimeError( - "Must provide pretrained_model name as " - "function argument or in extra params" - ) - model = create_class(model_class).from_pretrained(**model_class_kwargs) - - # Preparing training arguments: - training_args = TrainingArguments( - **train_kwargs, - ) - - compute_metrics = _create_compute_metrics(metrics) if metrics else None - trainer = Trainer( - model=model, - args=training_args, - train_dataset=tokenized_train, - eval_dataset=tokenized_test, - tokenizer=tokenizer, - data_collator=data_collator, - compute_metrics=compute_metrics, - ) - - apply_mlrun(trainer, model_name=model_name) - - # Apply training with evaluation: - context.logger.info(f"training '{model_name}'") - trainer.train() - - -def _get_model_dir(model_uri: str): - model_file, _, _ = mlrun.artifacts.get_model(model_uri) - model_dir = tempfile.gettempdir() - # Unzip the Model: - with zipfile.ZipFile(model_file, "r") as zip_file: - zip_file.extractall(model_dir) - - return model_dir - - -def optimize( - model_path: str, - model_name: str = "optimized_model", - target_dir: str = "./optimized", - optimization_level: int = 1, -): - """ - Optimizing the transformer model using ONNX optimization. - - - :param model_path: The path of the model to optimize. - :param model_name: Name of the optimized model. - :param target_dir: The directory to save the ONNX model. - :param optimization_level: Optimization level performed by ONNX Runtime of the loaded graph. (default is 1) - """ - # We import these in the function scope so ONNX won't be mandatory for the other handlers: - from optimum.onnxruntime import ORTModelForSequenceClassification, ORTOptimizer - from optimum.onnxruntime.configuration import OptimizationConfig - - model_dir = _get_model_dir(model_uri=model_path) - # Creating configuration for optimization step: - optimization_config = OptimizationConfig(optimization_level=optimization_level) - - # Converting our pretrained model to an ONNX-Runtime model: - ort_model = ORTModelForSequenceClassification.from_pretrained( - model_dir, from_transformers=True - ) - - # Creating an ONNX-Runtime optimizer from ONNX model: - optimizer = ORTOptimizer.from_pretrained(ort_model) - - apply_mlrun(optimizer, model_name=model_name) - # Optimizing and saving the ONNX model: - optimizer.optimize(save_dir=target_dir, optimization_config=optimization_config) diff --git a/functions/development/hugging_face_classifier_trainer/0.2.0/src/item.yaml b/functions/development/hugging_face_classifier_trainer/0.2.0/src/item.yaml deleted file mode 100755 index 3c087765..00000000 --- a/functions/development/hugging_face_classifier_trainer/0.2.0/src/item.yaml +++ /dev/null @@ -1,31 +0,0 @@ -apiVersion: v1 -categories: -- machine-learning -- model-training -description: Automatic train and optimize functions for HuggingFace framework -doc: '' -example: hugging_face_classifier_trainer.ipynb -generationDate: 2022-08-28:17-25 -hidden: false -icon: '' -labels: - author: davids -maintainers: [] -marketplaceType: '' -mlrunVersion: 1.6.1 -name: hugging_face_classifier_trainer -platformVersion: 3.5.5 -spec: - filename: hugging_face_classifier_trainer.py - handler: train - image: mlrun/mlrun - kind: job - requirements: - - onnx~=1.14.1 - - onnxruntime~=1.16.1 - - optimum~=1.6.4 - - transformers~=4.26.1 - - datasets~=2.10.1 - - scikit-learn~=1.0.2 -url: '' -version: 0.2.0 diff --git a/functions/development/hugging_face_classifier_trainer/0.2.0/src/requirements.txt b/functions/development/hugging_face_classifier_trainer/0.2.0/src/requirements.txt deleted file mode 100644 index 9d0db7b4..00000000 --- a/functions/development/hugging_face_classifier_trainer/0.2.0/src/requirements.txt +++ /dev/null @@ -1,6 +0,0 @@ -onnx~=1.14.1 -onnxruntime~=1.16.1 -optimum~=1.6.4 -transformers~=4.26.1 -datasets~=2.10.1 -scikit-learn~=1.0.2 \ No newline at end of file diff --git a/functions/development/hugging_face_classifier_trainer/0.2.0/src/test_hugging_face_classifier_trainer.py b/functions/development/hugging_face_classifier_trainer/0.2.0/src/test_hugging_face_classifier_trainer.py deleted file mode 100644 index a5e0fee9..00000000 --- a/functions/development/hugging_face_classifier_trainer/0.2.0/src/test_hugging_face_classifier_trainer.py +++ /dev/null @@ -1,145 +0,0 @@ -# Copyright 2019 Iguazio -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -import os - -import mlrun -import pytest -from mlrun import import_function - -REQUIRED_ENV_VARS = [ - "MLRUN_DBPATH", - "MLRUN_ARTIFACT_PATH", - "V3IO_USERNAME", - "V3IO_API", - "V3IO_ACCESS_KEY", -] - -ADDITIONAL_PARAM_FOR_TRAIN = { - "TRAIN_output_dir": "finetuning-sentiment-model-3000-samples", - "TRAIN_learning_rate": 2e-5, - "TRAIN_per_device_train_batch_size": 16, - "TRAIN_per_device_eval_batch_size": 16, - "TRAIN_num_train_epochs": 2, - "TRAIN_weight_decay": 0.01, - "TRAIN_push_to_hub": False, - "TRAIN_evaluation_strategy": "epoch", - "TRAIN_eval_steps": 1, - "TRAIN_logging_steps": 1, - "CLASS_num_labels": 2, -} - - -def _validate_environment_variables() -> bool: - """ - Checks that all required Environment variables are set. - """ - environment_keys = os.environ.keys() - return all(key in environment_keys for key in REQUIRED_ENV_VARS) - - -def _set_environment(env_file=None): - if env_file: - mlrun.set_env_from_file(env_file) - mlrun.get_or_create_project( - "hugging-face-classifier-trainer-test", context="./", user_project=True - ) - - -@pytest.mark.skipif( - condition=not _validate_environment_variables(), - reason="Project's environment variables are not set", -) -def test_train_sequence_classification(): - _set_environment() - - # Importing function: - fn = import_function("function.yaml") - - train_run = None - - try: - train_run = fn.run( - params={ - "hf_dataset": "Shayanvsf/US_Airline_Sentiment", - "drop_columns": [ - "airline_sentiment_confidence", - "negativereason_confidence", - ], - "pretrained_tokenizer": "distilbert-base-uncased", - "pretrained_model": "distilbert-base-uncased", - "model_class": "transformers.AutoModelForSequenceClassification", - "label_name": "airline_sentiment", - "num_of_train_samples": 100, - "metrics": ["accuracy", "f1"], - "random_state": 42, - **ADDITIONAL_PARAM_FOR_TRAIN, - }, - handler="train", - local=True, - ) - except Exception as exception: - print(f"- The test failed - raised the following error:\n- {exception}") - assert train_run and all( - key in train_run.outputs for key in ["model", "loss"] - ), "outputs should include more data" - - -@pytest.mark.skipif( - condition=not _validate_environment_variables(), - reason="Project's environment variables are not set", -) -def test_train_and_optimize_sequence_classification(): - _set_environment() - - # Importing function: - fn = import_function("function.yaml") - - train_run = None - optimize_run = None - - try: - train_run = fn.run( - params={ - "hf_dataset": "Shayanvsf/US_Airline_Sentiment", - "drop_columns": [ - "airline_sentiment_confidence", - "negativereason_confidence", - ], - "pretrained_tokenizer": "distilbert-base-uncased", - "pretrained_model": "distilbert-base-uncased", - "model_class": "transformers.AutoModelForSequenceClassification", - "label_name": "airline_sentiment", - "num_of_train_samples": 100, - "metrics": ["accuracy", "f1"], - "random_state": 42, - **ADDITIONAL_PARAM_FOR_TRAIN, - }, - handler="train", - local=True, - ) - - optimize_run = fn.run( - params={"model_path": train_run.outputs["model"]}, - handler="optimize", - local=True, - ) - except Exception as exception: - print(f"- The test failed - raised the following error:\n- {exception}") - assert train_run and all( - key in train_run.outputs for key in ["model", "loss"] - ), "outputs should include more data" - assert optimize_run and all( - key in optimize_run.outputs for key in ["model"] - ), "outputs should include more data" diff --git a/functions/development/hugging_face_classifier_trainer/0.2.0/static/documentation.html b/functions/development/hugging_face_classifier_trainer/0.2.0/static/documentation.html deleted file mode 100644 index 1652c838..00000000 --- a/functions/development/hugging_face_classifier_trainer/0.2.0/static/documentation.html +++ /dev/null @@ -1,394 +0,0 @@ - - - - - - - -hugging_face_classifier_trainer package - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - - - -
-
- - -
-
-
- -
-

hugging_face_classifier_trainer package

- -
- -
-
-
-
-
-

hugging_face_classifier_trainer package#

-
-

Submodules#

-
-
-

hugging_face_classifier_trainer.hugging_face_classifier_trainer module#

-
-
-class hugging_face_classifier_trainer.hugging_face_classifier_trainer.HFORTOptimizerMLRunInterface(*args: Any, **kwargs: Any)[source]#
-

Bases: mlrun.frameworks._common., abc.ABC

-

Interface for adding MLRun features for tensorflow keras API.

-
-
-DEFAULT_CONTEXT_NAME = 'mlrun-huggingface'#
-
-
-
-classmethod add_interface(obj, restoration: Optional[mlrun.frameworks._common.CommonTypes.MLRunInterfaceRestorationType] = None)[source]#
-

Enrich the object with this interface properties, methods and functions, so it will have this TensorFlow.Keras -MLRun’s features. -:param obj: The object to enrich his interface. -:param restoration: Restoration information tuple as returned from ‘remove_interface’ in order to

-
-

add the interface in a certain state.

-
-
-
-
-enable_auto_logging(context: mlrun.execution.MLClientCtx, model_name: str = 'model', tag: str = '', labels: Optional[Dict[str, str]] = None, extra_data: Optional[dict] = None)[source]#
-
-
-
-classmethod mlrun_optimize()[source]#
-

MLRun’s tf.keras.Model.fit wrapper. It will setup the optimizer when using horovod. The optimizer must be -passed in a keyword argument and when using horovod, it must be passed as an Optimizer instance, not a string.

-

raise MLRunInvalidArgumentError: In case the optimizer provided did not follow the instructions above.

-
-
-
-
-class hugging_face_classifier_trainer.hugging_face_classifier_trainer.HFTrainerMLRunInterface(*args: Any, **kwargs: Any)[source]#
-

Bases: mlrun.frameworks._common., abc.ABC

-

Interface for adding MLRun features for tensorflow keras API.

-
-
-DEFAULT_CONTEXT_NAME = 'mlrun-huggingface'#
-
-
-
-classmethod add_interface(obj: transformers.Trainer, restoration: Optional[mlrun.frameworks._common.CommonTypes.MLRunInterfaceRestorationType] = None)[source]#
-

Enrich the object with this interface properties, methods and functions, so it will have this TensorFlow.Keras -MLRuns features. -:param obj: The object to enrich his interface. -:param restoration: Restoration information tuple as returned from ‘remove_interface’ in order to

-
-

add the interface in a certain state.

-
-
-
-
-classmethod mlrun_train()[source]#
-

MLRuns tf.keras.Model.fit wrapper. It will setup the optimizer when using horovod. The optimizer must be -passed in a keyword argument and when using horovod, it must be passed as an Optimizer instance, not a string.

-

raise MLRunInvalidArgumentError: In case the optimizer provided did not follow the instructions above.

-
-
-
-
-class hugging_face_classifier_trainer.hugging_face_classifier_trainer.KWArgsPrefixes[source]#
-

Bases: object

-
-
-FIT = 'FIT_'#
-
-
-
-MODEL_CLASS = 'CLASS_'#
-
-
-
-PREDICT = 'PREDICT_'#
-
-
-
-TRAIN = 'TRAIN_'#
-
-
-
-
-class hugging_face_classifier_trainer.hugging_face_classifier_trainer.MLRunCallback(*args: Any, **kwargs: Any)[source]#
-

Bases: transformers.

-

Callback for collecting logs during training / evaluation of the Trainer API.

-
-
-on_epoch_begin(args: transformers.TrainingArguments, state: transformers.TrainerState, control: transformers.TrainerControl, **kwargs)[source]#
-
-
-
-on_epoch_end(args: transformers.TrainingArguments, state: transformers.TrainerState, control: transformers.TrainerControl, **kwargs)[source]#
-
-
-
-on_evaluate(args: transformers.TrainingArguments, state: transformers.TrainerState, control: transformers.TrainerControl, **kwargs)[source]#
-
-
-
-on_log(args: transformers.TrainingArguments, state: transformers.TrainerState, control: transformers.TrainerControl, logs: Optional[Dict[str, float]] = None, **kwargs)[source]#
-
-
-
-on_train_begin(args: transformers.TrainingArguments, state: transformers.TrainerState, control: transformers.TrainerControl, **kwargs)[source]#
-
-
-
-on_train_end(args: transformers.TrainingArguments, state: transformers.TrainerState, control: transformers.TrainerControl, model: Optional[transformers.PreTrainedModel] = None, tokenizer: Optional[transformers.PreTrainedTokenizer] = None, **kwargs)[source]#
-
-
-
-
-hugging_face_classifier_trainer.hugging_face_classifier_trainer.apply_mlrun(huggingface_object, model_name: Optional[str] = None, tag: str = '', context: Optional[mlrun.execution.MLClientCtx] = None, auto_log: bool = True, labels: Optional[Dict[str, str]] = None, extra_data: Optional[dict] = None, **kwargs)[source]#
-

Wrap the given model with MLRun’s interface providing it with mlrun’s additional features. -:param huggingface_object: The model to wrap. Can be loaded from the model path given as well. -:param model_name: The model name to use for storing the model artifact. Default: “model”. -:param tag: The model’s tag to log with. -:param context: MLRun context to work with. If no context is given it will be retrieved via

-
-

‘mlrun.get_or_create_ctx(None)’

-
-
-
Parameters
-

auto_log – Whether to enable MLRun’s auto logging. Default: True.

-
-
-
-
-
-hugging_face_classifier_trainer.hugging_face_classifier_trainer.optimize(model_path: str, model_name: str = 'optimized_model', target_dir: str = './optimized', optimization_level: int = 1)[source]#
-

Optimizing the transformer model using ONNX optimization.

-
-
Parameters
-
    -
  • model_path – The path of the model to optimize.

  • -
  • model_name – Name of the optimized model.

  • -
  • target_dir – The directory to save the ONNX model.

  • -
  • optimization_level – Optimization level performed by ONNX Runtime of the loaded graph. (default is 1)

  • -
-
-
-
-
-
-hugging_face_classifier_trainer.hugging_face_classifier_trainer.train(context: mlrun.execution.MLClientCtx, hf_dataset: Optional[str] = None, dataset: Optional[mlrun.datastore.base.DataItem] = None, test_set: Optional[mlrun.datastore.base.DataItem] = None, drop_columns: Optional[List[str]] = None, pretrained_tokenizer: Optional[str] = None, pretrained_model: Optional[str] = None, model_class: Optional[str] = None, model_name: str = 'huggingface-model', label_name: str = 'labels', text_col: str = 'text', num_of_train_samples: Optional[int] = None, train_test_split_size: Optional[float] = None, metrics: Optional[List[str]] = None, random_state: Optional[int] = None)[source]#
-

Training and evaluating a pretrained model with a pretrained tokenizer over a dataset. -The dataset can be either be the name of the dataset that contains in the HuggingFace hub, -or a URI or a FeatureVector

-
-
Parameters
-
    -
  • context – MLRun context

  • -
  • hf_dataset – The name of the dataset to get from the HuggingFace hub

  • -
  • dataset – The dataset to train the model on. Can be either a URI or a FeatureVector

  • -
  • test_set – The test set to train the model with.

  • -
  • drop_columns – The columns to drop from the dataset.

  • -
  • pretrained_tokenizer – The name of the pretrained tokenizer from the HuggingFace hub.

  • -
  • pretrained_model – The name of the pretrained model from the HuggingFace hub.

  • -
  • model_name – The model’s name to use for storing the model artifact, default to ‘model’

  • -
  • model_class – The class of the model, e.g. transformers.AutoModelForSequenceClassification

  • -
  • label_name – The target label of the column in the dataset.

  • -
  • text_col – The input text column un the dataset.

  • -
  • num_of_train_samples – Max number of training samples, for debugging.

  • -
  • train_test_split_size – Should be between 0.0 and 1.0 and represent the proportion of the dataset to include -in the test split.

  • -
  • metrics – List of different metrics for evaluate the model such as f1, accuracy etc.

  • -
  • random_state – Random state for train_test_split

  • -
-
-
-
-
-
-

Module contents#

-
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/hugging_face_classifier_trainer/0.2.0/static/example.html b/functions/development/hugging_face_classifier_trainer/0.2.0/static/example.html deleted file mode 100644 index 5fdd60e5..00000000 --- a/functions/development/hugging_face_classifier_trainer/0.2.0/static/example.html +++ /dev/null @@ -1,2406 +0,0 @@ - - - - - - - -MLRun Hugging Face Classifier Trainer Tutorial - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - - - -
-
- - -
-
-
- - -
-
-

-
-

MLRun Hugging Face Classifier Trainer Tutorial#

-

This notebook shows how to use the handlers of the Hugging Face classifier trainer. -the following handlers are:

-
    -
  • train

  • -
  • optimize

  • -
-

All you need is simply HF model type and a HF dataset name.

-
-
-
%pip install -r requirements.txt
-
-
-
-
-
Requirement already satisfied: onnx~=1.14.1 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from -r requirements.txt (line 1)) (1.14.1)
-Requirement already satisfied: onnxruntime==1.16.1 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from -r requirements.txt (line 2)) (1.16.1)
-Requirement already satisfied: optimum~=1.6.4 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from -r requirements.txt (line 3)) (1.6.4)
-Requirement already satisfied: transformers~=4.26.1 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from -r requirements.txt (line 4)) (4.26.1)
-Requirement already satisfied: datasets~=2.10.1 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from -r requirements.txt (line 5)) (2.10.1)
-Requirement already satisfied: scikit-learn~=1.0.2 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from -r requirements.txt (line 6)) (1.0.2)
-Requirement already satisfied: coloredlogs in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from onnxruntime==1.16.1->-r requirements.txt (line 2)) (15.0.1)
-Requirement already satisfied: flatbuffers in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from onnxruntime==1.16.1->-r requirements.txt (line 2)) (1.12)
-Requirement already satisfied: numpy>=1.21.6 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from onnxruntime==1.16.1->-r requirements.txt (line 2)) (1.23.5)
-Requirement already satisfied: packaging in /conda/envs/mlrun-base/lib/python3.9/site-packages (from onnxruntime==1.16.1->-r requirements.txt (line 2)) (21.3)
-Requirement already satisfied: protobuf in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from onnxruntime==1.16.1->-r requirements.txt (line 2)) (3.20.2)
-Requirement already satisfied: sympy in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from onnxruntime==1.16.1->-r requirements.txt (line 2)) (1.12)
-Requirement already satisfied: typing-extensions>=3.6.2.1 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from onnx~=1.14.1->-r requirements.txt (line 1)) (4.7.1)
-Requirement already satisfied: torch>=1.9 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from optimum~=1.6.4->-r requirements.txt (line 3)) (2.1.2)
-Requirement already satisfied: huggingface-hub>=0.8.0 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from optimum~=1.6.4->-r requirements.txt (line 3)) (0.20.1)
-Requirement already satisfied: filelock in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from transformers~=4.26.1->-r requirements.txt (line 4)) (3.13.1)
-Requirement already satisfied: pyyaml>=5.1 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from transformers~=4.26.1->-r requirements.txt (line 4)) (5.4.1)
-Requirement already satisfied: regex!=2019.12.17 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from transformers~=4.26.1->-r requirements.txt (line 4)) (2023.12.25)
-Requirement already satisfied: requests in /conda/envs/mlrun-base/lib/python3.9/site-packages (from transformers~=4.26.1->-r requirements.txt (line 4)) (2.31.0)
-Requirement already satisfied: tokenizers!=0.11.3,<0.14,>=0.11.1 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from transformers~=4.26.1->-r requirements.txt (line 4)) (0.13.3)
-Requirement already satisfied: tqdm>=4.27 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from transformers~=4.26.1->-r requirements.txt (line 4)) (4.65.0)
-Requirement already satisfied: pyarrow>=6.0.0 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from datasets~=2.10.1->-r requirements.txt (line 5)) (11.0.0)
-Requirement already satisfied: dill<0.3.7,>=0.3.0 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from datasets~=2.10.1->-r requirements.txt (line 5)) (0.3.6)
-Requirement already satisfied: pandas in /conda/envs/mlrun-base/lib/python3.9/site-packages (from datasets~=2.10.1->-r requirements.txt (line 5)) (1.4.4)
-Requirement already satisfied: xxhash in /conda/envs/mlrun-base/lib/python3.9/site-packages (from datasets~=2.10.1->-r requirements.txt (line 5)) (3.3.0)
-Requirement already satisfied: multiprocess in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from datasets~=2.10.1->-r requirements.txt (line 5)) (0.70.14)
-Requirement already satisfied: fsspec>=2021.11.1 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from fsspec[http]>=2021.11.1->datasets~=2.10.1->-r requirements.txt (line 5)) (2023.9.2)
-Requirement already satisfied: aiohttp in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from datasets~=2.10.1->-r requirements.txt (line 5)) (3.9.1)
-Requirement already satisfied: responses<0.19 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from datasets~=2.10.1->-r requirements.txt (line 5)) (0.18.0)
-Requirement already satisfied: scipy>=1.1.0 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from scikit-learn~=1.0.2->-r requirements.txt (line 6)) (1.11.4)
-Requirement already satisfied: joblib>=0.11 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from scikit-learn~=1.0.2->-r requirements.txt (line 6)) (1.3.2)
-Requirement already satisfied: threadpoolctl>=2.0.0 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from scikit-learn~=1.0.2->-r requirements.txt (line 6)) (3.2.0)
-Requirement already satisfied: attrs>=17.3.0 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from aiohttp->datasets~=2.10.1->-r requirements.txt (line 5)) (19.1.0)
-Requirement already satisfied: multidict<7.0,>=4.5 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from aiohttp->datasets~=2.10.1->-r requirements.txt (line 5)) (6.0.4)
-Requirement already satisfied: yarl<2.0,>=1.0 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from aiohttp->datasets~=2.10.1->-r requirements.txt (line 5)) (1.9.2)
-Requirement already satisfied: frozenlist>=1.1.1 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from aiohttp->datasets~=2.10.1->-r requirements.txt (line 5)) (1.4.0)
-Requirement already satisfied: aiosignal>=1.1.2 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from aiohttp->datasets~=2.10.1->-r requirements.txt (line 5)) (1.3.1)
-Requirement already satisfied: async-timeout<5.0,>=4.0 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from aiohttp->datasets~=2.10.1->-r requirements.txt (line 5)) (4.0.3)
-Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from packaging->onnxruntime==1.16.1->-r requirements.txt (line 2)) (3.1.1)
-Requirement already satisfied: charset-normalizer<4,>=2 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from requests->transformers~=4.26.1->-r requirements.txt (line 4)) (2.1.1)
-Requirement already satisfied: idna<4,>=2.5 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from requests->transformers~=4.26.1->-r requirements.txt (line 4)) (3.4)
-Requirement already satisfied: urllib3<3,>=1.21.1 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from requests->transformers~=4.26.1->-r requirements.txt (line 4)) (1.26.16)
-Requirement already satisfied: certifi>=2017.4.17 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from requests->transformers~=4.26.1->-r requirements.txt (line 4)) (2023.7.22)
-Requirement already satisfied: networkx in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (3.2.1)
-Requirement already satisfied: jinja2 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (3.1.3)
-Requirement already satisfied: nvidia-cuda-nvrtc-cu12==12.1.105 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (12.1.105)
-Requirement already satisfied: nvidia-cuda-runtime-cu12==12.1.105 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (12.1.105)
-Requirement already satisfied: nvidia-cuda-cupti-cu12==12.1.105 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (12.1.105)
-Requirement already satisfied: nvidia-cudnn-cu12==8.9.2.26 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (8.9.2.26)
-Requirement already satisfied: nvidia-cublas-cu12==12.1.3.1 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (12.1.3.1)
-Requirement already satisfied: nvidia-cufft-cu12==11.0.2.54 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (11.0.2.54)
-Requirement already satisfied: nvidia-curand-cu12==10.3.2.106 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (10.3.2.106)
-Requirement already satisfied: nvidia-cusolver-cu12==11.4.5.107 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (11.4.5.107)
-Requirement already satisfied: nvidia-cusparse-cu12==12.1.0.106 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (12.1.0.106)
-Requirement already satisfied: nvidia-nccl-cu12==2.18.1 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (2.18.1)
-Requirement already satisfied: nvidia-nvtx-cu12==12.1.105 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (12.1.105)
-Requirement already satisfied: triton==2.1.0 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (2.1.0)
-Requirement already satisfied: nvidia-nvjitlink-cu12 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from nvidia-cusolver-cu12==11.4.5.107->torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (12.3.101)
-Requirement already satisfied: sentencepiece!=0.1.92,>=0.1.91 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from transformers[sentencepiece]>=4.26.0->optimum~=1.6.4->-r requirements.txt (line 3)) (0.2.0)
-Requirement already satisfied: humanfriendly>=9.1 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from coloredlogs->onnxruntime==1.16.1->-r requirements.txt (line 2)) (9.2)
-Requirement already satisfied: python-dateutil>=2.8.1 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from pandas->datasets~=2.10.1->-r requirements.txt (line 5)) (2.8.2)
-Requirement already satisfied: pytz>=2020.1 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from pandas->datasets~=2.10.1->-r requirements.txt (line 5)) (2023.3.post1)
-Requirement already satisfied: mpmath>=0.19 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from sympy->onnxruntime==1.16.1->-r requirements.txt (line 2)) (1.3.0)
-Requirement already satisfied: six>=1.5 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from python-dateutil>=2.8.1->pandas->datasets~=2.10.1->-r requirements.txt (line 5)) (1.16.0)
-Requirement already satisfied: MarkupSafe>=2.0 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from jinja2->torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (2.1.3)
-Note: you may need to restart the kernel to use updated packages.
-
-
-
-
-
-
-
import mlrun
-
-
-
-
-
-
-
project = mlrun.get_or_create_project('hugging-face-trainer', context="./", user_project=True)
-
-
-
-
-
> 2024-03-24 17:10:17,091 [info] Project loaded successfully: {'project_name': 'hugging-face-trainer'}
-
-
-
-
-
-

Importing the hugging_face_classifier_trainer function from the Marketplace#

-
-
-
hugging_face_classifier_trainer = mlrun.import_function("hub://hugging_face_classifier_trainer")
-
-
-
-
-
-
-

Training a model#

-

Choosing the train handler

-
-

Define task parameters¶#

-
    -
  • Class parameters should contain the prefix CLASS_

  • -
  • Train parameters should contain the prefix TRAIN_

  • -
-
-
-
model_class = "transformers.AutoModelForSequenceClassification"
-additional_parameters = {
-    "TRAIN_output_dir": "finetuning-sentiment-model-3000-samples",
-    "TRAIN_learning_rate": 2e-5,
-    "TRAIN_per_device_train_batch_size": 16,
-    "TRAIN_per_device_eval_batch_size": 16,
-    "TRAIN_num_train_epochs": 3,
-    "TRAIN_weight_decay": 0.01,
-    "TRAIN_push_to_hub": False,
-    "TRAIN_evaluation_strategy": "epoch",
-    "TRAIN_eval_steps": 1,
-    "TRAIN_logging_steps": 1,
-    "CLASS_num_labels": 2
-}
-
-
-
-
-
-
-

Running the Training job with the “train” handler#

-
-
-
train_run = hugging_face_classifier_trainer.run(params={
-                                                        "hf_dataset": "Shayanvsf/US_Airline_Sentiment",
-                                                        "drop_columns": [
-                                                            "airline_sentiment_confidence",
-                                                            "negativereason_confidence",
-                                                        ],
-                                                        "pretrained_tokenizer": "distilbert-base-uncased",
-                                                        "pretrained_model": "distilbert-base-uncased",
-                                                        "model_class": "transformers.AutoModelForSequenceClassification",
-                                                        "label_name": "airline_sentiment",
-                                                        "num_of_train_samples": 100,
-                                                        "metrics": ["accuracy", "f1"],
-                                                        "random_state": 42,
-                                                        **additional_parameters
-                                                    },
-                                                    handler="train",
-                                                    local=True,
-                                                )
-
-
-
-
-
> 2024-03-24 17:10:21,025 [info] Storing function: {'name': 'hugging-face-classifier-trainer-train', 'uid': '514d8d5530c842238b1cc81983cd943e', 'db': 'http://mlrun-api:8080'}
-> 2024-03-24 17:11:03,727 [info] 'train_test_split_size' is not provided, setting train_test_split_size to 0.2
-> 2024-03-24 17:11:03,882 [info] Loading and editing Shayanvsf/US_Airline_Sentiment dataset from Hugging Face hub
-
-
-
Found cached dataset parquet (/igz/.cache/huggingface/datasets/Shayanvsf___parquet/Shayanvsf--US_Airline_Sentiment-1319c42f87c44b2f/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
-
-
-
Loading cached shuffled indices for dataset at /igz/.cache/huggingface/datasets/Shayanvsf___parquet/Shayanvsf--US_Airline_Sentiment-1319c42f87c44b2f/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-ec18d1773cfb9bb5.arrow
-Loading cached shuffled indices for dataset at /igz/.cache/huggingface/datasets/Shayanvsf___parquet/Shayanvsf--US_Airline_Sentiment-1319c42f87c44b2f/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-e0c54c494a578ee6.arrow
-
-
-
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight']
-- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
-- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
-Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.weight', 'pre_classifier.weight', 'classifier.bias']
-You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
-
-
-
> 2024-03-24 17:11:08,938 [info] training 'huggingface-model'
-
-
-
The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
-This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning
-***** Running training *****
-  Num examples = 100
-  Num Epochs = 3
-  Instantaneous batch size per device = 16
-  Total train batch size (w. parallel, distributed & accumulation) = 16
-  Gradient Accumulation steps = 1
-  Total optimization steps = 21
-  Number of trainable parameters = 66955010
-You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
-
-
-
-
- - [21/21 00:15, Epoch 3/3] -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
EpochTraining LossValidation LossAccuracyF1
10.7389000.5153110.7916670.000000
20.5259000.4815630.7916670.000000
30.4908000.4716750.7916670.000000

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
-***** Running Evaluation *****
-  Num examples = 24
-  Batch size = 16
-/tmp/tmp0c1aawrq.py:561: FutureWarning:
-
-load_metric is deprecated and will be removed in the next major version of datasets. Use 'evaluate.load' instead, from the new library 🤗 Evaluate: https://huggingface.co/docs/evaluate
-
-The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
-***** Running Evaluation *****
-  Num examples = 24
-  Batch size = 16
-The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
-***** Running Evaluation *****
-  Num examples = 24
-  Batch size = 16
-
-
-Training completed. Do not forget to share your model on huggingface.co/models =)
-
-
-tokenizer config file saved in /tmp/tokenizer/tokenizer_config.json
-Special tokens file saved in /tmp/tokenizer/special_tokens_map.json
-Configuration saved in /tmp/model/config.json
-Model weights saved in /tmp/model/pytorch_model.bin
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
hugging-face-trainer-avia0Mar 24 17:10:21completedhugging-face-classifier-trainer-train
v3io_user=avia
kind=local
owner=avia
host=jupyter-avia-6454bdd4c5-xz8cg
hf_dataset=Shayanvsf/US_Airline_Sentiment
drop_columns=['airline_sentiment_confidence', 'negativereason_confidence']
pretrained_tokenizer=distilbert-base-uncased
pretrained_model=distilbert-base-uncased
model_class=transformers.AutoModelForSequenceClassification
label_name=airline_sentiment
num_of_train_samples=100
metrics=['accuracy', 'f1']
random_state=42
TRAIN_output_dir=finetuning-sentiment-model-3000-samples
TRAIN_learning_rate=2e-05
TRAIN_per_device_train_batch_size=16
TRAIN_per_device_eval_batch_size=16
TRAIN_num_train_epochs=3
TRAIN_weight_decay=0.01
TRAIN_push_to_hub=False
TRAIN_evaluation_strategy=epoch
TRAIN_eval_steps=1
TRAIN_logging_steps=1
CLASS_num_labels=2
loss=0.4908
learning_rate=0.0
eval_loss=0.47167453169822693
eval_accuracy=0.7916666666666666
eval_f1=0.0
eval_runtime=0.5186
eval_samples_per_second=46.276
eval_steps_per_second=3.856
train_runtime=17.6054
train_samples_per_second=17.04
train_steps_per_second=1.193
total_flos=3327208489680.0
loss_plot
learning_rate_plot
eval_loss_plot
eval_accuracy_plot
eval_f1_plot
eval_runtime_plot
eval_samples_per_second_plot
eval_steps_per_second_plot
tokenizer
model
-
- -
-

-
-
-
> to track results use the .show() or .logs() methods or click here to open in UI
> 2024-03-24 17:12:01,880 [info] Run execution finished: {'status': 'completed', 'name': 'hugging-face-classifier-trainer-train'}
-
-
-
-
-
-
-

The result of the train run#

-
-
-
train_run.outputs
-
-
-
-
-
{'loss': 0.4908,
- 'learning_rate': 0.0,
- 'eval_loss': 0.47167453169822693,
- 'eval_accuracy': 0.7916666666666666,
- 'eval_f1': 0.0,
- 'eval_runtime': 0.5186,
- 'eval_samples_per_second': 46.276,
- 'eval_steps_per_second': 3.856,
- 'train_runtime': 17.6054,
- 'train_samples_per_second': 17.04,
- 'train_steps_per_second': 1.193,
- 'total_flos': 3327208489680.0,
- 'loss_plot': 'v3io:///projects/hugging-face-trainer-avia/artifacts/hugging-face-classifier-trainer-train/0/loss_plot.html',
- 'learning_rate_plot': 'v3io:///projects/hugging-face-trainer-avia/artifacts/hugging-face-classifier-trainer-train/0/learning_rate_plot.html',
- 'eval_loss_plot': 'v3io:///projects/hugging-face-trainer-avia/artifacts/hugging-face-classifier-trainer-train/0/eval_loss_plot.html',
- 'eval_accuracy_plot': 'v3io:///projects/hugging-face-trainer-avia/artifacts/hugging-face-classifier-trainer-train/0/eval_accuracy_plot.html',
- 'eval_f1_plot': 'v3io:///projects/hugging-face-trainer-avia/artifacts/hugging-face-classifier-trainer-train/0/eval_f1_plot.html',
- 'eval_runtime_plot': 'v3io:///projects/hugging-face-trainer-avia/artifacts/hugging-face-classifier-trainer-train/0/eval_runtime_plot.html',
- 'eval_samples_per_second_plot': 'v3io:///projects/hugging-face-trainer-avia/artifacts/hugging-face-classifier-trainer-train/0/eval_samples_per_second_plot.html',
- 'eval_steps_per_second_plot': 'v3io:///projects/hugging-face-trainer-avia/artifacts/hugging-face-classifier-trainer-train/0/eval_steps_per_second_plot.html',
- 'tokenizer': 'store://artifacts/hugging-face-trainer-avia/hugging-face-classifier-trainer-train_tokenizer@514d8d5530c842238b1cc81983cd943e',
- 'model': 'store://artifacts/hugging-face-trainer-avia/huggingface-model@514d8d5530c842238b1cc81983cd943e'}
-
-
-
-
-
-
-
train_run.artifact('loss_plot').show()
-
-
-
-
-
- - -
-
- -
-
-
-
-

Getting the model for evaluating and predicting#

-
-
-
model_path = train_run.outputs['model']
-
-
-
-
-
-
-
-

Optimize the model#

-

Choosing the optimize handler

-

The result of using this handled is an onnx optimized model.

-
-
-
optimize_run = hugging_face_classifier_trainer.run(params={
-                                                        "model_path": str(model_path)
-                                                    },
-                                                    handler="optimize",
-                                                    local=True,
-                                                )
-
-
-
-
-
> 2024-03-24 17:12:02,020 [info] Storing function: {'name': 'hugging-face-classifier-trainer-optimize', 'uid': 'fbee1ead18444824a4b5c0308a677bf4', 'db': 'http://mlrun-api:8080'}
-
-
-
/User/.pythonlibs/mlrun-base/lib/python3.9/site-packages/optimum/onnxruntime/configuration.py:726: FutureWarning:
-
-disable_embed_layer_norm will be deprecated soon, use disable_embed_layer_norm_fusion instead, disable_embed_layer_norm_fusion is set to True.
-
-loading configuration file /tmp/config.json
-Model config DistilBertConfig {
-  "_name_or_path": "/tmp/config.json",
-  "activation": "gelu",
-  "architectures": [
-    "DistilBertForSequenceClassification"
-  ],
-  "attention_dropout": 0.1,
-  "dim": 768,
-  "dropout": 0.1,
-  "hidden_dim": 3072,
-  "initializer_range": 0.02,
-  "max_position_embeddings": 512,
-  "model_type": "distilbert",
-  "n_heads": 12,
-  "n_layers": 6,
-  "pad_token_id": 0,
-  "problem_type": "single_label_classification",
-  "qa_dropout": 0.1,
-  "seq_classif_dropout": 0.2,
-  "sinusoidal_pos_embds": false,
-  "tie_weights_": true,
-  "torch_dtype": "float32",
-  "transformers_version": "4.26.1",
-  "vocab_size": 30522
-}
-
-loading configuration file /tmp/config.json
-Model config DistilBertConfig {
-  "_name_or_path": "/tmp",
-  "activation": "gelu",
-  "architectures": [
-    "DistilBertForSequenceClassification"
-  ],
-  "attention_dropout": 0.1,
-  "dim": 768,
-  "dropout": 0.1,
-  "hidden_dim": 3072,
-  "initializer_range": 0.02,
-  "max_position_embeddings": 512,
-  "model_type": "distilbert",
-  "n_heads": 12,
-  "n_layers": 6,
-  "pad_token_id": 0,
-  "problem_type": "single_label_classification",
-  "qa_dropout": 0.1,
-  "seq_classif_dropout": 0.2,
-  "sinusoidal_pos_embds": false,
-  "tie_weights_": true,
-  "torch_dtype": "float32",
-  "transformers_version": "4.26.1",
-  "vocab_size": 30522
-}
-
-loading weights file /tmp/pytorch_model.bin
-All model checkpoint weights were used when initializing DistilBertForSequenceClassification.
-
-All the weights of DistilBertForSequenceClassification were initialized from the model checkpoint at /tmp.
-If your task is similar to the task the model of the checkpoint was trained on, you can already use DistilBertForSequenceClassification for predictions without further training.
-/User/.pythonlibs/mlrun-base/lib/python3.9/site-packages/transformers/models/distilbert/modeling_distilbert.py:218: TracerWarning:
-
-torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect.
-
-Configuration saved in /tmp/tmp79wjp8m8/config.json
-Could not locate the tokenizer configuration file, will try to use the model config instead.
-loading configuration file /tmp/config.json
-Model config DistilBertConfig {
-  "_name_or_path": "/tmp",
-  "activation": "gelu",
-  "architectures": [
-    "DistilBertForSequenceClassification"
-  ],
-  "attention_dropout": 0.1,
-  "dim": 768,
-  "dropout": 0.1,
-  "hidden_dim": 3072,
-  "initializer_range": 0.02,
-  "max_position_embeddings": 512,
-  "model_type": "distilbert",
-  "n_heads": 12,
-  "n_layers": 6,
-  "pad_token_id": 0,
-  "problem_type": "single_label_classification",
-  "qa_dropout": 0.1,
-  "seq_classif_dropout": 0.2,
-  "sinusoidal_pos_embds": false,
-  "tie_weights_": true,
-  "torch_dtype": "float32",
-  "transformers_version": "4.26.1",
-  "vocab_size": 30522
-}
-
-loading configuration file /tmp/config.json
-Model config DistilBertConfig {
-  "_name_or_path": "/tmp",
-  "activation": "gelu",
-  "architectures": [
-    "DistilBertForSequenceClassification"
-  ],
-  "attention_dropout": 0.1,
-  "dim": 768,
-  "dropout": 0.1,
-  "hidden_dim": 3072,
-  "initializer_range": 0.02,
-  "max_position_embeddings": 512,
-  "model_type": "distilbert",
-  "n_heads": 12,
-  "n_layers": 6,
-  "pad_token_id": 0,
-  "problem_type": "single_label_classification",
-  "qa_dropout": 0.1,
-  "seq_classif_dropout": 0.2,
-  "sinusoidal_pos_embds": false,
-  "tie_weights_": true,
-  "torch_dtype": "float32",
-  "transformers_version": "4.26.1",
-  "vocab_size": 30522
-}
-
-Could not locate the tokenizer configuration file, will try to use the model config instead.
-loading configuration file /tmp/config.json
-Model config DistilBertConfig {
-  "_name_or_path": "/tmp",
-  "activation": "gelu",
-  "architectures": [
-    "DistilBertForSequenceClassification"
-  ],
-  "attention_dropout": 0.1,
-  "dim": 768,
-  "dropout": 0.1,
-  "hidden_dim": 3072,
-  "initializer_range": 0.02,
-  "max_position_embeddings": 512,
-  "model_type": "distilbert",
-  "n_heads": 12,
-  "n_layers": 6,
-  "pad_token_id": 0,
-  "problem_type": "single_label_classification",
-  "qa_dropout": 0.1,
-  "seq_classif_dropout": 0.2,
-  "sinusoidal_pos_embds": false,
-  "tie_weights_": true,
-  "torch_dtype": "float32",
-  "transformers_version": "4.26.1",
-  "vocab_size": 30522
-}
-
-Could not locate the tokenizer configuration file, will try to use the model config instead.
-loading configuration file /tmp/tmp79wjp8m8/config.json
-Model config DistilBertConfig {
-  "_name_or_path": "/tmp/tmp79wjp8m8",
-  "activation": "gelu",
-  "architectures": [
-    "DistilBertForSequenceClassification"
-  ],
-  "attention_dropout": 0.1,
-  "dim": 768,
-  "dropout": 0.1,
-  "hidden_dim": 3072,
-  "initializer_range": 0.02,
-  "max_position_embeddings": 512,
-  "model_type": "distilbert",
-  "n_heads": 12,
-  "n_layers": 6,
-  "pad_token_id": 0,
-  "problem_type": "single_label_classification",
-  "qa_dropout": 0.1,
-  "seq_classif_dropout": 0.2,
-  "sinusoidal_pos_embds": false,
-  "tie_weights_": true,
-  "torch_dtype": "float32",
-  "transformers_version": "4.26.1",
-  "vocab_size": 30522
-}
-
-loading configuration file /tmp/tmp79wjp8m8/config.json
-Model config DistilBertConfig {
-  "_name_or_path": "/tmp/tmp79wjp8m8",
-  "activation": "gelu",
-  "architectures": [
-    "DistilBertForSequenceClassification"
-  ],
-  "attention_dropout": 0.1,
-  "dim": 768,
-  "dropout": 0.1,
-  "hidden_dim": 3072,
-  "initializer_range": 0.02,
-  "max_position_embeddings": 512,
-  "model_type": "distilbert",
-  "n_heads": 12,
-  "n_layers": 6,
-  "pad_token_id": 0,
-  "problem_type": "single_label_classification",
-  "qa_dropout": 0.1,
-  "seq_classif_dropout": 0.2,
-  "sinusoidal_pos_embds": false,
-  "tie_weights_": true,
-  "torch_dtype": "float32",
-  "transformers_version": "4.26.1",
-  "vocab_size": 30522
-}
-
-Could not locate the tokenizer configuration file, will try to use the model config instead.
-loading configuration file /tmp/tmp79wjp8m8/config.json
-Model config DistilBertConfig {
-  "_name_or_path": "/tmp/tmp79wjp8m8",
-  "activation": "gelu",
-  "architectures": [
-    "DistilBertForSequenceClassification"
-  ],
-  "attention_dropout": 0.1,
-  "dim": 768,
-  "dropout": 0.1,
-  "hidden_dim": 3072,
-  "initializer_range": 0.02,
-  "max_position_embeddings": 512,
-  "model_type": "distilbert",
-  "n_heads": 12,
-  "n_layers": 6,
-  "pad_token_id": 0,
-  "problem_type": "single_label_classification",
-  "qa_dropout": 0.1,
-  "seq_classif_dropout": 0.2,
-  "sinusoidal_pos_embds": false,
-  "tie_weights_": true,
-  "torch_dtype": "float32",
-  "transformers_version": "4.26.1",
-  "vocab_size": 30522
-}
-
-Configuration saved in optimized/config.json
-Could not locate the tokenizer configuration file, will try to use the model config instead.
-loading configuration file /tmp/tmp79wjp8m8/config.json
-Model config DistilBertConfig {
-  "_name_or_path": "/tmp/tmp79wjp8m8",
-  "activation": "gelu",
-  "architectures": [
-    "DistilBertForSequenceClassification"
-  ],
-  "attention_dropout": 0.1,
-  "dim": 768,
-  "dropout": 0.1,
-  "hidden_dim": 3072,
-  "initializer_range": 0.02,
-  "max_position_embeddings": 512,
-  "model_type": "distilbert",
-  "n_heads": 12,
-  "n_layers": 6,
-  "pad_token_id": 0,
-  "problem_type": "single_label_classification",
-  "qa_dropout": 0.1,
-  "seq_classif_dropout": 0.2,
-  "sinusoidal_pos_embds": false,
-  "tie_weights_": true,
-  "torch_dtype": "float32",
-  "transformers_version": "4.26.1",
-  "vocab_size": 30522
-}
-
-loading configuration file /tmp/tmp79wjp8m8/config.json
-Model config DistilBertConfig {
-  "_name_or_path": "/tmp/tmp79wjp8m8",
-  "activation": "gelu",
-  "architectures": [
-    "DistilBertForSequenceClassification"
-  ],
-  "attention_dropout": 0.1,
-  "dim": 768,
-  "dropout": 0.1,
-  "hidden_dim": 3072,
-  "initializer_range": 0.02,
-  "max_position_embeddings": 512,
-  "model_type": "distilbert",
-  "n_heads": 12,
-  "n_layers": 6,
-  "pad_token_id": 0,
-  "problem_type": "single_label_classification",
-  "qa_dropout": 0.1,
-  "seq_classif_dropout": 0.2,
-  "sinusoidal_pos_embds": false,
-  "tie_weights_": true,
-  "torch_dtype": "float32",
-  "transformers_version": "4.26.1",
-  "vocab_size": 30522
-}
-
-Could not locate the tokenizer configuration file, will try to use the model config instead.
-loading configuration file /tmp/tmp79wjp8m8/config.json
-Model config DistilBertConfig {
-  "_name_or_path": "/tmp/tmp79wjp8m8",
-  "activation": "gelu",
-  "architectures": [
-    "DistilBertForSequenceClassification"
-  ],
-  "attention_dropout": 0.1,
-  "dim": 768,
-  "dropout": 0.1,
-  "hidden_dim": 3072,
-  "initializer_range": 0.02,
-  "max_position_embeddings": 512,
-  "model_type": "distilbert",
-  "n_heads": 12,
-  "n_layers": 6,
-  "pad_token_id": 0,
-  "problem_type": "single_label_classification",
-  "qa_dropout": 0.1,
-  "seq_classif_dropout": 0.2,
-  "sinusoidal_pos_embds": false,
-  "tie_weights_": true,
-  "torch_dtype": "float32",
-  "transformers_version": "4.26.1",
-  "vocab_size": 30522
-}
-
-Failed to remove node input: "/distilbert/transformer/layer.0/attention/Transpose_output_0"
-input: "/distilbert/transformer/layer.0/attention/Constant_11_output_0"
-output: "/distilbert/transformer/layer.0/attention/Div_output_0"
-name: "/distilbert/transformer/layer.0/attention/Div"
-op_type: "Div"
-
-Failed to remove node input: "/distilbert/transformer/layer.1/attention/Transpose_output_0"
-input: "/distilbert/transformer/layer.1/attention/Constant_11_output_0"
-output: "/distilbert/transformer/layer.1/attention/Div_output_0"
-name: "/distilbert/transformer/layer.1/attention/Div"
-op_type: "Div"
-
-Failed to remove node input: "/distilbert/transformer/layer.2/attention/Transpose_output_0"
-input: "/distilbert/transformer/layer.2/attention/Constant_11_output_0"
-output: "/distilbert/transformer/layer.2/attention/Div_output_0"
-name: "/distilbert/transformer/layer.2/attention/Div"
-op_type: "Div"
-
-Failed to remove node input: "/distilbert/transformer/layer.3/attention/Transpose_output_0"
-input: "/distilbert/transformer/layer.3/attention/Constant_11_output_0"
-output: "/distilbert/transformer/layer.3/attention/Div_output_0"
-name: "/distilbert/transformer/layer.3/attention/Div"
-op_type: "Div"
-
-Failed to remove node input: "/distilbert/transformer/layer.4/attention/Transpose_output_0"
-input: "/distilbert/transformer/layer.4/attention/Constant_11_output_0"
-output: "/distilbert/transformer/layer.4/attention/Div_output_0"
-name: "/distilbert/transformer/layer.4/attention/Div"
-op_type: "Div"
-
-Failed to remove node input: "/distilbert/transformer/layer.5/attention/Transpose_output_0"
-input: "/distilbert/transformer/layer.5/attention/Constant_11_output_0"
-output: "/distilbert/transformer/layer.5/attention/Div_output_0"
-name: "/distilbert/transformer/layer.5/attention/Div"
-op_type: "Div"
-
-Configuration saved in optimized/config.json
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
hugging-face-trainer-avia0Mar 24 17:12:02completedhugging-face-classifier-trainer-optimize
v3io_user=avia
kind=local
owner=avia
host=jupyter-avia-6454bdd4c5-xz8cg
model_path=store://artifacts/hugging-face-trainer-avia/huggingface-model@514d8d5530c842238b1cc81983cd943e
model
-
- -
-

-
-
-
> to track results use the .show() or .logs() methods or click here to open in UI
> 2024-03-24 17:12:22,721 [info] Run execution finished: {'status': 'completed', 'name': 'hugging-face-classifier-trainer-optimize'}
-
-
-
-
-
-
-
optimize_run.outputs
-
-
-
-
-
{'model': 'store://artifacts/hugging-face-trainer-avia/optimized_model@fbee1ead18444824a4b5c0308a677bf4'}
-
-
-
-
-
-
-

Running the training remotely#

-
-
-
project.build_function("hugging-face-classifier-trainer",with_mlrun=True)
-
-
-
-
-
/User/.pythonlibs/mlrun-base/lib/python3.9/site-packages/mlrun/projects/operations.py:276: OverwriteBuildParamsWarning:
-
-The `overwrite_build_params` parameter default will change from 'False' to 'True' in 1.8.0.
-
-
-
> 2024-03-24 17:14:22,792 [info] Started building image: .mlrun/func-hugging-face-trainer-avia-hugging-face-classifier-trainer:latest
-INFO[0000] Retrieving image manifest mlrun/mlrun:1.6.1  
-INFO[0000] Retrieving image mlrun/mlrun:1.6.1 from registry index.docker.io 
-INFO[0000] Built cross stage deps: map[]                
-INFO[0000] Retrieving image manifest mlrun/mlrun:1.6.1  
-INFO[0000] Returning cached image manifest              
-INFO[0000] Executing 0 build triggers                   
-INFO[0000] Building stage 'mlrun/mlrun:1.6.1' [idx: '0', base-idx: '-1'] 
-INFO[0000] Unpacking rootfs as cmd RUN echo 'Installing /empty/requirements.txt...'; cat /empty/requirements.txt requires it. 
-INFO[0047] RUN echo 'Installing /empty/requirements.txt...'; cat /empty/requirements.txt 
-INFO[0047] Initializing snapshotter ...                 
-INFO[0047] Taking snapshot of full filesystem...        
-INFO[0074] Cmd: /bin/sh                                 
-INFO[0074] Args: [-c echo 'Installing /empty/requirements.txt...'; cat /empty/requirements.txt] 
-INFO[0074] Running: [/bin/sh -c echo 'Installing /empty/requirements.txt...'; cat /empty/requirements.txt] 
-Installing /empty/requirements.txt...
-mlrun[complete]==1.6.1
-onnx~=1.14.1
-onnxruntime~=1.16.1
-optimum~=1.6.4
-transformers~=4.26.1
-datasets~=2.10.1
-scikit-learn~=1.0.2
-INFO[0074] Taking snapshot of full filesystem...        
-INFO[0078] No files were changed, appending empty layer to config. No layer added to image. 
-INFO[0078] RUN python -m pip install -r /empty/requirements.txt 
-INFO[0078] Cmd: /bin/sh                                 
-INFO[0078] Args: [-c python -m pip install -r /empty/requirements.txt] 
-INFO[0078] Running: [/bin/sh -c python -m pip install -r /empty/requirements.txt] 
-Requirement already satisfied: mlrun[complete]==1.6.1 in /opt/conda/lib/python3.9/site-packages (from -r /empty/requirements.txt (line 1)) (1.6.1)
-Collecting onnx~=1.14.1 (from -r /empty/requirements.txt (line 2))
-  Obtaining dependency information for onnx~=1.14.1 from https://files.pythonhosted.org/packages/ff/24/0e522fdcadf0e15fc304145a5b6e5d7246d7f2c507fd9bfe6e1fafb2aa95/onnx-1.14.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
-  Downloading onnx-1.14.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (15 kB)
-Collecting onnxruntime~=1.16.1 (from -r /empty/requirements.txt (line 3))
-  Obtaining dependency information for onnxruntime~=1.16.1 from https://files.pythonhosted.org/packages/de/ab/ed3ae0d649cee41e870f8b1653cf4a1c1fc321e0ded4e3e1a3d4a25c0131/onnxruntime-1.16.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
-  Downloading onnxruntime-1.16.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.3 kB)
-Collecting optimum~=1.6.4 (from -r /empty/requirements.txt (line 4))
-  Obtaining dependency information for optimum~=1.6.4 from https://files.pythonhosted.org/packages/31/72/a7e3b2c57d6368c5f4bb6fba54a85cbf07d25c385a2db3f1a638f3c0ddb2/optimum-1.6.4-py3-none-any.whl.metadata
-  Downloading optimum-1.6.4-py3-none-any.whl.metadata (17 kB)
-Collecting transformers~=4.26.1 (from -r /empty/requirements.txt (line 5))
-  Obtaining dependency information for transformers~=4.26.1 from https://files.pythonhosted.org/packages/1e/e2/60c3f4691b16d126ee9cfe28f598b13c424b60350ab339aba81aef054b8f/transformers-4.26.1-py3-none-any.whl.metadata
-  Downloading transformers-4.26.1-py3-none-any.whl.metadata (100 kB)
-     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100.3/100.3 kB 6.2 MB/s eta 0:00:00
-Collecting datasets~=2.10.1 (from -r /empty/requirements.txt (line 6))
-  Obtaining dependency information for datasets~=2.10.1 from https://files.pythonhosted.org/packages/fe/17/5825fdf034ff1a315becdbb9b6fe5a2bd9d8e724464535f18809593bf9c2/datasets-2.10.1-py3-none-any.whl.metadata
-  Downloading datasets-2.10.1-py3-none-any.whl.metadata (20 kB)
-Collecting scikit-learn~=1.0.2 (from -r /empty/requirements.txt (line 7))
-  Obtaining dependency information for scikit-learn~=1.0.2 from https://files.pythonhosted.org/packages/57/aa/483fbe6b5314bce2d49801e6cec1f2139a9c220d0d51494788fff47233b3/scikit_learn-1.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
-  Downloading scikit_learn-1.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
-Requirement already satisfied: urllib3<1.27,>=1.26.9 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.26.18)
-Requirement already satisfied: GitPython>=3.1.41,~=3.1 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.1.42)
-Requirement already satisfied: aiohttp~=3.9 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.9.3)
-Requirement already satisfied: aiohttp-retry~=2.8 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.8.3)
-Requirement already satisfied: click~=8.1 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (8.1.7)
-Requirement already satisfied: kfp~=1.8 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.8.22)
-Requirement already satisfied: nest-asyncio~=1.0 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.6.0)
-Requirement already satisfied: ipython~=8.10 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (8.18.1)
-Requirement already satisfied: nuclio-jupyter~=0.9.15 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.9.16)
-Requirement already satisfied: numpy<1.27.0,>=1.16.5 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.26.4)
-Requirement already satisfied: pandas<2.2,>=1.2 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.1.4)
-Requirement already satisfied: pyarrow<15,>=10.0 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (14.0.2)
-Requirement already satisfied: pyyaml~=5.1 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (5.4.1)
-Requirement already satisfied: requests~=2.31 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.31.0)
-Requirement already satisfied: tabulate~=0.8.6 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.8.10)
-Requirement already satisfied: v3io~=0.5.21 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.5.23)
-Requirement already satisfied: pydantic>=1.10.8,~=1.10 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.10.14)
-Requirement already satisfied: mergedeep~=1.3 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.3.4)
-Requirement already satisfied: v3io-frames~=0.10.12 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.10.13)
-Requirement already satisfied: semver~=3.0 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.0.2)
-Requirement already satisfied: dependency-injector~=4.41 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (4.41.0)
-Requirement already satisfied: fsspec==2023.9.2 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2023.9.2)
-Requirement already satisfied: v3iofs~=0.1.17 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.1.18)
-Requirement already satisfied: storey~=1.6.18 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.6.18)
-Requirement already satisfied: inflection~=0.5.0 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.5.1)
-Requirement already satisfied: python-dotenv~=0.17.0 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.17.1)
-Requirement already satisfied: setuptools~=68.2 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (68.2.2)
-Requirement already satisfied: deprecated~=1.2 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.2.14)
-Requirement already satisfied: jinja2>=3.1.3,~=3.1 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.1.3)
-Requirement already satisfied: anyio~=3.7 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.7.1)
-Requirement already satisfied: orjson~=3.9 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.9.15)
-Requirement already satisfied: adlfs==2023.9.0 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2023.9.0)
-Requirement already satisfied: aiobotocore<2.8,>=2.5.0 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.5.4)
-Requirement already satisfied: avro~=1.11 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.11.3)
-Requirement already satisfied: azure-core~=1.24 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.30.0)
-Requirement already satisfied: azure-identity~=1.5 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.15.0)
-Requirement already satisfied: azure-keyvault-secrets~=4.2 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (4.8.0)
-Requirement already satisfied: boto3<1.29.0,>=1.28.0 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.28.17)
-Requirement already satisfied: dask~=2023.9.0 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2023.9.3)
-Requirement already satisfied: databricks-sdk~=0.13.0 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.13.0)
-Requirement already satisfied: distributed~=2023.9.0 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2023.9.3)
-Requirement already satisfied: gcsfs==2023.9.2 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2023.9.2)
-Requirement already satisfied: google-cloud-bigquery[bqstorage,pandas]==3.14.1 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.14.1)
-Requirement already satisfied: graphviz~=0.20.0 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.20.1)
-Requirement already satisfied: kafka-python~=2.0 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.0.2)
-Requirement already satisfied: mlflow~=2.8 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.10.2)
-Requirement already satisfied: msrest~=0.6.21 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.6.21)
-Requirement already satisfied: plotly<5.12.0,~=5.4 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (5.11.0)
-Requirement already satisfied: pyopenssl>=23 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (24.0.0)
-Requirement already satisfied: redis~=4.3 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (4.6.0)
-Requirement already satisfied: s3fs==2023.9.2 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2023.9.2)
-Requirement already satisfied: sqlalchemy~=1.4 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.4.51)
-Requirement already satisfied: azure-datalake-store<0.1,>=0.0.46 in /opt/conda/lib/python3.9/site-packages (from adlfs==2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.0.53)
-Requirement already satisfied: azure-storage-blob>=12.12.0 in /opt/conda/lib/python3.9/site-packages (from adlfs==2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (12.19.0)
-Requirement already satisfied: decorator>4.1.2 in /opt/conda/lib/python3.9/site-packages (from gcsfs==2023.9.2->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (5.1.1)
-Requirement already satisfied: google-auth>=1.2 in /opt/conda/lib/python3.9/site-packages (from gcsfs==2023.9.2->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.28.1)
-Requirement already satisfied: google-auth-oauthlib in /opt/conda/lib/python3.9/site-packages (from gcsfs==2023.9.2->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.2.0)
-Requirement already satisfied: google-cloud-storage in /opt/conda/lib/python3.9/site-packages (from gcsfs==2023.9.2->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.14.0)
-Requirement already satisfied: google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0dev,>=1.31.5 in /opt/conda/lib/python3.9/site-packages (from google-cloud-bigquery[bqstorage,pandas]==3.14.1->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.17.1)
-Requirement already satisfied: google-cloud-core<3.0.0dev,>=1.6.0 in /opt/conda/lib/python3.9/site-packages (from google-cloud-bigquery[bqstorage,pandas]==3.14.1->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.4.1)
-Requirement already satisfied: google-resumable-media<3.0dev,>=0.6.0 in /opt/conda/lib/python3.9/site-packages (from google-cloud-bigquery[bqstorage,pandas]==3.14.1->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.7.0)
-Requirement already satisfied: packaging>=20.0.0 in /opt/conda/lib/python3.9/site-packages (from google-cloud-bigquery[bqstorage,pandas]==3.14.1->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (23.1)
-Requirement already satisfied: python-dateutil<3.0dev,>=2.7.2 in /opt/conda/lib/python3.9/site-packages (from google-cloud-bigquery[bqstorage,pandas]==3.14.1->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.8.2)
-Requirement already satisfied: db-dtypes<2.0.0dev,>=0.3.0 in /opt/conda/lib/python3.9/site-packages (from google-cloud-bigquery[bqstorage,pandas]==3.14.1->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.2.0)
-Requirement already satisfied: google-cloud-bigquery-storage<3.0.0dev,>=2.6.0 in /opt/conda/lib/python3.9/site-packages (from google-cloud-bigquery[bqstorage,pandas]==3.14.1->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.24.0)
-Requirement already satisfied: grpcio<2.0dev,>=1.47.0 in /opt/conda/lib/python3.9/site-packages (from google-cloud-bigquery[bqstorage,pandas]==3.14.1->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.62.0)
-Requirement already satisfied: protobuf>=3.20.2 in /opt/conda/lib/python3.9/site-packages (from onnx~=1.14.1->-r /empty/requirements.txt (line 2)) (3.20.3)
-Requirement already satisfied: typing-extensions>=3.6.2.1 in /opt/conda/lib/python3.9/site-packages (from onnx~=1.14.1->-r /empty/requirements.txt (line 2)) (4.10.0)
-Collecting coloredlogs (from onnxruntime~=1.16.1->-r /empty/requirements.txt (line 3))
-  Obtaining dependency information for coloredlogs from https://files.pythonhosted.org/packages/a7/06/3d6badcf13db419e25b07041d9c7b4a2c331d3f4e7134445ec5df57714cd/coloredlogs-15.0.1-py2.py3-none-any.whl.metadata
-  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
-Collecting flatbuffers (from onnxruntime~=1.16.1->-r /empty/requirements.txt (line 3))
-  Obtaining dependency information for flatbuffers from https://files.pythonhosted.org/packages/bf/45/c961e3cb6ddad76b325c163d730562bb6deb1ace5acbed0306f5fbefb90e/flatbuffers-24.3.7-py2.py3-none-any.whl.metadata
-  Downloading flatbuffers-24.3.7-py2.py3-none-any.whl.metadata (849 bytes)
-Collecting sympy (from onnxruntime~=1.16.1->-r /empty/requirements.txt (line 3))
-  Obtaining dependency information for sympy from https://files.pythonhosted.org/packages/d2/05/e6600db80270777c4a64238a98d442f0fd07cc8915be2a1c16da7f2b9e74/sympy-1.12-py3-none-any.whl.metadata
-  Downloading sympy-1.12-py3-none-any.whl.metadata (12 kB)
-Collecting transformers[sentencepiece]>=4.26.0 (from optimum~=1.6.4->-r /empty/requirements.txt (line 4))
-  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/0a/fd/280f4385e76f3c1890efc15fa93f7206134fefad6351397e1bfab6d0d0de/transformers-4.39.1-py3-none-any.whl.metadata
-  Downloading transformers-4.39.1-py3-none-any.whl.metadata (134 kB)
-     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 134.8/134.8 kB 40.1 MB/s eta 0:00:00
-Collecting torch>=1.9 (from optimum~=1.6.4->-r /empty/requirements.txt (line 4))
-  Obtaining dependency information for torch>=1.9 from https://files.pythonhosted.org/packages/98/04/95a12556d068786d6505c609daf2805bed91c9210c5185499a7c121eba47/torch-2.2.1-cp39-cp39-manylinux1_x86_64.whl.metadata
-  Downloading torch-2.2.1-cp39-cp39-manylinux1_x86_64.whl.metadata (25 kB)
-Collecting numpy<1.27.0,>=1.16.5 (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1))
-  Obtaining dependency information for numpy<1.27.0,>=1.16.5 from https://files.pythonhosted.org/packages/4c/b9/038abd6fbd67b05b03cb1af590cfc02b7f1e5a37af7ac6a868f5093c29f5/numpy-1.23.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
-  Downloading numpy-1.23.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.3 kB)
-Collecting huggingface-hub>=0.8.0 (from optimum~=1.6.4->-r /empty/requirements.txt (line 4))
-  Obtaining dependency information for huggingface-hub>=0.8.0 from https://files.pythonhosted.org/packages/ab/28/d4b691840d73126d4c9845f8a22dad033ac872509b6d3a0d93b456eef424/huggingface_hub-0.21.4-py3-none-any.whl.metadata
-  Downloading huggingface_hub-0.21.4-py3-none-any.whl.metadata (13 kB)
-Collecting filelock (from transformers~=4.26.1->-r /empty/requirements.txt (line 5))
-  Obtaining dependency information for filelock from https://files.pythonhosted.org/packages/81/54/84d42a0bee35edba99dee7b59a8d4970eccdd44b99fe728ed912106fc781/filelock-3.13.1-py3-none-any.whl.metadata
-  Downloading filelock-3.13.1-py3-none-any.whl.metadata (2.8 kB)
-Collecting regex!=2019.12.17 (from transformers~=4.26.1->-r /empty/requirements.txt (line 5))
-  Obtaining dependency information for regex!=2019.12.17 from https://files.pythonhosted.org/packages/05/9e/80c20f1151432a6025690c9c2037053039b028a7b236fa81d7e7ac9dec60/regex-2023.12.25-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
-  Downloading regex-2023.12.25-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
-     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 40.9/40.9 kB 217.5 MB/s eta 0:00:00
-Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers~=4.26.1->-r /empty/requirements.txt (line 5))
-  Obtaining dependency information for tokenizers!=0.11.3,<0.14,>=0.11.1 from https://files.pythonhosted.org/packages/d6/27/07a337087dd507170a1b20fed3bbf8da81401185a7130a6e74e440c52040/tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
-  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
-Requirement already satisfied: tqdm>=4.27 in /opt/conda/lib/python3.9/site-packages (from transformers~=4.26.1->-r /empty/requirements.txt (line 5)) (4.65.0)
-Collecting dill<0.3.7,>=0.3.0 (from datasets~=2.10.1->-r /empty/requirements.txt (line 6))
-  Obtaining dependency information for dill<0.3.7,>=0.3.0 from https://files.pythonhosted.org/packages/be/e3/a84bf2e561beed15813080d693b4b27573262433fced9c1d1fea59e60553/dill-0.3.6-py3-none-any.whl.metadata
-  Downloading dill-0.3.6-py3-none-any.whl.metadata (9.8 kB)
-Requirement already satisfied: xxhash in /opt/conda/lib/python3.9/site-packages (from datasets~=2.10.1->-r /empty/requirements.txt (line 6)) (3.4.1)
-Collecting multiprocess (from datasets~=2.10.1->-r /empty/requirements.txt (line 6))
-  Obtaining dependency information for multiprocess from https://files.pythonhosted.org/packages/da/d9/f7f9379981e39b8c2511c9e0326d212accacb82f12fbfdc1aa2ce2a7b2b6/multiprocess-0.70.16-py39-none-any.whl.metadata
-  Downloading multiprocess-0.70.16-py39-none-any.whl.metadata (7.2 kB)
-Collecting responses<0.19 (from datasets~=2.10.1->-r /empty/requirements.txt (line 6))
-  Obtaining dependency information for responses<0.19 from https://files.pythonhosted.org/packages/79/f3/2b3a6dc5986303b3dd1bbbcf482022acb2583c428cd23f0b6d37b1a1a519/responses-0.18.0-py3-none-any.whl.metadata
-  Downloading responses-0.18.0-py3-none-any.whl.metadata (29 kB)
-Requirement already satisfied: scipy>=1.1.0 in /opt/conda/lib/python3.9/site-packages (from scikit-learn~=1.0.2->-r /empty/requirements.txt (line 7)) (1.12.0)
-Requirement already satisfied: joblib>=0.11 in /opt/conda/lib/python3.9/site-packages (from scikit-learn~=1.0.2->-r /empty/requirements.txt (line 7)) (1.3.2)
-Requirement already satisfied: threadpoolctl>=2.0.0 in /opt/conda/lib/python3.9/site-packages (from scikit-learn~=1.0.2->-r /empty/requirements.txt (line 7)) (3.3.0)
-Requirement already satisfied: botocore<1.31.18,>=1.31.17 in /opt/conda/lib/python3.9/site-packages (from aiobotocore<2.8,>=2.5.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.31.17)
-Requirement already satisfied: wrapt<2.0.0,>=1.10.10 in /opt/conda/lib/python3.9/site-packages (from aiobotocore<2.8,>=2.5.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.16.0)
-Requirement already satisfied: aioitertools<1.0.0,>=0.5.1 in /opt/conda/lib/python3.9/site-packages (from aiobotocore<2.8,>=2.5.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.11.0)
-Requirement already satisfied: aiosignal>=1.1.2 in /opt/conda/lib/python3.9/site-packages (from aiohttp~=3.9->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.3.1)
-Requirement already satisfied: attrs>=17.3.0 in /opt/conda/lib/python3.9/site-packages (from aiohttp~=3.9->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (23.2.0)
-Requirement already satisfied: frozenlist>=1.1.1 in /opt/conda/lib/python3.9/site-packages (from aiohttp~=3.9->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.4.1)
-Requirement already satisfied: multidict<7.0,>=4.5 in /opt/conda/lib/python3.9/site-packages (from aiohttp~=3.9->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (6.0.5)
-Requirement already satisfied: yarl<2.0,>=1.0 in /opt/conda/lib/python3.9/site-packages (from aiohttp~=3.9->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.9.4)
-Requirement already satisfied: async-timeout<5.0,>=4.0 in /opt/conda/lib/python3.9/site-packages (from aiohttp~=3.9->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (4.0.3)
-Requirement already satisfied: idna>=2.8 in /opt/conda/lib/python3.9/site-packages (from anyio~=3.7->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.4)
-Requirement already satisfied: sniffio>=1.1 in /opt/conda/lib/python3.9/site-packages (from anyio~=3.7->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.3.1)
-Requirement already satisfied: exceptiongroup in /opt/conda/lib/python3.9/site-packages (from anyio~=3.7->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.2.0)
-Requirement already satisfied: six>=1.11.0 in /opt/conda/lib/python3.9/site-packages (from azure-core~=1.24->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.16.0)
-Requirement already satisfied: cryptography>=2.5 in /opt/conda/lib/python3.9/site-packages (from azure-identity~=1.5->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (42.0.2)
-Requirement already satisfied: msal<2.0.0,>=1.24.0 in /opt/conda/lib/python3.9/site-packages (from azure-identity~=1.5->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.27.0)
-Requirement already satisfied: msal-extensions<2.0.0,>=0.3.0 in /opt/conda/lib/python3.9/site-packages (from azure-identity~=1.5->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.1.0)
-Requirement already satisfied: isodate>=0.6.1 in /opt/conda/lib/python3.9/site-packages (from azure-keyvault-secrets~=4.2->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.6.1)
-Requirement already satisfied: jmespath<2.0.0,>=0.7.1 in /opt/conda/lib/python3.9/site-packages (from boto3<1.29.0,>=1.28.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.0.1)
-Requirement already satisfied: s3transfer<0.7.0,>=0.6.0 in /opt/conda/lib/python3.9/site-packages (from boto3<1.29.0,>=1.28.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.6.2)
-Requirement already satisfied: cloudpickle>=1.5.0 in /opt/conda/lib/python3.9/site-packages (from dask~=2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.2.1)
-Requirement already satisfied: partd>=1.2.0 in /opt/conda/lib/python3.9/site-packages (from dask~=2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.4.1)
-Requirement already satisfied: toolz>=0.10.0 in /opt/conda/lib/python3.9/site-packages (from dask~=2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.12.0)
-Requirement already satisfied: importlib-metadata>=4.13.0 in /opt/conda/lib/python3.9/site-packages (from dask~=2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (7.0.1)
-Requirement already satisfied: locket>=1.0.0 in /opt/conda/lib/python3.9/site-packages (from distributed~=2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.0.0)
-Requirement already satisfied: msgpack>=1.0.0 in /opt/conda/lib/python3.9/site-packages (from distributed~=2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.0.7)
-Requirement already satisfied: psutil>=5.7.2 in /opt/conda/lib/python3.9/site-packages (from distributed~=2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (5.9.8)
-Requirement already satisfied: sortedcontainers>=2.0.5 in /opt/conda/lib/python3.9/site-packages (from distributed~=2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.4.0)
-Requirement already satisfied: tblib>=1.6.0 in /opt/conda/lib/python3.9/site-packages (from distributed~=2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.0.0)
-Requirement already satisfied: tornado>=6.0.4 in /opt/conda/lib/python3.9/site-packages (from distributed~=2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (6.4)
-Requirement already satisfied: zict>=3.0.0 in /opt/conda/lib/python3.9/site-packages (from distributed~=2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.0.0)
-Requirement already satisfied: gitdb<5,>=4.0.1 in /opt/conda/lib/python3.9/site-packages (from GitPython>=3.1.41,~=3.1->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (4.0.11)
-Requirement already satisfied: jedi>=0.16 in /opt/conda/lib/python3.9/site-packages (from ipython~=8.10->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.19.1)
-Requirement already satisfied: matplotlib-inline in /opt/conda/lib/python3.9/site-packages (from ipython~=8.10->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.1.6)
-Requirement already satisfied: prompt-toolkit<3.1.0,>=3.0.41 in /opt/conda/lib/python3.9/site-packages (from ipython~=8.10->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.0.43)
-Requirement already satisfied: pygments>=2.4.0 in /opt/conda/lib/python3.9/site-packages (from ipython~=8.10->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.17.2)
-Requirement already satisfied: stack-data in /opt/conda/lib/python3.9/site-packages (from ipython~=8.10->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.6.3)
-Requirement already satisfied: traitlets>=5 in /opt/conda/lib/python3.9/site-packages (from ipython~=8.10->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (5.14.1)
-Requirement already satisfied: pexpect>4.3 in /opt/conda/lib/python3.9/site-packages (from ipython~=8.10->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (4.9.0)
-Requirement already satisfied: MarkupSafe>=2.0 in /opt/conda/lib/python3.9/site-packages (from jinja2>=3.1.3,~=3.1->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.1.5)
-Requirement already satisfied: absl-py<2,>=0.9 in /opt/conda/lib/python3.9/site-packages (from kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.4.0)
-Requirement already satisfied: kubernetes<26,>=8.0.0 in /opt/conda/lib/python3.9/site-packages (from kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (25.3.0)
-Requirement already satisfied: google-api-python-client<2,>=1.7.8 in /opt/conda/lib/python3.9/site-packages (from kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.12.11)
-Requirement already satisfied: requests-toolbelt<1,>=0.8.0 in /opt/conda/lib/python3.9/site-packages (from kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.10.1)
-Requirement already satisfied: kfp-server-api<2.0.0,>=1.1.2 in /opt/conda/lib/python3.9/site-packages (from kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.8.5)
-Requirement already satisfied: jsonschema<5,>=3.0.1 in /opt/conda/lib/python3.9/site-packages (from kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (4.21.1)
-Requirement already satisfied: strip-hints<1,>=0.1.8 in /opt/conda/lib/python3.9/site-packages (from kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.1.10)
-Requirement already satisfied: docstring-parser<1,>=0.7.3 in /opt/conda/lib/python3.9/site-packages (from kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.15)
-Requirement already satisfied: kfp-pipeline-spec<0.2.0,>=0.1.16 in /opt/conda/lib/python3.9/site-packages (from kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.1.16)
-Requirement already satisfied: fire<1,>=0.3.1 in /opt/conda/lib/python3.9/site-packages (from kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.5.0)
-Requirement already satisfied: uritemplate<4,>=3.0.1 in /opt/conda/lib/python3.9/site-packages (from kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.0.1)
-Requirement already satisfied: typer<1.0,>=0.3.2 in /opt/conda/lib/python3.9/site-packages (from kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.9.0)
-Requirement already satisfied: entrypoints<1 in /opt/conda/lib/python3.9/site-packages (from mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.4)
-Requirement already satisfied: pytz<2024 in /opt/conda/lib/python3.9/site-packages (from mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2023.4)
-Requirement already satisfied: sqlparse<1,>=0.4.0 in /opt/conda/lib/python3.9/site-packages (from mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.4.4)
-Requirement already satisfied: alembic!=1.10.0,<2 in /opt/conda/lib/python3.9/site-packages (from mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.13.1)
-Requirement already satisfied: docker<8,>=4.0.0 in /opt/conda/lib/python3.9/site-packages (from mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (7.0.0)
-Requirement already satisfied: Flask<4 in /opt/conda/lib/python3.9/site-packages (from mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.0.2)
-Requirement already satisfied: querystring-parser<2 in /opt/conda/lib/python3.9/site-packages (from mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.2.4)
-Requirement already satisfied: markdown<4,>=3.3 in /opt/conda/lib/python3.9/site-packages (from mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.5.2)
-Requirement already satisfied: matplotlib<4 in /opt/conda/lib/python3.9/site-packages (from mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.8.3)
-Requirement already satisfied: gunicorn<22 in /opt/conda/lib/python3.9/site-packages (from mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (21.2.0)
-Requirement already satisfied: requests-oauthlib>=0.5.0 in /opt/conda/lib/python3.9/site-packages (from msrest~=0.6.21->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.3.1)
-Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.9/site-packages (from msrest~=0.6.21->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2024.2.2)
-Requirement already satisfied: nbconvert>=6.4.5 in /opt/conda/lib/python3.9/site-packages (from nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (7.16.1)
-Requirement already satisfied: notebook<7.0.0,>=6.4 in /opt/conda/lib/python3.9/site-packages (from nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (6.5.6)
-Requirement already satisfied: tzdata>=2022.1 in /opt/conda/lib/python3.9/site-packages (from pandas<2.2,>=1.2->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2024.1)
-Requirement already satisfied: tenacity>=6.2.0 in /opt/conda/lib/python3.9/site-packages (from plotly<5.12.0,~=5.4->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (8.2.3)
-Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.9/site-packages (from requests~=2.31->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.0.4)
-Requirement already satisfied: greenlet!=0.4.17 in /opt/conda/lib/python3.9/site-packages (from sqlalchemy~=1.4->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.0.3)
-Requirement already satisfied: nuclio-sdk>=0.5.3 in /opt/conda/lib/python3.9/site-packages (from storey~=1.6.18->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.5.9)
-Collecting networkx (from torch>=1.9->optimum~=1.6.4->-r /empty/requirements.txt (line 4))
-  Obtaining dependency information for networkx from https://files.pythonhosted.org/packages/d5/f0/8fbc882ca80cf077f1b246c0e3c3465f7f415439bdea6b899f6b19f61f70/networkx-3.2.1-py3-none-any.whl.metadata
-  Downloading networkx-3.2.1-py3-none-any.whl.metadata (5.2 kB)
-Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.9->optimum~=1.6.4->-r /empty/requirements.txt (line 4))
-  Obtaining dependency information for nvidia-cuda-nvrtc-cu12==12.1.105 from https://files.pythonhosted.org/packages/b6/9f/c64c03f49d6fbc56196664d05dba14e3a561038a81a638eeb47f4d4cfd48/nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata
-  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
-Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.9->optimum~=1.6.4->-r /empty/requirements.txt (line 4))
-  Obtaining dependency information for nvidia-cuda-runtime-cu12==12.1.105 from https://files.pythonhosted.org/packages/eb/d5/c68b1d2cdfcc59e72e8a5949a37ddb22ae6cade80cd4a57a84d4c8b55472/nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata
-  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
-Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.9->optimum~=1.6.4->-r /empty/requirements.txt (line 4))
-  Obtaining dependency information for nvidia-cuda-cupti-cu12==12.1.105 from https://files.pythonhosted.org/packages/7e/00/6b218edd739ecfc60524e585ba8e6b00554dd908de2c9c66c1af3e44e18d/nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata
-  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
-Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.9->optimum~=1.6.4->-r /empty/requirements.txt (line 4))
-  Obtaining dependency information for nvidia-cudnn-cu12==8.9.2.26 from https://files.pythonhosted.org/packages/ff/74/a2e2be7fb83aaedec84f391f082cf765dfb635e7caa9b49065f73e4835d8/nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata
-  Downloading nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
-Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.9->optimum~=1.6.4->-r /empty/requirements.txt (line 4))
-  Obtaining dependency information for nvidia-cublas-cu12==12.1.3.1 from https://files.pythonhosted.org/packages/37/6d/121efd7382d5b0284239f4ab1fc1590d86d34ed4a4a2fdb13b30ca8e5740/nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata
-  Downloading nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
-Collecting nvidia-cufft-cu12==11.0.2.54 (from torch>=1.9->optimum~=1.6.4->-r /empty/requirements.txt (line 4))
-  Obtaining dependency information for nvidia-cufft-cu12==11.0.2.54 from https://files.pythonhosted.org/packages/86/94/eb540db023ce1d162e7bea9f8f5aa781d57c65aed513c33ee9a5123ead4d/nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl.metadata
-  Downloading nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
-Collecting nvidia-curand-cu12==10.3.2.106 (from torch>=1.9->optimum~=1.6.4->-r /empty/requirements.txt (line 4))
-  Obtaining dependency information for nvidia-curand-cu12==10.3.2.106 from https://files.pythonhosted.org/packages/44/31/4890b1c9abc496303412947fc7dcea3d14861720642b49e8ceed89636705/nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl.metadata
-  Downloading nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
-Collecting nvidia-cusolver-cu12==11.4.5.107 (from torch>=1.9->optimum~=1.6.4->-r /empty/requirements.txt (line 4))
-  Obtaining dependency information for nvidia-cusolver-cu12==11.4.5.107 from https://files.pythonhosted.org/packages/bc/1d/8de1e5c67099015c834315e333911273a8c6aaba78923dd1d1e25fc5f217/nvidia_cusolver_cu12-11.4.5.107-py3-none-manylinux1_x86_64.whl.metadata
-  Downloading nvidia_cusolver_cu12-11.4.5.107-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
-Collecting nvidia-cusparse-cu12==12.1.0.106 (from torch>=1.9->optimum~=1.6.4->-r /empty/requirements.txt (line 4))
-  Obtaining dependency information for nvidia-cusparse-cu12==12.1.0.106 from https://files.pythonhosted.org/packages/65/5b/cfaeebf25cd9fdec14338ccb16f6b2c4c7fa9163aefcf057d86b9cc248bb/nvidia_cusparse_cu12-12.1.0.106-py3-none-manylinux1_x86_64.whl.metadata
-  Downloading nvidia_cusparse_cu12-12.1.0.106-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
-Collecting nvidia-nccl-cu12==2.19.3 (from torch>=1.9->optimum~=1.6.4->-r /empty/requirements.txt (line 4))
-  Obtaining dependency information for nvidia-nccl-cu12==2.19.3 from https://files.pythonhosted.org/packages/38/00/d0d4e48aef772ad5aebcf70b73028f88db6e5640b36c38e90445b7a57c45/nvidia_nccl_cu12-2.19.3-py3-none-manylinux1_x86_64.whl.metadata
-  Downloading nvidia_nccl_cu12-2.19.3-py3-none-manylinux1_x86_64.whl.metadata (1.8 kB)
-Collecting nvidia-nvtx-cu12==12.1.105 (from torch>=1.9->optimum~=1.6.4->-r /empty/requirements.txt (line 4))
-  Obtaining dependency information for nvidia-nvtx-cu12==12.1.105 from https://files.pythonhosted.org/packages/da/d3/8057f0587683ed2fcd4dbfbdfdfa807b9160b809976099d36b8f60d08f03/nvidia_nvtx_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata
-  Downloading nvidia_nvtx_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.7 kB)
-Collecting triton==2.2.0 (from torch>=1.9->optimum~=1.6.4->-r /empty/requirements.txt (line 4))
-  Obtaining dependency information for triton==2.2.0 from https://files.pythonhosted.org/packages/6a/5c/01d9f062f719581cf6e60053e1a005d666ec67dcb59630fffaa3a3e5c9d8/triton-2.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
-  Downloading triton-2.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
-Collecting nvidia-nvjitlink-cu12 (from nvidia-cusolver-cu12==11.4.5.107->torch>=1.9->optimum~=1.6.4->-r /empty/requirements.txt (line 4))
-  Obtaining dependency information for nvidia-nvjitlink-cu12 from https://files.pythonhosted.org/packages/58/d1/d1c80553f9d5d07b6072bc132607d75a0ef3600e28e1890e11c0f55d7346/nvidia_nvjitlink_cu12-12.4.99-py3-none-manylinux2014_x86_64.whl.metadata
-  Downloading nvidia_nvjitlink_cu12-12.4.99-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
-INFO: pip is looking at multiple versions of transformers[sentencepiece] to determine which version is compatible with other requirements. This could take a while.
-Collecting transformers[sentencepiece]>=4.26.0 (from optimum~=1.6.4->-r /empty/requirements.txt (line 4))
-  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/a4/73/f620d76193954e16db3d5c53a07d956d7b9c800e570758d3bff91906d4a4/transformers-4.39.0-py3-none-any.whl.metadata
-  Downloading transformers-4.39.0-py3-none-any.whl.metadata (134 kB)
-     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 134.8/134.8 kB 115.9 MB/s eta 0:00:00
-  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/b6/4d/fbe6d89fde59d8107f0a02816c4ac4542a8f9a85559fdf33c68282affcc1/transformers-4.38.2-py3-none-any.whl.metadata
-  Downloading transformers-4.38.2-py3-none-any.whl.metadata (130 kB)
-     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 130.7/130.7 kB 126.3 MB/s eta 0:00:00
-  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/3e/6b/1b589f7b69aaea8193cf5bc91cf97410284aecd97b6312cdb08baedbdffe/transformers-4.38.1-py3-none-any.whl.metadata
-  Downloading transformers-4.38.1-py3-none-any.whl.metadata (131 kB)
-     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 131.1/131.1 kB 138.2 MB/s eta 0:00:00
-  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/91/89/5416dc364c7ef0711c564fd61a69b03d1e40eeb5c506c38e53ba8a969e79/transformers-4.38.0-py3-none-any.whl.metadata
-  Downloading transformers-4.38.0-py3-none-any.whl.metadata (131 kB)
-     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 131.1/131.1 kB 186.3 MB/s eta 0:00:00
-  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/85/f6/c5065913119c41ecad148c34e3a861f719e16b89a522287213698da911fc/transformers-4.37.2-py3-none-any.whl.metadata
-  Downloading transformers-4.37.2-py3-none-any.whl.metadata (129 kB)
-     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 129.4/129.4 kB 236.8 MB/s eta 0:00:00
-  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/ad/67/b4d6a51dcaf988cb45b31e26c6e33fb169fe34ba5fb168b086309bd7c028/transformers-4.37.1-py3-none-any.whl.metadata
-  Downloading transformers-4.37.1-py3-none-any.whl.metadata (129 kB)
-     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 129.4/129.4 kB 156.4 MB/s eta 0:00:00
-  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/3c/45/52133ce6bce49a099cc865599803bf1fad93de887276f728e56848d77a70/transformers-4.37.0-py3-none-any.whl.metadata
-  Downloading transformers-4.37.0-py3-none-any.whl.metadata (129 kB)
-     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 129.4/129.4 kB 102.0 MB/s eta 0:00:00
-INFO: pip is still looking at multiple versions of transformers[sentencepiece] to determine which version is compatible with other requirements. This could take a while.
-  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/20/0a/739426a81f7635b422fbe6cb8d1d99d1235579a6ac8024c13d743efa6847/transformers-4.36.2-py3-none-any.whl.metadata
-  Downloading transformers-4.36.2-py3-none-any.whl.metadata (126 kB)
-     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 126.8/126.8 kB 108.8 MB/s eta 0:00:00
-  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/fc/04/0aad491cd98b09236c54ab849863ee85421eeda5138bbf9d33ecc594652b/transformers-4.36.1-py3-none-any.whl.metadata
-  Downloading transformers-4.36.1-py3-none-any.whl.metadata (126 kB)
-     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 126.8/126.8 kB 140.6 MB/s eta 0:00:00
-  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/0f/12/d8e27a190ca67811f81deea3183b528d9169f10b74d827e0b9211520ecfa/transformers-4.36.0-py3-none-any.whl.metadata
-  Downloading transformers-4.36.0-py3-none-any.whl.metadata (126 kB)
-     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 126.8/126.8 kB 267.8 MB/s eta 0:00:00
-  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/12/dd/f17b11a93a9ca27728e12512d167eb1281c151c4c6881d3ab59eb58f4127/transformers-4.35.2-py3-none-any.whl.metadata
-  Downloading transformers-4.35.2-py3-none-any.whl.metadata (123 kB)
-     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 123.5/123.5 kB 130.2 MB/s eta 0:00:00
-  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/92/ba/cfff7e01f7070d9fca3964bf42b2257b86964c3e6763b8d5435436cc1d77/transformers-4.35.1-py3-none-any.whl.metadata
-  Downloading transformers-4.35.1-py3-none-any.whl.metadata (123 kB)
-     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 123.1/123.1 kB 183.6 MB/s eta 0:00:00
-INFO: This is taking longer than usual. You might need to provide the dependency resolver with stricter constraints to reduce runtime. See https://pip.pypa.io/warnings/backtracking for guidance. If you want to abort this run, press Ctrl + C.
-  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/9a/06/e4ec2a321e57c03b7e9345d709d554a52c33760e5015fdff0919d9459af0/transformers-4.35.0-py3-none-any.whl.metadata
-  Downloading transformers-4.35.0-py3-none-any.whl.metadata (123 kB)
-     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 123.1/123.1 kB 177.3 MB/s eta 0:00:00
-  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/c1/bd/f64d67df4d3b05a460f281defe830ffab6d7940b7ca98ec085e94e024781/transformers-4.34.1-py3-none-any.whl.metadata
-  Downloading transformers-4.34.1-py3-none-any.whl.metadata (121 kB)
-     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 121.5/121.5 kB 270.5 MB/s eta 0:00:00
-  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/1a/d1/3bba59606141ae808017f6fde91453882f931957f125009417b87a281067/transformers-4.34.0-py3-none-any.whl.metadata
-  Downloading transformers-4.34.0-py3-none-any.whl.metadata (121 kB)
-     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 121.5/121.5 kB 133.4 MB/s eta 0:00:00
-  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/98/46/f6a79f944d5c7763a9bc13b2aa6ac72daf43a6551f5fb03bccf0a9c2fec1/transformers-4.33.3-py3-none-any.whl.metadata
-  Downloading transformers-4.33.3-py3-none-any.whl.metadata (119 kB)
-     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 119.9/119.9 kB 163.1 MB/s eta 0:00:00
-  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/1a/06/3817f9bb923437ead9a794f0ac0d03b8b5e0478ab112db4c413dd37c09da/transformers-4.33.2-py3-none-any.whl.metadata
-  Downloading transformers-4.33.2-py3-none-any.whl.metadata (119 kB)
-     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 119.9/119.9 kB 274.9 MB/s eta 0:00:00
-  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/13/30/54b59e73400df3de506ad8630284e9fd63f4b94f735423d55fc342181037/transformers-4.33.1-py3-none-any.whl.metadata
-  Downloading transformers-4.33.1-py3-none-any.whl.metadata (119 kB)
-     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 119.9/119.9 kB 274.2 MB/s eta 0:00:00
-  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/e1/9d/4d9fe5c3b820db10773392ac5f4a0c8dab668f70b245ce2ce09785166128/transformers-4.33.0-py3-none-any.whl.metadata
-  Downloading transformers-4.33.0-py3-none-any.whl.metadata (119 kB)
-     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 119.9/119.9 kB 185.9 MB/s eta 0:00:00
-  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/83/8d/f65f8138365462ace54458a9e164f4b28ce1141361970190eef36bdef986/transformers-4.32.1-py3-none-any.whl.metadata
-  Downloading transformers-4.32.1-py3-none-any.whl.metadata (118 kB)
-     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 118.5/118.5 kB 144.4 MB/s eta 0:00:00
-  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/ae/95/283a1c004430bd2a9425d6937fc545dd49a4e4592feb76be0299a14e2378/transformers-4.32.0-py3-none-any.whl.metadata
-  Downloading transformers-4.32.0-py3-none-any.whl.metadata (118 kB)
-     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 118.5/118.5 kB 150.3 MB/s eta 0:00:00
-  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/21/02/ae8e595f45b6c8edee07913892b3b41f5f5f273962ad98851dc6a564bbb9/transformers-4.31.0-py3-none-any.whl.metadata
-  Downloading transformers-4.31.0-py3-none-any.whl.metadata (116 kB)
-     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 116.9/116.9 kB 156.7 MB/s eta 0:00:00
-  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/5b/0b/e45d26ccd28568013523e04f325432ea88a442b4e3020b757cf4361f0120/transformers-4.30.2-py3-none-any.whl.metadata
-  Downloading transformers-4.30.2-py3-none-any.whl.metadata (113 kB)
-     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 113.6/113.6 kB 263.7 MB/s eta 0:00:00
-  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/b8/df/b01b5e67cde3883757c9212455cbb9169385dcab5858b7172199126b756d/transformers-4.30.1-py3-none-any.whl.metadata
-  Downloading transformers-4.30.1-py3-none-any.whl.metadata (113 kB)
-     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 113.6/113.6 kB 263.8 MB/s eta 0:00:00
-  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/e2/72/1af3d38e98fdcceb3876de4567ac395a66c26976e259fe2d46266e052d61/transformers-4.30.0-py3-none-any.whl.metadata
-  Downloading transformers-4.30.0-py3-none-any.whl.metadata (113 kB)
-     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 113.6/113.6 kB 266.5 MB/s eta 0:00:00
-  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/17/aa/a89864288afe45abe1ab79f002140a20348140e86836d96096d8f8a3bac0/transformers-4.29.2-py3-none-any.whl.metadata
-  Downloading transformers-4.29.2-py3-none-any.whl.metadata (112 kB)
-     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 112.3/112.3 kB 272.7 MB/s eta 0:00:00
-  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/e8/b5/ddb16f9de207e6571ab7cc5db0cc538fa2d6d91cf024565496462af4c1ce/transformers-4.29.1-py3-none-any.whl.metadata
-  Downloading transformers-4.29.1-py3-none-any.whl.metadata (112 kB)
-     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 112.3/112.3 kB 262.3 MB/s eta 0:00:00
-  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/45/e4/4914b11df70954d95a7c36b74bf9010c8594fcec960471479449b0deb4f7/transformers-4.29.0-py3-none-any.whl.metadata
-  Downloading transformers-4.29.0-py3-none-any.whl.metadata (111 kB)
-     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 111.9/111.9 kB 269.5 MB/s eta 0:00:00
-  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/d8/a7/a6ff727fd5d96d6625f4658944a2ae230f0c75743a9a117fbda013b03d3d/transformers-4.28.1-py3-none-any.whl.metadata
-  Downloading transformers-4.28.1-py3-none-any.whl.metadata (109 kB)
-     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 110.0/110.0 kB 245.6 MB/s eta 0:00:00
-  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/8b/13/1ce598763b3669d43f192a7911bf2bf730a328012ab8801b93187a4f70d0/transformers-4.28.0-py3-none-any.whl.metadata
-  Downloading transformers-4.28.0-py3-none-any.whl.metadata (109 kB)
-     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 110.0/110.0 kB 256.3 MB/s eta 0:00:00
-  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/87/f0/2a152ed10ab8601431e87a606d397f7473c5fa4f8162f4ec5bda6ddb2df4/transformers-4.27.4-py3-none-any.whl.metadata
-  Downloading transformers-4.27.4-py3-none-any.whl.metadata (106 kB)
-     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 106.7/106.7 kB 254.4 MB/s eta 0:00:00
-  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/52/ac/9dc5a17ba60bc354d99250d9d1629f99d76f6729cee438fa91c8cc74bc5d/transformers-4.27.3-py3-none-any.whl.metadata
-  Downloading transformers-4.27.3-py3-none-any.whl.metadata (106 kB)
-     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 106.7/106.7 kB 251.5 MB/s eta 0:00:00
-  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/73/f0/4a795505387a3e7cd7f0c2a2a87f876658f9a07947a38fb67bffceff9246/transformers-4.27.2-py3-none-any.whl.metadata
-  Downloading transformers-4.27.2-py3-none-any.whl.metadata (106 kB)
-     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 106.7/106.7 kB 246.1 MB/s eta 0:00:00
-  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/6d/9b/2f536f9e73390209e0b27b74691355dac494b7ec8154f3012fdc6debbae7/transformers-4.27.1-py3-none-any.whl.metadata
-  Downloading transformers-4.27.1-py3-none-any.whl.metadata (106 kB)
-     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 106.7/106.7 kB 114.0 MB/s eta 0:00:00
-  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/4d/3e/1378ed266cf991f5ab5fcb29e953d97d793c7f9242ea5dc52f856415ea3a/transformers-4.27.0-py3-none-any.whl.metadata
-  Downloading transformers-4.27.0-py3-none-any.whl.metadata (106 kB)
-     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 106.7/106.7 kB 247.2 MB/s eta 0:00:00
-Collecting sentencepiece!=0.1.92,>=0.1.91 (from transformers~=4.26.1->-r /empty/requirements.txt (line 5))
-  Obtaining dependency information for sentencepiece!=0.1.92,>=0.1.91 from https://files.pythonhosted.org/packages/5f/01/c95e42eb86282b2c79305d3e0b0ca5a743f85a61262bb7130999c70b9374/sentencepiece-0.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
-  Downloading sentencepiece-0.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)
-Collecting protobuf>=3.20.2 (from onnx~=1.14.1->-r /empty/requirements.txt (line 2))
-  Obtaining dependency information for protobuf>=3.20.2 from https://files.pythonhosted.org/packages/38/b1/d9b615dceb67ac38e13cbd7680c27182b40154996022cbb244ba1ac7d30f/protobuf-3.20.2-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl.metadata
-  Downloading protobuf-3.20.2-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl.metadata (679 bytes)
-Requirement already satisfied: future>=0.18.2 in /opt/conda/lib/python3.9/site-packages (from v3io~=0.5.21->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.0.0)
-Requirement already satisfied: ujson>=3 in /opt/conda/lib/python3.9/site-packages (from v3io~=0.5.21->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (5.9.0)
-Requirement already satisfied: googleapis-common-protos>=1.5.3 in /opt/conda/lib/python3.9/site-packages (from v3io-frames~=0.10.12->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.62.0)
-Requirement already satisfied: grpcio-tools!=1.34.0,<1.49,>=1.30 in /opt/conda/lib/python3.9/site-packages (from v3io-frames~=0.10.12->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.48.2)
-Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime~=1.16.1->-r /empty/requirements.txt (line 3))
-  Obtaining dependency information for humanfriendly>=9.1 from https://files.pythonhosted.org/packages/f0/0f/310fb31e39e2d734ccaa2c0fb981ee41f7bd5056ce9bc29b2248bd569169/humanfriendly-10.0-py2.py3-none-any.whl.metadata
-  Downloading humanfriendly-10.0-py2.py3-none-any.whl.metadata (9.2 kB)
-INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
-Collecting multiprocess (from datasets~=2.10.1->-r /empty/requirements.txt (line 6))
-  Obtaining dependency information for multiprocess from https://files.pythonhosted.org/packages/c6/c9/820b5ab056f4ada76fbe05bd481a948f287957d6cbfd59e2dd2618b408c1/multiprocess-0.70.15-py39-none-any.whl.metadata
-  Downloading multiprocess-0.70.15-py39-none-any.whl.metadata (7.2 kB)
-  Obtaining dependency information for multiprocess from https://files.pythonhosted.org/packages/6a/f4/fbeb03ef7abdda54db4a6a75c971b88ab73d724ff09e3275cc1e99f1c946/multiprocess-0.70.14-py39-none-any.whl.metadata
-  Downloading multiprocess-0.70.14-py39-none-any.whl.metadata (6.6 kB)
-Collecting mpmath>=0.19 (from sympy->onnxruntime~=1.16.1->-r /empty/requirements.txt (line 3))
-  Obtaining dependency information for mpmath>=0.19 from https://files.pythonhosted.org/packages/43/e3/7d92a15f894aa0c9c4b49b8ee9ac9850d6e63b03c9c32c0367a13ae62209/mpmath-1.3.0-py3-none-any.whl.metadata
-  Downloading mpmath-1.3.0-py3-none-any.whl.metadata (8.6 kB)
-Requirement already satisfied: Mako in /opt/conda/lib/python3.9/site-packages (from alembic!=1.10.0,<2->mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.3.2)
-Requirement already satisfied: cffi in /opt/conda/lib/python3.9/site-packages (from azure-datalake-store<0.1,>=0.0.46->adlfs==2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.16.0)
-Requirement already satisfied: termcolor in /opt/conda/lib/python3.9/site-packages (from fire<1,>=0.3.1->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.4.0)
-Requirement already satisfied: Werkzeug>=3.0.0 in /opt/conda/lib/python3.9/site-packages (from Flask<4->mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.0.1)
-Requirement already satisfied: itsdangerous>=2.1.2 in /opt/conda/lib/python3.9/site-packages (from Flask<4->mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.1.2)
-Requirement already satisfied: blinker>=1.6.2 in /opt/conda/lib/python3.9/site-packages (from Flask<4->mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.7.0)
-Requirement already satisfied: smmap<6,>=3.0.1 in /opt/conda/lib/python3.9/site-packages (from gitdb<5,>=4.0.1->GitPython>=3.1.41,~=3.1->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (5.0.1)
-Requirement already satisfied: httplib2<1dev,>=0.15.0 in /opt/conda/lib/python3.9/site-packages (from google-api-python-client<2,>=1.7.8->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.22.0)
-Requirement already satisfied: google-auth-httplib2>=0.0.3 in /opt/conda/lib/python3.9/site-packages (from google-api-python-client<2,>=1.7.8->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.2.0)
-Requirement already satisfied: cachetools<6.0,>=2.0.0 in /opt/conda/lib/python3.9/site-packages (from google-auth>=1.2->gcsfs==2023.9.2->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (5.3.3)
-Requirement already satisfied: pyasn1-modules>=0.2.1 in /opt/conda/lib/python3.9/site-packages (from google-auth>=1.2->gcsfs==2023.9.2->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.3.0)
-Requirement already satisfied: rsa<5,>=3.1.4 in /opt/conda/lib/python3.9/site-packages (from google-auth>=1.2->gcsfs==2023.9.2->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (4.9)
-Requirement already satisfied: proto-plus<2.0.0dev,>=1.22.0 in /opt/conda/lib/python3.9/site-packages (from google-cloud-bigquery-storage<3.0.0dev,>=2.6.0->google-cloud-bigquery[bqstorage,pandas]==3.14.1->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.23.0)
-Requirement already satisfied: google-crc32c<2.0dev,>=1.0 in /opt/conda/lib/python3.9/site-packages (from google-cloud-storage->gcsfs==2023.9.2->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.5.0)
-Requirement already satisfied: zipp>=0.5 in /opt/conda/lib/python3.9/site-packages (from importlib-metadata>=4.13.0->dask~=2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.17.0)
-Requirement already satisfied: parso<0.9.0,>=0.8.3 in /opt/conda/lib/python3.9/site-packages (from jedi>=0.16->ipython~=8.10->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.8.3)
-Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /opt/conda/lib/python3.9/site-packages (from jsonschema<5,>=3.0.1->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2023.12.1)
-Requirement already satisfied: referencing>=0.28.4 in /opt/conda/lib/python3.9/site-packages (from jsonschema<5,>=3.0.1->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.33.0)
-Requirement already satisfied: rpds-py>=0.7.1 in /opt/conda/lib/python3.9/site-packages (from jsonschema<5,>=3.0.1->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.18.0)
-Requirement already satisfied: websocket-client!=0.40.0,!=0.41.*,!=0.42.*,>=0.32.0 in /opt/conda/lib/python3.9/site-packages (from kubernetes<26,>=8.0.0->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.7.0)
-Requirement already satisfied: contourpy>=1.0.1 in /opt/conda/lib/python3.9/site-packages (from matplotlib<4->mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.2.0)
-Requirement already satisfied: cycler>=0.10 in /opt/conda/lib/python3.9/site-packages (from matplotlib<4->mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.12.1)
-Requirement already satisfied: fonttools>=4.22.0 in /opt/conda/lib/python3.9/site-packages (from matplotlib<4->mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (4.49.0)
-Requirement already satisfied: kiwisolver>=1.3.1 in /opt/conda/lib/python3.9/site-packages (from matplotlib<4->mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.4.5)
-Requirement already satisfied: pillow>=8 in /opt/conda/lib/python3.9/site-packages (from matplotlib<4->mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (10.2.0)
-Requirement already satisfied: pyparsing>=2.3.1 in /opt/conda/lib/python3.9/site-packages (from matplotlib<4->mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.1.1)
-Requirement already satisfied: importlib-resources>=3.2.0 in /opt/conda/lib/python3.9/site-packages (from matplotlib<4->mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (6.1.2)
-Requirement already satisfied: PyJWT[crypto]<3,>=1.0.0 in /opt/conda/lib/python3.9/site-packages (from msal<2.0.0,>=1.24.0->azure-identity~=1.5->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.8.0)
-Requirement already satisfied: portalocker<3,>=1.0 in /opt/conda/lib/python3.9/site-packages (from msal-extensions<2.0.0,>=0.3.0->azure-identity~=1.5->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.8.2)
-Requirement already satisfied: beautifulsoup4 in /opt/conda/lib/python3.9/site-packages (from nbconvert>=6.4.5->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (4.12.3)
-Requirement already satisfied: bleach!=5.0.0 in /opt/conda/lib/python3.9/site-packages (from nbconvert>=6.4.5->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (6.1.0)
-Requirement already satisfied: defusedxml in /opt/conda/lib/python3.9/site-packages (from nbconvert>=6.4.5->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.7.1)
-Requirement already satisfied: jupyter-core>=4.7 in /opt/conda/lib/python3.9/site-packages (from nbconvert>=6.4.5->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (5.7.1)
-Requirement already satisfied: jupyterlab-pygments in /opt/conda/lib/python3.9/site-packages (from nbconvert>=6.4.5->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.3.0)
-Requirement already satisfied: mistune<4,>=2.0.3 in /opt/conda/lib/python3.9/site-packages (from nbconvert>=6.4.5->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.0.2)
-Requirement already satisfied: nbclient>=0.5.0 in /opt/conda/lib/python3.9/site-packages (from nbconvert>=6.4.5->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.9.0)
-Requirement already satisfied: nbformat>=5.7 in /opt/conda/lib/python3.9/site-packages (from nbconvert>=6.4.5->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (5.9.2)
-Requirement already satisfied: pandocfilters>=1.4.1 in /opt/conda/lib/python3.9/site-packages (from nbconvert>=6.4.5->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.5.1)
-Requirement already satisfied: tinycss2 in /opt/conda/lib/python3.9/site-packages (from nbconvert>=6.4.5->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.2.1)
-Requirement already satisfied: pyzmq<25,>=17 in /opt/conda/lib/python3.9/site-packages (from notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (24.0.1)
-Requirement already satisfied: argon2-cffi in /opt/conda/lib/python3.9/site-packages (from notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (23.1.0)
-Requirement already satisfied: jupyter-client<8,>=5.3.4 in /opt/conda/lib/python3.9/site-packages (from notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (7.4.9)
-Requirement already satisfied: ipython-genutils in /opt/conda/lib/python3.9/site-packages (from notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.2.0)
-Requirement already satisfied: ipykernel in /opt/conda/lib/python3.9/site-packages (from notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (6.29.3)
-Requirement already satisfied: Send2Trash>=1.8.0 in /opt/conda/lib/python3.9/site-packages (from notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.8.2)
-Requirement already satisfied: terminado>=0.8.3 in /opt/conda/lib/python3.9/site-packages (from notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.18.0)
-Requirement already satisfied: prometheus-client in /opt/conda/lib/python3.9/site-packages (from notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.20.0)
-Requirement already satisfied: nbclassic>=0.4.7 in /opt/conda/lib/python3.9/site-packages (from notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.0.0)
-Requirement already satisfied: ptyprocess>=0.5 in /opt/conda/lib/python3.9/site-packages (from pexpect>4.3->ipython~=8.10->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.7.0)
-Requirement already satisfied: wcwidth in /opt/conda/lib/python3.9/site-packages (from prompt-toolkit<3.1.0,>=3.0.41->ipython~=8.10->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.2.13)
-Requirement already satisfied: oauthlib>=3.0.0 in /opt/conda/lib/python3.9/site-packages (from requests-oauthlib>=0.5.0->msrest~=0.6.21->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.2.2)
-Requirement already satisfied: wheel in /opt/conda/lib/python3.9/site-packages (from strip-hints<1,>=0.1.8->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.41.2)
-Requirement already satisfied: executing>=1.2.0 in /opt/conda/lib/python3.9/site-packages (from stack-data->ipython~=8.10->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.0.1)
-Requirement already satisfied: asttokens>=2.1.0 in /opt/conda/lib/python3.9/site-packages (from stack-data->ipython~=8.10->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.4.1)
-Requirement already satisfied: pure-eval in /opt/conda/lib/python3.9/site-packages (from stack-data->ipython~=8.10->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.2.2)
-Requirement already satisfied: webencodings in /opt/conda/lib/python3.9/site-packages (from bleach!=5.0.0->nbconvert>=6.4.5->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.5.1)
-Requirement already satisfied: pycparser in /opt/conda/lib/python3.9/site-packages (from cffi->azure-datalake-store<0.1,>=0.0.46->adlfs==2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.21)
-Requirement already satisfied: grpcio-status<2.0.dev0,>=1.33.2 in /opt/conda/lib/python3.9/site-packages (from google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0dev,>=1.31.5->google-cloud-bigquery[bqstorage,pandas]==3.14.1->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.48.2)
-Requirement already satisfied: platformdirs>=2.5 in /opt/conda/lib/python3.9/site-packages (from jupyter-core>=4.7->nbconvert>=6.4.5->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.10.0)
-Requirement already satisfied: jupyter-server>=1.8 in /opt/conda/lib/python3.9/site-packages (from nbclassic>=0.4.7->notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.12.5)
-Requirement already satisfied: notebook-shim>=0.2.3 in /opt/conda/lib/python3.9/site-packages (from nbclassic>=0.4.7->notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.2.4)
-Requirement already satisfied: fastjsonschema in /opt/conda/lib/python3.9/site-packages (from nbformat>=5.7->nbconvert>=6.4.5->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.19.1)
-Requirement already satisfied: pyasn1<0.6.0,>=0.4.6 in /opt/conda/lib/python3.9/site-packages (from pyasn1-modules>=0.2.1->google-auth>=1.2->gcsfs==2023.9.2->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.5.1)
-Requirement already satisfied: argon2-cffi-bindings in /opt/conda/lib/python3.9/site-packages (from argon2-cffi->notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (21.2.0)
-Requirement already satisfied: soupsieve>1.2 in /opt/conda/lib/python3.9/site-packages (from beautifulsoup4->nbconvert>=6.4.5->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.5)
-Requirement already satisfied: comm>=0.1.1 in /opt/conda/lib/python3.9/site-packages (from ipykernel->notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.2.1)
-Requirement already satisfied: debugpy>=1.6.5 in /opt/conda/lib/python3.9/site-packages (from ipykernel->notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.8.1)
-Requirement already satisfied: jupyter-events>=0.9.0 in /opt/conda/lib/python3.9/site-packages (from jupyter-server>=1.8->nbclassic>=0.4.7->notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.9.0)
-Requirement already satisfied: jupyter-server-terminals in /opt/conda/lib/python3.9/site-packages (from jupyter-server>=1.8->nbclassic>=0.4.7->notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.5.2)
-Requirement already satisfied: overrides in /opt/conda/lib/python3.9/site-packages (from jupyter-server>=1.8->nbclassic>=0.4.7->notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (7.7.0)
-Requirement already satisfied: python-json-logger>=2.0.4 in /opt/conda/lib/python3.9/site-packages (from jupyter-events>=0.9.0->jupyter-server>=1.8->nbclassic>=0.4.7->notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.0.7)
-Requirement already satisfied: rfc3339-validator in /opt/conda/lib/python3.9/site-packages (from jupyter-events>=0.9.0->jupyter-server>=1.8->nbclassic>=0.4.7->notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.1.4)
-Requirement already satisfied: rfc3986-validator>=0.1.1 in /opt/conda/lib/python3.9/site-packages (from jupyter-events>=0.9.0->jupyter-server>=1.8->nbclassic>=0.4.7->notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.1.1)
-Requirement already satisfied: fqdn in /opt/conda/lib/python3.9/site-packages (from jsonschema<5,>=3.0.1->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.5.1)
-Requirement already satisfied: isoduration in /opt/conda/lib/python3.9/site-packages (from jsonschema<5,>=3.0.1->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (20.11.0)
-Requirement already satisfied: jsonpointer>1.13 in /opt/conda/lib/python3.9/site-packages (from jsonschema<5,>=3.0.1->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.1)
-Requirement already satisfied: uri-template in /opt/conda/lib/python3.9/site-packages (from jsonschema<5,>=3.0.1->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.3.0)
-Requirement already satisfied: webcolors>=1.11 in /opt/conda/lib/python3.9/site-packages (from jsonschema<5,>=3.0.1->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.13)
-Requirement already satisfied: arrow>=0.15.0 in /opt/conda/lib/python3.9/site-packages (from isoduration->jsonschema<5,>=3.0.1->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.3.0)
-Requirement already satisfied: types-python-dateutil>=2.8.10 in /opt/conda/lib/python3.9/site-packages (from arrow>=0.15.0->isoduration->jsonschema<5,>=3.0.1->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.8.19.20240106)
-Downloading onnx-1.14.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (14.6 MB)
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 14.6/14.6 MB 274.2 MB/s eta 0:00:00
-Downloading onnxruntime-1.16.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.4 MB)
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 6.4/6.4 MB 277.9 MB/s eta 0:00:00
-Downloading optimum-1.6.4-py3-none-any.whl (227 kB)
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 227.8/227.8 kB 291.3 MB/s eta 0:00:00
-Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 6.3/6.3 MB 242.4 MB/s eta 0:00:00
-Downloading datasets-2.10.1-py3-none-any.whl (469 kB)
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 469.0/469.0 kB 185.9 MB/s eta 0:00:00
-Downloading scikit_learn-1.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.4 MB)
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 26.4/26.4 MB 275.9 MB/s eta 0:00:00
-Downloading dill-0.3.6-py3-none-any.whl (110 kB)
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 110.5/110.5 kB 282.3 MB/s eta 0:00:00
-Downloading huggingface_hub-0.21.4-py3-none-any.whl (346 kB)
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 346.4/346.4 kB 311.7 MB/s eta 0:00:00
-Downloading numpy-1.23.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.1 MB)
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 17.1/17.1 MB 269.6 MB/s eta 0:00:00
-Downloading regex-2023.12.25-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (773 kB)
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 773.4/773.4 kB 311.9 MB/s eta 0:00:00
-Downloading responses-0.18.0-py3-none-any.whl (38 kB)
-Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 7.8/7.8 MB 264.1 MB/s eta 0:00:00
-Downloading torch-2.2.1-cp39-cp39-manylinux1_x86_64.whl (755.5 MB)
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 755.5/755.5 MB 204.0 MB/s eta 0:00:00
-Downloading nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 410.6/410.6 MB 40.3 MB/s eta 0:00:00
-Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 14.1/14.1 MB 43.0 MB/s eta 0:00:00
-Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 23.7/23.7 MB 46.9 MB/s eta 0:00:00
-Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 823.6/823.6 kB 51.0 MB/s eta 0:00:00
-Downloading nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 731.7/731.7 MB 58.2 MB/s eta 0:00:00
-Downloading nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 121.6/121.6 MB 69.0 MB/s eta 0:00:00
-Downloading nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 56.5/56.5 MB 36.0 MB/s eta 0:00:00
-Downloading nvidia_cusolver_cu12-11.4.5.107-py3-none-manylinux1_x86_64.whl (124.2 MB)
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 124.2/124.2 MB 52.8 MB/s eta 0:00:00
-Downloading nvidia_cusparse_cu12-12.1.0.106-py3-none-manylinux1_x86_64.whl (196.0 MB)
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 196.0/196.0 MB 45.9 MB/s eta 0:00:00
-Downloading nvidia_nccl_cu12-2.19.3-py3-none-manylinux1_x86_64.whl (166.0 MB)
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 166.0/166.0 MB 19.6 MB/s eta 0:00:00
-Downloading nvidia_nvtx_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (99 kB)
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 99.1/99.1 kB 27.7 MB/s eta 0:00:00
-Downloading triton-2.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (167.9 MB)
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 167.9/167.9 MB 41.3 MB/s eta 0:00:00
-Downloading protobuf-3.20.2-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl (1.0 MB)
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.0/1.0 MB 42.8 MB/s eta 0:00:00
-Downloading coloredlogs-15.0.1-py2.py3-none-any.whl (46 kB)
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 46.0/46.0 kB 192.0 MB/s eta 0:00:00
-Downloading filelock-3.13.1-py3-none-any.whl (11 kB)
-Downloading flatbuffers-24.3.7-py2.py3-none-any.whl (26 kB)
-Downloading multiprocess-0.70.14-py39-none-any.whl (132 kB)
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 132.9/132.9 kB 100.7 MB/s eta 0:00:00
-Downloading sympy-1.12-py3-none-any.whl (5.7 MB)
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 5.7/5.7 MB 41.4 MB/s eta 0:00:00
-Downloading humanfriendly-10.0-py2.py3-none-any.whl (86 kB)
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 86.8/86.8 kB 253.7 MB/s eta 0:00:00
-Downloading mpmath-1.3.0-py3-none-any.whl (536 kB)
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 536.2/536.2 kB 45.4 MB/s eta 0:00:00
-Downloading sentencepiece-0.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.3/1.3 MB 46.1 MB/s eta 0:00:00
-Downloading networkx-3.2.1-py3-none-any.whl (1.6 MB)
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.6/1.6 MB 43.7 MB/s eta 0:00:00
-Downloading nvidia_nvjitlink_cu12-12.4.99-py3-none-manylinux2014_x86_64.whl (21.1 MB)
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 21.1/21.1 MB 43.8 MB/s eta 0:00:00
-Installing collected packages: tokenizers, sentencepiece, mpmath, flatbuffers, sympy, regex, protobuf, nvidia-nvtx-cu12, nvidia-nvjitlink-cu12, nvidia-nccl-cu12, nvidia-curand-cu12, nvidia-cufft-cu12, nvidia-cuda-runtime-cu12, nvidia-cuda-nvrtc-cu12, nvidia-cuda-cupti-cu12, nvidia-cublas-cu12, numpy, networkx, humanfriendly, filelock, dill, triton, responses, onnx, nvidia-cusparse-cu12, nvidia-cudnn-cu12, multiprocess, huggingface-hub, coloredlogs, transformers, scikit-learn, onnxruntime, nvidia-cusolver-cu12, torch, datasets, optimum
-  Attempting uninstall: protobuf
-    Found existing installation: protobuf 3.20.3
-    Uninstalling protobuf-3.20.3:
-      Successfully uninstalled protobuf-3.20.3
-  Attempting uninstall: numpy
-    Found existing installation: numpy 1.26.4
-    Uninstalling numpy-1.26.4:
-      Successfully uninstalled numpy-1.26.4
-  Attempting uninstall: scikit-learn
-    Found existing installation: scikit-learn 1.4.1.post1
-    Uninstalling scikit-learn-1.4.1.post1:
-      Successfully uninstalled scikit-learn-1.4.1.post1
-Successfully installed coloredlogs-15.0.1 datasets-2.10.1 dill-0.3.6 filelock-3.13.1 flatbuffers-24.3.7 huggingface-hub-0.21.4 humanfriendly-10.0 mpmath-1.3.0 multiprocess-0.70.14 networkx-3.2.1 numpy-1.23.5 nvidia-cublas-cu12-12.1.3.1 nvidia-cuda-cupti-cu12-12.1.105 nvidia-cuda-nvrtc-cu12-12.1.105 nvidia-cuda-runtime-cu12-12.1.105 nvidia-cudnn-cu12-8.9.2.26 nvidia-cufft-cu12-11.0.2.54 nvidia-curand-cu12-10.3.2.106 nvidia-cusolver-cu12-11.4.5.107 nvidia-cusparse-cu12-12.1.0.106 nvidia-nccl-cu12-2.19.3 nvidia-nvjitlink-cu12-12.4.99 nvidia-nvtx-cu12-12.1.105 onnx-1.14.1 onnxruntime-1.16.3 optimum-1.6.4 protobuf-3.20.2 regex-2023.12.25 responses-0.18.0 scikit-learn-1.0.2 sentencepiece-0.2.0 sympy-1.12 tokenizers-0.13.3 torch-2.2.1 transformers-4.26.1 triton-2.2.0
-WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv
-INFO[0238] Taking snapshot of full filesystem...        
-INFO[0463] Pushing image to docker-registry.default-tenant.app.app-lab-2-b688.iguazio-cd2.com/mlrun/func-hugging-face-trainer-avia-hugging-face-classifier-trainer:latest 
-INFO[0493] Pushed docker-registry.default-tenant.app.app-lab-2-b688.iguazio-cd2.com/mlrun/func-hugging-face-trainer-avia-hugging-face-classifier-trainer@sha256:691d0bb3c23487b4b5d2f84ab323c24735626ee81681475f53a4158b72d4cfee 
-
-
-
BuildStatus(ready=True, outputs={'image': '.mlrun/func-hugging-face-trainer-avia-hugging-face-classifier-trainer:latest'})
-
-
-
-
-
-
-
train_run = hugging_face_classifier_trainer.run(params={
-                                                        "hf_dataset": "Shayanvsf/US_Airline_Sentiment",
-                                                        "drop_columns": [
-                                                            "airline_sentiment_confidence",
-                                                            "negativereason_confidence",
-                                                        ],
-                                                        "pretrained_tokenizer": "distilbert-base-uncased",
-                                                        "pretrained_model": "distilbert-base-uncased",
-                                                        "model_class": "transformers.AutoModelForSequenceClassification",
-                                                        "label_name": "airline_sentiment",
-                                                        "num_of_train_samples": 100,
-                                                        "metrics": ["accuracy", "f1"],
-                                                        "random_state": 42,
-                                                        **additional_parameters
-                                                    },
-                                                    handler="train",                                                    
-                                                )
-
-
-
-
-
> 2024-03-24 17:22:42,252 [info] Storing function: {'name': 'hugging-face-classifier-trainer-train', 'uid': '53252ce7aacb4b1aacf86bf3b862daa2', 'db': 'http://mlrun-api:8080'}
-> 2024-03-24 17:22:42,536 [info] Job is running in the background, pod: hugging-face-classifier-trainer-train-dqqfr
-> 2024-03-24 17:24:43,288 [info] 'train_test_split_size' is not provided, setting train_test_split_size to 0.2
-> 2024-03-24 17:24:43,847 [info] Loading and editing Shayanvsf/US_Airline_Sentiment dataset from Hugging Face hub
-Downloading metadata: 100%|██████████| 1.03k/1.03k [00:00<00:00, 6.77MB/s]
-Downloading and preparing dataset None/None (download: 265.13 KiB, generated: 1.50 MiB, post-processed: Unknown size, total: 1.76 MiB) to /root/.cache/huggingface/datasets/Shayanvsf___parquet/Shayanvsf--US_Airline_Sentiment-1319c42f87c44b2f/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...
-Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]
-Downloading data: 100%|██████████| 92.6k/92.6k [00:00<00:00, 59.3MB/s]
-Downloading data files:  33%|███▎      | 1/3 [00:00<00:00,  6.42it/s]
-Downloading data: 100%|██████████| 605k/605k [00:00<00:00, 81.8MB/s]
-Downloading data files:  67%|██████▋   | 2/3 [00:00<00:00,  6.59it/s]
-Downloading data: 100%|██████████| 179k/179k [00:00<00:00, 50.9MB/s]
-Downloading data files: 100%|██████████| 3/3 [00:00<00:00,  6.62it/s]
-Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 1263.34it/s]
-Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/Shayanvsf___parquet/Shayanvsf--US_Airline_Sentiment-1319c42f87c44b2f/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.
-100%|██████████| 3/3 [00:00<00:00, 978.99it/s]                              
-Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight']
-- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
-- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
-Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classifier.weight', 'classifier.bias']
-You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
-> 2024-03-24 17:24:47,076 [info] training 'huggingface-model'
-The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
-This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning
-***** Running training *****
-  Num examples = 100
-  Num Epochs = 3
-  Instantaneous batch size per device = 16
-  Total train batch size (w. parallel, distributed & accumulation) = 16
-  Gradient Accumulation steps = 1
-  Total optimization steps = 21
-  Number of trainable parameters = 66955010
-huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
-To disable this warning, you can either:
-	- Avoid using `tokenizers` before the fork if possible
-	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
-  0%|          | 0/21 [00:00<?, ?it/s]You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
- 33%|███▎      | 7/21 [00:16<00:28,  2.02s/it]The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
-***** Running Evaluation *****
-  Num examples = 24
-  Batch size = 16
-
-{'loss': 0.7005, 'learning_rate': 1.904761904761905e-05, 'epoch': 0.14}
-{'loss': 0.6528, 'learning_rate': 1.8095238095238097e-05, 'epoch': 0.29}
-{'loss': 0.6468, 'learning_rate': 1.7142857142857142e-05, 'epoch': 0.43}
-{'loss': 0.5877, 'learning_rate': 1.6190476190476193e-05, 'epoch': 0.57}
-{'loss': 0.6694, 'learning_rate': 1.523809523809524e-05, 'epoch': 0.71}
-{'loss': 0.5219, 'learning_rate': 1.4285714285714287e-05, 'epoch': 0.86}
-{'loss': 0.7052, 'learning_rate': 1.3333333333333333e-05, 'epoch': 1.0}
-  0%|          | 0/2 [00:00<?, ?it/s]
-100%|██████████| 2/2 [00:00<00:00,  4.86it/s]main.py:561: FutureWarning:
-
-load_metric is deprecated and will be removed in the next major version of datasets. Use 'evaluate.load' instead, from the new library 🤗 Evaluate: https://huggingface.co/docs/evaluate
-
-
-
-Downloading builder script: 4.21kB [00:00, 11.4MB/s]                   
-
-
-Downloading builder script: 6.50kB [00:00, 21.8MB/s]                   
-                                              
- 33%|███▎      | 7/21 [00:18<00:28,  2.02s/it]
-100%|██████████| 2/2 [00:00<00:00,  4.86it/s]
- 67%|██████▋   | 14/21 [00:34<00:14,  2.07s/it]The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
-***** Running Evaluation *****
-  Num examples = 24
-  Batch size = 16
-{'eval_loss': 0.5350419878959656, 'eval_accuracy': 0.7916666666666666, 'eval_f1': 0.0, 'eval_runtime': 1.5536, 'eval_samples_per_second': 15.448, 'eval_steps_per_second': 1.287, 'epoch': 1.0}
-{'loss': 0.5942, 'learning_rate': 1.2380952380952383e-05, 'epoch': 1.14}
-{'loss': 0.5899, 'learning_rate': 1.1428571428571429e-05, 'epoch': 1.29}
-{'loss': 0.5317, 'learning_rate': 1.0476190476190477e-05, 'epoch': 1.43}
-{'loss': 0.4516, 'learning_rate': 9.523809523809525e-06, 'epoch': 1.57}
-{'loss': 0.5121, 'learning_rate': 8.571428571428571e-06, 'epoch': 1.71}
-{'loss': 0.5264, 'learning_rate': 7.61904761904762e-06, 'epoch': 1.86}
-{'loss': 0.539, 'learning_rate': 6.666666666666667e-06, 'epoch': 2.0}
-
-  0%|          | 0/2 [00:00<?, ?it/s]
-                                               A
- 67%|██████▋   | 14/21 [00:35<00:14,  2.07s/it]
-100%|██████████| 2/2 [00:00<00:00,  4.95it/s]
-100%|██████████| 21/21 [00:52<00:00,  2.05s/it]The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
-***** Running Evaluation *****
-  Num examples = 24
-  Batch size = 16
-{'eval_loss': 0.4877033233642578, 'eval_accuracy': 0.7916666666666666, 'eval_f1': 0.0, 'eval_runtime': 1.1789, 'eval_samples_per_second': 20.357, 'eval_steps_per_second': 1.696, 'epoch': 2.0}
-{'loss': 0.4059, 'learning_rate': 5.7142857142857145e-06, 'epoch': 2.14}
-{'loss': 0.5851, 'learning_rate': 4.761904761904762e-06, 'epoch': 2.29}
-{'loss': 0.4135, 'learning_rate': 3.80952380952381e-06, 'epoch': 2.43}
-{'loss': 0.6571, 'learning_rate': 2.8571428571428573e-06, 'epoch': 2.57}
-{'loss': 0.4883, 'learning_rate': 1.904761904761905e-06, 'epoch': 2.71}
-{'loss': 0.5114, 'learning_rate': 9.523809523809525e-07, 'epoch': 2.86}
-{'loss': 0.5215, 'learning_rate': 0.0, 'epoch': 3.0}
-
-  0%|          | 0/2 [00:00<?, ?it/s]
-                                               A
-100%|██████████| 21/21 [00:54<00:00,  2.05s/it]
-100%|██████████| 2/2 [00:00<00:00,  6.38it/s]
-                                             
-
-Training completed. Do not forget to share your model on huggingface.co/models =)
-
-
-100%|██████████| 21/21 [00:55<00:00,  2.62s/it]
-tokenizer config file saved in /tmp/tokenizer/tokenizer_config.json
-Special tokens file saved in /tmp/tokenizer/special_tokens_map.json
-Configuration saved in /tmp/model/config.json
-Model weights saved in /tmp/model/pytorch_model.bin
-{'eval_loss': 0.4750453531742096, 'eval_accuracy': 0.7916666666666666, 'eval_f1': 0.0, 'eval_runtime': 1.0524, 'eval_samples_per_second': 22.806, 'eval_steps_per_second': 1.9, 'epoch': 3.0}
-{'train_runtime': 55.1543, 'train_samples_per_second': 5.439, 'train_steps_per_second': 0.381, 'train_loss': 0.5624780683290391, 'epoch': 3.0}
-> 2024-03-24 17:26:00,230 [info] To track results use the CLI: {'info_cmd': 'mlrun get run 53252ce7aacb4b1aacf86bf3b862daa2 -p hugging-face-trainer-avia', 'logs_cmd': 'mlrun logs 53252ce7aacb4b1aacf86bf3b862daa2 -p hugging-face-trainer-avia'}
-> 2024-03-24 17:26:00,231 [info] Or click for UI: {'ui_url': 'https://dashboard.default-tenant.app.app-lab-2-b688.iguazio-cd2.com/mlprojects/hugging-face-trainer-avia/jobs/monitor/53252ce7aacb4b1aacf86bf3b862daa2/overview'}
-> 2024-03-24 17:26:00,231 [info] Run execution finished: {'status': 'completed', 'name': 'hugging-face-classifier-trainer-train'}
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
hugging-face-trainer-avia0Mar 24 17:24:39completedhugging-face-classifier-trainer-train
v3io_user=avia
kind=job
owner=avia
mlrun/client_version=1.6.1
mlrun/client_python_version=3.9.16
host=hugging-face-classifier-trainer-train-dqqfr
hf_dataset=Shayanvsf/US_Airline_Sentiment
drop_columns=['airline_sentiment_confidence', 'negativereason_confidence']
pretrained_tokenizer=distilbert-base-uncased
pretrained_model=distilbert-base-uncased
model_class=transformers.AutoModelForSequenceClassification
label_name=airline_sentiment
num_of_train_samples=100
metrics=['accuracy', 'f1']
random_state=42
TRAIN_output_dir=finetuning-sentiment-model-3000-samples
TRAIN_learning_rate=2e-05
TRAIN_per_device_train_batch_size=16
TRAIN_per_device_eval_batch_size=16
TRAIN_num_train_epochs=3
TRAIN_weight_decay=0.01
TRAIN_push_to_hub=False
TRAIN_evaluation_strategy=epoch
TRAIN_eval_steps=1
TRAIN_logging_steps=1
CLASS_num_labels=2
loss=0.5215
learning_rate=0.0
eval_loss=0.4750453531742096
eval_accuracy=0.7916666666666666
eval_f1=0.0
eval_runtime=1.0524
eval_samples_per_second=22.806
eval_steps_per_second=1.9
train_runtime=55.1543
train_samples_per_second=5.439
train_steps_per_second=0.381
total_flos=3327208489680.0
loss_plot
learning_rate_plot
eval_loss_plot
eval_accuracy_plot
eval_f1_plot
eval_runtime_plot
eval_samples_per_second_plot
eval_steps_per_second_plot
tokenizer
model
-
- -
-

-
-
-
> to track results use the .show() or .logs() methods or click here to open in UI
> 2024-03-24 17:26:09,792 [info] Run execution finished: {'status': 'completed', 'name': 'hugging-face-classifier-trainer-train'}
-
-
-
-
-

Back to the top

-
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/hugging_face_classifier_trainer/0.2.0/static/function.html b/functions/development/hugging_face_classifier_trainer/0.2.0/static/function.html deleted file mode 100644 index 2bf1ffb9..00000000 --- a/functions/development/hugging_face_classifier_trainer/0.2.0/static/function.html +++ /dev/null @@ -1,390 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: job
-metadata:
-  name: hugging-face-classifier-trainer
-  tag: ''
-  hash: e8113e81f04c96fc9a8a94e717dea81ee3e05a18
-  project: ''
-  labels:
-    author: davids
-  categories:
-  - machine-learning
-  - model-training
-spec:
-  command: ''
-  args: []
-  image: ''
-  build:
-    functionSourceCode: 
-    base_image: mlrun/mlrun
-    commands: []
-    code_origin: ''
-    origin_filename: ''
-    requirements:
-    - onnx~=1.14.1
-    - onnxruntime~=1.16.1
-    - optimum~=1.6.4
-    - transformers~=4.26.1
-    - datasets~=2.10.1
-    - scikit-learn~=1.0.2
-  entry_points:
-    add_interface:
-      name: add_interface
-      doc: 'Enrich the object with this interface properties, methods and functions,
-        so it will have this TensorFlow.Keras
-
-        MLRuns features.'
-      parameters:
-      - name: cls
-      - name: obj
-        type: Trainer
-        doc: The object to enrich his interface.
-      - name: restoration
-        type: MLRunInterfaceRestorationType
-        doc: Restoration information tuple as returned from 'remove_interface' in
-          order to add the interface in a certain state.
-        default: null
-      outputs: []
-      lineno: 146
-      has_varargs: false
-      has_kwargs: false
-    mlrun_optimize:
-      name: mlrun_optimize
-      doc: 'MLRun''s tf.keras.Model.fit wrapper. It will setup the optimizer when
-        using horovod. The optimizer must be
-
-        passed in a keyword argument and when using horovod, it must be passed as
-        an Optimizer instance, not a string.
-
-
-        raise MLRunInvalidArgumentError: In case the optimizer provided did not follow
-        the instructions above.'
-      parameters:
-      - name: cls
-      outputs: []
-      lineno: 79
-      has_varargs: false
-      has_kwargs: false
-    wrapper:
-      name: wrapper
-      doc: ''
-      parameters:
-      - name: self
-        type: Trainer
-      outputs: []
-      lineno: 173
-      has_varargs: true
-      has_kwargs: true
-    enable_auto_logging:
-      name: enable_auto_logging
-      doc: ''
-      parameters:
-      - name: self
-      - name: context
-        type: MLClientCtx
-      - name: model_name
-        type: str
-        default: model
-      - name: tag
-        type: str
-        default: ''
-      - name: labels
-        type: Dict[str, str]
-        default: null
-      - name: extra_data
-        type: dict
-        default: null
-      outputs: []
-      lineno: 114
-      has_varargs: false
-      has_kwargs: false
-    mlrun_train:
-      name: mlrun_train
-      doc: 'MLRuns tf.keras.Model.fit wrapper. It will setup the optimizer when using
-        horovod. The optimizer must be
-
-        passed in a keyword argument and when using horovod, it must be passed as
-        an Optimizer instance, not a string.
-
-
-        raise MLRunInvalidArgumentError: In case the optimizer provided did not follow
-        the instructions above.'
-      parameters:
-      - name: cls
-      outputs: []
-      lineno: 164
-      has_varargs: false
-      has_kwargs: false
-    on_epoch_begin:
-      name: on_epoch_begin
-      doc: ''
-      parameters:
-      - name: self
-      - name: args
-        type: TrainingArguments
-      - name: state
-        type: TrainerState
-      - name: control
-        type: TrainerControl
-      outputs: []
-      lineno: 220
-      has_varargs: false
-      has_kwargs: true
-    on_epoch_end:
-      name: on_epoch_end
-      doc: ''
-      parameters:
-      - name: self
-      - name: args
-        type: TrainingArguments
-      - name: state
-        type: TrainerState
-      - name: control
-        type: TrainerControl
-      outputs: []
-      lineno: 229
-      has_varargs: false
-      has_kwargs: true
-    on_log:
-      name: on_log
-      doc: ''
-      parameters:
-      - name: self
-      - name: args
-        type: TrainingArguments
-      - name: state
-        type: TrainerState
-      - name: control
-        type: TrainerControl
-      - name: logs
-        type: Dict[str, float]
-        default: null
-      outputs: []
-      lineno: 238
-      has_varargs: false
-      has_kwargs: true
-    on_train_begin:
-      name: on_train_begin
-      doc: ''
-      parameters:
-      - name: self
-      - name: args
-        type: TrainingArguments
-      - name: state
-        type: TrainerState
-      - name: control
-        type: TrainerControl
-      outputs: []
-      lineno: 262
-      has_varargs: false
-      has_kwargs: true
-    on_train_end:
-      name: on_train_end
-      doc: ''
-      parameters:
-      - name: self
-      - name: args
-        type: TrainingArguments
-      - name: state
-        type: TrainerState
-      - name: control
-        type: TrainerControl
-      - name: model
-        type: PreTrainedModel
-        default: null
-      - name: tokenizer
-        type: PreTrainedTokenizer
-        default: null
-      outputs: []
-      lineno: 271
-      has_varargs: false
-      has_kwargs: true
-    on_evaluate:
-      name: on_evaluate
-      doc: ''
-      parameters:
-      - name: self
-      - name: args
-        type: TrainingArguments
-      - name: state
-        type: TrainerState
-      - name: control
-        type: TrainerControl
-      outputs: []
-      lineno: 322
-      has_varargs: false
-      has_kwargs: true
-    apply_mlrun:
-      name: apply_mlrun
-      doc: Wrap the given model with MLRun's interface providing it with mlrun's additional
-        features.
-      parameters:
-      - name: huggingface_object
-        doc: The model to wrap. Can be loaded from the model path given as well.
-      - name: model_name
-        type: str
-        doc: 'The model name to use for storing the model artifact. Default: "model".'
-        default: null
-      - name: tag
-        type: str
-        doc: The model's tag to log with.
-        default: ''
-      - name: context
-        type: MLClientCtx
-        doc: MLRun context to work with. If no context is given it will be retrieved
-          via 'mlrun.get_or_create_ctx(None)'
-        default: null
-      - name: auto_log
-        type: bool
-        doc: 'Whether to enable MLRun''s auto logging. Default: True.'
-        default: true
-      - name: labels
-        type: Dict[str, str]
-        default: null
-      - name: extra_data
-        type: dict
-        default: null
-      outputs: []
-      lineno: 421
-      has_varargs: false
-      has_kwargs: true
-    train:
-      name: train
-      doc: 'Training and evaluating a pretrained model with a pretrained tokenizer
-        over a dataset.
-
-        The dataset can be either be the name of the dataset that contains in the
-        HuggingFace hub,
-
-        or a URI or a FeatureVector'
-      parameters:
-      - name: context
-        type: MLClientCtx
-        doc: MLRun context
-      - name: hf_dataset
-        type: str
-        doc: The name of the dataset to get from the HuggingFace hub
-        default: null
-      - name: dataset
-        type: DataItem
-        doc: The dataset to train the model on. Can be either a URI or a FeatureVector
-        default: null
-      - name: test_set
-        type: DataItem
-        doc: The test set to train the model with.
-        default: null
-      - name: drop_columns
-        type: Optional[List[str]]
-        doc: The columns to drop from the dataset.
-        default: null
-      - name: pretrained_tokenizer
-        type: str
-        doc: The name of the pretrained tokenizer from the HuggingFace hub.
-        default: null
-      - name: pretrained_model
-        type: str
-        doc: The name of the pretrained model from the HuggingFace hub.
-        default: null
-      - name: model_class
-        type: str
-        doc: The class of the model, e.g. `transformers.AutoModelForSequenceClassification`
-        default: null
-      - name: model_name
-        type: str
-        doc: The model's name to use for storing the model artifact, default to 'model'
-        default: huggingface-model
-      - name: label_name
-        type: str
-        doc: The target label of the column in the dataset.
-        default: labels
-      - name: text_col
-        type: str
-        doc: The input text column un the dataset.
-        default: text
-      - name: num_of_train_samples
-        type: int
-        doc: Max number of training samples, for debugging.
-        default: null
-      - name: train_test_split_size
-        type: float
-        doc: Should be between 0.0 and 1.0 and represent the proportion of the dataset
-          to include in the test split.
-        default: null
-      - name: metrics
-        type: List[str]
-        doc: List of different metrics for evaluate the model such as f1, accuracy
-          etc.
-        default: null
-      - name: random_state
-        type: int
-        doc: Random state for train_test_split
-        default: null
-      outputs: []
-      lineno: 647
-      has_varargs: false
-      has_kwargs: false
-    preprocess_function:
-      name: preprocess_function
-      doc: ''
-      parameters:
-      - name: examples
-      outputs: []
-      lineno: 696
-      has_varargs: false
-      has_kwargs: false
-    optimize:
-      name: optimize
-      doc: Optimizing the transformer model using ONNX optimization.
-      parameters:
-      - name: model_path
-        type: str
-        doc: The path of the model to optimize.
-      - name: model_name
-        type: str
-        doc: Name of the optimized model.
-        default: optimized_model
-      - name: target_dir
-        type: str
-        doc: The directory to save the ONNX model.
-        default: ./optimized
-      - name: optimization_level
-        type: int
-        doc: Optimization level performed by ONNX Runtime of the loaded graph. (default
-          is 1)
-        default: 1
-      outputs: []
-      lineno: 799
-      has_varargs: false
-      has_kwargs: false
-  description: Automatic train and optimize functions for HuggingFace framework
-  default_handler: train
-  disable_auto_mount: false
-  clone_target_dir: ''
-  env: []
-  priority_class_name: ''
-  preemption_mode: prevent
-  affinity: null
-  tolerations: null
-  security_context: {}
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/hugging_face_classifier_trainer/0.2.0/static/hugging_face_classifier_trainer.html b/functions/development/hugging_face_classifier_trainer/0.2.0/static/hugging_face_classifier_trainer.html deleted file mode 100644 index 99a105cb..00000000 --- a/functions/development/hugging_face_classifier_trainer/0.2.0/static/hugging_face_classifier_trainer.html +++ /dev/null @@ -1,972 +0,0 @@ - - - - - - - -hugging_face_classifier_trainer.hugging_face_classifier_trainer - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - -
-
- -
-
-
-
-
- -
-

- -
-
-
-
-
-
-
-

Source code for hugging_face_classifier_trainer.hugging_face_classifier_trainer

-import os
-import shutil
-import tempfile
-import zipfile
-from abc import ABC
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
-
-import mlrun
-import mlrun.datastore
-import mlrun.utils
-import numpy as np
-import pandas as pd
-import transformers
-from datasets import Dataset, load_dataset, load_metric
-from mlrun import MLClientCtx
-from mlrun import feature_store as fs
-from mlrun.artifacts import Artifact, PlotlyArtifact
-from mlrun.datastore import DataItem
-from mlrun.frameworks._common import CommonTypes, MLRunInterface
-from mlrun.utils import create_class
-from plotly import graph_objects as go
-from sklearn.model_selection import train_test_split
-from transformers import (
-    AutoTokenizer,
-    DataCollatorWithPadding,
-    EvalPrediction,
-    PreTrainedModel,
-    PreTrainedTokenizer,
-    Trainer,
-    TrainerCallback,
-    TrainerControl,
-    TrainerState,
-    TrainingArguments,
-)
-
-
-# ----------------------from MLRUN--------------------------------
-
[docs]class HFORTOptimizerMLRunInterface(MLRunInterface, ABC): - """ - Interface for adding MLRun features for tensorflow keras API. - """ - - # MLRun's context default name: - DEFAULT_CONTEXT_NAME = "mlrun-huggingface" - - # Attributes to be inserted so the MLRun interface will be fully enabled. - _PROPERTIES = { - "_auto_log": False, - "_context": None, - "_model_name": "model", - "_tag": "", - "_labels": None, - "_extra_data": None, - } - _METHODS = ["enable_auto_logging"] - # Attributes to replace so the MLRun interface will be fully enabled. - _REPLACED_METHODS = [ - "optimize", - ] - -
[docs] @classmethod - def add_interface( - cls, - obj, - restoration: CommonTypes.MLRunInterfaceRestorationType = None, - ): - """ - Enrich the object with this interface properties, methods and functions, so it will have this TensorFlow.Keras - MLRun's features. - :param obj: The object to enrich his interface. - :param restoration: Restoration information tuple as returned from 'remove_interface' in order to - add the interface in a certain state. - """ - super(HFORTOptimizerMLRunInterface, cls).add_interface( - obj=obj, restoration=restoration - )
- -
[docs] @classmethod - def mlrun_optimize(cls): - """ - MLRun's tf.keras.Model.fit wrapper. It will setup the optimizer when using horovod. The optimizer must be - passed in a keyword argument and when using horovod, it must be passed as an Optimizer instance, not a string. - - raise MLRunInvalidArgumentError: In case the optimizer provided did not follow the instructions above. - """ - - def wrapper(self, *args, **kwargs): - save_dir = cls._get_function_argument( - self.optimize, - argument_name="save_dir", - passed_args=args, - passed_kwargs=kwargs, - )[0] - - # Call the original optimize method: - result = self.original_optimize(*args, **kwargs) - - if self._auto_log: - # Log the onnx model: - self._context.log_model( - key="model", - db_key=self._model_name, - model_file=f"{save_dir}/model_optimized.onnx", - tag=self._tag, - framework="ONNX", - labels=self._labels, - extra_data=self._extra_data, - ) - - return result - - return wrapper
- -
[docs] def enable_auto_logging( - self, - context: mlrun.MLClientCtx, - model_name: str = "model", - tag: str = "", - labels: Dict[str, str] = None, - extra_data: dict = None, - ): - self._auto_log = True - - self._context = context - self._model_name = model_name - self._tag = tag - self._labels = labels - self._extra_data = extra_data
- - -
[docs]class HFTrainerMLRunInterface(MLRunInterface, ABC): - """ - Interface for adding MLRun features for tensorflow keras API. - """ - - # MLRuns context default name: - DEFAULT_CONTEXT_NAME = "mlrun-huggingface" - - # Attributes to replace so the MLRun interface will be fully enabled. - _REPLACED_METHODS = [ - "train", - # "evaluate" - ] - -
[docs] @classmethod - def add_interface( - cls, - obj: Trainer, - restoration: CommonTypes.MLRunInterfaceRestorationType = None, - ): - """ - Enrich the object with this interface properties, methods and functions, so it will have this TensorFlow.Keras - MLRuns features. - :param obj: The object to enrich his interface. - :param restoration: Restoration information tuple as returned from 'remove_interface' in order to - add the interface in a certain state. - """ - - super(HFTrainerMLRunInterface, cls).add_interface( - obj=obj, restoration=restoration - )
- -
[docs] @classmethod - def mlrun_train(cls): - - """ - MLRuns tf.keras.Model.fit wrapper. It will setup the optimizer when using horovod. The optimizer must be - passed in a keyword argument and when using horovod, it must be passed as an Optimizer instance, not a string. - - raise MLRunInvalidArgumentError: In case the optimizer provided did not follow the instructions above. - """ - - def wrapper(self: Trainer, *args, **kwargs): - # Restore the evaluation method as `train` will use it: - # cls._restore_attribute(obj=self, attribute_name="evaluate") - - # Call the original fit method: - result = self.original_train(*args, **kwargs) - - # Replace the evaluation method again: - # cls._replace_function(obj=self, function_name="evaluate") - - return result - - return wrapper
- - -
[docs]class MLRunCallback(TrainerCallback): - """ - Callback for collecting logs during training / evaluation of the `Trainer` API. - """ - - def __init__( - self, - context: mlrun.MLClientCtx = None, - model_name: str = "model", - tag: str = "", - labels: Dict[str, str] = None, - extra_data: dict = None, - ): - super().__init__() - - # Store the configurations: - self._context = ( - context - if context is not None - else mlrun.get_or_create_ctx("./mlrun-huggingface") - ) - self._model_name = model_name - self._tag = tag - self._labels = labels - self._extra_data = extra_data if extra_data is not None else {} - - # Set up the logging mode: - self._is_training = False - self._steps: List[List[int]] = [] - self._metric_scores: Dict[str, List[float]] = {} - self._artifacts: Dict[str, Artifact] = {} - -
[docs] def on_epoch_begin( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs, - ): - self._steps.append([])
- -
[docs] def on_epoch_end( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs, - ): - self._log_metrics()
- -
[docs] def on_log( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - logs: Dict[str, float] = None, - **kwargs, - ): - recent_logs = state.log_history[-1].copy() - - recent_logs.pop("epoch") - current_step = int(recent_logs.pop("step")) - if current_step not in self._steps[-1]: - self._steps[-1].append(current_step) - - for metric_name, metric_score in recent_logs.items(): - if metric_name.startswith("train_"): - if metric_name.split("train_")[1] not in self._metric_scores: - self._metric_scores[metric_name] = [metric_score] - continue - if metric_name not in self._metric_scores: - self._metric_scores[metric_name] = [] - self._metric_scores[metric_name].append(metric_score)
- -
[docs] def on_train_begin( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs, - ): - self._is_training = True
- -
[docs] def on_train_end( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - model: PreTrainedModel = None, - tokenizer: PreTrainedTokenizer = None, - **kwargs, - ): - self._log_metrics() - - temp_directory = tempfile.gettempdir() - - # Save and log the tokenizer: - if tokenizer is not None: - # Save tokenizer: - tokenizer_dir = os.path.join(temp_directory, "tokenizer") - tokenizer.save_pretrained(save_directory=tokenizer_dir) - # Zip the tokenizer directory: - tokenizer_zip = shutil.make_archive( - base_name="tokenizer", - format="zip", - root_dir=tokenizer_dir, - ) - # Log the zip file: - self._artifacts["tokenizer"] = self._context.log_artifact( - item="tokenizer", local_path=tokenizer_zip - ) - - # Save the model: - model_dir = os.path.join(temp_directory, "model") - model.save_pretrained(save_directory=model_dir) - - # Zip the model directory: - shutil.make_archive( - base_name="model", - format="zip", - root_dir=model_dir, - ) - - # Log the model: - self._context.log_model( - key="model", - db_key=self._model_name, - model_file="model.zip", - tag=self._tag, - framework="Hugging Face", - labels=self._labels, - extra_data={**self._artifacts, **self._extra_data}, - )
- -
[docs] def on_evaluate( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs, - ): - self._log_metrics() - - if self._is_training: - return
- - # TODO: Update the model object - - def _log_metrics(self): - for metric_name, metric_scores in self._metric_scores.items(): - self._context.log_result(key=metric_name, value=metric_scores[-1]) - if len(metric_scores) > 1: - self._log_metric_plot(name=metric_name, scores=metric_scores) - self._context.commit(completed=False) - - def _log_metric_plot(self, name: str, scores: List[float]): - # Initialize a plotly figure: - metric_figure = go.Figure() - - # Add titles: - metric_figure.update_layout( - title=name.capitalize().replace("_", " "), - xaxis_title="Samples", - yaxis_title="Scores", - ) - - # Draw: - metric_figure.add_trace( - go.Scatter(x=np.arange(len(scores)), y=scores, mode="lines") - ) - - # Create the plotly artifact: - artifact_name = f"{name}_plot" - artifact = PlotlyArtifact(key=artifact_name, figure=metric_figure) - self._artifacts[artifact_name] = self._context.log_artifact(artifact)
- - -def _apply_mlrun_on_trainer( - trainer: transformers.Trainer, - model_name: str = None, - tag: str = "", - context: mlrun.MLClientCtx = None, - auto_log: bool = True, - labels: Dict[str, str] = None, - extra_data: dict = None, - **kwargs, -): - # Get parameters defaults: - if context is None: - context = mlrun.get_or_create_ctx(HFTrainerMLRunInterface.DEFAULT_CONTEXT_NAME) - - HFTrainerMLRunInterface.add_interface(obj=trainer) - - if auto_log: - trainer.add_callback( - MLRunCallback( - context=context, - model_name=model_name, - tag=tag, - labels=labels, - extra_data=extra_data, - ) - ) - - -def _apply_mlrun_on_optimizer( - optimizer, - model_name: str = None, - tag: str = "", - context: mlrun.MLClientCtx = None, - auto_log: bool = True, - labels: Dict[str, str] = None, - extra_data: dict = None, - **kwargs, -): - # Get parameters defaults: - if context is None: - context = mlrun.get_or_create_ctx( - HFORTOptimizerMLRunInterface.DEFAULT_CONTEXT_NAME - ) - - HFORTOptimizerMLRunInterface.add_interface(obj=optimizer) - - if auto_log: - optimizer.enable_auto_logging( - context=context, - model_name=model_name, - tag=tag, - labels=labels, - extra_data=extra_data, - ) - - -
[docs]def apply_mlrun( - huggingface_object, - model_name: str = None, - tag: str = "", - context: mlrun.MLClientCtx = None, - auto_log: bool = True, - labels: Dict[str, str] = None, - extra_data: dict = None, - **kwargs, -): - """ - Wrap the given model with MLRun's interface providing it with mlrun's additional features. - :param huggingface_object: The model to wrap. Can be loaded from the model path given as well. - :param model_name: The model name to use for storing the model artifact. Default: "model". - :param tag: The model's tag to log with. - :param context: MLRun context to work with. If no context is given it will be retrieved via - 'mlrun.get_or_create_ctx(None)' - :param auto_log: Whether to enable MLRun's auto logging. Default: True. - """ - - if isinstance(huggingface_object, transformers.Trainer): - return _apply_mlrun_on_trainer( - trainer=huggingface_object, - model_name=model_name, - tag=tag, - context=context, - auto_log=auto_log, - labels=labels, - extra_data=extra_data, - ) - import optimum.onnxruntime as optimum_ort - - if isinstance(huggingface_object, optimum_ort.ORTOptimizer): - return _apply_mlrun_on_optimizer( - optimizer=huggingface_object, - model_name=model_name, - tag=tag, - context=context, - auto_log=auto_log, - labels=labels, - extra_data=extra_data, - ) - raise mlrun.errors.MLRunInvalidArgumentError
- - -# ---------------------- from auto_trainer-------------------------------- -
[docs]class KWArgsPrefixes: - MODEL_CLASS = "CLASS_" - FIT = "FIT_" - TRAIN = "TRAIN_" - PREDICT = "PREDICT_"
- - -def _get_sub_dict_by_prefix(src: Dict, prefix_key: str) -> Dict[str, Any]: - """ - Collect all the keys from the given dict that starts with the given prefix and creates a new dictionary with these - keys. - - :param src: The source dict to extract the values from. - :param prefix_key: Only keys with this prefix will be returned. The keys in the result dict will be without this - prefix. - """ - return { - key.replace(prefix_key, ""): val - for key, val in src.items() - if key.startswith(prefix_key) - } - - -def _get_dataframe( - context: MLClientCtx, - dataset: DataItem, - label_columns: Optional[Union[str, List[str]]] = None, - drop_columns: Union[str, List[str], int, List[int]] = None, -) -> Tuple[pd.DataFrame, Optional[Union[str, List[str]]]]: - """ - Getting the DataFrame of the dataset and drop the columns accordingly. - - :param context: MLRun context. - :param dataset: The dataset to train the model on. - Can be either a list of lists, dict, URI or a FeatureVector. - :param label_columns: The target label(s) of the column(s) in the dataset. for Regression or - Classification tasks. - :param drop_columns: str/int or a list of strings/ints that represent the column names/indices to drop. - """ - if isinstance(dataset, (list, dict)): - dataset = pd.DataFrame(dataset) - # Checking if drop_columns provided by integer type: - if drop_columns: - if isinstance(drop_columns, str) or ( - isinstance(drop_columns, list) - and any(isinstance(col, str) for col in drop_columns) - ): - context.logger.error( - "drop_columns must be an integer/list of integers if not provided with a URI/FeatureVector dataset" - ) - raise ValueError - dataset.drop(drop_columns, axis=1, inplace=True) - - return dataset, label_columns - - store_uri_prefix, _ = mlrun.datastore.parse_store_uri(dataset.artifact_url) - if mlrun.utils.StorePrefix.FeatureVector == store_uri_prefix: - # feature-vector case: - label_columns = label_columns or dataset.meta.status.label_column - dataset = fs.get_offline_features( - dataset.meta.uri, drop_columns=drop_columns - ).to_dataframe() - - context.logger.info(f"label columns: {label_columns}") - else: - # simple URL case: - dataset = dataset.as_df() - if drop_columns: - if all(col in dataset for col in drop_columns): - dataset = dataset.drop(drop_columns, axis=1) - else: - context.logger.info( - "not all of the columns to drop in the dataset, drop columns process skipped" - ) - return dataset, label_columns - - -# ---------------------- Hugging Face Trainer -------------------------------- - - -def _create_compute_metrics(metrics: List[str]) -> Callable[[EvalPrediction], Dict]: - """ - This function create and returns a function that will be used to compute metrics at evaluation. - :param metrics: List of different metrics for evaluate the model such as f1, accuracy etc. - - :returns: Function that will be used to compute metrics at evaluation. - Must take a [`EvalPrediction`] and return a dictionary string to metric values. - """ - - def _compute_metrics(eval_pred): - logits, labels = eval_pred - predictions = np.argmax(logits, axis=-1) - metric_dict_results = {} - for metric in metrics: - load_met = load_metric(metric) - metric_res = load_met.compute(predictions=predictions, references=labels)[ - metric - ] - metric_dict_results[metric] = metric_res - - return metric_dict_results - - return _compute_metrics - - -def _edit_columns( - dataset: Dataset, - drop_columns: List[str] = None, - rename_columns: [str, str] = None, -) -> Dataset: - """ - Drop and renames that columns of the given dataset - :param dataset: Dataset to process - :param drop_columns: The columns to drop from the dataset. - :param rename_columns: Dict of columns ro rename : {<old_name>: <new_name>, ...} - - :returns: The dataset after the desired process - """ - if drop_columns: - dataset = dataset.remove_columns(drop_columns) - if rename_columns: - dataset = dataset.rename_columns(rename_columns) - return dataset - - -def _prepare_dataset( - context: MLClientCtx, - dataset_name: str, - label_name: str = None, - drop_columns: Optional[List[str]] = None, - num_of_train_samples: int = None, - train_test_split_size: float = None, - random_state: int = None, -) -> Tuple[Dataset, Dataset]: - """ - Loading the dataset and editing the columns - - :param context: MLRun contex - :param dataset_name: The name of the dataset to get from the HuggingFace hub - :param label_name: The target label of the column in the dataset. - :param drop_columns: The columns to drop from the dataset. - :param num_of_train_samples: Max number of training samples, for debugging. - :param train_test_split_size: Should be between 0.0 and 1.0 and represent the proportion of the dataset to include - in the test split. - :param random_state: Random state for train_test_split - - """ - - context.logger.info( - f"Loading and editing {dataset_name} dataset from Hugging Face hub" - ) - rename_cols = {label_name: "labels"} - - # Loading and editing dataset: - dataset = load_dataset(dataset_name) - - # train set - train_dataset = dataset["train"] - if num_of_train_samples: - train_dataset = train_dataset.shuffle(seed=random_state).select( - list(range(num_of_train_samples)) - ) - train_dataset = _edit_columns(train_dataset, drop_columns, rename_cols) - - # test set - test_dataset = dataset["test"] - if train_test_split_size or num_of_train_samples: - train_test_split_size = train_test_split_size or 0.2 - num_of_test_samples = int( - (train_dataset.num_rows * train_test_split_size) - // (1 - train_test_split_size) - ) - test_dataset = test_dataset.shuffle(seed=random_state).select( - list(range(num_of_test_samples)) - ) - test_dataset = _edit_columns(test_dataset, drop_columns, rename_cols) - - return train_dataset, test_dataset - - -
[docs]def train( - context: MLClientCtx, - hf_dataset: str = None, - dataset: DataItem = None, - test_set: DataItem = None, - drop_columns: Optional[List[str]] = None, - pretrained_tokenizer: str = None, - pretrained_model: str = None, - model_class: str = None, - model_name: str = "huggingface-model", - label_name: str = "labels", - text_col: str = "text", - num_of_train_samples: int = None, - train_test_split_size: float = None, - metrics: List[str] = None, - random_state: int = None, -): - """ - Training and evaluating a pretrained model with a pretrained tokenizer over a dataset. - The dataset can be either be the name of the dataset that contains in the HuggingFace hub, - or a URI or a FeatureVector - - :param context: MLRun context - :param hf_dataset: The name of the dataset to get from the HuggingFace hub - :param dataset: The dataset to train the model on. Can be either a URI or a FeatureVector - :param test_set: The test set to train the model with. - :param drop_columns: The columns to drop from the dataset. - :param pretrained_tokenizer: The name of the pretrained tokenizer from the HuggingFace hub. - :param pretrained_model: The name of the pretrained model from the HuggingFace hub. - :param model_name: The model's name to use for storing the model artifact, default to 'model' - :param model_class: The class of the model, e.g. `transformers.AutoModelForSequenceClassification` - :param label_name: The target label of the column in the dataset. - :param text_col: The input text column un the dataset. - :param num_of_train_samples: Max number of training samples, for debugging. - :param train_test_split_size: Should be between 0.0 and 1.0 and represent the proportion of the dataset to include - in the test split. - :param metrics: List of different metrics for evaluate the model such as f1, accuracy etc. - :param random_state: Random state for train_test_split - """ - - if train_test_split_size is None and test_set is None: - context.logger.info( - "'train_test_split_size' is not provided, setting train_test_split_size to 0.2" - ) - train_test_split_size = 0.2 - - # Creating tokenizer: - tokenizer = AutoTokenizer.from_pretrained(pretrained_tokenizer) - - def preprocess_function(examples): - return tokenizer(examples[text_col], truncation=True) - - # prepare data for training - if hf_dataset: - train_dataset, test_dataset = _prepare_dataset( - context, - hf_dataset, - label_name, - drop_columns, - num_of_train_samples, - train_test_split_size, - random_state=random_state, - ) - elif dataset: - # Get DataFrame by URL or by FeatureVector: - train_dataset, label_name = _get_dataframe( - context=context, - dataset=dataset, - label_columns=label_name, - drop_columns=drop_columns, - ) - if test_set: - test_dataset, _ = _get_dataframe( - context=context, - dataset=test_set, - label_columns=label_name, - drop_columns=drop_columns, - ) - else: - train_dataset, test_dataset = train_test_split( - train_dataset, - test_size=train_test_split_size, - random_state=random_state, - ) - train_dataset = Dataset.from_pandas(train_dataset) - test_dataset = Dataset.from_pandas(test_dataset) - else: - raise mlrun.errors.MLRunInvalidArgumentError( - "Training data was not provided. A training dataset is mandatory for training." - " Please provide a training set using one of the arguments 'hf_dataset' or 'dataset'." - ) - - # Mapping datasets with the tokenizer: - tokenized_train = train_dataset.map(preprocess_function, batched=True) - tokenized_test = test_dataset.map(preprocess_function, batched=True) - - # Creating data collator for batching: - data_collator = DataCollatorWithPadding(tokenizer=tokenizer) - - # Parsing kwargs: - train_kwargs = _get_sub_dict_by_prefix( - src=context.parameters, prefix_key=KWArgsPrefixes.TRAIN - ) - model_class_kwargs = _get_sub_dict_by_prefix( - src=context.parameters, prefix_key=KWArgsPrefixes.MODEL_CLASS - ) - - # Loading our pretrained model: - model_class_kwargs["pretrained_model_name_or_path"] = ( - model_class_kwargs.get("pretrained_model_name_or_path") or pretrained_model - ) - train_kwargs["hub_token"] = train_kwargs.get("hub_token") or pretrained_tokenizer - if not model_class_kwargs["pretrained_model_name_or_path"]: - raise mlrun.errors.MLRunRuntimeError( - "Must provide pretrained_model name as " - "function argument or in extra params" - ) - model = create_class(model_class).from_pretrained(**model_class_kwargs) - - # Preparing training arguments: - training_args = TrainingArguments( - **train_kwargs, - ) - - compute_metrics = _create_compute_metrics(metrics) if metrics else None - trainer = Trainer( - model=model, - args=training_args, - train_dataset=tokenized_train, - eval_dataset=tokenized_test, - tokenizer=tokenizer, - data_collator=data_collator, - compute_metrics=compute_metrics, - ) - - apply_mlrun(trainer, model_name=model_name) - - # Apply training with evaluation: - context.logger.info(f"training '{model_name}'") - trainer.train()
- - -def _get_model_dir(model_uri: str): - model_file, _, _ = mlrun.artifacts.get_model(model_uri) - model_dir = tempfile.gettempdir() - # Unzip the Model: - with zipfile.ZipFile(model_file, "r") as zip_file: - zip_file.extractall(model_dir) - - return model_dir - - -
[docs]def optimize( - model_path: str, - model_name: str = "optimized_model", - target_dir: str = "./optimized", - optimization_level: int = 1, -): - """ - Optimizing the transformer model using ONNX optimization. - - - :param model_path: The path of the model to optimize. - :param model_name: Name of the optimized model. - :param target_dir: The directory to save the ONNX model. - :param optimization_level: Optimization level performed by ONNX Runtime of the loaded graph. (default is 1) - """ - # We import these in the function scope so ONNX won't be mandatory for the other handlers: - from optimum.onnxruntime import ORTModelForSequenceClassification, ORTOptimizer - from optimum.onnxruntime.configuration import OptimizationConfig - - model_dir = _get_model_dir(model_uri=model_path) - # Creating configuration for optimization step: - optimization_config = OptimizationConfig(optimization_level=optimization_level) - - # Converting our pretrained model to an ONNX-Runtime model: - ort_model = ORTModelForSequenceClassification.from_pretrained( - model_dir, from_transformers=True - ) - - # Creating an ONNX-Runtime optimizer from ONNX model: - optimizer = ORTOptimizer.from_pretrained(ort_model) - - apply_mlrun(optimizer, model_name=model_name) - # Optimizing and saving the ONNX model: - optimizer.optimize(save_dir=target_dir, optimization_config=optimization_config)
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/hugging_face_classifier_trainer/0.2.0/static/item.html b/functions/development/hugging_face_classifier_trainer/0.2.0/static/item.html deleted file mode 100644 index 7db7e49b..00000000 --- a/functions/development/hugging_face_classifier_trainer/0.2.0/static/item.html +++ /dev/null @@ -1,53 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- machine-learning
-- model-training
-description: Automatic train and optimize functions for HuggingFace framework
-doc: ''
-example: hugging_face_classifier_trainer.ipynb
-generationDate: 2022-08-28:17-25
-hidden: false
-icon: ''
-labels:
-  author: davids
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 1.6.1
-name: hugging_face_classifier_trainer
-platformVersion: 3.5.5
-spec:
-  filename: hugging_face_classifier_trainer.py
-  handler: train
-  image: mlrun/mlrun
-  kind: job
-  requirements:
-  - onnx~=1.14.1
-  - onnxruntime~=1.16.1
-  - optimum~=1.6.4
-  - transformers~=4.26.1
-  - datasets~=2.10.1
-  - scikit-learn~=1.0.2
-url: ''
-version: 0.2.0
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/hugging_face_classifier_trainer/0.2.0/static/source.html b/functions/development/hugging_face_classifier_trainer/0.2.0/static/source.html deleted file mode 100644 index 6eee51f5..00000000 --- a/functions/development/hugging_face_classifier_trainer/0.2.0/static/source.html +++ /dev/null @@ -1,854 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-import os
-import shutil
-import tempfile
-import zipfile
-from abc import ABC
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
-
-import mlrun
-import mlrun.datastore
-import mlrun.utils
-import numpy as np
-import pandas as pd
-import transformers
-from datasets import Dataset, load_dataset, load_metric
-from mlrun import MLClientCtx
-from mlrun import feature_store as fs
-from mlrun.artifacts import Artifact, PlotlyArtifact
-from mlrun.datastore import DataItem
-from mlrun.frameworks._common import CommonTypes, MLRunInterface
-from mlrun.utils import create_class
-from plotly import graph_objects as go
-from sklearn.model_selection import train_test_split
-from transformers import (
-    AutoTokenizer,
-    DataCollatorWithPadding,
-    EvalPrediction,
-    PreTrainedModel,
-    PreTrainedTokenizer,
-    Trainer,
-    TrainerCallback,
-    TrainerControl,
-    TrainerState,
-    TrainingArguments,
-)
-
-
-# ----------------------from MLRUN--------------------------------
-class HFORTOptimizerMLRunInterface(MLRunInterface, ABC):
-    """
-    Interface for adding MLRun features for tensorflow keras API.
-    """
-
-    # MLRun's context default name:
-    DEFAULT_CONTEXT_NAME = "mlrun-huggingface"
-
-    # Attributes to be inserted so the MLRun interface will be fully enabled.
-    _PROPERTIES = {
-        "_auto_log": False,
-        "_context": None,
-        "_model_name": "model",
-        "_tag": "",
-        "_labels": None,
-        "_extra_data": None,
-    }
-    _METHODS = ["enable_auto_logging"]
-    # Attributes to replace so the MLRun interface will be fully enabled.
-    _REPLACED_METHODS = [
-        "optimize",
-    ]
-
-    @classmethod
-    def add_interface(
-        cls,
-        obj,
-        restoration: CommonTypes.MLRunInterfaceRestorationType = None,
-    ):
-        """
-        Enrich the object with this interface properties, methods and functions, so it will have this TensorFlow.Keras
-        MLRun's features.
-        :param obj:                     The object to enrich his interface.
-        :param restoration: Restoration information tuple as returned from 'remove_interface' in order to
-                                        add the interface in a certain state.
-        """
-        super(HFORTOptimizerMLRunInterface, cls).add_interface(
-            obj=obj, restoration=restoration
-        )
-
-    @classmethod
-    def mlrun_optimize(cls):
-        """
-        MLRun's tf.keras.Model.fit wrapper. It will setup the optimizer when using horovod. The optimizer must be
-        passed in a keyword argument and when using horovod, it must be passed as an Optimizer instance, not a string.
-
-        raise MLRunInvalidArgumentError: In case the optimizer provided did not follow the instructions above.
-        """
-
-        def wrapper(self, *args, **kwargs):
-            save_dir = cls._get_function_argument(
-                self.optimize,
-                argument_name="save_dir",
-                passed_args=args,
-                passed_kwargs=kwargs,
-            )[0]
-
-            # Call the original optimize method:
-            result = self.original_optimize(*args, **kwargs)
-
-            if self._auto_log:
-                # Log the onnx model:
-                self._context.log_model(
-                    key="model",
-                    db_key=self._model_name,
-                    model_file=f"{save_dir}/model_optimized.onnx",
-                    tag=self._tag,
-                    framework="ONNX",
-                    labels=self._labels,
-                    extra_data=self._extra_data,
-                )
-
-            return result
-
-        return wrapper
-
-    def enable_auto_logging(
-        self,
-        context: mlrun.MLClientCtx,
-        model_name: str = "model",
-        tag: str = "",
-        labels: Dict[str, str] = None,
-        extra_data: dict = None,
-    ):
-        self._auto_log = True
-
-        self._context = context
-        self._model_name = model_name
-        self._tag = tag
-        self._labels = labels
-        self._extra_data = extra_data
-
-
-class HFTrainerMLRunInterface(MLRunInterface, ABC):
-    """
-    Interface for adding MLRun features for tensorflow keras API.
-    """
-
-    # MLRuns context default name:
-    DEFAULT_CONTEXT_NAME = "mlrun-huggingface"
-
-    # Attributes to replace so the MLRun interface will be fully enabled.
-    _REPLACED_METHODS = [
-        "train",
-        # "evaluate"
-    ]
-
-    @classmethod
-    def add_interface(
-        cls,
-        obj: Trainer,
-        restoration: CommonTypes.MLRunInterfaceRestorationType = None,
-    ):
-        """
-        Enrich the object with this interface properties, methods and functions, so it will have this TensorFlow.Keras
-        MLRuns features.
-        :param obj:                     The object to enrich his interface.
-        :param restoration: Restoration information tuple as returned from 'remove_interface' in order to
-                                        add the interface in a certain state.
-        """
-
-        super(HFTrainerMLRunInterface, cls).add_interface(
-            obj=obj, restoration=restoration
-        )
-
-    @classmethod
-    def mlrun_train(cls):
-
-        """
-        MLRuns tf.keras.Model.fit wrapper. It will setup the optimizer when using horovod. The optimizer must be
-        passed in a keyword argument and when using horovod, it must be passed as an Optimizer instance, not a string.
-
-        raise MLRunInvalidArgumentError: In case the optimizer provided did not follow the instructions above.
-        """
-
-        def wrapper(self: Trainer, *args, **kwargs):
-            # Restore the evaluation method as `train` will use it:
-            # cls._restore_attribute(obj=self, attribute_name="evaluate")
-
-            # Call the original fit method:
-            result = self.original_train(*args, **kwargs)
-
-            # Replace the evaluation method again:
-            # cls._replace_function(obj=self, function_name="evaluate")
-
-            return result
-
-        return wrapper
-
-
-class MLRunCallback(TrainerCallback):
-    """
-    Callback for collecting logs during training / evaluation of the `Trainer` API.
-    """
-
-    def __init__(
-        self,
-        context: mlrun.MLClientCtx = None,
-        model_name: str = "model",
-        tag: str = "",
-        labels: Dict[str, str] = None,
-        extra_data: dict = None,
-    ):
-        super().__init__()
-
-        # Store the configurations:
-        self._context = (
-            context
-            if context is not None
-            else mlrun.get_or_create_ctx("./mlrun-huggingface")
-        )
-        self._model_name = model_name
-        self._tag = tag
-        self._labels = labels
-        self._extra_data = extra_data if extra_data is not None else {}
-
-        # Set up the logging mode:
-        self._is_training = False
-        self._steps: List[List[int]] = []
-        self._metric_scores: Dict[str, List[float]] = {}
-        self._artifacts: Dict[str, Artifact] = {}
-
-    def on_epoch_begin(
-        self,
-        args: TrainingArguments,
-        state: TrainerState,
-        control: TrainerControl,
-        **kwargs,
-    ):
-        self._steps.append([])
-
-    def on_epoch_end(
-        self,
-        args: TrainingArguments,
-        state: TrainerState,
-        control: TrainerControl,
-        **kwargs,
-    ):
-        self._log_metrics()
-
-    def on_log(
-        self,
-        args: TrainingArguments,
-        state: TrainerState,
-        control: TrainerControl,
-        logs: Dict[str, float] = None,
-        **kwargs,
-    ):
-        recent_logs = state.log_history[-1].copy()
-
-        recent_logs.pop("epoch")
-        current_step = int(recent_logs.pop("step"))
-        if current_step not in self._steps[-1]:
-            self._steps[-1].append(current_step)
-
-        for metric_name, metric_score in recent_logs.items():
-            if metric_name.startswith("train_"):
-                if metric_name.split("train_")[1] not in self._metric_scores:
-                    self._metric_scores[metric_name] = [metric_score]
-                continue
-            if metric_name not in self._metric_scores:
-                self._metric_scores[metric_name] = []
-            self._metric_scores[metric_name].append(metric_score)
-
-    def on_train_begin(
-        self,
-        args: TrainingArguments,
-        state: TrainerState,
-        control: TrainerControl,
-        **kwargs,
-    ):
-        self._is_training = True
-
-    def on_train_end(
-        self,
-        args: TrainingArguments,
-        state: TrainerState,
-        control: TrainerControl,
-        model: PreTrainedModel = None,
-        tokenizer: PreTrainedTokenizer = None,
-        **kwargs,
-    ):
-        self._log_metrics()
-
-        temp_directory = tempfile.gettempdir()
-
-        # Save and log the tokenizer:
-        if tokenizer is not None:
-            # Save tokenizer:
-            tokenizer_dir = os.path.join(temp_directory, "tokenizer")
-            tokenizer.save_pretrained(save_directory=tokenizer_dir)
-            # Zip the tokenizer directory:
-            tokenizer_zip = shutil.make_archive(
-                base_name="tokenizer",
-                format="zip",
-                root_dir=tokenizer_dir,
-            )
-            # Log the zip file:
-            self._artifacts["tokenizer"] = self._context.log_artifact(
-                item="tokenizer", local_path=tokenizer_zip
-            )
-
-        # Save the model:
-        model_dir = os.path.join(temp_directory, "model")
-        model.save_pretrained(save_directory=model_dir)
-
-        # Zip the model directory:
-        shutil.make_archive(
-            base_name="model",
-            format="zip",
-            root_dir=model_dir,
-        )
-
-        # Log the model:
-        self._context.log_model(
-            key="model",
-            db_key=self._model_name,
-            model_file="model.zip",
-            tag=self._tag,
-            framework="Hugging Face",
-            labels=self._labels,
-            extra_data={**self._artifacts, **self._extra_data},
-        )
-
-    def on_evaluate(
-        self,
-        args: TrainingArguments,
-        state: TrainerState,
-        control: TrainerControl,
-        **kwargs,
-    ):
-        self._log_metrics()
-
-        if self._is_training:
-            return
-
-        # TODO: Update the model object
-
-    def _log_metrics(self):
-        for metric_name, metric_scores in self._metric_scores.items():
-            self._context.log_result(key=metric_name, value=metric_scores[-1])
-            if len(metric_scores) > 1:
-                self._log_metric_plot(name=metric_name, scores=metric_scores)
-        self._context.commit(completed=False)
-
-    def _log_metric_plot(self, name: str, scores: List[float]):
-        # Initialize a plotly figure:
-        metric_figure = go.Figure()
-
-        # Add titles:
-        metric_figure.update_layout(
-            title=name.capitalize().replace("_", " "),
-            xaxis_title="Samples",
-            yaxis_title="Scores",
-        )
-
-        # Draw:
-        metric_figure.add_trace(
-            go.Scatter(x=np.arange(len(scores)), y=scores, mode="lines")
-        )
-
-        # Create the plotly artifact:
-        artifact_name = f"{name}_plot"
-        artifact = PlotlyArtifact(key=artifact_name, figure=metric_figure)
-        self._artifacts[artifact_name] = self._context.log_artifact(artifact)
-
-
-def _apply_mlrun_on_trainer(
-    trainer: transformers.Trainer,
-    model_name: str = None,
-    tag: str = "",
-    context: mlrun.MLClientCtx = None,
-    auto_log: bool = True,
-    labels: Dict[str, str] = None,
-    extra_data: dict = None,
-    **kwargs,
-):
-    # Get parameters defaults:
-    if context is None:
-        context = mlrun.get_or_create_ctx(HFTrainerMLRunInterface.DEFAULT_CONTEXT_NAME)
-
-    HFTrainerMLRunInterface.add_interface(obj=trainer)
-
-    if auto_log:
-        trainer.add_callback(
-            MLRunCallback(
-                context=context,
-                model_name=model_name,
-                tag=tag,
-                labels=labels,
-                extra_data=extra_data,
-            )
-        )
-
-
-def _apply_mlrun_on_optimizer(
-    optimizer,
-    model_name: str = None,
-    tag: str = "",
-    context: mlrun.MLClientCtx = None,
-    auto_log: bool = True,
-    labels: Dict[str, str] = None,
-    extra_data: dict = None,
-    **kwargs,
-):
-    # Get parameters defaults:
-    if context is None:
-        context = mlrun.get_or_create_ctx(
-            HFORTOptimizerMLRunInterface.DEFAULT_CONTEXT_NAME
-        )
-
-    HFORTOptimizerMLRunInterface.add_interface(obj=optimizer)
-
-    if auto_log:
-        optimizer.enable_auto_logging(
-            context=context,
-            model_name=model_name,
-            tag=tag,
-            labels=labels,
-            extra_data=extra_data,
-        )
-
-
-def apply_mlrun(
-    huggingface_object,
-    model_name: str = None,
-    tag: str = "",
-    context: mlrun.MLClientCtx = None,
-    auto_log: bool = True,
-    labels: Dict[str, str] = None,
-    extra_data: dict = None,
-    **kwargs,
-):
-    """
-    Wrap the given model with MLRun's interface providing it with mlrun's additional features.
-    :param huggingface_object: The model to wrap. Can be loaded from the model path given as well.
-    :param model_name:         The model name to use for storing the model artifact. Default: "model".
-    :param tag:                The model's tag to log with.
-    :param context:            MLRun context to work with. If no context is given it will be retrieved via
-                               'mlrun.get_or_create_ctx(None)'
-    :param auto_log:           Whether to enable MLRun's auto logging. Default: True.
-    """
-
-    if isinstance(huggingface_object, transformers.Trainer):
-        return _apply_mlrun_on_trainer(
-            trainer=huggingface_object,
-            model_name=model_name,
-            tag=tag,
-            context=context,
-            auto_log=auto_log,
-            labels=labels,
-            extra_data=extra_data,
-        )
-    import optimum.onnxruntime as optimum_ort
-
-    if isinstance(huggingface_object, optimum_ort.ORTOptimizer):
-        return _apply_mlrun_on_optimizer(
-            optimizer=huggingface_object,
-            model_name=model_name,
-            tag=tag,
-            context=context,
-            auto_log=auto_log,
-            labels=labels,
-            extra_data=extra_data,
-        )
-    raise mlrun.errors.MLRunInvalidArgumentError
-
-
-# ---------------------- from auto_trainer--------------------------------
-class KWArgsPrefixes:
-    MODEL_CLASS = "CLASS_"
-    FIT = "FIT_"
-    TRAIN = "TRAIN_"
-    PREDICT = "PREDICT_"
-
-
-def _get_sub_dict_by_prefix(src: Dict, prefix_key: str) -> Dict[str, Any]:
-    """
-    Collect all the keys from the given dict that starts with the given prefix and creates a new dictionary with these
-    keys.
-
-    :param src:         The source dict to extract the values from.
-    :param prefix_key:  Only keys with this prefix will be returned. The keys in the result dict will be without this
-                        prefix.
-    """
-    return {
-        key.replace(prefix_key, ""): val
-        for key, val in src.items()
-        if key.startswith(prefix_key)
-    }
-
-
-def _get_dataframe(
-    context: MLClientCtx,
-    dataset: DataItem,
-    label_columns: Optional[Union[str, List[str]]] = None,
-    drop_columns: Union[str, List[str], int, List[int]] = None,
-) -> Tuple[pd.DataFrame, Optional[Union[str, List[str]]]]:
-    """
-    Getting the DataFrame of the dataset and drop the columns accordingly.
-
-    :param context:         MLRun context.
-    :param dataset:         The dataset to train the model on.
-                            Can be either a list of lists, dict, URI or a FeatureVector.
-    :param label_columns:   The target label(s) of the column(s) in the dataset. for Regression or
-                            Classification tasks.
-    :param drop_columns:    str/int or a list of strings/ints that represent the column names/indices to drop.
-    """
-    if isinstance(dataset, (list, dict)):
-        dataset = pd.DataFrame(dataset)
-        # Checking if drop_columns provided by integer type:
-        if drop_columns:
-            if isinstance(drop_columns, str) or (
-                isinstance(drop_columns, list)
-                and any(isinstance(col, str) for col in drop_columns)
-            ):
-                context.logger.error(
-                    "drop_columns must be an integer/list of integers if not provided with a URI/FeatureVector dataset"
-                )
-                raise ValueError
-            dataset.drop(drop_columns, axis=1, inplace=True)
-
-        return dataset, label_columns
-
-    store_uri_prefix, _ = mlrun.datastore.parse_store_uri(dataset.artifact_url)
-    if mlrun.utils.StorePrefix.FeatureVector == store_uri_prefix:
-        # feature-vector case:
-        label_columns = label_columns or dataset.meta.status.label_column
-        dataset = fs.get_offline_features(
-            dataset.meta.uri, drop_columns=drop_columns
-        ).to_dataframe()
-
-        context.logger.info(f"label columns: {label_columns}")
-    else:
-        # simple URL case:
-        dataset = dataset.as_df()
-        if drop_columns:
-            if all(col in dataset for col in drop_columns):
-                dataset = dataset.drop(drop_columns, axis=1)
-            else:
-                context.logger.info(
-                    "not all of the columns to drop in the dataset, drop columns process skipped"
-                )
-    return dataset, label_columns
-
-
-# ---------------------- Hugging Face Trainer --------------------------------
-
-
-def _create_compute_metrics(metrics: List[str]) -> Callable[[EvalPrediction], Dict]:
-    """
-    This function create and returns a function that will be used to compute metrics at evaluation.
-    :param metrics: List of different metrics for evaluate the model such as f1, accuracy etc.
-
-    :returns: Function that will be used to compute metrics at evaluation.
-             Must take a [`EvalPrediction`] and return a dictionary string to metric values.
-    """
-
-    def _compute_metrics(eval_pred):
-        logits, labels = eval_pred
-        predictions = np.argmax(logits, axis=-1)
-        metric_dict_results = {}
-        for metric in metrics:
-            load_met = load_metric(metric)
-            metric_res = load_met.compute(predictions=predictions, references=labels)[
-                metric
-            ]
-            metric_dict_results[metric] = metric_res
-
-        return metric_dict_results
-
-    return _compute_metrics
-
-
-def _edit_columns(
-    dataset: Dataset,
-    drop_columns: List[str] = None,
-    rename_columns: [str, str] = None,
-) -> Dataset:
-    """
-    Drop and renames that columns of the given dataset
-    :param dataset:         Dataset to process
-    :param drop_columns:    The columns to drop from the dataset.
-    :param rename_columns:  Dict of columns ro rename : {: , ...}
-
-    :returns: The dataset after the desired process
-    """
-    if drop_columns:
-        dataset = dataset.remove_columns(drop_columns)
-    if rename_columns:
-        dataset = dataset.rename_columns(rename_columns)
-    return dataset
-
-
-def _prepare_dataset(
-    context: MLClientCtx,
-    dataset_name: str,
-    label_name: str = None,
-    drop_columns: Optional[List[str]] = None,
-    num_of_train_samples: int = None,
-    train_test_split_size: float = None,
-    random_state: int = None,
-) -> Tuple[Dataset, Dataset]:
-    """
-    Loading the dataset and editing the columns
-
-    :param context:                 MLRun contex
-    :param dataset_name:            The name of the dataset to get from the HuggingFace hub
-    :param label_name:              The target label of the column in the dataset.
-    :param drop_columns:            The columns to drop from the dataset.
-    :param num_of_train_samples:    Max number of training samples, for debugging.
-    :param train_test_split_size:   Should be between 0.0 and 1.0 and represent the proportion of the dataset to include
-                                    in the test split.
-    :param random_state:            Random state for train_test_split
-
-    """
-
-    context.logger.info(
-        f"Loading and editing {dataset_name} dataset from Hugging Face hub"
-    )
-    rename_cols = {label_name: "labels"}
-
-    # Loading and editing dataset:
-    dataset = load_dataset(dataset_name)
-
-    # train set
-    train_dataset = dataset["train"]
-    if num_of_train_samples:
-        train_dataset = train_dataset.shuffle(seed=random_state).select(
-            list(range(num_of_train_samples))
-        )
-    train_dataset = _edit_columns(train_dataset, drop_columns, rename_cols)
-
-    # test set
-    test_dataset = dataset["test"]
-    if train_test_split_size or num_of_train_samples:
-        train_test_split_size = train_test_split_size or 0.2
-        num_of_test_samples = int(
-            (train_dataset.num_rows * train_test_split_size)
-            // (1 - train_test_split_size)
-        )
-        test_dataset = test_dataset.shuffle(seed=random_state).select(
-            list(range(num_of_test_samples))
-        )
-    test_dataset = _edit_columns(test_dataset, drop_columns, rename_cols)
-
-    return train_dataset, test_dataset
-
-
-def train(
-    context: MLClientCtx,
-    hf_dataset: str = None,
-    dataset: DataItem = None,
-    test_set: DataItem = None,
-    drop_columns: Optional[List[str]] = None,
-    pretrained_tokenizer: str = None,
-    pretrained_model: str = None,
-    model_class: str = None,
-    model_name: str = "huggingface-model",
-    label_name: str = "labels",
-    text_col: str = "text",
-    num_of_train_samples: int = None,
-    train_test_split_size: float = None,
-    metrics: List[str] = None,
-    random_state: int = None,
-):
-    """
-    Training and evaluating a pretrained model with a pretrained tokenizer over a dataset.
-    The dataset can be either be the name of the dataset that contains in the HuggingFace hub,
-    or a URI or a FeatureVector
-
-    :param context:                 MLRun context
-    :param hf_dataset:              The name of the dataset to get from the HuggingFace hub
-    :param dataset:                 The dataset to train the model on. Can be either a URI or a FeatureVector
-    :param test_set:                The test set to train the model with.
-    :param drop_columns:            The columns to drop from the dataset.
-    :param pretrained_tokenizer:    The name of the pretrained tokenizer from the HuggingFace hub.
-    :param pretrained_model:        The name of the pretrained model from the HuggingFace hub.
-    :param model_name:              The model's name to use for storing the model artifact, default to 'model'
-    :param model_class:             The class of the model, e.g. `transformers.AutoModelForSequenceClassification`
-    :param label_name:              The target label of the column in the dataset.
-    :param text_col:                The input text column un the dataset.
-    :param num_of_train_samples:    Max number of training samples, for debugging.
-    :param train_test_split_size:   Should be between 0.0 and 1.0 and represent the proportion of the dataset to include
-                                    in the test split.
-    :param metrics:                 List of different metrics for evaluate the model such as f1, accuracy etc.
-    :param random_state:            Random state for train_test_split
-    """
-
-    if train_test_split_size is None and test_set is None:
-        context.logger.info(
-            "'train_test_split_size' is not provided, setting train_test_split_size to 0.2"
-        )
-        train_test_split_size = 0.2
-
-    # Creating tokenizer:
-    tokenizer = AutoTokenizer.from_pretrained(pretrained_tokenizer)
-
-    def preprocess_function(examples):
-        return tokenizer(examples[text_col], truncation=True)
-
-    # prepare data for training
-    if hf_dataset:
-        train_dataset, test_dataset = _prepare_dataset(
-            context,
-            hf_dataset,
-            label_name,
-            drop_columns,
-            num_of_train_samples,
-            train_test_split_size,
-            random_state=random_state,
-        )
-    elif dataset:
-        # Get DataFrame by URL or by FeatureVector:
-        train_dataset, label_name = _get_dataframe(
-            context=context,
-            dataset=dataset,
-            label_columns=label_name,
-            drop_columns=drop_columns,
-        )
-        if test_set:
-            test_dataset, _ = _get_dataframe(
-                context=context,
-                dataset=test_set,
-                label_columns=label_name,
-                drop_columns=drop_columns,
-            )
-        else:
-            train_dataset, test_dataset = train_test_split(
-                train_dataset,
-                test_size=train_test_split_size,
-                random_state=random_state,
-            )
-        train_dataset = Dataset.from_pandas(train_dataset)
-        test_dataset = Dataset.from_pandas(test_dataset)
-    else:
-        raise mlrun.errors.MLRunInvalidArgumentError(
-            "Training data was not provided. A training dataset is mandatory for training."
-            " Please provide a training set using one of the arguments 'hf_dataset' or 'dataset'."
-        )
-
-    # Mapping datasets with the tokenizer:
-    tokenized_train = train_dataset.map(preprocess_function, batched=True)
-    tokenized_test = test_dataset.map(preprocess_function, batched=True)
-
-    # Creating data collator for batching:
-    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
-
-    # Parsing kwargs:
-    train_kwargs = _get_sub_dict_by_prefix(
-        src=context.parameters, prefix_key=KWArgsPrefixes.TRAIN
-    )
-    model_class_kwargs = _get_sub_dict_by_prefix(
-        src=context.parameters, prefix_key=KWArgsPrefixes.MODEL_CLASS
-    )
-
-    # Loading our pretrained model:
-    model_class_kwargs["pretrained_model_name_or_path"] = (
-        model_class_kwargs.get("pretrained_model_name_or_path") or pretrained_model
-    )
-    train_kwargs["hub_token"] = train_kwargs.get("hub_token") or pretrained_tokenizer
-    if not model_class_kwargs["pretrained_model_name_or_path"]:
-        raise mlrun.errors.MLRunRuntimeError(
-            "Must provide pretrained_model name as "
-            "function argument or in extra params"
-        )
-    model = create_class(model_class).from_pretrained(**model_class_kwargs)
-
-    # Preparing training arguments:
-    training_args = TrainingArguments(
-        **train_kwargs,
-    )
-
-    compute_metrics = _create_compute_metrics(metrics) if metrics else None
-    trainer = Trainer(
-        model=model,
-        args=training_args,
-        train_dataset=tokenized_train,
-        eval_dataset=tokenized_test,
-        tokenizer=tokenizer,
-        data_collator=data_collator,
-        compute_metrics=compute_metrics,
-    )
-
-    apply_mlrun(trainer, model_name=model_name)
-
-    # Apply training with evaluation:
-    context.logger.info(f"training '{model_name}'")
-    trainer.train()
-
-
-def _get_model_dir(model_uri: str):
-    model_file, _, _ = mlrun.artifacts.get_model(model_uri)
-    model_dir = tempfile.gettempdir()
-    # Unzip the Model:
-    with zipfile.ZipFile(model_file, "r") as zip_file:
-        zip_file.extractall(model_dir)
-
-    return model_dir
-
-
-def optimize(
-    model_path: str,
-    model_name: str = "optimized_model",
-    target_dir: str = "./optimized",
-    optimization_level: int = 1,
-):
-    """
-    Optimizing the transformer model using ONNX optimization.
-
-
-    :param model_path:          The path of the model to optimize.
-    :param model_name:          Name of the optimized model.
-    :param target_dir:          The directory to save the ONNX model.
-    :param optimization_level:  Optimization level performed by ONNX Runtime of the loaded graph. (default is 1)
-    """
-    # We import these in the function scope so ONNX won't be mandatory for the other handlers:
-    from optimum.onnxruntime import ORTModelForSequenceClassification, ORTOptimizer
-    from optimum.onnxruntime.configuration import OptimizationConfig
-
-    model_dir = _get_model_dir(model_uri=model_path)
-    # Creating configuration for optimization step:
-    optimization_config = OptimizationConfig(optimization_level=optimization_level)
-
-    # Converting our pretrained model to an ONNX-Runtime model:
-    ort_model = ORTModelForSequenceClassification.from_pretrained(
-        model_dir, from_transformers=True
-    )
-
-    # Creating an ONNX-Runtime optimizer from ONNX model:
-    optimizer = ORTOptimizer.from_pretrained(ort_model)
-
-    apply_mlrun(optimizer, model_name=model_name)
-    # Optimizing and saving the ONNX model:
-    optimizer.optimize(save_dir=target_dir, optimization_config=optimization_config)
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/hugging_face_classifier_trainer/latest/src/function.yaml b/functions/development/hugging_face_classifier_trainer/latest/src/function.yaml deleted file mode 100644 index eb223b2b..00000000 --- a/functions/development/hugging_face_classifier_trainer/latest/src/function.yaml +++ /dev/null @@ -1,368 +0,0 @@ -kind: job -metadata: - name: hugging-face-classifier-trainer - tag: '' - hash: e8113e81f04c96fc9a8a94e717dea81ee3e05a18 - project: '' - labels: - author: davids - categories: - - machine-learning - - model-training -spec: - command: '' - args: [] - image: '' - build: - functionSourceCode:  - base_image: mlrun/mlrun - commands: [] - code_origin: '' - origin_filename: '' - requirements: - - onnx~=1.14.1 - - onnxruntime~=1.16.1 - - optimum~=1.6.4 - - transformers~=4.26.1 - - datasets~=2.10.1 - - scikit-learn~=1.0.2 - entry_points: - add_interface: - name: add_interface - doc: 'Enrich the object with this interface properties, methods and functions, - so it will have this TensorFlow.Keras - - MLRuns features.' - parameters: - - name: cls - - name: obj - type: Trainer - doc: The object to enrich his interface. - - name: restoration - type: MLRunInterfaceRestorationType - doc: Restoration information tuple as returned from 'remove_interface' in - order to add the interface in a certain state. - default: null - outputs: [] - lineno: 146 - has_varargs: false - has_kwargs: false - mlrun_optimize: - name: mlrun_optimize - doc: 'MLRun''s tf.keras.Model.fit wrapper. It will setup the optimizer when - using horovod. The optimizer must be - - passed in a keyword argument and when using horovod, it must be passed as - an Optimizer instance, not a string. - - - raise MLRunInvalidArgumentError: In case the optimizer provided did not follow - the instructions above.' - parameters: - - name: cls - outputs: [] - lineno: 79 - has_varargs: false - has_kwargs: false - wrapper: - name: wrapper - doc: '' - parameters: - - name: self - type: Trainer - outputs: [] - lineno: 173 - has_varargs: true - has_kwargs: true - enable_auto_logging: - name: enable_auto_logging - doc: '' - parameters: - - name: self - - name: context - type: MLClientCtx - - name: model_name - type: str - default: model - - name: tag - type: str - default: '' - - name: labels - type: Dict[str, str] - default: null - - name: extra_data - type: dict - default: null - outputs: [] - lineno: 114 - has_varargs: false - has_kwargs: false - mlrun_train: - name: mlrun_train - doc: 'MLRuns tf.keras.Model.fit wrapper. It will setup the optimizer when using - horovod. The optimizer must be - - passed in a keyword argument and when using horovod, it must be passed as - an Optimizer instance, not a string. - - - raise MLRunInvalidArgumentError: In case the optimizer provided did not follow - the instructions above.' - parameters: - - name: cls - outputs: [] - lineno: 164 - has_varargs: false - has_kwargs: false - on_epoch_begin: - name: on_epoch_begin - doc: '' - parameters: - - name: self - - name: args - type: TrainingArguments - - name: state - type: TrainerState - - name: control - type: TrainerControl - outputs: [] - lineno: 220 - has_varargs: false - has_kwargs: true - on_epoch_end: - name: on_epoch_end - doc: '' - parameters: - - name: self - - name: args - type: TrainingArguments - - name: state - type: TrainerState - - name: control - type: TrainerControl - outputs: [] - lineno: 229 - has_varargs: false - has_kwargs: true - on_log: - name: on_log - doc: '' - parameters: - - name: self - - name: args - type: TrainingArguments - - name: state - type: TrainerState - - name: control - type: TrainerControl - - name: logs - type: Dict[str, float] - default: null - outputs: [] - lineno: 238 - has_varargs: false - has_kwargs: true - on_train_begin: - name: on_train_begin - doc: '' - parameters: - - name: self - - name: args - type: TrainingArguments - - name: state - type: TrainerState - - name: control - type: TrainerControl - outputs: [] - lineno: 262 - has_varargs: false - has_kwargs: true - on_train_end: - name: on_train_end - doc: '' - parameters: - - name: self - - name: args - type: TrainingArguments - - name: state - type: TrainerState - - name: control - type: TrainerControl - - name: model - type: PreTrainedModel - default: null - - name: tokenizer - type: PreTrainedTokenizer - default: null - outputs: [] - lineno: 271 - has_varargs: false - has_kwargs: true - on_evaluate: - name: on_evaluate - doc: '' - parameters: - - name: self - - name: args - type: TrainingArguments - - name: state - type: TrainerState - - name: control - type: TrainerControl - outputs: [] - lineno: 322 - has_varargs: false - has_kwargs: true - apply_mlrun: - name: apply_mlrun - doc: Wrap the given model with MLRun's interface providing it with mlrun's additional - features. - parameters: - - name: huggingface_object - doc: The model to wrap. Can be loaded from the model path given as well. - - name: model_name - type: str - doc: 'The model name to use for storing the model artifact. Default: "model".' - default: null - - name: tag - type: str - doc: The model's tag to log with. - default: '' - - name: context - type: MLClientCtx - doc: MLRun context to work with. If no context is given it will be retrieved - via 'mlrun.get_or_create_ctx(None)' - default: null - - name: auto_log - type: bool - doc: 'Whether to enable MLRun''s auto logging. Default: True.' - default: true - - name: labels - type: Dict[str, str] - default: null - - name: extra_data - type: dict - default: null - outputs: [] - lineno: 421 - has_varargs: false - has_kwargs: true - train: - name: train - doc: 'Training and evaluating a pretrained model with a pretrained tokenizer - over a dataset. - - The dataset can be either be the name of the dataset that contains in the - HuggingFace hub, - - or a URI or a FeatureVector' - parameters: - - name: context - type: MLClientCtx - doc: MLRun context - - name: hf_dataset - type: str - doc: The name of the dataset to get from the HuggingFace hub - default: null - - name: dataset - type: DataItem - doc: The dataset to train the model on. Can be either a URI or a FeatureVector - default: null - - name: test_set - type: DataItem - doc: The test set to train the model with. - default: null - - name: drop_columns - type: Optional[List[str]] - doc: The columns to drop from the dataset. - default: null - - name: pretrained_tokenizer - type: str - doc: The name of the pretrained tokenizer from the HuggingFace hub. - default: null - - name: pretrained_model - type: str - doc: The name of the pretrained model from the HuggingFace hub. - default: null - - name: model_class - type: str - doc: The class of the model, e.g. `transformers.AutoModelForSequenceClassification` - default: null - - name: model_name - type: str - doc: The model's name to use for storing the model artifact, default to 'model' - default: huggingface-model - - name: label_name - type: str - doc: The target label of the column in the dataset. - default: labels - - name: text_col - type: str - doc: The input text column un the dataset. - default: text - - name: num_of_train_samples - type: int - doc: Max number of training samples, for debugging. - default: null - - name: train_test_split_size - type: float - doc: Should be between 0.0 and 1.0 and represent the proportion of the dataset - to include in the test split. - default: null - - name: metrics - type: List[str] - doc: List of different metrics for evaluate the model such as f1, accuracy - etc. - default: null - - name: random_state - type: int - doc: Random state for train_test_split - default: null - outputs: [] - lineno: 647 - has_varargs: false - has_kwargs: false - preprocess_function: - name: preprocess_function - doc: '' - parameters: - - name: examples - outputs: [] - lineno: 696 - has_varargs: false - has_kwargs: false - optimize: - name: optimize - doc: Optimizing the transformer model using ONNX optimization. - parameters: - - name: model_path - type: str - doc: The path of the model to optimize. - - name: model_name - type: str - doc: Name of the optimized model. - default: optimized_model - - name: target_dir - type: str - doc: The directory to save the ONNX model. - default: ./optimized - - name: optimization_level - type: int - doc: Optimization level performed by ONNX Runtime of the loaded graph. (default - is 1) - default: 1 - outputs: [] - lineno: 799 - has_varargs: false - has_kwargs: false - description: Automatic train and optimize functions for HuggingFace framework - default_handler: train - disable_auto_mount: false - clone_target_dir: '' - env: [] - priority_class_name: '' - preemption_mode: prevent - affinity: null - tolerations: null - security_context: {} -verbose: false diff --git a/functions/development/hugging_face_classifier_trainer/latest/src/hugging_face_classifier_trainer.ipynb b/functions/development/hugging_face_classifier_trainer/latest/src/hugging_face_classifier_trainer.ipynb deleted file mode 100644 index 2768d2dc..00000000 --- a/functions/development/hugging_face_classifier_trainer/latest/src/hugging_face_classifier_trainer.ipynb +++ /dev/null @@ -1,2533 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "\n", - "# MLRun Hugging Face Classifier Trainer Tutorial" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "This notebook shows how to use the handlers of the Hugging Face classifier trainer.\n", - "the following handlers are:\n", - "- `train`\n", - "- `optimize`\n", - "\n", - "All you need is simply **HF model type** and a **HF dataset name**." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "scrolled": true, - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Requirement already satisfied: onnx~=1.14.1 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from -r requirements.txt (line 1)) (1.14.1)\n", - "Requirement already satisfied: onnxruntime==1.16.1 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from -r requirements.txt (line 2)) (1.16.1)\n", - "Requirement already satisfied: optimum~=1.6.4 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from -r requirements.txt (line 3)) (1.6.4)\n", - "Requirement already satisfied: transformers~=4.26.1 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from -r requirements.txt (line 4)) (4.26.1)\n", - "Requirement already satisfied: datasets~=2.10.1 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from -r requirements.txt (line 5)) (2.10.1)\n", - "Requirement already satisfied: scikit-learn~=1.0.2 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from -r requirements.txt (line 6)) (1.0.2)\n", - "Requirement already satisfied: coloredlogs in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from onnxruntime==1.16.1->-r requirements.txt (line 2)) (15.0.1)\n", - "Requirement already satisfied: flatbuffers in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from onnxruntime==1.16.1->-r requirements.txt (line 2)) (1.12)\n", - "Requirement already satisfied: numpy>=1.21.6 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from onnxruntime==1.16.1->-r requirements.txt (line 2)) (1.23.5)\n", - "Requirement already satisfied: packaging in /conda/envs/mlrun-base/lib/python3.9/site-packages (from onnxruntime==1.16.1->-r requirements.txt (line 2)) (21.3)\n", - "Requirement already satisfied: protobuf in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from onnxruntime==1.16.1->-r requirements.txt (line 2)) (3.20.2)\n", - "Requirement already satisfied: sympy in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from onnxruntime==1.16.1->-r requirements.txt (line 2)) (1.12)\n", - "Requirement already satisfied: typing-extensions>=3.6.2.1 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from onnx~=1.14.1->-r requirements.txt (line 1)) (4.7.1)\n", - "Requirement already satisfied: torch>=1.9 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from optimum~=1.6.4->-r requirements.txt (line 3)) (2.1.2)\n", - "Requirement already satisfied: huggingface-hub>=0.8.0 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from optimum~=1.6.4->-r requirements.txt (line 3)) (0.20.1)\n", - "Requirement already satisfied: filelock in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from transformers~=4.26.1->-r requirements.txt (line 4)) (3.13.1)\n", - "Requirement already satisfied: pyyaml>=5.1 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from transformers~=4.26.1->-r requirements.txt (line 4)) (5.4.1)\n", - "Requirement already satisfied: regex!=2019.12.17 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from transformers~=4.26.1->-r requirements.txt (line 4)) (2023.12.25)\n", - "Requirement already satisfied: requests in /conda/envs/mlrun-base/lib/python3.9/site-packages (from transformers~=4.26.1->-r requirements.txt (line 4)) (2.31.0)\n", - "Requirement already satisfied: tokenizers!=0.11.3,<0.14,>=0.11.1 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from transformers~=4.26.1->-r requirements.txt (line 4)) (0.13.3)\n", - "Requirement already satisfied: tqdm>=4.27 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from transformers~=4.26.1->-r requirements.txt (line 4)) (4.65.0)\n", - "Requirement already satisfied: pyarrow>=6.0.0 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from datasets~=2.10.1->-r requirements.txt (line 5)) (11.0.0)\n", - "Requirement already satisfied: dill<0.3.7,>=0.3.0 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from datasets~=2.10.1->-r requirements.txt (line 5)) (0.3.6)\n", - "Requirement already satisfied: pandas in /conda/envs/mlrun-base/lib/python3.9/site-packages (from datasets~=2.10.1->-r requirements.txt (line 5)) (1.4.4)\n", - "Requirement already satisfied: xxhash in /conda/envs/mlrun-base/lib/python3.9/site-packages (from datasets~=2.10.1->-r requirements.txt (line 5)) (3.3.0)\n", - "Requirement already satisfied: multiprocess in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from datasets~=2.10.1->-r requirements.txt (line 5)) (0.70.14)\n", - "Requirement already satisfied: fsspec>=2021.11.1 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from fsspec[http]>=2021.11.1->datasets~=2.10.1->-r requirements.txt (line 5)) (2023.9.2)\n", - "Requirement already satisfied: aiohttp in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from datasets~=2.10.1->-r requirements.txt (line 5)) (3.9.1)\n", - "Requirement already satisfied: responses<0.19 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from datasets~=2.10.1->-r requirements.txt (line 5)) (0.18.0)\n", - "Requirement already satisfied: scipy>=1.1.0 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from scikit-learn~=1.0.2->-r requirements.txt (line 6)) (1.11.4)\n", - "Requirement already satisfied: joblib>=0.11 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from scikit-learn~=1.0.2->-r requirements.txt (line 6)) (1.3.2)\n", - "Requirement already satisfied: threadpoolctl>=2.0.0 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from scikit-learn~=1.0.2->-r requirements.txt (line 6)) (3.2.0)\n", - "Requirement already satisfied: attrs>=17.3.0 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from aiohttp->datasets~=2.10.1->-r requirements.txt (line 5)) (19.1.0)\n", - "Requirement already satisfied: multidict<7.0,>=4.5 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from aiohttp->datasets~=2.10.1->-r requirements.txt (line 5)) (6.0.4)\n", - "Requirement already satisfied: yarl<2.0,>=1.0 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from aiohttp->datasets~=2.10.1->-r requirements.txt (line 5)) (1.9.2)\n", - "Requirement already satisfied: frozenlist>=1.1.1 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from aiohttp->datasets~=2.10.1->-r requirements.txt (line 5)) (1.4.0)\n", - "Requirement already satisfied: aiosignal>=1.1.2 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from aiohttp->datasets~=2.10.1->-r requirements.txt (line 5)) (1.3.1)\n", - "Requirement already satisfied: async-timeout<5.0,>=4.0 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from aiohttp->datasets~=2.10.1->-r requirements.txt (line 5)) (4.0.3)\n", - "Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from packaging->onnxruntime==1.16.1->-r requirements.txt (line 2)) (3.1.1)\n", - "Requirement already satisfied: charset-normalizer<4,>=2 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from requests->transformers~=4.26.1->-r requirements.txt (line 4)) (2.1.1)\n", - "Requirement already satisfied: idna<4,>=2.5 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from requests->transformers~=4.26.1->-r requirements.txt (line 4)) (3.4)\n", - "Requirement already satisfied: urllib3<3,>=1.21.1 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from requests->transformers~=4.26.1->-r requirements.txt (line 4)) (1.26.16)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from requests->transformers~=4.26.1->-r requirements.txt (line 4)) (2023.7.22)\n", - "Requirement already satisfied: networkx in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (3.2.1)\n", - "Requirement already satisfied: jinja2 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (3.1.3)\n", - "Requirement already satisfied: nvidia-cuda-nvrtc-cu12==12.1.105 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (12.1.105)\n", - "Requirement already satisfied: nvidia-cuda-runtime-cu12==12.1.105 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (12.1.105)\n", - "Requirement already satisfied: nvidia-cuda-cupti-cu12==12.1.105 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (12.1.105)\n", - "Requirement already satisfied: nvidia-cudnn-cu12==8.9.2.26 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (8.9.2.26)\n", - "Requirement already satisfied: nvidia-cublas-cu12==12.1.3.1 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (12.1.3.1)\n", - "Requirement already satisfied: nvidia-cufft-cu12==11.0.2.54 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (11.0.2.54)\n", - "Requirement already satisfied: nvidia-curand-cu12==10.3.2.106 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (10.3.2.106)\n", - "Requirement already satisfied: nvidia-cusolver-cu12==11.4.5.107 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (11.4.5.107)\n", - "Requirement already satisfied: nvidia-cusparse-cu12==12.1.0.106 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (12.1.0.106)\n", - "Requirement already satisfied: nvidia-nccl-cu12==2.18.1 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (2.18.1)\n", - "Requirement already satisfied: nvidia-nvtx-cu12==12.1.105 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (12.1.105)\n", - "Requirement already satisfied: triton==2.1.0 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (2.1.0)\n", - "Requirement already satisfied: nvidia-nvjitlink-cu12 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from nvidia-cusolver-cu12==11.4.5.107->torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (12.3.101)\n", - "Requirement already satisfied: sentencepiece!=0.1.92,>=0.1.91 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from transformers[sentencepiece]>=4.26.0->optimum~=1.6.4->-r requirements.txt (line 3)) (0.2.0)\n", - "Requirement already satisfied: humanfriendly>=9.1 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from coloredlogs->onnxruntime==1.16.1->-r requirements.txt (line 2)) (9.2)\n", - "Requirement already satisfied: python-dateutil>=2.8.1 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from pandas->datasets~=2.10.1->-r requirements.txt (line 5)) (2.8.2)\n", - "Requirement already satisfied: pytz>=2020.1 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from pandas->datasets~=2.10.1->-r requirements.txt (line 5)) (2023.3.post1)\n", - "Requirement already satisfied: mpmath>=0.19 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from sympy->onnxruntime==1.16.1->-r requirements.txt (line 2)) (1.3.0)\n", - "Requirement already satisfied: six>=1.5 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from python-dateutil>=2.8.1->pandas->datasets~=2.10.1->-r requirements.txt (line 5)) (1.16.0)\n", - "Requirement already satisfied: MarkupSafe>=2.0 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from jinja2->torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (2.1.3)\n", - "Note: you may need to restart the kernel to use updated packages.\n" - ] - } - ], - "source": [ - "%pip install -r requirements.txt" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [], - "source": [ - "import mlrun" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2024-03-24 17:10:17,091 [info] Project loaded successfully: {'project_name': 'hugging-face-trainer'}\n" - ] - } - ], - "source": [ - "project = mlrun.get_or_create_project('hugging-face-trainer', context=\"./\", user_project=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "### **Importing the hugging_face_classifier_trainer function from the Marketplace**" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [], - "source": [ - "hugging_face_classifier_trainer = mlrun.import_function(\"hub://hugging_face_classifier_trainer\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "### **Training a model**\n", - "\n", - "Choosing the `train` handler" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "#### Define task parameters¶\n", - "* Class parameters should contain the prefix `CLASS_`\n", - "* Train parameters should contain the prefix `TRAIN_`" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [], - "source": [ - "model_class = \"transformers.AutoModelForSequenceClassification\"\n", - "additional_parameters = {\n", - " \"TRAIN_output_dir\": \"finetuning-sentiment-model-3000-samples\",\n", - " \"TRAIN_learning_rate\": 2e-5,\n", - " \"TRAIN_per_device_train_batch_size\": 16,\n", - " \"TRAIN_per_device_eval_batch_size\": 16,\n", - " \"TRAIN_num_train_epochs\": 3,\n", - " \"TRAIN_weight_decay\": 0.01,\n", - " \"TRAIN_push_to_hub\": False,\n", - " \"TRAIN_evaluation_strategy\": \"epoch\",\n", - " \"TRAIN_eval_steps\": 1,\n", - " \"TRAIN_logging_steps\": 1,\n", - " \"CLASS_num_labels\": 2\n", - "}" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "#### Running the Training job with the \"train\" handler" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "pycharm": { - "name": "#%%\n" - }, - "scrolled": true, - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2024-03-24 17:10:21,025 [info] Storing function: {'name': 'hugging-face-classifier-trainer-train', 'uid': '514d8d5530c842238b1cc81983cd943e', 'db': 'http://mlrun-api:8080'}\n", - "> 2024-03-24 17:11:03,727 [info] 'train_test_split_size' is not provided, setting train_test_split_size to 0.2\n", - "> 2024-03-24 17:11:03,882 [info] Loading and editing Shayanvsf/US_Airline_Sentiment dataset from Hugging Face hub\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Found cached dataset parquet (/igz/.cache/huggingface/datasets/Shayanvsf___parquet/Shayanvsf--US_Airline_Sentiment-1319c42f87c44b2f/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "f43b1388d0b344888323bec590baadee", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/3 [00:00 2024-03-24 17:11:08,938 [info] training 'huggingface-model'\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`, you can safely ignore this message.\n", - "This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n", - "***** Running training *****\n", - " Num examples = 100\n", - " Num Epochs = 3\n", - " Instantaneous batch size per device = 16\n", - " Total train batch size (w. parallel, distributed & accumulation) = 16\n", - " Gradient Accumulation steps = 1\n", - " Total optimization steps = 21\n", - " Number of trainable parameters = 66955010\n", - "You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - " \n", - " \n", - " [21/21 00:15, Epoch 3/3]\n", - "
\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
EpochTraining LossValidation LossAccuracyF1
10.7389000.5153110.7916670.000000
20.5259000.4815630.7916670.000000
30.4908000.4716750.7916670.000000

" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`, you can safely ignore this message.\n", - "***** Running Evaluation *****\n", - " Num examples = 24\n", - " Batch size = 16\n", - "/tmp/tmp0c1aawrq.py:561: FutureWarning:\n", - "\n", - "load_metric is deprecated and will be removed in the next major version of datasets. Use 'evaluate.load' instead, from the new library 🤗 Evaluate: https://huggingface.co/docs/evaluate\n", - "\n", - "The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`, you can safely ignore this message.\n", - "***** Running Evaluation *****\n", - " Num examples = 24\n", - " Batch size = 16\n", - "The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`, you can safely ignore this message.\n", - "***** Running Evaluation *****\n", - " Num examples = 24\n", - " Batch size = 16\n", - "\n", - "\n", - "Training completed. Do not forget to share your model on huggingface.co/models =)\n", - "\n", - "\n", - "tokenizer config file saved in /tmp/tokenizer/tokenizer_config.json\n", - "Special tokens file saved in /tmp/tokenizer/special_tokens_map.json\n", - "Configuration saved in /tmp/model/config.json\n", - "Model weights saved in /tmp/model/pytorch_model.bin\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "

\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
hugging-face-trainer-avia0Mar 24 17:10:21completedhugging-face-classifier-trainer-train
v3io_user=avia
kind=local
owner=avia
host=jupyter-avia-6454bdd4c5-xz8cg
hf_dataset=Shayanvsf/US_Airline_Sentiment
drop_columns=['airline_sentiment_confidence', 'negativereason_confidence']
pretrained_tokenizer=distilbert-base-uncased
pretrained_model=distilbert-base-uncased
model_class=transformers.AutoModelForSequenceClassification
label_name=airline_sentiment
num_of_train_samples=100
metrics=['accuracy', 'f1']
random_state=42
TRAIN_output_dir=finetuning-sentiment-model-3000-samples
TRAIN_learning_rate=2e-05
TRAIN_per_device_train_batch_size=16
TRAIN_per_device_eval_batch_size=16
TRAIN_num_train_epochs=3
TRAIN_weight_decay=0.01
TRAIN_push_to_hub=False
TRAIN_evaluation_strategy=epoch
TRAIN_eval_steps=1
TRAIN_logging_steps=1
CLASS_num_labels=2
loss=0.4908
learning_rate=0.0
eval_loss=0.47167453169822693
eval_accuracy=0.7916666666666666
eval_f1=0.0
eval_runtime=0.5186
eval_samples_per_second=46.276
eval_steps_per_second=3.856
train_runtime=17.6054
train_samples_per_second=17.04
train_steps_per_second=1.193
total_flos=3327208489680.0
loss_plot
learning_rate_plot
eval_loss_plot
eval_accuracy_plot
eval_f1_plot
eval_runtime_plot
eval_samples_per_second_plot
eval_steps_per_second_plot
tokenizer
model
\n", - "
\n", - "
\n", - "
\n", - " Title\n", - " ×\n", - "
\n", - " \n", - "
\n", - "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - }, - { - "data": { - "text/html": [ - " > to track results use the .show() or .logs() methods or click here to open in UI" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2024-03-24 17:12:01,880 [info] Run execution finished: {'status': 'completed', 'name': 'hugging-face-classifier-trainer-train'}\n" - ] - } - ], - "source": [ - "train_run = hugging_face_classifier_trainer.run(params={\n", - " \"hf_dataset\": \"Shayanvsf/US_Airline_Sentiment\",\n", - " \"drop_columns\": [\n", - " \"airline_sentiment_confidence\",\n", - " \"negativereason_confidence\",\n", - " ],\n", - " \"pretrained_tokenizer\": \"distilbert-base-uncased\",\n", - " \"pretrained_model\": \"distilbert-base-uncased\",\n", - " \"model_class\": \"transformers.AutoModelForSequenceClassification\",\n", - " \"label_name\": \"airline_sentiment\",\n", - " \"num_of_train_samples\": 100,\n", - " \"metrics\": [\"accuracy\", \"f1\"],\n", - " \"random_state\": 42,\n", - " **additional_parameters\n", - " },\n", - " handler=\"train\",\n", - " local=True,\n", - " )" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "#### The result of the train run" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "{'loss': 0.4908,\n", - " 'learning_rate': 0.0,\n", - " 'eval_loss': 0.47167453169822693,\n", - " 'eval_accuracy': 0.7916666666666666,\n", - " 'eval_f1': 0.0,\n", - " 'eval_runtime': 0.5186,\n", - " 'eval_samples_per_second': 46.276,\n", - " 'eval_steps_per_second': 3.856,\n", - " 'train_runtime': 17.6054,\n", - " 'train_samples_per_second': 17.04,\n", - " 'train_steps_per_second': 1.193,\n", - " 'total_flos': 3327208489680.0,\n", - " 'loss_plot': 'v3io:///projects/hugging-face-trainer-avia/artifacts/hugging-face-classifier-trainer-train/0/loss_plot.html',\n", - " 'learning_rate_plot': 'v3io:///projects/hugging-face-trainer-avia/artifacts/hugging-face-classifier-trainer-train/0/learning_rate_plot.html',\n", - " 'eval_loss_plot': 'v3io:///projects/hugging-face-trainer-avia/artifacts/hugging-face-classifier-trainer-train/0/eval_loss_plot.html',\n", - " 'eval_accuracy_plot': 'v3io:///projects/hugging-face-trainer-avia/artifacts/hugging-face-classifier-trainer-train/0/eval_accuracy_plot.html',\n", - " 'eval_f1_plot': 'v3io:///projects/hugging-face-trainer-avia/artifacts/hugging-face-classifier-trainer-train/0/eval_f1_plot.html',\n", - " 'eval_runtime_plot': 'v3io:///projects/hugging-face-trainer-avia/artifacts/hugging-face-classifier-trainer-train/0/eval_runtime_plot.html',\n", - " 'eval_samples_per_second_plot': 'v3io:///projects/hugging-face-trainer-avia/artifacts/hugging-face-classifier-trainer-train/0/eval_samples_per_second_plot.html',\n", - " 'eval_steps_per_second_plot': 'v3io:///projects/hugging-face-trainer-avia/artifacts/hugging-face-classifier-trainer-train/0/eval_steps_per_second_plot.html',\n", - " 'tokenizer': 'store://artifacts/hugging-face-trainer-avia/hugging-face-classifier-trainer-train_tokenizer@514d8d5530c842238b1cc81983cd943e',\n", - " 'model': 'store://artifacts/hugging-face-trainer-avia/huggingface-model@514d8d5530c842238b1cc81983cd943e'}" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "train_run.outputs" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "
\n", - "
\n", - "\n", - "" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "train_run.artifact('loss_plot').show()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "#### Getting the model for evaluating and predicting" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [], - "source": [ - "model_path = train_run.outputs['model']" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Optimize the model**\n", - "\n", - "Choosing the `optimize` handler\n", - "\n", - "The result of using this handled is an onnx optimized model." - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "scrolled": true, - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2024-03-24 17:12:02,020 [info] Storing function: {'name': 'hugging-face-classifier-trainer-optimize', 'uid': 'fbee1ead18444824a4b5c0308a677bf4', 'db': 'http://mlrun-api:8080'}\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/User/.pythonlibs/mlrun-base/lib/python3.9/site-packages/optimum/onnxruntime/configuration.py:726: FutureWarning:\n", - "\n", - "disable_embed_layer_norm will be deprecated soon, use disable_embed_layer_norm_fusion instead, disable_embed_layer_norm_fusion is set to True.\n", - "\n", - "loading configuration file /tmp/config.json\n", - "Model config DistilBertConfig {\n", - " \"_name_or_path\": \"/tmp/config.json\",\n", - " \"activation\": \"gelu\",\n", - " \"architectures\": [\n", - " \"DistilBertForSequenceClassification\"\n", - " ],\n", - " \"attention_dropout\": 0.1,\n", - " \"dim\": 768,\n", - " \"dropout\": 0.1,\n", - " \"hidden_dim\": 3072,\n", - " \"initializer_range\": 0.02,\n", - " \"max_position_embeddings\": 512,\n", - " \"model_type\": \"distilbert\",\n", - " \"n_heads\": 12,\n", - " \"n_layers\": 6,\n", - " \"pad_token_id\": 0,\n", - " \"problem_type\": \"single_label_classification\",\n", - " \"qa_dropout\": 0.1,\n", - " \"seq_classif_dropout\": 0.2,\n", - " \"sinusoidal_pos_embds\": false,\n", - " \"tie_weights_\": true,\n", - " \"torch_dtype\": \"float32\",\n", - " \"transformers_version\": \"4.26.1\",\n", - " \"vocab_size\": 30522\n", - "}\n", - "\n", - "loading configuration file /tmp/config.json\n", - "Model config DistilBertConfig {\n", - " \"_name_or_path\": \"/tmp\",\n", - " \"activation\": \"gelu\",\n", - " \"architectures\": [\n", - " \"DistilBertForSequenceClassification\"\n", - " ],\n", - " \"attention_dropout\": 0.1,\n", - " \"dim\": 768,\n", - " \"dropout\": 0.1,\n", - " \"hidden_dim\": 3072,\n", - " \"initializer_range\": 0.02,\n", - " \"max_position_embeddings\": 512,\n", - " \"model_type\": \"distilbert\",\n", - " \"n_heads\": 12,\n", - " \"n_layers\": 6,\n", - " \"pad_token_id\": 0,\n", - " \"problem_type\": \"single_label_classification\",\n", - " \"qa_dropout\": 0.1,\n", - " \"seq_classif_dropout\": 0.2,\n", - " \"sinusoidal_pos_embds\": false,\n", - " \"tie_weights_\": true,\n", - " \"torch_dtype\": \"float32\",\n", - " \"transformers_version\": \"4.26.1\",\n", - " \"vocab_size\": 30522\n", - "}\n", - "\n", - "loading weights file /tmp/pytorch_model.bin\n", - "All model checkpoint weights were used when initializing DistilBertForSequenceClassification.\n", - "\n", - "All the weights of DistilBertForSequenceClassification were initialized from the model checkpoint at /tmp.\n", - "If your task is similar to the task the model of the checkpoint was trained on, you can already use DistilBertForSequenceClassification for predictions without further training.\n", - "/User/.pythonlibs/mlrun-base/lib/python3.9/site-packages/transformers/models/distilbert/modeling_distilbert.py:218: TracerWarning:\n", - "\n", - "torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect.\n", - "\n", - "Configuration saved in /tmp/tmp79wjp8m8/config.json\n", - "Could not locate the tokenizer configuration file, will try to use the model config instead.\n", - "loading configuration file /tmp/config.json\n", - "Model config DistilBertConfig {\n", - " \"_name_or_path\": \"/tmp\",\n", - " \"activation\": \"gelu\",\n", - " \"architectures\": [\n", - " \"DistilBertForSequenceClassification\"\n", - " ],\n", - " \"attention_dropout\": 0.1,\n", - " \"dim\": 768,\n", - " \"dropout\": 0.1,\n", - " \"hidden_dim\": 3072,\n", - " \"initializer_range\": 0.02,\n", - " \"max_position_embeddings\": 512,\n", - " \"model_type\": \"distilbert\",\n", - " \"n_heads\": 12,\n", - " \"n_layers\": 6,\n", - " \"pad_token_id\": 0,\n", - " \"problem_type\": \"single_label_classification\",\n", - " \"qa_dropout\": 0.1,\n", - " \"seq_classif_dropout\": 0.2,\n", - " \"sinusoidal_pos_embds\": false,\n", - " \"tie_weights_\": true,\n", - " \"torch_dtype\": \"float32\",\n", - " \"transformers_version\": \"4.26.1\",\n", - " \"vocab_size\": 30522\n", - "}\n", - "\n", - "loading configuration file /tmp/config.json\n", - "Model config DistilBertConfig {\n", - " \"_name_or_path\": \"/tmp\",\n", - " \"activation\": \"gelu\",\n", - " \"architectures\": [\n", - " \"DistilBertForSequenceClassification\"\n", - " ],\n", - " \"attention_dropout\": 0.1,\n", - " \"dim\": 768,\n", - " \"dropout\": 0.1,\n", - " \"hidden_dim\": 3072,\n", - " \"initializer_range\": 0.02,\n", - " \"max_position_embeddings\": 512,\n", - " \"model_type\": \"distilbert\",\n", - " \"n_heads\": 12,\n", - " \"n_layers\": 6,\n", - " \"pad_token_id\": 0,\n", - " \"problem_type\": \"single_label_classification\",\n", - " \"qa_dropout\": 0.1,\n", - " \"seq_classif_dropout\": 0.2,\n", - " \"sinusoidal_pos_embds\": false,\n", - " \"tie_weights_\": true,\n", - " \"torch_dtype\": \"float32\",\n", - " \"transformers_version\": \"4.26.1\",\n", - " \"vocab_size\": 30522\n", - "}\n", - "\n", - "Could not locate the tokenizer configuration file, will try to use the model config instead.\n", - "loading configuration file /tmp/config.json\n", - "Model config DistilBertConfig {\n", - " \"_name_or_path\": \"/tmp\",\n", - " \"activation\": \"gelu\",\n", - " \"architectures\": [\n", - " \"DistilBertForSequenceClassification\"\n", - " ],\n", - " \"attention_dropout\": 0.1,\n", - " \"dim\": 768,\n", - " \"dropout\": 0.1,\n", - " \"hidden_dim\": 3072,\n", - " \"initializer_range\": 0.02,\n", - " \"max_position_embeddings\": 512,\n", - " \"model_type\": \"distilbert\",\n", - " \"n_heads\": 12,\n", - " \"n_layers\": 6,\n", - " \"pad_token_id\": 0,\n", - " \"problem_type\": \"single_label_classification\",\n", - " \"qa_dropout\": 0.1,\n", - " \"seq_classif_dropout\": 0.2,\n", - " \"sinusoidal_pos_embds\": false,\n", - " \"tie_weights_\": true,\n", - " \"torch_dtype\": \"float32\",\n", - " \"transformers_version\": \"4.26.1\",\n", - " \"vocab_size\": 30522\n", - "}\n", - "\n", - "Could not locate the tokenizer configuration file, will try to use the model config instead.\n", - "loading configuration file /tmp/tmp79wjp8m8/config.json\n", - "Model config DistilBertConfig {\n", - " \"_name_or_path\": \"/tmp/tmp79wjp8m8\",\n", - " \"activation\": \"gelu\",\n", - " \"architectures\": [\n", - " \"DistilBertForSequenceClassification\"\n", - " ],\n", - " \"attention_dropout\": 0.1,\n", - " \"dim\": 768,\n", - " \"dropout\": 0.1,\n", - " \"hidden_dim\": 3072,\n", - " \"initializer_range\": 0.02,\n", - " \"max_position_embeddings\": 512,\n", - " \"model_type\": \"distilbert\",\n", - " \"n_heads\": 12,\n", - " \"n_layers\": 6,\n", - " \"pad_token_id\": 0,\n", - " \"problem_type\": \"single_label_classification\",\n", - " \"qa_dropout\": 0.1,\n", - " \"seq_classif_dropout\": 0.2,\n", - " \"sinusoidal_pos_embds\": false,\n", - " \"tie_weights_\": true,\n", - " \"torch_dtype\": \"float32\",\n", - " \"transformers_version\": \"4.26.1\",\n", - " \"vocab_size\": 30522\n", - "}\n", - "\n", - "loading configuration file /tmp/tmp79wjp8m8/config.json\n", - "Model config DistilBertConfig {\n", - " \"_name_or_path\": \"/tmp/tmp79wjp8m8\",\n", - " \"activation\": \"gelu\",\n", - " \"architectures\": [\n", - " \"DistilBertForSequenceClassification\"\n", - " ],\n", - " \"attention_dropout\": 0.1,\n", - " \"dim\": 768,\n", - " \"dropout\": 0.1,\n", - " \"hidden_dim\": 3072,\n", - " \"initializer_range\": 0.02,\n", - " \"max_position_embeddings\": 512,\n", - " \"model_type\": \"distilbert\",\n", - " \"n_heads\": 12,\n", - " \"n_layers\": 6,\n", - " \"pad_token_id\": 0,\n", - " \"problem_type\": \"single_label_classification\",\n", - " \"qa_dropout\": 0.1,\n", - " \"seq_classif_dropout\": 0.2,\n", - " \"sinusoidal_pos_embds\": false,\n", - " \"tie_weights_\": true,\n", - " \"torch_dtype\": \"float32\",\n", - " \"transformers_version\": \"4.26.1\",\n", - " \"vocab_size\": 30522\n", - "}\n", - "\n", - "Could not locate the tokenizer configuration file, will try to use the model config instead.\n", - "loading configuration file /tmp/tmp79wjp8m8/config.json\n", - "Model config DistilBertConfig {\n", - " \"_name_or_path\": \"/tmp/tmp79wjp8m8\",\n", - " \"activation\": \"gelu\",\n", - " \"architectures\": [\n", - " \"DistilBertForSequenceClassification\"\n", - " ],\n", - " \"attention_dropout\": 0.1,\n", - " \"dim\": 768,\n", - " \"dropout\": 0.1,\n", - " \"hidden_dim\": 3072,\n", - " \"initializer_range\": 0.02,\n", - " \"max_position_embeddings\": 512,\n", - " \"model_type\": \"distilbert\",\n", - " \"n_heads\": 12,\n", - " \"n_layers\": 6,\n", - " \"pad_token_id\": 0,\n", - " \"problem_type\": \"single_label_classification\",\n", - " \"qa_dropout\": 0.1,\n", - " \"seq_classif_dropout\": 0.2,\n", - " \"sinusoidal_pos_embds\": false,\n", - " \"tie_weights_\": true,\n", - " \"torch_dtype\": \"float32\",\n", - " \"transformers_version\": \"4.26.1\",\n", - " \"vocab_size\": 30522\n", - "}\n", - "\n", - "Configuration saved in optimized/config.json\n", - "Could not locate the tokenizer configuration file, will try to use the model config instead.\n", - "loading configuration file /tmp/tmp79wjp8m8/config.json\n", - "Model config DistilBertConfig {\n", - " \"_name_or_path\": \"/tmp/tmp79wjp8m8\",\n", - " \"activation\": \"gelu\",\n", - " \"architectures\": [\n", - " \"DistilBertForSequenceClassification\"\n", - " ],\n", - " \"attention_dropout\": 0.1,\n", - " \"dim\": 768,\n", - " \"dropout\": 0.1,\n", - " \"hidden_dim\": 3072,\n", - " \"initializer_range\": 0.02,\n", - " \"max_position_embeddings\": 512,\n", - " \"model_type\": \"distilbert\",\n", - " \"n_heads\": 12,\n", - " \"n_layers\": 6,\n", - " \"pad_token_id\": 0,\n", - " \"problem_type\": \"single_label_classification\",\n", - " \"qa_dropout\": 0.1,\n", - " \"seq_classif_dropout\": 0.2,\n", - " \"sinusoidal_pos_embds\": false,\n", - " \"tie_weights_\": true,\n", - " \"torch_dtype\": \"float32\",\n", - " \"transformers_version\": \"4.26.1\",\n", - " \"vocab_size\": 30522\n", - "}\n", - "\n", - "loading configuration file /tmp/tmp79wjp8m8/config.json\n", - "Model config DistilBertConfig {\n", - " \"_name_or_path\": \"/tmp/tmp79wjp8m8\",\n", - " \"activation\": \"gelu\",\n", - " \"architectures\": [\n", - " \"DistilBertForSequenceClassification\"\n", - " ],\n", - " \"attention_dropout\": 0.1,\n", - " \"dim\": 768,\n", - " \"dropout\": 0.1,\n", - " \"hidden_dim\": 3072,\n", - " \"initializer_range\": 0.02,\n", - " \"max_position_embeddings\": 512,\n", - " \"model_type\": \"distilbert\",\n", - " \"n_heads\": 12,\n", - " \"n_layers\": 6,\n", - " \"pad_token_id\": 0,\n", - " \"problem_type\": \"single_label_classification\",\n", - " \"qa_dropout\": 0.1,\n", - " \"seq_classif_dropout\": 0.2,\n", - " \"sinusoidal_pos_embds\": false,\n", - " \"tie_weights_\": true,\n", - " \"torch_dtype\": \"float32\",\n", - " \"transformers_version\": \"4.26.1\",\n", - " \"vocab_size\": 30522\n", - "}\n", - "\n", - "Could not locate the tokenizer configuration file, will try to use the model config instead.\n", - "loading configuration file /tmp/tmp79wjp8m8/config.json\n", - "Model config DistilBertConfig {\n", - " \"_name_or_path\": \"/tmp/tmp79wjp8m8\",\n", - " \"activation\": \"gelu\",\n", - " \"architectures\": [\n", - " \"DistilBertForSequenceClassification\"\n", - " ],\n", - " \"attention_dropout\": 0.1,\n", - " \"dim\": 768,\n", - " \"dropout\": 0.1,\n", - " \"hidden_dim\": 3072,\n", - " \"initializer_range\": 0.02,\n", - " \"max_position_embeddings\": 512,\n", - " \"model_type\": \"distilbert\",\n", - " \"n_heads\": 12,\n", - " \"n_layers\": 6,\n", - " \"pad_token_id\": 0,\n", - " \"problem_type\": \"single_label_classification\",\n", - " \"qa_dropout\": 0.1,\n", - " \"seq_classif_dropout\": 0.2,\n", - " \"sinusoidal_pos_embds\": false,\n", - " \"tie_weights_\": true,\n", - " \"torch_dtype\": \"float32\",\n", - " \"transformers_version\": \"4.26.1\",\n", - " \"vocab_size\": 30522\n", - "}\n", - "\n", - "Failed to remove node input: \"/distilbert/transformer/layer.0/attention/Transpose_output_0\"\n", - "input: \"/distilbert/transformer/layer.0/attention/Constant_11_output_0\"\n", - "output: \"/distilbert/transformer/layer.0/attention/Div_output_0\"\n", - "name: \"/distilbert/transformer/layer.0/attention/Div\"\n", - "op_type: \"Div\"\n", - "\n", - "Failed to remove node input: \"/distilbert/transformer/layer.1/attention/Transpose_output_0\"\n", - "input: \"/distilbert/transformer/layer.1/attention/Constant_11_output_0\"\n", - "output: \"/distilbert/transformer/layer.1/attention/Div_output_0\"\n", - "name: \"/distilbert/transformer/layer.1/attention/Div\"\n", - "op_type: \"Div\"\n", - "\n", - "Failed to remove node input: \"/distilbert/transformer/layer.2/attention/Transpose_output_0\"\n", - "input: \"/distilbert/transformer/layer.2/attention/Constant_11_output_0\"\n", - "output: \"/distilbert/transformer/layer.2/attention/Div_output_0\"\n", - "name: \"/distilbert/transformer/layer.2/attention/Div\"\n", - "op_type: \"Div\"\n", - "\n", - "Failed to remove node input: \"/distilbert/transformer/layer.3/attention/Transpose_output_0\"\n", - "input: \"/distilbert/transformer/layer.3/attention/Constant_11_output_0\"\n", - "output: \"/distilbert/transformer/layer.3/attention/Div_output_0\"\n", - "name: \"/distilbert/transformer/layer.3/attention/Div\"\n", - "op_type: \"Div\"\n", - "\n", - "Failed to remove node input: \"/distilbert/transformer/layer.4/attention/Transpose_output_0\"\n", - "input: \"/distilbert/transformer/layer.4/attention/Constant_11_output_0\"\n", - "output: \"/distilbert/transformer/layer.4/attention/Div_output_0\"\n", - "name: \"/distilbert/transformer/layer.4/attention/Div\"\n", - "op_type: \"Div\"\n", - "\n", - "Failed to remove node input: \"/distilbert/transformer/layer.5/attention/Transpose_output_0\"\n", - "input: \"/distilbert/transformer/layer.5/attention/Constant_11_output_0\"\n", - "output: \"/distilbert/transformer/layer.5/attention/Div_output_0\"\n", - "name: \"/distilbert/transformer/layer.5/attention/Div\"\n", - "op_type: \"Div\"\n", - "\n", - "Configuration saved in optimized/config.json\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
hugging-face-trainer-avia0Mar 24 17:12:02completedhugging-face-classifier-trainer-optimize
v3io_user=avia
kind=local
owner=avia
host=jupyter-avia-6454bdd4c5-xz8cg
model_path=store://artifacts/hugging-face-trainer-avia/huggingface-model@514d8d5530c842238b1cc81983cd943e
model
\n", - "
\n", - "
\n", - "
\n", - " Title\n", - " ×\n", - "
\n", - " \n", - "
\n", - "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - }, - { - "data": { - "text/html": [ - " > to track results use the .show() or .logs() methods or click here to open in UI" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2024-03-24 17:12:22,721 [info] Run execution finished: {'status': 'completed', 'name': 'hugging-face-classifier-trainer-optimize'}\n" - ] - } - ], - "source": [ - "optimize_run = hugging_face_classifier_trainer.run(params={\n", - " \"model_path\": str(model_path)\n", - " },\n", - " handler=\"optimize\",\n", - " local=True,\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'model': 'store://artifacts/hugging-face-trainer-avia/optimized_model@fbee1ead18444824a4b5c0308a677bf4'}" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "optimize_run.outputs" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Running the training remotely**\n" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": { - "scrolled": true, - "tags": [] - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/User/.pythonlibs/mlrun-base/lib/python3.9/site-packages/mlrun/projects/operations.py:276: OverwriteBuildParamsWarning:\n", - "\n", - "The `overwrite_build_params` parameter default will change from 'False' to 'True' in 1.8.0.\n", - "\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2024-03-24 17:14:22,792 [info] Started building image: .mlrun/func-hugging-face-trainer-avia-hugging-face-classifier-trainer:latest\n", - "\u001b[36mINFO\u001b[0m[0000] Retrieving image manifest mlrun/mlrun:1.6.1 \n", - "\u001b[36mINFO\u001b[0m[0000] Retrieving image mlrun/mlrun:1.6.1 from registry index.docker.io \n", - "\u001b[36mINFO\u001b[0m[0000] Built cross stage deps: map[] \n", - "\u001b[36mINFO\u001b[0m[0000] Retrieving image manifest mlrun/mlrun:1.6.1 \n", - "\u001b[36mINFO\u001b[0m[0000] Returning cached image manifest \n", - "\u001b[36mINFO\u001b[0m[0000] Executing 0 build triggers \n", - "\u001b[36mINFO\u001b[0m[0000] Building stage 'mlrun/mlrun:1.6.1' [idx: '0', base-idx: '-1'] \n", - "\u001b[36mINFO\u001b[0m[0000] Unpacking rootfs as cmd RUN echo 'Installing /empty/requirements.txt...'; cat /empty/requirements.txt requires it. \n", - "\u001b[36mINFO\u001b[0m[0047] RUN echo 'Installing /empty/requirements.txt...'; cat /empty/requirements.txt \n", - "\u001b[36mINFO\u001b[0m[0047] Initializing snapshotter ... \n", - "\u001b[36mINFO\u001b[0m[0047] Taking snapshot of full filesystem... \n", - "\u001b[36mINFO\u001b[0m[0074] Cmd: /bin/sh \n", - "\u001b[36mINFO\u001b[0m[0074] Args: [-c echo 'Installing /empty/requirements.txt...'; cat /empty/requirements.txt] \n", - "\u001b[36mINFO\u001b[0m[0074] Running: [/bin/sh -c echo 'Installing /empty/requirements.txt...'; cat /empty/requirements.txt] \n", - "Installing /empty/requirements.txt...\n", - "mlrun[complete]==1.6.1\n", - "onnx~=1.14.1\n", - "onnxruntime~=1.16.1\n", - "optimum~=1.6.4\n", - "transformers~=4.26.1\n", - "datasets~=2.10.1\n", - "scikit-learn~=1.0.2\n", - "\u001b[36mINFO\u001b[0m[0074] Taking snapshot of full filesystem... \n", - "\u001b[36mINFO\u001b[0m[0078] No files were changed, appending empty layer to config. No layer added to image. \n", - "\u001b[36mINFO\u001b[0m[0078] RUN python -m pip install -r /empty/requirements.txt \n", - "\u001b[36mINFO\u001b[0m[0078] Cmd: /bin/sh \n", - "\u001b[36mINFO\u001b[0m[0078] Args: [-c python -m pip install -r /empty/requirements.txt] \n", - "\u001b[36mINFO\u001b[0m[0078] Running: [/bin/sh -c python -m pip install -r /empty/requirements.txt] \n", - "Requirement already satisfied: mlrun[complete]==1.6.1 in /opt/conda/lib/python3.9/site-packages (from -r /empty/requirements.txt (line 1)) (1.6.1)\n", - "Collecting onnx~=1.14.1 (from -r /empty/requirements.txt (line 2))\n", - " Obtaining dependency information for onnx~=1.14.1 from https://files.pythonhosted.org/packages/ff/24/0e522fdcadf0e15fc304145a5b6e5d7246d7f2c507fd9bfe6e1fafb2aa95/onnx-1.14.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata\n", - " Downloading onnx-1.14.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (15 kB)\n", - "Collecting onnxruntime~=1.16.1 (from -r /empty/requirements.txt (line 3))\n", - " Obtaining dependency information for onnxruntime~=1.16.1 from https://files.pythonhosted.org/packages/de/ab/ed3ae0d649cee41e870f8b1653cf4a1c1fc321e0ded4e3e1a3d4a25c0131/onnxruntime-1.16.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata\n", - " Downloading onnxruntime-1.16.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.3 kB)\n", - "Collecting optimum~=1.6.4 (from -r /empty/requirements.txt (line 4))\n", - " Obtaining dependency information for optimum~=1.6.4 from https://files.pythonhosted.org/packages/31/72/a7e3b2c57d6368c5f4bb6fba54a85cbf07d25c385a2db3f1a638f3c0ddb2/optimum-1.6.4-py3-none-any.whl.metadata\n", - " Downloading optimum-1.6.4-py3-none-any.whl.metadata (17 kB)\n", - "Collecting transformers~=4.26.1 (from -r /empty/requirements.txt (line 5))\n", - " Obtaining dependency information for transformers~=4.26.1 from https://files.pythonhosted.org/packages/1e/e2/60c3f4691b16d126ee9cfe28f598b13c424b60350ab339aba81aef054b8f/transformers-4.26.1-py3-none-any.whl.metadata\n", - " Downloading transformers-4.26.1-py3-none-any.whl.metadata (100 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100.3/100.3 kB 6.2 MB/s eta 0:00:00\n", - "Collecting datasets~=2.10.1 (from -r /empty/requirements.txt (line 6))\n", - " Obtaining dependency information for datasets~=2.10.1 from https://files.pythonhosted.org/packages/fe/17/5825fdf034ff1a315becdbb9b6fe5a2bd9d8e724464535f18809593bf9c2/datasets-2.10.1-py3-none-any.whl.metadata\n", - " Downloading datasets-2.10.1-py3-none-any.whl.metadata (20 kB)\n", - "Collecting scikit-learn~=1.0.2 (from -r /empty/requirements.txt (line 7))\n", - " Obtaining dependency information for scikit-learn~=1.0.2 from https://files.pythonhosted.org/packages/57/aa/483fbe6b5314bce2d49801e6cec1f2139a9c220d0d51494788fff47233b3/scikit_learn-1.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata\n", - " Downloading scikit_learn-1.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)\n", - "Requirement already satisfied: urllib3<1.27,>=1.26.9 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.26.18)\n", - "Requirement already satisfied: GitPython>=3.1.41,~=3.1 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.1.42)\n", - "Requirement already satisfied: aiohttp~=3.9 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.9.3)\n", - "Requirement already satisfied: aiohttp-retry~=2.8 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.8.3)\n", - "Requirement already satisfied: click~=8.1 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (8.1.7)\n", - "Requirement already satisfied: kfp~=1.8 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.8.22)\n", - "Requirement already satisfied: nest-asyncio~=1.0 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.6.0)\n", - "Requirement already satisfied: ipython~=8.10 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (8.18.1)\n", - "Requirement already satisfied: nuclio-jupyter~=0.9.15 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.9.16)\n", - "Requirement already satisfied: numpy<1.27.0,>=1.16.5 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.26.4)\n", - "Requirement already satisfied: pandas<2.2,>=1.2 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.1.4)\n", - "Requirement already satisfied: pyarrow<15,>=10.0 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (14.0.2)\n", - "Requirement already satisfied: pyyaml~=5.1 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (5.4.1)\n", - "Requirement already satisfied: requests~=2.31 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.31.0)\n", - "Requirement already satisfied: tabulate~=0.8.6 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.8.10)\n", - "Requirement already satisfied: v3io~=0.5.21 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.5.23)\n", - "Requirement already satisfied: pydantic>=1.10.8,~=1.10 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.10.14)\n", - "Requirement already satisfied: mergedeep~=1.3 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.3.4)\n", - "Requirement already satisfied: v3io-frames~=0.10.12 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.10.13)\n", - "Requirement already satisfied: semver~=3.0 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.0.2)\n", - "Requirement already satisfied: dependency-injector~=4.41 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (4.41.0)\n", - "Requirement already satisfied: fsspec==2023.9.2 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2023.9.2)\n", - "Requirement already satisfied: v3iofs~=0.1.17 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.1.18)\n", - "Requirement already satisfied: storey~=1.6.18 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.6.18)\n", - "Requirement already satisfied: inflection~=0.5.0 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.5.1)\n", - "Requirement already satisfied: python-dotenv~=0.17.0 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.17.1)\n", - "Requirement already satisfied: setuptools~=68.2 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (68.2.2)\n", - "Requirement already satisfied: deprecated~=1.2 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.2.14)\n", - "Requirement already satisfied: jinja2>=3.1.3,~=3.1 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.1.3)\n", - "Requirement already satisfied: anyio~=3.7 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.7.1)\n", - "Requirement already satisfied: orjson~=3.9 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.9.15)\n", - "Requirement already satisfied: adlfs==2023.9.0 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2023.9.0)\n", - "Requirement already satisfied: aiobotocore<2.8,>=2.5.0 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.5.4)\n", - "Requirement already satisfied: avro~=1.11 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.11.3)\n", - "Requirement already satisfied: azure-core~=1.24 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.30.0)\n", - "Requirement already satisfied: azure-identity~=1.5 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.15.0)\n", - "Requirement already satisfied: azure-keyvault-secrets~=4.2 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (4.8.0)\n", - "Requirement already satisfied: boto3<1.29.0,>=1.28.0 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.28.17)\n", - "Requirement already satisfied: dask~=2023.9.0 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2023.9.3)\n", - "Requirement already satisfied: databricks-sdk~=0.13.0 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.13.0)\n", - "Requirement already satisfied: distributed~=2023.9.0 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2023.9.3)\n", - "Requirement already satisfied: gcsfs==2023.9.2 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2023.9.2)\n", - "Requirement already satisfied: google-cloud-bigquery[bqstorage,pandas]==3.14.1 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.14.1)\n", - "Requirement already satisfied: graphviz~=0.20.0 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.20.1)\n", - "Requirement already satisfied: kafka-python~=2.0 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.0.2)\n", - "Requirement already satisfied: mlflow~=2.8 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.10.2)\n", - "Requirement already satisfied: msrest~=0.6.21 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.6.21)\n", - "Requirement already satisfied: plotly<5.12.0,~=5.4 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (5.11.0)\n", - "Requirement already satisfied: pyopenssl>=23 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (24.0.0)\n", - "Requirement already satisfied: redis~=4.3 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (4.6.0)\n", - "Requirement already satisfied: s3fs==2023.9.2 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2023.9.2)\n", - "Requirement already satisfied: sqlalchemy~=1.4 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.4.51)\n", - "Requirement already satisfied: azure-datalake-store<0.1,>=0.0.46 in /opt/conda/lib/python3.9/site-packages (from adlfs==2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.0.53)\n", - "Requirement already satisfied: azure-storage-blob>=12.12.0 in /opt/conda/lib/python3.9/site-packages (from adlfs==2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (12.19.0)\n", - "Requirement already satisfied: decorator>4.1.2 in /opt/conda/lib/python3.9/site-packages (from gcsfs==2023.9.2->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (5.1.1)\n", - "Requirement already satisfied: google-auth>=1.2 in /opt/conda/lib/python3.9/site-packages (from gcsfs==2023.9.2->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.28.1)\n", - "Requirement already satisfied: google-auth-oauthlib in /opt/conda/lib/python3.9/site-packages (from gcsfs==2023.9.2->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.2.0)\n", - "Requirement already satisfied: google-cloud-storage in /opt/conda/lib/python3.9/site-packages (from gcsfs==2023.9.2->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.14.0)\n", - "Requirement already satisfied: google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0dev,>=1.31.5 in /opt/conda/lib/python3.9/site-packages (from google-cloud-bigquery[bqstorage,pandas]==3.14.1->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.17.1)\n", - "Requirement already satisfied: google-cloud-core<3.0.0dev,>=1.6.0 in /opt/conda/lib/python3.9/site-packages (from google-cloud-bigquery[bqstorage,pandas]==3.14.1->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.4.1)\n", - "Requirement already satisfied: google-resumable-media<3.0dev,>=0.6.0 in /opt/conda/lib/python3.9/site-packages (from google-cloud-bigquery[bqstorage,pandas]==3.14.1->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.7.0)\n", - "Requirement already satisfied: packaging>=20.0.0 in /opt/conda/lib/python3.9/site-packages (from google-cloud-bigquery[bqstorage,pandas]==3.14.1->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (23.1)\n", - "Requirement already satisfied: python-dateutil<3.0dev,>=2.7.2 in /opt/conda/lib/python3.9/site-packages (from google-cloud-bigquery[bqstorage,pandas]==3.14.1->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.8.2)\n", - "Requirement already satisfied: db-dtypes<2.0.0dev,>=0.3.0 in /opt/conda/lib/python3.9/site-packages (from google-cloud-bigquery[bqstorage,pandas]==3.14.1->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.2.0)\n", - "Requirement already satisfied: google-cloud-bigquery-storage<3.0.0dev,>=2.6.0 in /opt/conda/lib/python3.9/site-packages (from google-cloud-bigquery[bqstorage,pandas]==3.14.1->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.24.0)\n", - "Requirement already satisfied: grpcio<2.0dev,>=1.47.0 in /opt/conda/lib/python3.9/site-packages (from google-cloud-bigquery[bqstorage,pandas]==3.14.1->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.62.0)\n", - "Requirement already satisfied: protobuf>=3.20.2 in /opt/conda/lib/python3.9/site-packages (from onnx~=1.14.1->-r /empty/requirements.txt (line 2)) (3.20.3)\n", - "Requirement already satisfied: typing-extensions>=3.6.2.1 in /opt/conda/lib/python3.9/site-packages (from onnx~=1.14.1->-r /empty/requirements.txt (line 2)) (4.10.0)\n", - "Collecting coloredlogs (from onnxruntime~=1.16.1->-r /empty/requirements.txt (line 3))\n", - " Obtaining dependency information for coloredlogs from https://files.pythonhosted.org/packages/a7/06/3d6badcf13db419e25b07041d9c7b4a2c331d3f4e7134445ec5df57714cd/coloredlogs-15.0.1-py2.py3-none-any.whl.metadata\n", - " Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)\n", - "Collecting flatbuffers (from onnxruntime~=1.16.1->-r /empty/requirements.txt (line 3))\n", - " Obtaining dependency information for flatbuffers from https://files.pythonhosted.org/packages/bf/45/c961e3cb6ddad76b325c163d730562bb6deb1ace5acbed0306f5fbefb90e/flatbuffers-24.3.7-py2.py3-none-any.whl.metadata\n", - " Downloading flatbuffers-24.3.7-py2.py3-none-any.whl.metadata (849 bytes)\n", - "Collecting sympy (from onnxruntime~=1.16.1->-r /empty/requirements.txt (line 3))\n", - " Obtaining dependency information for sympy from https://files.pythonhosted.org/packages/d2/05/e6600db80270777c4a64238a98d442f0fd07cc8915be2a1c16da7f2b9e74/sympy-1.12-py3-none-any.whl.metadata\n", - " Downloading sympy-1.12-py3-none-any.whl.metadata (12 kB)\n", - "Collecting transformers[sentencepiece]>=4.26.0 (from optimum~=1.6.4->-r /empty/requirements.txt (line 4))\n", - " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/0a/fd/280f4385e76f3c1890efc15fa93f7206134fefad6351397e1bfab6d0d0de/transformers-4.39.1-py3-none-any.whl.metadata\n", - " Downloading transformers-4.39.1-py3-none-any.whl.metadata (134 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 134.8/134.8 kB 40.1 MB/s eta 0:00:00\n", - "Collecting torch>=1.9 (from optimum~=1.6.4->-r /empty/requirements.txt (line 4))\n", - " Obtaining dependency information for torch>=1.9 from https://files.pythonhosted.org/packages/98/04/95a12556d068786d6505c609daf2805bed91c9210c5185499a7c121eba47/torch-2.2.1-cp39-cp39-manylinux1_x86_64.whl.metadata\n", - " Downloading torch-2.2.1-cp39-cp39-manylinux1_x86_64.whl.metadata (25 kB)\n", - "Collecting numpy<1.27.0,>=1.16.5 (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1))\n", - " Obtaining dependency information for numpy<1.27.0,>=1.16.5 from https://files.pythonhosted.org/packages/4c/b9/038abd6fbd67b05b03cb1af590cfc02b7f1e5a37af7ac6a868f5093c29f5/numpy-1.23.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata\n", - " Downloading numpy-1.23.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.3 kB)\n", - "Collecting huggingface-hub>=0.8.0 (from optimum~=1.6.4->-r /empty/requirements.txt (line 4))\n", - " Obtaining dependency information for huggingface-hub>=0.8.0 from https://files.pythonhosted.org/packages/ab/28/d4b691840d73126d4c9845f8a22dad033ac872509b6d3a0d93b456eef424/huggingface_hub-0.21.4-py3-none-any.whl.metadata\n", - " Downloading huggingface_hub-0.21.4-py3-none-any.whl.metadata (13 kB)\n", - "Collecting filelock (from transformers~=4.26.1->-r /empty/requirements.txt (line 5))\n", - " Obtaining dependency information for filelock from https://files.pythonhosted.org/packages/81/54/84d42a0bee35edba99dee7b59a8d4970eccdd44b99fe728ed912106fc781/filelock-3.13.1-py3-none-any.whl.metadata\n", - " Downloading filelock-3.13.1-py3-none-any.whl.metadata (2.8 kB)\n", - "Collecting regex!=2019.12.17 (from transformers~=4.26.1->-r /empty/requirements.txt (line 5))\n", - " Obtaining dependency information for regex!=2019.12.17 from https://files.pythonhosted.org/packages/05/9e/80c20f1151432a6025690c9c2037053039b028a7b236fa81d7e7ac9dec60/regex-2023.12.25-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata\n", - " Downloading regex-2023.12.25-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 40.9/40.9 kB 217.5 MB/s eta 0:00:00\n", - "Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers~=4.26.1->-r /empty/requirements.txt (line 5))\n", - " Obtaining dependency information for tokenizers!=0.11.3,<0.14,>=0.11.1 from https://files.pythonhosted.org/packages/d6/27/07a337087dd507170a1b20fed3bbf8da81401185a7130a6e74e440c52040/tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata\n", - " Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)\n", - "Requirement already satisfied: tqdm>=4.27 in /opt/conda/lib/python3.9/site-packages (from transformers~=4.26.1->-r /empty/requirements.txt (line 5)) (4.65.0)\n", - "Collecting dill<0.3.7,>=0.3.0 (from datasets~=2.10.1->-r /empty/requirements.txt (line 6))\n", - " Obtaining dependency information for dill<0.3.7,>=0.3.0 from https://files.pythonhosted.org/packages/be/e3/a84bf2e561beed15813080d693b4b27573262433fced9c1d1fea59e60553/dill-0.3.6-py3-none-any.whl.metadata\n", - " Downloading dill-0.3.6-py3-none-any.whl.metadata (9.8 kB)\n", - "Requirement already satisfied: xxhash in /opt/conda/lib/python3.9/site-packages (from datasets~=2.10.1->-r /empty/requirements.txt (line 6)) (3.4.1)\n", - "Collecting multiprocess (from datasets~=2.10.1->-r /empty/requirements.txt (line 6))\n", - " Obtaining dependency information for multiprocess from https://files.pythonhosted.org/packages/da/d9/f7f9379981e39b8c2511c9e0326d212accacb82f12fbfdc1aa2ce2a7b2b6/multiprocess-0.70.16-py39-none-any.whl.metadata\n", - " Downloading multiprocess-0.70.16-py39-none-any.whl.metadata (7.2 kB)\n", - "Collecting responses<0.19 (from datasets~=2.10.1->-r /empty/requirements.txt (line 6))\n", - " Obtaining dependency information for responses<0.19 from https://files.pythonhosted.org/packages/79/f3/2b3a6dc5986303b3dd1bbbcf482022acb2583c428cd23f0b6d37b1a1a519/responses-0.18.0-py3-none-any.whl.metadata\n", - " Downloading responses-0.18.0-py3-none-any.whl.metadata (29 kB)\n", - "Requirement already satisfied: scipy>=1.1.0 in /opt/conda/lib/python3.9/site-packages (from scikit-learn~=1.0.2->-r /empty/requirements.txt (line 7)) (1.12.0)\n", - "Requirement already satisfied: joblib>=0.11 in /opt/conda/lib/python3.9/site-packages (from scikit-learn~=1.0.2->-r /empty/requirements.txt (line 7)) (1.3.2)\n", - "Requirement already satisfied: threadpoolctl>=2.0.0 in /opt/conda/lib/python3.9/site-packages (from scikit-learn~=1.0.2->-r /empty/requirements.txt (line 7)) (3.3.0)\n", - "Requirement already satisfied: botocore<1.31.18,>=1.31.17 in /opt/conda/lib/python3.9/site-packages (from aiobotocore<2.8,>=2.5.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.31.17)\n", - "Requirement already satisfied: wrapt<2.0.0,>=1.10.10 in /opt/conda/lib/python3.9/site-packages (from aiobotocore<2.8,>=2.5.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.16.0)\n", - "Requirement already satisfied: aioitertools<1.0.0,>=0.5.1 in /opt/conda/lib/python3.9/site-packages (from aiobotocore<2.8,>=2.5.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.11.0)\n", - "Requirement already satisfied: aiosignal>=1.1.2 in /opt/conda/lib/python3.9/site-packages (from aiohttp~=3.9->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.3.1)\n", - "Requirement already satisfied: attrs>=17.3.0 in /opt/conda/lib/python3.9/site-packages (from aiohttp~=3.9->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (23.2.0)\n", - "Requirement already satisfied: frozenlist>=1.1.1 in /opt/conda/lib/python3.9/site-packages (from aiohttp~=3.9->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.4.1)\n", - "Requirement already satisfied: multidict<7.0,>=4.5 in /opt/conda/lib/python3.9/site-packages (from aiohttp~=3.9->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (6.0.5)\n", - "Requirement already satisfied: yarl<2.0,>=1.0 in /opt/conda/lib/python3.9/site-packages (from aiohttp~=3.9->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.9.4)\n", - "Requirement already satisfied: async-timeout<5.0,>=4.0 in /opt/conda/lib/python3.9/site-packages (from aiohttp~=3.9->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (4.0.3)\n", - "Requirement already satisfied: idna>=2.8 in /opt/conda/lib/python3.9/site-packages (from anyio~=3.7->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.4)\n", - "Requirement already satisfied: sniffio>=1.1 in /opt/conda/lib/python3.9/site-packages (from anyio~=3.7->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.3.1)\n", - "Requirement already satisfied: exceptiongroup in /opt/conda/lib/python3.9/site-packages (from anyio~=3.7->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.2.0)\n", - "Requirement already satisfied: six>=1.11.0 in /opt/conda/lib/python3.9/site-packages (from azure-core~=1.24->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.16.0)\n", - "Requirement already satisfied: cryptography>=2.5 in /opt/conda/lib/python3.9/site-packages (from azure-identity~=1.5->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (42.0.2)\n", - "Requirement already satisfied: msal<2.0.0,>=1.24.0 in /opt/conda/lib/python3.9/site-packages (from azure-identity~=1.5->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.27.0)\n", - "Requirement already satisfied: msal-extensions<2.0.0,>=0.3.0 in /opt/conda/lib/python3.9/site-packages (from azure-identity~=1.5->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.1.0)\n", - "Requirement already satisfied: isodate>=0.6.1 in /opt/conda/lib/python3.9/site-packages (from azure-keyvault-secrets~=4.2->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.6.1)\n", - "Requirement already satisfied: jmespath<2.0.0,>=0.7.1 in /opt/conda/lib/python3.9/site-packages (from boto3<1.29.0,>=1.28.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.0.1)\n", - "Requirement already satisfied: s3transfer<0.7.0,>=0.6.0 in /opt/conda/lib/python3.9/site-packages (from boto3<1.29.0,>=1.28.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.6.2)\n", - "Requirement already satisfied: cloudpickle>=1.5.0 in /opt/conda/lib/python3.9/site-packages (from dask~=2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.2.1)\n", - "Requirement already satisfied: partd>=1.2.0 in /opt/conda/lib/python3.9/site-packages (from dask~=2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.4.1)\n", - "Requirement already satisfied: toolz>=0.10.0 in /opt/conda/lib/python3.9/site-packages (from dask~=2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.12.0)\n", - "Requirement already satisfied: importlib-metadata>=4.13.0 in /opt/conda/lib/python3.9/site-packages (from dask~=2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (7.0.1)\n", - "Requirement already satisfied: locket>=1.0.0 in /opt/conda/lib/python3.9/site-packages (from distributed~=2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.0.0)\n", - "Requirement already satisfied: msgpack>=1.0.0 in /opt/conda/lib/python3.9/site-packages (from distributed~=2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.0.7)\n", - "Requirement already satisfied: psutil>=5.7.2 in /opt/conda/lib/python3.9/site-packages (from distributed~=2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (5.9.8)\n", - "Requirement already satisfied: sortedcontainers>=2.0.5 in /opt/conda/lib/python3.9/site-packages (from distributed~=2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.4.0)\n", - "Requirement already satisfied: tblib>=1.6.0 in /opt/conda/lib/python3.9/site-packages (from distributed~=2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.0.0)\n", - "Requirement already satisfied: tornado>=6.0.4 in /opt/conda/lib/python3.9/site-packages (from distributed~=2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (6.4)\n", - "Requirement already satisfied: zict>=3.0.0 in /opt/conda/lib/python3.9/site-packages (from distributed~=2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.0.0)\n", - "Requirement already satisfied: gitdb<5,>=4.0.1 in /opt/conda/lib/python3.9/site-packages (from GitPython>=3.1.41,~=3.1->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (4.0.11)\n", - "Requirement already satisfied: jedi>=0.16 in /opt/conda/lib/python3.9/site-packages (from ipython~=8.10->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.19.1)\n", - "Requirement already satisfied: matplotlib-inline in /opt/conda/lib/python3.9/site-packages (from ipython~=8.10->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.1.6)\n", - "Requirement already satisfied: prompt-toolkit<3.1.0,>=3.0.41 in /opt/conda/lib/python3.9/site-packages (from ipython~=8.10->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.0.43)\n", - "Requirement already satisfied: pygments>=2.4.0 in /opt/conda/lib/python3.9/site-packages (from ipython~=8.10->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.17.2)\n", - "Requirement already satisfied: stack-data in /opt/conda/lib/python3.9/site-packages (from ipython~=8.10->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.6.3)\n", - "Requirement already satisfied: traitlets>=5 in /opt/conda/lib/python3.9/site-packages (from ipython~=8.10->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (5.14.1)\n", - "Requirement already satisfied: pexpect>4.3 in /opt/conda/lib/python3.9/site-packages (from ipython~=8.10->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (4.9.0)\n", - "Requirement already satisfied: MarkupSafe>=2.0 in /opt/conda/lib/python3.9/site-packages (from jinja2>=3.1.3,~=3.1->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.1.5)\n", - "Requirement already satisfied: absl-py<2,>=0.9 in /opt/conda/lib/python3.9/site-packages (from kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.4.0)\n", - "Requirement already satisfied: kubernetes<26,>=8.0.0 in /opt/conda/lib/python3.9/site-packages (from kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (25.3.0)\n", - "Requirement already satisfied: google-api-python-client<2,>=1.7.8 in /opt/conda/lib/python3.9/site-packages (from kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.12.11)\n", - "Requirement already satisfied: requests-toolbelt<1,>=0.8.0 in /opt/conda/lib/python3.9/site-packages (from kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.10.1)\n", - "Requirement already satisfied: kfp-server-api<2.0.0,>=1.1.2 in /opt/conda/lib/python3.9/site-packages (from kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.8.5)\n", - "Requirement already satisfied: jsonschema<5,>=3.0.1 in /opt/conda/lib/python3.9/site-packages (from kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (4.21.1)\n", - "Requirement already satisfied: strip-hints<1,>=0.1.8 in /opt/conda/lib/python3.9/site-packages (from kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.1.10)\n", - "Requirement already satisfied: docstring-parser<1,>=0.7.3 in /opt/conda/lib/python3.9/site-packages (from kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.15)\n", - "Requirement already satisfied: kfp-pipeline-spec<0.2.0,>=0.1.16 in /opt/conda/lib/python3.9/site-packages (from kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.1.16)\n", - "Requirement already satisfied: fire<1,>=0.3.1 in /opt/conda/lib/python3.9/site-packages (from kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.5.0)\n", - "Requirement already satisfied: uritemplate<4,>=3.0.1 in /opt/conda/lib/python3.9/site-packages (from kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.0.1)\n", - "Requirement already satisfied: typer<1.0,>=0.3.2 in /opt/conda/lib/python3.9/site-packages (from kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.9.0)\n", - "Requirement already satisfied: entrypoints<1 in /opt/conda/lib/python3.9/site-packages (from mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.4)\n", - "Requirement already satisfied: pytz<2024 in /opt/conda/lib/python3.9/site-packages (from mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2023.4)\n", - "Requirement already satisfied: sqlparse<1,>=0.4.0 in /opt/conda/lib/python3.9/site-packages (from mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.4.4)\n", - "Requirement already satisfied: alembic!=1.10.0,<2 in /opt/conda/lib/python3.9/site-packages (from mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.13.1)\n", - "Requirement already satisfied: docker<8,>=4.0.0 in /opt/conda/lib/python3.9/site-packages (from mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (7.0.0)\n", - "Requirement already satisfied: Flask<4 in /opt/conda/lib/python3.9/site-packages (from mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.0.2)\n", - "Requirement already satisfied: querystring-parser<2 in /opt/conda/lib/python3.9/site-packages (from mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.2.4)\n", - "Requirement already satisfied: markdown<4,>=3.3 in /opt/conda/lib/python3.9/site-packages (from mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.5.2)\n", - "Requirement already satisfied: matplotlib<4 in /opt/conda/lib/python3.9/site-packages (from mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.8.3)\n", - "Requirement already satisfied: gunicorn<22 in /opt/conda/lib/python3.9/site-packages (from mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (21.2.0)\n", - "Requirement already satisfied: requests-oauthlib>=0.5.0 in /opt/conda/lib/python3.9/site-packages (from msrest~=0.6.21->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.3.1)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.9/site-packages (from msrest~=0.6.21->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2024.2.2)\n", - "Requirement already satisfied: nbconvert>=6.4.5 in /opt/conda/lib/python3.9/site-packages (from nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (7.16.1)\n", - "Requirement already satisfied: notebook<7.0.0,>=6.4 in /opt/conda/lib/python3.9/site-packages (from nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (6.5.6)\n", - "Requirement already satisfied: tzdata>=2022.1 in /opt/conda/lib/python3.9/site-packages (from pandas<2.2,>=1.2->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2024.1)\n", - "Requirement already satisfied: tenacity>=6.2.0 in /opt/conda/lib/python3.9/site-packages (from plotly<5.12.0,~=5.4->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (8.2.3)\n", - "Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.9/site-packages (from requests~=2.31->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.0.4)\n", - "Requirement already satisfied: greenlet!=0.4.17 in /opt/conda/lib/python3.9/site-packages (from sqlalchemy~=1.4->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.0.3)\n", - "Requirement already satisfied: nuclio-sdk>=0.5.3 in /opt/conda/lib/python3.9/site-packages (from storey~=1.6.18->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.5.9)\n", - "Collecting networkx (from torch>=1.9->optimum~=1.6.4->-r /empty/requirements.txt (line 4))\n", - " Obtaining dependency information for networkx from https://files.pythonhosted.org/packages/d5/f0/8fbc882ca80cf077f1b246c0e3c3465f7f415439bdea6b899f6b19f61f70/networkx-3.2.1-py3-none-any.whl.metadata\n", - " Downloading networkx-3.2.1-py3-none-any.whl.metadata (5.2 kB)\n", - "Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.9->optimum~=1.6.4->-r /empty/requirements.txt (line 4))\n", - " Obtaining dependency information for nvidia-cuda-nvrtc-cu12==12.1.105 from https://files.pythonhosted.org/packages/b6/9f/c64c03f49d6fbc56196664d05dba14e3a561038a81a638eeb47f4d4cfd48/nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata\n", - " Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)\n", - "Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.9->optimum~=1.6.4->-r /empty/requirements.txt (line 4))\n", - " Obtaining dependency information for nvidia-cuda-runtime-cu12==12.1.105 from https://files.pythonhosted.org/packages/eb/d5/c68b1d2cdfcc59e72e8a5949a37ddb22ae6cade80cd4a57a84d4c8b55472/nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata\n", - " Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)\n", - "Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.9->optimum~=1.6.4->-r /empty/requirements.txt (line 4))\n", - " Obtaining dependency information for nvidia-cuda-cupti-cu12==12.1.105 from https://files.pythonhosted.org/packages/7e/00/6b218edd739ecfc60524e585ba8e6b00554dd908de2c9c66c1af3e44e18d/nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata\n", - " Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)\n", - "Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.9->optimum~=1.6.4->-r /empty/requirements.txt (line 4))\n", - " Obtaining dependency information for nvidia-cudnn-cu12==8.9.2.26 from https://files.pythonhosted.org/packages/ff/74/a2e2be7fb83aaedec84f391f082cf765dfb635e7caa9b49065f73e4835d8/nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata\n", - " Downloading nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)\n", - "Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.9->optimum~=1.6.4->-r /empty/requirements.txt (line 4))\n", - " Obtaining dependency information for nvidia-cublas-cu12==12.1.3.1 from https://files.pythonhosted.org/packages/37/6d/121efd7382d5b0284239f4ab1fc1590d86d34ed4a4a2fdb13b30ca8e5740/nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata\n", - " Downloading nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)\n", - "Collecting nvidia-cufft-cu12==11.0.2.54 (from torch>=1.9->optimum~=1.6.4->-r /empty/requirements.txt (line 4))\n", - " Obtaining dependency information for nvidia-cufft-cu12==11.0.2.54 from https://files.pythonhosted.org/packages/86/94/eb540db023ce1d162e7bea9f8f5aa781d57c65aed513c33ee9a5123ead4d/nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl.metadata\n", - " Downloading nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)\n", - "Collecting nvidia-curand-cu12==10.3.2.106 (from torch>=1.9->optimum~=1.6.4->-r /empty/requirements.txt (line 4))\n", - " Obtaining dependency information for nvidia-curand-cu12==10.3.2.106 from https://files.pythonhosted.org/packages/44/31/4890b1c9abc496303412947fc7dcea3d14861720642b49e8ceed89636705/nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl.metadata\n", - " Downloading nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)\n", - "Collecting nvidia-cusolver-cu12==11.4.5.107 (from torch>=1.9->optimum~=1.6.4->-r /empty/requirements.txt (line 4))\n", - " Obtaining dependency information for nvidia-cusolver-cu12==11.4.5.107 from https://files.pythonhosted.org/packages/bc/1d/8de1e5c67099015c834315e333911273a8c6aaba78923dd1d1e25fc5f217/nvidia_cusolver_cu12-11.4.5.107-py3-none-manylinux1_x86_64.whl.metadata\n", - " Downloading nvidia_cusolver_cu12-11.4.5.107-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)\n", - "Collecting nvidia-cusparse-cu12==12.1.0.106 (from torch>=1.9->optimum~=1.6.4->-r /empty/requirements.txt (line 4))\n", - " Obtaining dependency information for nvidia-cusparse-cu12==12.1.0.106 from https://files.pythonhosted.org/packages/65/5b/cfaeebf25cd9fdec14338ccb16f6b2c4c7fa9163aefcf057d86b9cc248bb/nvidia_cusparse_cu12-12.1.0.106-py3-none-manylinux1_x86_64.whl.metadata\n", - " Downloading nvidia_cusparse_cu12-12.1.0.106-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)\n", - "Collecting nvidia-nccl-cu12==2.19.3 (from torch>=1.9->optimum~=1.6.4->-r /empty/requirements.txt (line 4))\n", - " Obtaining dependency information for nvidia-nccl-cu12==2.19.3 from https://files.pythonhosted.org/packages/38/00/d0d4e48aef772ad5aebcf70b73028f88db6e5640b36c38e90445b7a57c45/nvidia_nccl_cu12-2.19.3-py3-none-manylinux1_x86_64.whl.metadata\n", - " Downloading nvidia_nccl_cu12-2.19.3-py3-none-manylinux1_x86_64.whl.metadata (1.8 kB)\n", - "Collecting nvidia-nvtx-cu12==12.1.105 (from torch>=1.9->optimum~=1.6.4->-r /empty/requirements.txt (line 4))\n", - " Obtaining dependency information for nvidia-nvtx-cu12==12.1.105 from https://files.pythonhosted.org/packages/da/d3/8057f0587683ed2fcd4dbfbdfdfa807b9160b809976099d36b8f60d08f03/nvidia_nvtx_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata\n", - " Downloading nvidia_nvtx_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.7 kB)\n", - "Collecting triton==2.2.0 (from torch>=1.9->optimum~=1.6.4->-r /empty/requirements.txt (line 4))\n", - " Obtaining dependency information for triton==2.2.0 from https://files.pythonhosted.org/packages/6a/5c/01d9f062f719581cf6e60053e1a005d666ec67dcb59630fffaa3a3e5c9d8/triton-2.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata\n", - " Downloading triton-2.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)\n", - "Collecting nvidia-nvjitlink-cu12 (from nvidia-cusolver-cu12==11.4.5.107->torch>=1.9->optimum~=1.6.4->-r /empty/requirements.txt (line 4))\n", - " Obtaining dependency information for nvidia-nvjitlink-cu12 from https://files.pythonhosted.org/packages/58/d1/d1c80553f9d5d07b6072bc132607d75a0ef3600e28e1890e11c0f55d7346/nvidia_nvjitlink_cu12-12.4.99-py3-none-manylinux2014_x86_64.whl.metadata\n", - " Downloading nvidia_nvjitlink_cu12-12.4.99-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)\n", - "INFO: pip is looking at multiple versions of transformers[sentencepiece] to determine which version is compatible with other requirements. This could take a while.\n", - "Collecting transformers[sentencepiece]>=4.26.0 (from optimum~=1.6.4->-r /empty/requirements.txt (line 4))\n", - " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/a4/73/f620d76193954e16db3d5c53a07d956d7b9c800e570758d3bff91906d4a4/transformers-4.39.0-py3-none-any.whl.metadata\n", - " Downloading transformers-4.39.0-py3-none-any.whl.metadata (134 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 134.8/134.8 kB 115.9 MB/s eta 0:00:00\n", - " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/b6/4d/fbe6d89fde59d8107f0a02816c4ac4542a8f9a85559fdf33c68282affcc1/transformers-4.38.2-py3-none-any.whl.metadata\n", - " Downloading transformers-4.38.2-py3-none-any.whl.metadata (130 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 130.7/130.7 kB 126.3 MB/s eta 0:00:00\n", - " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/3e/6b/1b589f7b69aaea8193cf5bc91cf97410284aecd97b6312cdb08baedbdffe/transformers-4.38.1-py3-none-any.whl.metadata\n", - " Downloading transformers-4.38.1-py3-none-any.whl.metadata (131 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 131.1/131.1 kB 138.2 MB/s eta 0:00:00\n", - " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/91/89/5416dc364c7ef0711c564fd61a69b03d1e40eeb5c506c38e53ba8a969e79/transformers-4.38.0-py3-none-any.whl.metadata\n", - " Downloading transformers-4.38.0-py3-none-any.whl.metadata (131 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 131.1/131.1 kB 186.3 MB/s eta 0:00:00\n", - " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/85/f6/c5065913119c41ecad148c34e3a861f719e16b89a522287213698da911fc/transformers-4.37.2-py3-none-any.whl.metadata\n", - " Downloading transformers-4.37.2-py3-none-any.whl.metadata (129 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 129.4/129.4 kB 236.8 MB/s eta 0:00:00\n", - " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/ad/67/b4d6a51dcaf988cb45b31e26c6e33fb169fe34ba5fb168b086309bd7c028/transformers-4.37.1-py3-none-any.whl.metadata\n", - " Downloading transformers-4.37.1-py3-none-any.whl.metadata (129 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 129.4/129.4 kB 156.4 MB/s eta 0:00:00\n", - " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/3c/45/52133ce6bce49a099cc865599803bf1fad93de887276f728e56848d77a70/transformers-4.37.0-py3-none-any.whl.metadata\n", - " Downloading transformers-4.37.0-py3-none-any.whl.metadata (129 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 129.4/129.4 kB 102.0 MB/s eta 0:00:00\n", - "INFO: pip is still looking at multiple versions of transformers[sentencepiece] to determine which version is compatible with other requirements. This could take a while.\n", - " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/20/0a/739426a81f7635b422fbe6cb8d1d99d1235579a6ac8024c13d743efa6847/transformers-4.36.2-py3-none-any.whl.metadata\n", - " Downloading transformers-4.36.2-py3-none-any.whl.metadata (126 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 126.8/126.8 kB 108.8 MB/s eta 0:00:00\n", - " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/fc/04/0aad491cd98b09236c54ab849863ee85421eeda5138bbf9d33ecc594652b/transformers-4.36.1-py3-none-any.whl.metadata\n", - " Downloading transformers-4.36.1-py3-none-any.whl.metadata (126 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 126.8/126.8 kB 140.6 MB/s eta 0:00:00\n", - " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/0f/12/d8e27a190ca67811f81deea3183b528d9169f10b74d827e0b9211520ecfa/transformers-4.36.0-py3-none-any.whl.metadata\n", - " Downloading transformers-4.36.0-py3-none-any.whl.metadata (126 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 126.8/126.8 kB 267.8 MB/s eta 0:00:00\n", - " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/12/dd/f17b11a93a9ca27728e12512d167eb1281c151c4c6881d3ab59eb58f4127/transformers-4.35.2-py3-none-any.whl.metadata\n", - " Downloading transformers-4.35.2-py3-none-any.whl.metadata (123 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 123.5/123.5 kB 130.2 MB/s eta 0:00:00\n", - " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/92/ba/cfff7e01f7070d9fca3964bf42b2257b86964c3e6763b8d5435436cc1d77/transformers-4.35.1-py3-none-any.whl.metadata\n", - " Downloading transformers-4.35.1-py3-none-any.whl.metadata (123 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 123.1/123.1 kB 183.6 MB/s eta 0:00:00\n", - "INFO: This is taking longer than usual. You might need to provide the dependency resolver with stricter constraints to reduce runtime. See https://pip.pypa.io/warnings/backtracking for guidance. If you want to abort this run, press Ctrl + C.\n", - " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/9a/06/e4ec2a321e57c03b7e9345d709d554a52c33760e5015fdff0919d9459af0/transformers-4.35.0-py3-none-any.whl.metadata\n", - " Downloading transformers-4.35.0-py3-none-any.whl.metadata (123 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 123.1/123.1 kB 177.3 MB/s eta 0:00:00\n", - " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/c1/bd/f64d67df4d3b05a460f281defe830ffab6d7940b7ca98ec085e94e024781/transformers-4.34.1-py3-none-any.whl.metadata\n", - " Downloading transformers-4.34.1-py3-none-any.whl.metadata (121 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 121.5/121.5 kB 270.5 MB/s eta 0:00:00\n", - " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/1a/d1/3bba59606141ae808017f6fde91453882f931957f125009417b87a281067/transformers-4.34.0-py3-none-any.whl.metadata\n", - " Downloading transformers-4.34.0-py3-none-any.whl.metadata (121 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 121.5/121.5 kB 133.4 MB/s eta 0:00:00\n", - " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/98/46/f6a79f944d5c7763a9bc13b2aa6ac72daf43a6551f5fb03bccf0a9c2fec1/transformers-4.33.3-py3-none-any.whl.metadata\n", - " Downloading transformers-4.33.3-py3-none-any.whl.metadata (119 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 119.9/119.9 kB 163.1 MB/s eta 0:00:00\n", - " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/1a/06/3817f9bb923437ead9a794f0ac0d03b8b5e0478ab112db4c413dd37c09da/transformers-4.33.2-py3-none-any.whl.metadata\n", - " Downloading transformers-4.33.2-py3-none-any.whl.metadata (119 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 119.9/119.9 kB 274.9 MB/s eta 0:00:00\n", - " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/13/30/54b59e73400df3de506ad8630284e9fd63f4b94f735423d55fc342181037/transformers-4.33.1-py3-none-any.whl.metadata\n", - " Downloading transformers-4.33.1-py3-none-any.whl.metadata (119 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 119.9/119.9 kB 274.2 MB/s eta 0:00:00\n", - " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/e1/9d/4d9fe5c3b820db10773392ac5f4a0c8dab668f70b245ce2ce09785166128/transformers-4.33.0-py3-none-any.whl.metadata\n", - " Downloading transformers-4.33.0-py3-none-any.whl.metadata (119 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 119.9/119.9 kB 185.9 MB/s eta 0:00:00\n", - " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/83/8d/f65f8138365462ace54458a9e164f4b28ce1141361970190eef36bdef986/transformers-4.32.1-py3-none-any.whl.metadata\n", - " Downloading transformers-4.32.1-py3-none-any.whl.metadata (118 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 118.5/118.5 kB 144.4 MB/s eta 0:00:00\n", - " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/ae/95/283a1c004430bd2a9425d6937fc545dd49a4e4592feb76be0299a14e2378/transformers-4.32.0-py3-none-any.whl.metadata\n", - " Downloading transformers-4.32.0-py3-none-any.whl.metadata (118 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 118.5/118.5 kB 150.3 MB/s eta 0:00:00\n", - " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/21/02/ae8e595f45b6c8edee07913892b3b41f5f5f273962ad98851dc6a564bbb9/transformers-4.31.0-py3-none-any.whl.metadata\n", - " Downloading transformers-4.31.0-py3-none-any.whl.metadata (116 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 116.9/116.9 kB 156.7 MB/s eta 0:00:00\n", - " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/5b/0b/e45d26ccd28568013523e04f325432ea88a442b4e3020b757cf4361f0120/transformers-4.30.2-py3-none-any.whl.metadata\n", - " Downloading transformers-4.30.2-py3-none-any.whl.metadata (113 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 113.6/113.6 kB 263.7 MB/s eta 0:00:00\n", - " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/b8/df/b01b5e67cde3883757c9212455cbb9169385dcab5858b7172199126b756d/transformers-4.30.1-py3-none-any.whl.metadata\n", - " Downloading transformers-4.30.1-py3-none-any.whl.metadata (113 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 113.6/113.6 kB 263.8 MB/s eta 0:00:00\n", - " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/e2/72/1af3d38e98fdcceb3876de4567ac395a66c26976e259fe2d46266e052d61/transformers-4.30.0-py3-none-any.whl.metadata\n", - " Downloading transformers-4.30.0-py3-none-any.whl.metadata (113 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 113.6/113.6 kB 266.5 MB/s eta 0:00:00\n", - " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/17/aa/a89864288afe45abe1ab79f002140a20348140e86836d96096d8f8a3bac0/transformers-4.29.2-py3-none-any.whl.metadata\n", - " Downloading transformers-4.29.2-py3-none-any.whl.metadata (112 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 112.3/112.3 kB 272.7 MB/s eta 0:00:00\n", - " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/e8/b5/ddb16f9de207e6571ab7cc5db0cc538fa2d6d91cf024565496462af4c1ce/transformers-4.29.1-py3-none-any.whl.metadata\n", - " Downloading transformers-4.29.1-py3-none-any.whl.metadata (112 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 112.3/112.3 kB 262.3 MB/s eta 0:00:00\n", - " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/45/e4/4914b11df70954d95a7c36b74bf9010c8594fcec960471479449b0deb4f7/transformers-4.29.0-py3-none-any.whl.metadata\n", - " Downloading transformers-4.29.0-py3-none-any.whl.metadata (111 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 111.9/111.9 kB 269.5 MB/s eta 0:00:00\n", - " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/d8/a7/a6ff727fd5d96d6625f4658944a2ae230f0c75743a9a117fbda013b03d3d/transformers-4.28.1-py3-none-any.whl.metadata\n", - " Downloading transformers-4.28.1-py3-none-any.whl.metadata (109 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 110.0/110.0 kB 245.6 MB/s eta 0:00:00\n", - " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/8b/13/1ce598763b3669d43f192a7911bf2bf730a328012ab8801b93187a4f70d0/transformers-4.28.0-py3-none-any.whl.metadata\n", - " Downloading transformers-4.28.0-py3-none-any.whl.metadata (109 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 110.0/110.0 kB 256.3 MB/s eta 0:00:00\n", - " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/87/f0/2a152ed10ab8601431e87a606d397f7473c5fa4f8162f4ec5bda6ddb2df4/transformers-4.27.4-py3-none-any.whl.metadata\n", - " Downloading transformers-4.27.4-py3-none-any.whl.metadata (106 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 106.7/106.7 kB 254.4 MB/s eta 0:00:00\n", - " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/52/ac/9dc5a17ba60bc354d99250d9d1629f99d76f6729cee438fa91c8cc74bc5d/transformers-4.27.3-py3-none-any.whl.metadata\n", - " Downloading transformers-4.27.3-py3-none-any.whl.metadata (106 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 106.7/106.7 kB 251.5 MB/s eta 0:00:00\n", - " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/73/f0/4a795505387a3e7cd7f0c2a2a87f876658f9a07947a38fb67bffceff9246/transformers-4.27.2-py3-none-any.whl.metadata\n", - " Downloading transformers-4.27.2-py3-none-any.whl.metadata (106 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 106.7/106.7 kB 246.1 MB/s eta 0:00:00\n", - " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/6d/9b/2f536f9e73390209e0b27b74691355dac494b7ec8154f3012fdc6debbae7/transformers-4.27.1-py3-none-any.whl.metadata\n", - " Downloading transformers-4.27.1-py3-none-any.whl.metadata (106 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 106.7/106.7 kB 114.0 MB/s eta 0:00:00\n", - " Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/4d/3e/1378ed266cf991f5ab5fcb29e953d97d793c7f9242ea5dc52f856415ea3a/transformers-4.27.0-py3-none-any.whl.metadata\n", - " Downloading transformers-4.27.0-py3-none-any.whl.metadata (106 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 106.7/106.7 kB 247.2 MB/s eta 0:00:00\n", - "Collecting sentencepiece!=0.1.92,>=0.1.91 (from transformers~=4.26.1->-r /empty/requirements.txt (line 5))\n", - " Obtaining dependency information for sentencepiece!=0.1.92,>=0.1.91 from https://files.pythonhosted.org/packages/5f/01/c95e42eb86282b2c79305d3e0b0ca5a743f85a61262bb7130999c70b9374/sentencepiece-0.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata\n", - " Downloading sentencepiece-0.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)\n", - "Collecting protobuf>=3.20.2 (from onnx~=1.14.1->-r /empty/requirements.txt (line 2))\n", - " Obtaining dependency information for protobuf>=3.20.2 from https://files.pythonhosted.org/packages/38/b1/d9b615dceb67ac38e13cbd7680c27182b40154996022cbb244ba1ac7d30f/protobuf-3.20.2-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl.metadata\n", - " Downloading protobuf-3.20.2-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl.metadata (679 bytes)\n", - "Requirement already satisfied: future>=0.18.2 in /opt/conda/lib/python3.9/site-packages (from v3io~=0.5.21->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.0.0)\n", - "Requirement already satisfied: ujson>=3 in /opt/conda/lib/python3.9/site-packages (from v3io~=0.5.21->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (5.9.0)\n", - "Requirement already satisfied: googleapis-common-protos>=1.5.3 in /opt/conda/lib/python3.9/site-packages (from v3io-frames~=0.10.12->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.62.0)\n", - "Requirement already satisfied: grpcio-tools!=1.34.0,<1.49,>=1.30 in /opt/conda/lib/python3.9/site-packages (from v3io-frames~=0.10.12->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.48.2)\n", - "Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime~=1.16.1->-r /empty/requirements.txt (line 3))\n", - " Obtaining dependency information for humanfriendly>=9.1 from https://files.pythonhosted.org/packages/f0/0f/310fb31e39e2d734ccaa2c0fb981ee41f7bd5056ce9bc29b2248bd569169/humanfriendly-10.0-py2.py3-none-any.whl.metadata\n", - " Downloading humanfriendly-10.0-py2.py3-none-any.whl.metadata (9.2 kB)\n", - "INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.\n", - "Collecting multiprocess (from datasets~=2.10.1->-r /empty/requirements.txt (line 6))\n", - " Obtaining dependency information for multiprocess from https://files.pythonhosted.org/packages/c6/c9/820b5ab056f4ada76fbe05bd481a948f287957d6cbfd59e2dd2618b408c1/multiprocess-0.70.15-py39-none-any.whl.metadata\n", - " Downloading multiprocess-0.70.15-py39-none-any.whl.metadata (7.2 kB)\n", - " Obtaining dependency information for multiprocess from https://files.pythonhosted.org/packages/6a/f4/fbeb03ef7abdda54db4a6a75c971b88ab73d724ff09e3275cc1e99f1c946/multiprocess-0.70.14-py39-none-any.whl.metadata\n", - " Downloading multiprocess-0.70.14-py39-none-any.whl.metadata (6.6 kB)\n", - "Collecting mpmath>=0.19 (from sympy->onnxruntime~=1.16.1->-r /empty/requirements.txt (line 3))\n", - " Obtaining dependency information for mpmath>=0.19 from https://files.pythonhosted.org/packages/43/e3/7d92a15f894aa0c9c4b49b8ee9ac9850d6e63b03c9c32c0367a13ae62209/mpmath-1.3.0-py3-none-any.whl.metadata\n", - " Downloading mpmath-1.3.0-py3-none-any.whl.metadata (8.6 kB)\n", - "Requirement already satisfied: Mako in /opt/conda/lib/python3.9/site-packages (from alembic!=1.10.0,<2->mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.3.2)\n", - "Requirement already satisfied: cffi in /opt/conda/lib/python3.9/site-packages (from azure-datalake-store<0.1,>=0.0.46->adlfs==2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.16.0)\n", - "Requirement already satisfied: termcolor in /opt/conda/lib/python3.9/site-packages (from fire<1,>=0.3.1->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.4.0)\n", - "Requirement already satisfied: Werkzeug>=3.0.0 in /opt/conda/lib/python3.9/site-packages (from Flask<4->mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.0.1)\n", - "Requirement already satisfied: itsdangerous>=2.1.2 in /opt/conda/lib/python3.9/site-packages (from Flask<4->mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.1.2)\n", - "Requirement already satisfied: blinker>=1.6.2 in /opt/conda/lib/python3.9/site-packages (from Flask<4->mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.7.0)\n", - "Requirement already satisfied: smmap<6,>=3.0.1 in /opt/conda/lib/python3.9/site-packages (from gitdb<5,>=4.0.1->GitPython>=3.1.41,~=3.1->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (5.0.1)\n", - "Requirement already satisfied: httplib2<1dev,>=0.15.0 in /opt/conda/lib/python3.9/site-packages (from google-api-python-client<2,>=1.7.8->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.22.0)\n", - "Requirement already satisfied: google-auth-httplib2>=0.0.3 in /opt/conda/lib/python3.9/site-packages (from google-api-python-client<2,>=1.7.8->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.2.0)\n", - "Requirement already satisfied: cachetools<6.0,>=2.0.0 in /opt/conda/lib/python3.9/site-packages (from google-auth>=1.2->gcsfs==2023.9.2->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (5.3.3)\n", - "Requirement already satisfied: pyasn1-modules>=0.2.1 in /opt/conda/lib/python3.9/site-packages (from google-auth>=1.2->gcsfs==2023.9.2->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.3.0)\n", - "Requirement already satisfied: rsa<5,>=3.1.4 in /opt/conda/lib/python3.9/site-packages (from google-auth>=1.2->gcsfs==2023.9.2->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (4.9)\n", - "Requirement already satisfied: proto-plus<2.0.0dev,>=1.22.0 in /opt/conda/lib/python3.9/site-packages (from google-cloud-bigquery-storage<3.0.0dev,>=2.6.0->google-cloud-bigquery[bqstorage,pandas]==3.14.1->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.23.0)\n", - "Requirement already satisfied: google-crc32c<2.0dev,>=1.0 in /opt/conda/lib/python3.9/site-packages (from google-cloud-storage->gcsfs==2023.9.2->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.5.0)\n", - "Requirement already satisfied: zipp>=0.5 in /opt/conda/lib/python3.9/site-packages (from importlib-metadata>=4.13.0->dask~=2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.17.0)\n", - "Requirement already satisfied: parso<0.9.0,>=0.8.3 in /opt/conda/lib/python3.9/site-packages (from jedi>=0.16->ipython~=8.10->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.8.3)\n", - "Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /opt/conda/lib/python3.9/site-packages (from jsonschema<5,>=3.0.1->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2023.12.1)\n", - "Requirement already satisfied: referencing>=0.28.4 in /opt/conda/lib/python3.9/site-packages (from jsonschema<5,>=3.0.1->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.33.0)\n", - "Requirement already satisfied: rpds-py>=0.7.1 in /opt/conda/lib/python3.9/site-packages (from jsonschema<5,>=3.0.1->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.18.0)\n", - "Requirement already satisfied: websocket-client!=0.40.0,!=0.41.*,!=0.42.*,>=0.32.0 in /opt/conda/lib/python3.9/site-packages (from kubernetes<26,>=8.0.0->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.7.0)\n", - "Requirement already satisfied: contourpy>=1.0.1 in /opt/conda/lib/python3.9/site-packages (from matplotlib<4->mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.2.0)\n", - "Requirement already satisfied: cycler>=0.10 in /opt/conda/lib/python3.9/site-packages (from matplotlib<4->mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.12.1)\n", - "Requirement already satisfied: fonttools>=4.22.0 in /opt/conda/lib/python3.9/site-packages (from matplotlib<4->mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (4.49.0)\n", - "Requirement already satisfied: kiwisolver>=1.3.1 in /opt/conda/lib/python3.9/site-packages (from matplotlib<4->mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.4.5)\n", - "Requirement already satisfied: pillow>=8 in /opt/conda/lib/python3.9/site-packages (from matplotlib<4->mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (10.2.0)\n", - "Requirement already satisfied: pyparsing>=2.3.1 in /opt/conda/lib/python3.9/site-packages (from matplotlib<4->mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.1.1)\n", - "Requirement already satisfied: importlib-resources>=3.2.0 in /opt/conda/lib/python3.9/site-packages (from matplotlib<4->mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (6.1.2)\n", - "Requirement already satisfied: PyJWT[crypto]<3,>=1.0.0 in /opt/conda/lib/python3.9/site-packages (from msal<2.0.0,>=1.24.0->azure-identity~=1.5->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.8.0)\n", - "Requirement already satisfied: portalocker<3,>=1.0 in /opt/conda/lib/python3.9/site-packages (from msal-extensions<2.0.0,>=0.3.0->azure-identity~=1.5->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.8.2)\n", - "Requirement already satisfied: beautifulsoup4 in /opt/conda/lib/python3.9/site-packages (from nbconvert>=6.4.5->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (4.12.3)\n", - "Requirement already satisfied: bleach!=5.0.0 in /opt/conda/lib/python3.9/site-packages (from nbconvert>=6.4.5->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (6.1.0)\n", - "Requirement already satisfied: defusedxml in /opt/conda/lib/python3.9/site-packages (from nbconvert>=6.4.5->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.7.1)\n", - "Requirement already satisfied: jupyter-core>=4.7 in /opt/conda/lib/python3.9/site-packages (from nbconvert>=6.4.5->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (5.7.1)\n", - "Requirement already satisfied: jupyterlab-pygments in /opt/conda/lib/python3.9/site-packages (from nbconvert>=6.4.5->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.3.0)\n", - "Requirement already satisfied: mistune<4,>=2.0.3 in /opt/conda/lib/python3.9/site-packages (from nbconvert>=6.4.5->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.0.2)\n", - "Requirement already satisfied: nbclient>=0.5.0 in /opt/conda/lib/python3.9/site-packages (from nbconvert>=6.4.5->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.9.0)\n", - "Requirement already satisfied: nbformat>=5.7 in /opt/conda/lib/python3.9/site-packages (from nbconvert>=6.4.5->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (5.9.2)\n", - "Requirement already satisfied: pandocfilters>=1.4.1 in /opt/conda/lib/python3.9/site-packages (from nbconvert>=6.4.5->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.5.1)\n", - "Requirement already satisfied: tinycss2 in /opt/conda/lib/python3.9/site-packages (from nbconvert>=6.4.5->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.2.1)\n", - "Requirement already satisfied: pyzmq<25,>=17 in /opt/conda/lib/python3.9/site-packages (from notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (24.0.1)\n", - "Requirement already satisfied: argon2-cffi in /opt/conda/lib/python3.9/site-packages (from notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (23.1.0)\n", - "Requirement already satisfied: jupyter-client<8,>=5.3.4 in /opt/conda/lib/python3.9/site-packages (from notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (7.4.9)\n", - "Requirement already satisfied: ipython-genutils in /opt/conda/lib/python3.9/site-packages (from notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.2.0)\n", - "Requirement already satisfied: ipykernel in /opt/conda/lib/python3.9/site-packages (from notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (6.29.3)\n", - "Requirement already satisfied: Send2Trash>=1.8.0 in /opt/conda/lib/python3.9/site-packages (from notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.8.2)\n", - "Requirement already satisfied: terminado>=0.8.3 in /opt/conda/lib/python3.9/site-packages (from notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.18.0)\n", - "Requirement already satisfied: prometheus-client in /opt/conda/lib/python3.9/site-packages (from notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.20.0)\n", - "Requirement already satisfied: nbclassic>=0.4.7 in /opt/conda/lib/python3.9/site-packages (from notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.0.0)\n", - "Requirement already satisfied: ptyprocess>=0.5 in /opt/conda/lib/python3.9/site-packages (from pexpect>4.3->ipython~=8.10->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.7.0)\n", - "Requirement already satisfied: wcwidth in /opt/conda/lib/python3.9/site-packages (from prompt-toolkit<3.1.0,>=3.0.41->ipython~=8.10->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.2.13)\n", - "Requirement already satisfied: oauthlib>=3.0.0 in /opt/conda/lib/python3.9/site-packages (from requests-oauthlib>=0.5.0->msrest~=0.6.21->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.2.2)\n", - "Requirement already satisfied: wheel in /opt/conda/lib/python3.9/site-packages (from strip-hints<1,>=0.1.8->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.41.2)\n", - "Requirement already satisfied: executing>=1.2.0 in /opt/conda/lib/python3.9/site-packages (from stack-data->ipython~=8.10->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.0.1)\n", - "Requirement already satisfied: asttokens>=2.1.0 in /opt/conda/lib/python3.9/site-packages (from stack-data->ipython~=8.10->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.4.1)\n", - "Requirement already satisfied: pure-eval in /opt/conda/lib/python3.9/site-packages (from stack-data->ipython~=8.10->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.2.2)\n", - "Requirement already satisfied: webencodings in /opt/conda/lib/python3.9/site-packages (from bleach!=5.0.0->nbconvert>=6.4.5->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.5.1)\n", - "Requirement already satisfied: pycparser in /opt/conda/lib/python3.9/site-packages (from cffi->azure-datalake-store<0.1,>=0.0.46->adlfs==2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.21)\n", - "Requirement already satisfied: grpcio-status<2.0.dev0,>=1.33.2 in /opt/conda/lib/python3.9/site-packages (from google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0dev,>=1.31.5->google-cloud-bigquery[bqstorage,pandas]==3.14.1->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.48.2)\n", - "Requirement already satisfied: platformdirs>=2.5 in /opt/conda/lib/python3.9/site-packages (from jupyter-core>=4.7->nbconvert>=6.4.5->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.10.0)\n", - "Requirement already satisfied: jupyter-server>=1.8 in /opt/conda/lib/python3.9/site-packages (from nbclassic>=0.4.7->notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.12.5)\n", - "Requirement already satisfied: notebook-shim>=0.2.3 in /opt/conda/lib/python3.9/site-packages (from nbclassic>=0.4.7->notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.2.4)\n", - "Requirement already satisfied: fastjsonschema in /opt/conda/lib/python3.9/site-packages (from nbformat>=5.7->nbconvert>=6.4.5->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.19.1)\n", - "Requirement already satisfied: pyasn1<0.6.0,>=0.4.6 in /opt/conda/lib/python3.9/site-packages (from pyasn1-modules>=0.2.1->google-auth>=1.2->gcsfs==2023.9.2->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.5.1)\n", - "Requirement already satisfied: argon2-cffi-bindings in /opt/conda/lib/python3.9/site-packages (from argon2-cffi->notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (21.2.0)\n", - "Requirement already satisfied: soupsieve>1.2 in /opt/conda/lib/python3.9/site-packages (from beautifulsoup4->nbconvert>=6.4.5->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.5)\n", - "Requirement already satisfied: comm>=0.1.1 in /opt/conda/lib/python3.9/site-packages (from ipykernel->notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.2.1)\n", - "Requirement already satisfied: debugpy>=1.6.5 in /opt/conda/lib/python3.9/site-packages (from ipykernel->notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.8.1)\n", - "Requirement already satisfied: jupyter-events>=0.9.0 in /opt/conda/lib/python3.9/site-packages (from jupyter-server>=1.8->nbclassic>=0.4.7->notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.9.0)\n", - "Requirement already satisfied: jupyter-server-terminals in /opt/conda/lib/python3.9/site-packages (from jupyter-server>=1.8->nbclassic>=0.4.7->notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.5.2)\n", - "Requirement already satisfied: overrides in /opt/conda/lib/python3.9/site-packages (from jupyter-server>=1.8->nbclassic>=0.4.7->notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (7.7.0)\n", - "Requirement already satisfied: python-json-logger>=2.0.4 in /opt/conda/lib/python3.9/site-packages (from jupyter-events>=0.9.0->jupyter-server>=1.8->nbclassic>=0.4.7->notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.0.7)\n", - "Requirement already satisfied: rfc3339-validator in /opt/conda/lib/python3.9/site-packages (from jupyter-events>=0.9.0->jupyter-server>=1.8->nbclassic>=0.4.7->notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.1.4)\n", - "Requirement already satisfied: rfc3986-validator>=0.1.1 in /opt/conda/lib/python3.9/site-packages (from jupyter-events>=0.9.0->jupyter-server>=1.8->nbclassic>=0.4.7->notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.1.1)\n", - "Requirement already satisfied: fqdn in /opt/conda/lib/python3.9/site-packages (from jsonschema<5,>=3.0.1->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.5.1)\n", - "Requirement already satisfied: isoduration in /opt/conda/lib/python3.9/site-packages (from jsonschema<5,>=3.0.1->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (20.11.0)\n", - "Requirement already satisfied: jsonpointer>1.13 in /opt/conda/lib/python3.9/site-packages (from jsonschema<5,>=3.0.1->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.1)\n", - "Requirement already satisfied: uri-template in /opt/conda/lib/python3.9/site-packages (from jsonschema<5,>=3.0.1->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.3.0)\n", - "Requirement already satisfied: webcolors>=1.11 in /opt/conda/lib/python3.9/site-packages (from jsonschema<5,>=3.0.1->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.13)\n", - "Requirement already satisfied: arrow>=0.15.0 in /opt/conda/lib/python3.9/site-packages (from isoduration->jsonschema<5,>=3.0.1->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.3.0)\n", - "Requirement already satisfied: types-python-dateutil>=2.8.10 in /opt/conda/lib/python3.9/site-packages (from arrow>=0.15.0->isoduration->jsonschema<5,>=3.0.1->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.8.19.20240106)\n", - "Downloading onnx-1.14.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (14.6 MB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 14.6/14.6 MB 274.2 MB/s eta 0:00:00\n", - "Downloading onnxruntime-1.16.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.4 MB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 6.4/6.4 MB 277.9 MB/s eta 0:00:00\n", - "Downloading optimum-1.6.4-py3-none-any.whl (227 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 227.8/227.8 kB 291.3 MB/s eta 0:00:00\n", - "Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 6.3/6.3 MB 242.4 MB/s eta 0:00:00\n", - "Downloading datasets-2.10.1-py3-none-any.whl (469 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 469.0/469.0 kB 185.9 MB/s eta 0:00:00\n", - "Downloading scikit_learn-1.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.4 MB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 26.4/26.4 MB 275.9 MB/s eta 0:00:00\n", - "Downloading dill-0.3.6-py3-none-any.whl (110 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 110.5/110.5 kB 282.3 MB/s eta 0:00:00\n", - "Downloading huggingface_hub-0.21.4-py3-none-any.whl (346 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 346.4/346.4 kB 311.7 MB/s eta 0:00:00\n", - "Downloading numpy-1.23.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.1 MB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 17.1/17.1 MB 269.6 MB/s eta 0:00:00\n", - "Downloading regex-2023.12.25-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (773 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 773.4/773.4 kB 311.9 MB/s eta 0:00:00\n", - "Downloading responses-0.18.0-py3-none-any.whl (38 kB)\n", - "Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 7.8/7.8 MB 264.1 MB/s eta 0:00:00\n", - "Downloading torch-2.2.1-cp39-cp39-manylinux1_x86_64.whl (755.5 MB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 755.5/755.5 MB 204.0 MB/s eta 0:00:00\n", - "Downloading nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 410.6/410.6 MB 40.3 MB/s eta 0:00:00\n", - "Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 14.1/14.1 MB 43.0 MB/s eta 0:00:00\n", - "Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 23.7/23.7 MB 46.9 MB/s eta 0:00:00\n", - "Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 823.6/823.6 kB 51.0 MB/s eta 0:00:00\n", - "Downloading nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 731.7/731.7 MB 58.2 MB/s eta 0:00:00\n", - "Downloading nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 121.6/121.6 MB 69.0 MB/s eta 0:00:00\n", - "Downloading nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 56.5/56.5 MB 36.0 MB/s eta 0:00:00\n", - "Downloading nvidia_cusolver_cu12-11.4.5.107-py3-none-manylinux1_x86_64.whl (124.2 MB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 124.2/124.2 MB 52.8 MB/s eta 0:00:00\n", - "Downloading nvidia_cusparse_cu12-12.1.0.106-py3-none-manylinux1_x86_64.whl (196.0 MB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 196.0/196.0 MB 45.9 MB/s eta 0:00:00\n", - "Downloading nvidia_nccl_cu12-2.19.3-py3-none-manylinux1_x86_64.whl (166.0 MB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 166.0/166.0 MB 19.6 MB/s eta 0:00:00\n", - "Downloading nvidia_nvtx_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (99 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 99.1/99.1 kB 27.7 MB/s eta 0:00:00\n", - "Downloading triton-2.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (167.9 MB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 167.9/167.9 MB 41.3 MB/s eta 0:00:00\n", - "Downloading protobuf-3.20.2-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl (1.0 MB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.0/1.0 MB 42.8 MB/s eta 0:00:00\n", - "Downloading coloredlogs-15.0.1-py2.py3-none-any.whl (46 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 46.0/46.0 kB 192.0 MB/s eta 0:00:00\n", - "Downloading filelock-3.13.1-py3-none-any.whl (11 kB)\n", - "Downloading flatbuffers-24.3.7-py2.py3-none-any.whl (26 kB)\n", - "Downloading multiprocess-0.70.14-py39-none-any.whl (132 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 132.9/132.9 kB 100.7 MB/s eta 0:00:00\n", - "Downloading sympy-1.12-py3-none-any.whl (5.7 MB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 5.7/5.7 MB 41.4 MB/s eta 0:00:00\n", - "Downloading humanfriendly-10.0-py2.py3-none-any.whl (86 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 86.8/86.8 kB 253.7 MB/s eta 0:00:00\n", - "Downloading mpmath-1.3.0-py3-none-any.whl (536 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 536.2/536.2 kB 45.4 MB/s eta 0:00:00\n", - "Downloading sentencepiece-0.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.3/1.3 MB 46.1 MB/s eta 0:00:00\n", - "Downloading networkx-3.2.1-py3-none-any.whl (1.6 MB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.6/1.6 MB 43.7 MB/s eta 0:00:00\n", - "Downloading nvidia_nvjitlink_cu12-12.4.99-py3-none-manylinux2014_x86_64.whl (21.1 MB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 21.1/21.1 MB 43.8 MB/s eta 0:00:00\n", - "Installing collected packages: tokenizers, sentencepiece, mpmath, flatbuffers, sympy, regex, protobuf, nvidia-nvtx-cu12, nvidia-nvjitlink-cu12, nvidia-nccl-cu12, nvidia-curand-cu12, nvidia-cufft-cu12, nvidia-cuda-runtime-cu12, nvidia-cuda-nvrtc-cu12, nvidia-cuda-cupti-cu12, nvidia-cublas-cu12, numpy, networkx, humanfriendly, filelock, dill, triton, responses, onnx, nvidia-cusparse-cu12, nvidia-cudnn-cu12, multiprocess, huggingface-hub, coloredlogs, transformers, scikit-learn, onnxruntime, nvidia-cusolver-cu12, torch, datasets, optimum\n", - " Attempting uninstall: protobuf\n", - " Found existing installation: protobuf 3.20.3\n", - " Uninstalling protobuf-3.20.3:\n", - " Successfully uninstalled protobuf-3.20.3\n", - " Attempting uninstall: numpy\n", - " Found existing installation: numpy 1.26.4\n", - " Uninstalling numpy-1.26.4:\n", - " Successfully uninstalled numpy-1.26.4\n", - " Attempting uninstall: scikit-learn\n", - " Found existing installation: scikit-learn 1.4.1.post1\n", - " Uninstalling scikit-learn-1.4.1.post1:\n", - " Successfully uninstalled scikit-learn-1.4.1.post1\n", - "Successfully installed coloredlogs-15.0.1 datasets-2.10.1 dill-0.3.6 filelock-3.13.1 flatbuffers-24.3.7 huggingface-hub-0.21.4 humanfriendly-10.0 mpmath-1.3.0 multiprocess-0.70.14 networkx-3.2.1 numpy-1.23.5 nvidia-cublas-cu12-12.1.3.1 nvidia-cuda-cupti-cu12-12.1.105 nvidia-cuda-nvrtc-cu12-12.1.105 nvidia-cuda-runtime-cu12-12.1.105 nvidia-cudnn-cu12-8.9.2.26 nvidia-cufft-cu12-11.0.2.54 nvidia-curand-cu12-10.3.2.106 nvidia-cusolver-cu12-11.4.5.107 nvidia-cusparse-cu12-12.1.0.106 nvidia-nccl-cu12-2.19.3 nvidia-nvjitlink-cu12-12.4.99 nvidia-nvtx-cu12-12.1.105 onnx-1.14.1 onnxruntime-1.16.3 optimum-1.6.4 protobuf-3.20.2 regex-2023.12.25 responses-0.18.0 scikit-learn-1.0.2 sentencepiece-0.2.0 sympy-1.12 tokenizers-0.13.3 torch-2.2.1 transformers-4.26.1 triton-2.2.0\n", - "WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\n", - "\u001b[36mINFO\u001b[0m[0238] Taking snapshot of full filesystem... \n", - "\u001b[36mINFO\u001b[0m[0463] Pushing image to docker-registry.default-tenant.app.app-lab-2-b688.iguazio-cd2.com/mlrun/func-hugging-face-trainer-avia-hugging-face-classifier-trainer:latest \n", - "\u001b[36mINFO\u001b[0m[0493] Pushed docker-registry.default-tenant.app.app-lab-2-b688.iguazio-cd2.com/mlrun/func-hugging-face-trainer-avia-hugging-face-classifier-trainer@sha256:691d0bb3c23487b4b5d2f84ab323c24735626ee81681475f53a4158b72d4cfee \n" - ] - }, - { - "data": { - "text/plain": [ - "BuildStatus(ready=True, outputs={'image': '.mlrun/func-hugging-face-trainer-avia-hugging-face-classifier-trainer:latest'})" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "project.build_function(\"hugging-face-classifier-trainer\",with_mlrun=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": { - "scrolled": true, - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2024-03-24 17:22:42,252 [info] Storing function: {'name': 'hugging-face-classifier-trainer-train', 'uid': '53252ce7aacb4b1aacf86bf3b862daa2', 'db': 'http://mlrun-api:8080'}\n", - "> 2024-03-24 17:22:42,536 [info] Job is running in the background, pod: hugging-face-classifier-trainer-train-dqqfr\n", - "> 2024-03-24 17:24:43,288 [info] 'train_test_split_size' is not provided, setting train_test_split_size to 0.2\n", - "> 2024-03-24 17:24:43,847 [info] Loading and editing Shayanvsf/US_Airline_Sentiment dataset from Hugging Face hub\n", - "Downloading metadata: 100%|██████████| 1.03k/1.03k [00:00<00:00, 6.77MB/s]\n", - "Downloading and preparing dataset None/None (download: 265.13 KiB, generated: 1.50 MiB, post-processed: Unknown size, total: 1.76 MiB) to /root/.cache/huggingface/datasets/Shayanvsf___parquet/Shayanvsf--US_Airline_Sentiment-1319c42f87c44b2f/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...\n", - "Downloading data files: 0%| | 0/3 [00:00 2024-03-24 17:24:47,076 [info] training 'huggingface-model'\n", - "The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`, you can safely ignore this message.\n", - "This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n", - "***** Running training *****\n", - " Num examples = 100\n", - " Num Epochs = 3\n", - " Instantaneous batch size per device = 16\n", - " Total train batch size (w. parallel, distributed & accumulation) = 16\n", - " Gradient Accumulation steps = 1\n", - " Total optimization steps = 21\n", - " Number of trainable parameters = 66955010\n", - "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", - "To disable this warning, you can either:\n", - "\t- Avoid using `tokenizers` before the fork if possible\n", - "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", - " 0%| | 0/21 [00:00 2024-03-24 17:26:00,230 [info] To track results use the CLI: {'info_cmd': 'mlrun get run 53252ce7aacb4b1aacf86bf3b862daa2 -p hugging-face-trainer-avia', 'logs_cmd': 'mlrun logs 53252ce7aacb4b1aacf86bf3b862daa2 -p hugging-face-trainer-avia'}\n", - "> 2024-03-24 17:26:00,231 [info] Or click for UI: {'ui_url': 'https://dashboard.default-tenant.app.app-lab-2-b688.iguazio-cd2.com/mlprojects/hugging-face-trainer-avia/jobs/monitor/53252ce7aacb4b1aacf86bf3b862daa2/overview'}\n", - "> 2024-03-24 17:26:00,231 [info] Run execution finished: {'status': 'completed', 'name': 'hugging-face-classifier-trainer-train'}\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
hugging-face-trainer-avia0Mar 24 17:24:39completedhugging-face-classifier-trainer-train
v3io_user=avia
kind=job
owner=avia
mlrun/client_version=1.6.1
mlrun/client_python_version=3.9.16
host=hugging-face-classifier-trainer-train-dqqfr
hf_dataset=Shayanvsf/US_Airline_Sentiment
drop_columns=['airline_sentiment_confidence', 'negativereason_confidence']
pretrained_tokenizer=distilbert-base-uncased
pretrained_model=distilbert-base-uncased
model_class=transformers.AutoModelForSequenceClassification
label_name=airline_sentiment
num_of_train_samples=100
metrics=['accuracy', 'f1']
random_state=42
TRAIN_output_dir=finetuning-sentiment-model-3000-samples
TRAIN_learning_rate=2e-05
TRAIN_per_device_train_batch_size=16
TRAIN_per_device_eval_batch_size=16
TRAIN_num_train_epochs=3
TRAIN_weight_decay=0.01
TRAIN_push_to_hub=False
TRAIN_evaluation_strategy=epoch
TRAIN_eval_steps=1
TRAIN_logging_steps=1
CLASS_num_labels=2
loss=0.5215
learning_rate=0.0
eval_loss=0.4750453531742096
eval_accuracy=0.7916666666666666
eval_f1=0.0
eval_runtime=1.0524
eval_samples_per_second=22.806
eval_steps_per_second=1.9
train_runtime=55.1543
train_samples_per_second=5.439
train_steps_per_second=0.381
total_flos=3327208489680.0
loss_plot
learning_rate_plot
eval_loss_plot
eval_accuracy_plot
eval_f1_plot
eval_runtime_plot
eval_samples_per_second_plot
eval_steps_per_second_plot
tokenizer
model
\n", - "
\n", - "
\n", - "
\n", - " Title\n", - " ×\n", - "
\n", - " \n", - "
\n", - "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - }, - { - "data": { - "text/html": [ - " > to track results use the .show() or .logs() methods or click here to open in UI" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2024-03-24 17:26:09,792 [info] Run execution finished: {'status': 'completed', 'name': 'hugging-face-classifier-trainer-train'}\n" - ] - } - ], - "source": [ - "train_run = hugging_face_classifier_trainer.run(params={\n", - " \"hf_dataset\": \"Shayanvsf/US_Airline_Sentiment\",\n", - " \"drop_columns\": [\n", - " \"airline_sentiment_confidence\",\n", - " \"negativereason_confidence\",\n", - " ],\n", - " \"pretrained_tokenizer\": \"distilbert-base-uncased\",\n", - " \"pretrained_model\": \"distilbert-base-uncased\",\n", - " \"model_class\": \"transformers.AutoModelForSequenceClassification\",\n", - " \"label_name\": \"airline_sentiment\",\n", - " \"num_of_train_samples\": 100,\n", - " \"metrics\": [\"accuracy\", \"f1\"],\n", - " \"random_state\": 42,\n", - " **additional_parameters\n", - " },\n", - " handler=\"train\", \n", - " )" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "[Back to the top](#top)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "mlrun-base", - "language": "python", - "name": "conda-env-mlrun-base-py" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.16" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/functions/development/hugging_face_classifier_trainer/latest/src/hugging_face_classifier_trainer.py b/functions/development/hugging_face_classifier_trainer/latest/src/hugging_face_classifier_trainer.py deleted file mode 100755 index 29d07039..00000000 --- a/functions/development/hugging_face_classifier_trainer/latest/src/hugging_face_classifier_trainer.py +++ /dev/null @@ -1,832 +0,0 @@ -import os -import shutil -import tempfile -import zipfile -from abc import ABC -from typing import Any, Callable, Dict, List, Optional, Tuple, Union - -import mlrun -import mlrun.datastore -import mlrun.utils -import numpy as np -import pandas as pd -import transformers -from datasets import Dataset, load_dataset, load_metric -from mlrun import MLClientCtx -from mlrun import feature_store as fs -from mlrun.artifacts import Artifact, PlotlyArtifact -from mlrun.datastore import DataItem -from mlrun.frameworks._common import CommonTypes, MLRunInterface -from mlrun.utils import create_class -from plotly import graph_objects as go -from sklearn.model_selection import train_test_split -from transformers import ( - AutoTokenizer, - DataCollatorWithPadding, - EvalPrediction, - PreTrainedModel, - PreTrainedTokenizer, - Trainer, - TrainerCallback, - TrainerControl, - TrainerState, - TrainingArguments, -) - - -# ----------------------from MLRUN-------------------------------- -class HFORTOptimizerMLRunInterface(MLRunInterface, ABC): - """ - Interface for adding MLRun features for tensorflow keras API. - """ - - # MLRun's context default name: - DEFAULT_CONTEXT_NAME = "mlrun-huggingface" - - # Attributes to be inserted so the MLRun interface will be fully enabled. - _PROPERTIES = { - "_auto_log": False, - "_context": None, - "_model_name": "model", - "_tag": "", - "_labels": None, - "_extra_data": None, - } - _METHODS = ["enable_auto_logging"] - # Attributes to replace so the MLRun interface will be fully enabled. - _REPLACED_METHODS = [ - "optimize", - ] - - @classmethod - def add_interface( - cls, - obj, - restoration: CommonTypes.MLRunInterfaceRestorationType = None, - ): - """ - Enrich the object with this interface properties, methods and functions, so it will have this TensorFlow.Keras - MLRun's features. - :param obj: The object to enrich his interface. - :param restoration: Restoration information tuple as returned from 'remove_interface' in order to - add the interface in a certain state. - """ - super(HFORTOptimizerMLRunInterface, cls).add_interface( - obj=obj, restoration=restoration - ) - - @classmethod - def mlrun_optimize(cls): - """ - MLRun's tf.keras.Model.fit wrapper. It will setup the optimizer when using horovod. The optimizer must be - passed in a keyword argument and when using horovod, it must be passed as an Optimizer instance, not a string. - - raise MLRunInvalidArgumentError: In case the optimizer provided did not follow the instructions above. - """ - - def wrapper(self, *args, **kwargs): - save_dir = cls._get_function_argument( - self.optimize, - argument_name="save_dir", - passed_args=args, - passed_kwargs=kwargs, - )[0] - - # Call the original optimize method: - result = self.original_optimize(*args, **kwargs) - - if self._auto_log: - # Log the onnx model: - self._context.log_model( - key="model", - db_key=self._model_name, - model_file=f"{save_dir}/model_optimized.onnx", - tag=self._tag, - framework="ONNX", - labels=self._labels, - extra_data=self._extra_data, - ) - - return result - - return wrapper - - def enable_auto_logging( - self, - context: mlrun.MLClientCtx, - model_name: str = "model", - tag: str = "", - labels: Dict[str, str] = None, - extra_data: dict = None, - ): - self._auto_log = True - - self._context = context - self._model_name = model_name - self._tag = tag - self._labels = labels - self._extra_data = extra_data - - -class HFTrainerMLRunInterface(MLRunInterface, ABC): - """ - Interface for adding MLRun features for tensorflow keras API. - """ - - # MLRuns context default name: - DEFAULT_CONTEXT_NAME = "mlrun-huggingface" - - # Attributes to replace so the MLRun interface will be fully enabled. - _REPLACED_METHODS = [ - "train", - # "evaluate" - ] - - @classmethod - def add_interface( - cls, - obj: Trainer, - restoration: CommonTypes.MLRunInterfaceRestorationType = None, - ): - """ - Enrich the object with this interface properties, methods and functions, so it will have this TensorFlow.Keras - MLRuns features. - :param obj: The object to enrich his interface. - :param restoration: Restoration information tuple as returned from 'remove_interface' in order to - add the interface in a certain state. - """ - - super(HFTrainerMLRunInterface, cls).add_interface( - obj=obj, restoration=restoration - ) - - @classmethod - def mlrun_train(cls): - - """ - MLRuns tf.keras.Model.fit wrapper. It will setup the optimizer when using horovod. The optimizer must be - passed in a keyword argument and when using horovod, it must be passed as an Optimizer instance, not a string. - - raise MLRunInvalidArgumentError: In case the optimizer provided did not follow the instructions above. - """ - - def wrapper(self: Trainer, *args, **kwargs): - # Restore the evaluation method as `train` will use it: - # cls._restore_attribute(obj=self, attribute_name="evaluate") - - # Call the original fit method: - result = self.original_train(*args, **kwargs) - - # Replace the evaluation method again: - # cls._replace_function(obj=self, function_name="evaluate") - - return result - - return wrapper - - -class MLRunCallback(TrainerCallback): - """ - Callback for collecting logs during training / evaluation of the `Trainer` API. - """ - - def __init__( - self, - context: mlrun.MLClientCtx = None, - model_name: str = "model", - tag: str = "", - labels: Dict[str, str] = None, - extra_data: dict = None, - ): - super().__init__() - - # Store the configurations: - self._context = ( - context - if context is not None - else mlrun.get_or_create_ctx("./mlrun-huggingface") - ) - self._model_name = model_name - self._tag = tag - self._labels = labels - self._extra_data = extra_data if extra_data is not None else {} - - # Set up the logging mode: - self._is_training = False - self._steps: List[List[int]] = [] - self._metric_scores: Dict[str, List[float]] = {} - self._artifacts: Dict[str, Artifact] = {} - - def on_epoch_begin( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs, - ): - self._steps.append([]) - - def on_epoch_end( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs, - ): - self._log_metrics() - - def on_log( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - logs: Dict[str, float] = None, - **kwargs, - ): - recent_logs = state.log_history[-1].copy() - - recent_logs.pop("epoch") - current_step = int(recent_logs.pop("step")) - if current_step not in self._steps[-1]: - self._steps[-1].append(current_step) - - for metric_name, metric_score in recent_logs.items(): - if metric_name.startswith("train_"): - if metric_name.split("train_")[1] not in self._metric_scores: - self._metric_scores[metric_name] = [metric_score] - continue - if metric_name not in self._metric_scores: - self._metric_scores[metric_name] = [] - self._metric_scores[metric_name].append(metric_score) - - def on_train_begin( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs, - ): - self._is_training = True - - def on_train_end( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - model: PreTrainedModel = None, - tokenizer: PreTrainedTokenizer = None, - **kwargs, - ): - self._log_metrics() - - temp_directory = tempfile.gettempdir() - - # Save and log the tokenizer: - if tokenizer is not None: - # Save tokenizer: - tokenizer_dir = os.path.join(temp_directory, "tokenizer") - tokenizer.save_pretrained(save_directory=tokenizer_dir) - # Zip the tokenizer directory: - tokenizer_zip = shutil.make_archive( - base_name="tokenizer", - format="zip", - root_dir=tokenizer_dir, - ) - # Log the zip file: - self._artifacts["tokenizer"] = self._context.log_artifact( - item="tokenizer", local_path=tokenizer_zip - ) - - # Save the model: - model_dir = os.path.join(temp_directory, "model") - model.save_pretrained(save_directory=model_dir) - - # Zip the model directory: - shutil.make_archive( - base_name="model", - format="zip", - root_dir=model_dir, - ) - - # Log the model: - self._context.log_model( - key="model", - db_key=self._model_name, - model_file="model.zip", - tag=self._tag, - framework="Hugging Face", - labels=self._labels, - extra_data={**self._artifacts, **self._extra_data}, - ) - - def on_evaluate( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs, - ): - self._log_metrics() - - if self._is_training: - return - - # TODO: Update the model object - - def _log_metrics(self): - for metric_name, metric_scores in self._metric_scores.items(): - self._context.log_result(key=metric_name, value=metric_scores[-1]) - if len(metric_scores) > 1: - self._log_metric_plot(name=metric_name, scores=metric_scores) - self._context.commit(completed=False) - - def _log_metric_plot(self, name: str, scores: List[float]): - # Initialize a plotly figure: - metric_figure = go.Figure() - - # Add titles: - metric_figure.update_layout( - title=name.capitalize().replace("_", " "), - xaxis_title="Samples", - yaxis_title="Scores", - ) - - # Draw: - metric_figure.add_trace( - go.Scatter(x=np.arange(len(scores)), y=scores, mode="lines") - ) - - # Create the plotly artifact: - artifact_name = f"{name}_plot" - artifact = PlotlyArtifact(key=artifact_name, figure=metric_figure) - self._artifacts[artifact_name] = self._context.log_artifact(artifact) - - -def _apply_mlrun_on_trainer( - trainer: transformers.Trainer, - model_name: str = None, - tag: str = "", - context: mlrun.MLClientCtx = None, - auto_log: bool = True, - labels: Dict[str, str] = None, - extra_data: dict = None, - **kwargs, -): - # Get parameters defaults: - if context is None: - context = mlrun.get_or_create_ctx(HFTrainerMLRunInterface.DEFAULT_CONTEXT_NAME) - - HFTrainerMLRunInterface.add_interface(obj=trainer) - - if auto_log: - trainer.add_callback( - MLRunCallback( - context=context, - model_name=model_name, - tag=tag, - labels=labels, - extra_data=extra_data, - ) - ) - - -def _apply_mlrun_on_optimizer( - optimizer, - model_name: str = None, - tag: str = "", - context: mlrun.MLClientCtx = None, - auto_log: bool = True, - labels: Dict[str, str] = None, - extra_data: dict = None, - **kwargs, -): - # Get parameters defaults: - if context is None: - context = mlrun.get_or_create_ctx( - HFORTOptimizerMLRunInterface.DEFAULT_CONTEXT_NAME - ) - - HFORTOptimizerMLRunInterface.add_interface(obj=optimizer) - - if auto_log: - optimizer.enable_auto_logging( - context=context, - model_name=model_name, - tag=tag, - labels=labels, - extra_data=extra_data, - ) - - -def apply_mlrun( - huggingface_object, - model_name: str = None, - tag: str = "", - context: mlrun.MLClientCtx = None, - auto_log: bool = True, - labels: Dict[str, str] = None, - extra_data: dict = None, - **kwargs, -): - """ - Wrap the given model with MLRun's interface providing it with mlrun's additional features. - :param huggingface_object: The model to wrap. Can be loaded from the model path given as well. - :param model_name: The model name to use for storing the model artifact. Default: "model". - :param tag: The model's tag to log with. - :param context: MLRun context to work with. If no context is given it will be retrieved via - 'mlrun.get_or_create_ctx(None)' - :param auto_log: Whether to enable MLRun's auto logging. Default: True. - """ - - if isinstance(huggingface_object, transformers.Trainer): - return _apply_mlrun_on_trainer( - trainer=huggingface_object, - model_name=model_name, - tag=tag, - context=context, - auto_log=auto_log, - labels=labels, - extra_data=extra_data, - ) - import optimum.onnxruntime as optimum_ort - - if isinstance(huggingface_object, optimum_ort.ORTOptimizer): - return _apply_mlrun_on_optimizer( - optimizer=huggingface_object, - model_name=model_name, - tag=tag, - context=context, - auto_log=auto_log, - labels=labels, - extra_data=extra_data, - ) - raise mlrun.errors.MLRunInvalidArgumentError - - -# ---------------------- from auto_trainer-------------------------------- -class KWArgsPrefixes: - MODEL_CLASS = "CLASS_" - FIT = "FIT_" - TRAIN = "TRAIN_" - PREDICT = "PREDICT_" - - -def _get_sub_dict_by_prefix(src: Dict, prefix_key: str) -> Dict[str, Any]: - """ - Collect all the keys from the given dict that starts with the given prefix and creates a new dictionary with these - keys. - - :param src: The source dict to extract the values from. - :param prefix_key: Only keys with this prefix will be returned. The keys in the result dict will be without this - prefix. - """ - return { - key.replace(prefix_key, ""): val - for key, val in src.items() - if key.startswith(prefix_key) - } - - -def _get_dataframe( - context: MLClientCtx, - dataset: DataItem, - label_columns: Optional[Union[str, List[str]]] = None, - drop_columns: Union[str, List[str], int, List[int]] = None, -) -> Tuple[pd.DataFrame, Optional[Union[str, List[str]]]]: - """ - Getting the DataFrame of the dataset and drop the columns accordingly. - - :param context: MLRun context. - :param dataset: The dataset to train the model on. - Can be either a list of lists, dict, URI or a FeatureVector. - :param label_columns: The target label(s) of the column(s) in the dataset. for Regression or - Classification tasks. - :param drop_columns: str/int or a list of strings/ints that represent the column names/indices to drop. - """ - if isinstance(dataset, (list, dict)): - dataset = pd.DataFrame(dataset) - # Checking if drop_columns provided by integer type: - if drop_columns: - if isinstance(drop_columns, str) or ( - isinstance(drop_columns, list) - and any(isinstance(col, str) for col in drop_columns) - ): - context.logger.error( - "drop_columns must be an integer/list of integers if not provided with a URI/FeatureVector dataset" - ) - raise ValueError - dataset.drop(drop_columns, axis=1, inplace=True) - - return dataset, label_columns - - store_uri_prefix, _ = mlrun.datastore.parse_store_uri(dataset.artifact_url) - if mlrun.utils.StorePrefix.FeatureVector == store_uri_prefix: - # feature-vector case: - label_columns = label_columns or dataset.meta.status.label_column - dataset = fs.get_offline_features( - dataset.meta.uri, drop_columns=drop_columns - ).to_dataframe() - - context.logger.info(f"label columns: {label_columns}") - else: - # simple URL case: - dataset = dataset.as_df() - if drop_columns: - if all(col in dataset for col in drop_columns): - dataset = dataset.drop(drop_columns, axis=1) - else: - context.logger.info( - "not all of the columns to drop in the dataset, drop columns process skipped" - ) - return dataset, label_columns - - -# ---------------------- Hugging Face Trainer -------------------------------- - - -def _create_compute_metrics(metrics: List[str]) -> Callable[[EvalPrediction], Dict]: - """ - This function create and returns a function that will be used to compute metrics at evaluation. - :param metrics: List of different metrics for evaluate the model such as f1, accuracy etc. - - :returns: Function that will be used to compute metrics at evaluation. - Must take a [`EvalPrediction`] and return a dictionary string to metric values. - """ - - def _compute_metrics(eval_pred): - logits, labels = eval_pred - predictions = np.argmax(logits, axis=-1) - metric_dict_results = {} - for metric in metrics: - load_met = load_metric(metric) - metric_res = load_met.compute(predictions=predictions, references=labels)[ - metric - ] - metric_dict_results[metric] = metric_res - - return metric_dict_results - - return _compute_metrics - - -def _edit_columns( - dataset: Dataset, - drop_columns: List[str] = None, - rename_columns: [str, str] = None, -) -> Dataset: - """ - Drop and renames that columns of the given dataset - :param dataset: Dataset to process - :param drop_columns: The columns to drop from the dataset. - :param rename_columns: Dict of columns ro rename : {: , ...} - - :returns: The dataset after the desired process - """ - if drop_columns: - dataset = dataset.remove_columns(drop_columns) - if rename_columns: - dataset = dataset.rename_columns(rename_columns) - return dataset - - -def _prepare_dataset( - context: MLClientCtx, - dataset_name: str, - label_name: str = None, - drop_columns: Optional[List[str]] = None, - num_of_train_samples: int = None, - train_test_split_size: float = None, - random_state: int = None, -) -> Tuple[Dataset, Dataset]: - """ - Loading the dataset and editing the columns - - :param context: MLRun contex - :param dataset_name: The name of the dataset to get from the HuggingFace hub - :param label_name: The target label of the column in the dataset. - :param drop_columns: The columns to drop from the dataset. - :param num_of_train_samples: Max number of training samples, for debugging. - :param train_test_split_size: Should be between 0.0 and 1.0 and represent the proportion of the dataset to include - in the test split. - :param random_state: Random state for train_test_split - - """ - - context.logger.info( - f"Loading and editing {dataset_name} dataset from Hugging Face hub" - ) - rename_cols = {label_name: "labels"} - - # Loading and editing dataset: - dataset = load_dataset(dataset_name) - - # train set - train_dataset = dataset["train"] - if num_of_train_samples: - train_dataset = train_dataset.shuffle(seed=random_state).select( - list(range(num_of_train_samples)) - ) - train_dataset = _edit_columns(train_dataset, drop_columns, rename_cols) - - # test set - test_dataset = dataset["test"] - if train_test_split_size or num_of_train_samples: - train_test_split_size = train_test_split_size or 0.2 - num_of_test_samples = int( - (train_dataset.num_rows * train_test_split_size) - // (1 - train_test_split_size) - ) - test_dataset = test_dataset.shuffle(seed=random_state).select( - list(range(num_of_test_samples)) - ) - test_dataset = _edit_columns(test_dataset, drop_columns, rename_cols) - - return train_dataset, test_dataset - - -def train( - context: MLClientCtx, - hf_dataset: str = None, - dataset: DataItem = None, - test_set: DataItem = None, - drop_columns: Optional[List[str]] = None, - pretrained_tokenizer: str = None, - pretrained_model: str = None, - model_class: str = None, - model_name: str = "huggingface-model", - label_name: str = "labels", - text_col: str = "text", - num_of_train_samples: int = None, - train_test_split_size: float = None, - metrics: List[str] = None, - random_state: int = None, -): - """ - Training and evaluating a pretrained model with a pretrained tokenizer over a dataset. - The dataset can be either be the name of the dataset that contains in the HuggingFace hub, - or a URI or a FeatureVector - - :param context: MLRun context - :param hf_dataset: The name of the dataset to get from the HuggingFace hub - :param dataset: The dataset to train the model on. Can be either a URI or a FeatureVector - :param test_set: The test set to train the model with. - :param drop_columns: The columns to drop from the dataset. - :param pretrained_tokenizer: The name of the pretrained tokenizer from the HuggingFace hub. - :param pretrained_model: The name of the pretrained model from the HuggingFace hub. - :param model_name: The model's name to use for storing the model artifact, default to 'model' - :param model_class: The class of the model, e.g. `transformers.AutoModelForSequenceClassification` - :param label_name: The target label of the column in the dataset. - :param text_col: The input text column un the dataset. - :param num_of_train_samples: Max number of training samples, for debugging. - :param train_test_split_size: Should be between 0.0 and 1.0 and represent the proportion of the dataset to include - in the test split. - :param metrics: List of different metrics for evaluate the model such as f1, accuracy etc. - :param random_state: Random state for train_test_split - """ - - if train_test_split_size is None and test_set is None: - context.logger.info( - "'train_test_split_size' is not provided, setting train_test_split_size to 0.2" - ) - train_test_split_size = 0.2 - - # Creating tokenizer: - tokenizer = AutoTokenizer.from_pretrained(pretrained_tokenizer) - - def preprocess_function(examples): - return tokenizer(examples[text_col], truncation=True) - - # prepare data for training - if hf_dataset: - train_dataset, test_dataset = _prepare_dataset( - context, - hf_dataset, - label_name, - drop_columns, - num_of_train_samples, - train_test_split_size, - random_state=random_state, - ) - elif dataset: - # Get DataFrame by URL or by FeatureVector: - train_dataset, label_name = _get_dataframe( - context=context, - dataset=dataset, - label_columns=label_name, - drop_columns=drop_columns, - ) - if test_set: - test_dataset, _ = _get_dataframe( - context=context, - dataset=test_set, - label_columns=label_name, - drop_columns=drop_columns, - ) - else: - train_dataset, test_dataset = train_test_split( - train_dataset, - test_size=train_test_split_size, - random_state=random_state, - ) - train_dataset = Dataset.from_pandas(train_dataset) - test_dataset = Dataset.from_pandas(test_dataset) - else: - raise mlrun.errors.MLRunInvalidArgumentError( - "Training data was not provided. A training dataset is mandatory for training." - " Please provide a training set using one of the arguments 'hf_dataset' or 'dataset'." - ) - - # Mapping datasets with the tokenizer: - tokenized_train = train_dataset.map(preprocess_function, batched=True) - tokenized_test = test_dataset.map(preprocess_function, batched=True) - - # Creating data collator for batching: - data_collator = DataCollatorWithPadding(tokenizer=tokenizer) - - # Parsing kwargs: - train_kwargs = _get_sub_dict_by_prefix( - src=context.parameters, prefix_key=KWArgsPrefixes.TRAIN - ) - model_class_kwargs = _get_sub_dict_by_prefix( - src=context.parameters, prefix_key=KWArgsPrefixes.MODEL_CLASS - ) - - # Loading our pretrained model: - model_class_kwargs["pretrained_model_name_or_path"] = ( - model_class_kwargs.get("pretrained_model_name_or_path") or pretrained_model - ) - train_kwargs["hub_token"] = train_kwargs.get("hub_token") or pretrained_tokenizer - if not model_class_kwargs["pretrained_model_name_or_path"]: - raise mlrun.errors.MLRunRuntimeError( - "Must provide pretrained_model name as " - "function argument or in extra params" - ) - model = create_class(model_class).from_pretrained(**model_class_kwargs) - - # Preparing training arguments: - training_args = TrainingArguments( - **train_kwargs, - ) - - compute_metrics = _create_compute_metrics(metrics) if metrics else None - trainer = Trainer( - model=model, - args=training_args, - train_dataset=tokenized_train, - eval_dataset=tokenized_test, - tokenizer=tokenizer, - data_collator=data_collator, - compute_metrics=compute_metrics, - ) - - apply_mlrun(trainer, model_name=model_name) - - # Apply training with evaluation: - context.logger.info(f"training '{model_name}'") - trainer.train() - - -def _get_model_dir(model_uri: str): - model_file, _, _ = mlrun.artifacts.get_model(model_uri) - model_dir = tempfile.gettempdir() - # Unzip the Model: - with zipfile.ZipFile(model_file, "r") as zip_file: - zip_file.extractall(model_dir) - - return model_dir - - -def optimize( - model_path: str, - model_name: str = "optimized_model", - target_dir: str = "./optimized", - optimization_level: int = 1, -): - """ - Optimizing the transformer model using ONNX optimization. - - - :param model_path: The path of the model to optimize. - :param model_name: Name of the optimized model. - :param target_dir: The directory to save the ONNX model. - :param optimization_level: Optimization level performed by ONNX Runtime of the loaded graph. (default is 1) - """ - # We import these in the function scope so ONNX won't be mandatory for the other handlers: - from optimum.onnxruntime import ORTModelForSequenceClassification, ORTOptimizer - from optimum.onnxruntime.configuration import OptimizationConfig - - model_dir = _get_model_dir(model_uri=model_path) - # Creating configuration for optimization step: - optimization_config = OptimizationConfig(optimization_level=optimization_level) - - # Converting our pretrained model to an ONNX-Runtime model: - ort_model = ORTModelForSequenceClassification.from_pretrained( - model_dir, from_transformers=True - ) - - # Creating an ONNX-Runtime optimizer from ONNX model: - optimizer = ORTOptimizer.from_pretrained(ort_model) - - apply_mlrun(optimizer, model_name=model_name) - # Optimizing and saving the ONNX model: - optimizer.optimize(save_dir=target_dir, optimization_config=optimization_config) diff --git a/functions/development/hugging_face_classifier_trainer/latest/src/item.yaml b/functions/development/hugging_face_classifier_trainer/latest/src/item.yaml deleted file mode 100755 index 3c087765..00000000 --- a/functions/development/hugging_face_classifier_trainer/latest/src/item.yaml +++ /dev/null @@ -1,31 +0,0 @@ -apiVersion: v1 -categories: -- machine-learning -- model-training -description: Automatic train and optimize functions for HuggingFace framework -doc: '' -example: hugging_face_classifier_trainer.ipynb -generationDate: 2022-08-28:17-25 -hidden: false -icon: '' -labels: - author: davids -maintainers: [] -marketplaceType: '' -mlrunVersion: 1.6.1 -name: hugging_face_classifier_trainer -platformVersion: 3.5.5 -spec: - filename: hugging_face_classifier_trainer.py - handler: train - image: mlrun/mlrun - kind: job - requirements: - - onnx~=1.14.1 - - onnxruntime~=1.16.1 - - optimum~=1.6.4 - - transformers~=4.26.1 - - datasets~=2.10.1 - - scikit-learn~=1.0.2 -url: '' -version: 0.2.0 diff --git a/functions/development/hugging_face_classifier_trainer/latest/src/requirements.txt b/functions/development/hugging_face_classifier_trainer/latest/src/requirements.txt deleted file mode 100644 index 9d0db7b4..00000000 --- a/functions/development/hugging_face_classifier_trainer/latest/src/requirements.txt +++ /dev/null @@ -1,6 +0,0 @@ -onnx~=1.14.1 -onnxruntime~=1.16.1 -optimum~=1.6.4 -transformers~=4.26.1 -datasets~=2.10.1 -scikit-learn~=1.0.2 \ No newline at end of file diff --git a/functions/development/hugging_face_classifier_trainer/latest/src/test_hugging_face_classifier_trainer.py b/functions/development/hugging_face_classifier_trainer/latest/src/test_hugging_face_classifier_trainer.py deleted file mode 100644 index a5e0fee9..00000000 --- a/functions/development/hugging_face_classifier_trainer/latest/src/test_hugging_face_classifier_trainer.py +++ /dev/null @@ -1,145 +0,0 @@ -# Copyright 2019 Iguazio -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -import os - -import mlrun -import pytest -from mlrun import import_function - -REQUIRED_ENV_VARS = [ - "MLRUN_DBPATH", - "MLRUN_ARTIFACT_PATH", - "V3IO_USERNAME", - "V3IO_API", - "V3IO_ACCESS_KEY", -] - -ADDITIONAL_PARAM_FOR_TRAIN = { - "TRAIN_output_dir": "finetuning-sentiment-model-3000-samples", - "TRAIN_learning_rate": 2e-5, - "TRAIN_per_device_train_batch_size": 16, - "TRAIN_per_device_eval_batch_size": 16, - "TRAIN_num_train_epochs": 2, - "TRAIN_weight_decay": 0.01, - "TRAIN_push_to_hub": False, - "TRAIN_evaluation_strategy": "epoch", - "TRAIN_eval_steps": 1, - "TRAIN_logging_steps": 1, - "CLASS_num_labels": 2, -} - - -def _validate_environment_variables() -> bool: - """ - Checks that all required Environment variables are set. - """ - environment_keys = os.environ.keys() - return all(key in environment_keys for key in REQUIRED_ENV_VARS) - - -def _set_environment(env_file=None): - if env_file: - mlrun.set_env_from_file(env_file) - mlrun.get_or_create_project( - "hugging-face-classifier-trainer-test", context="./", user_project=True - ) - - -@pytest.mark.skipif( - condition=not _validate_environment_variables(), - reason="Project's environment variables are not set", -) -def test_train_sequence_classification(): - _set_environment() - - # Importing function: - fn = import_function("function.yaml") - - train_run = None - - try: - train_run = fn.run( - params={ - "hf_dataset": "Shayanvsf/US_Airline_Sentiment", - "drop_columns": [ - "airline_sentiment_confidence", - "negativereason_confidence", - ], - "pretrained_tokenizer": "distilbert-base-uncased", - "pretrained_model": "distilbert-base-uncased", - "model_class": "transformers.AutoModelForSequenceClassification", - "label_name": "airline_sentiment", - "num_of_train_samples": 100, - "metrics": ["accuracy", "f1"], - "random_state": 42, - **ADDITIONAL_PARAM_FOR_TRAIN, - }, - handler="train", - local=True, - ) - except Exception as exception: - print(f"- The test failed - raised the following error:\n- {exception}") - assert train_run and all( - key in train_run.outputs for key in ["model", "loss"] - ), "outputs should include more data" - - -@pytest.mark.skipif( - condition=not _validate_environment_variables(), - reason="Project's environment variables are not set", -) -def test_train_and_optimize_sequence_classification(): - _set_environment() - - # Importing function: - fn = import_function("function.yaml") - - train_run = None - optimize_run = None - - try: - train_run = fn.run( - params={ - "hf_dataset": "Shayanvsf/US_Airline_Sentiment", - "drop_columns": [ - "airline_sentiment_confidence", - "negativereason_confidence", - ], - "pretrained_tokenizer": "distilbert-base-uncased", - "pretrained_model": "distilbert-base-uncased", - "model_class": "transformers.AutoModelForSequenceClassification", - "label_name": "airline_sentiment", - "num_of_train_samples": 100, - "metrics": ["accuracy", "f1"], - "random_state": 42, - **ADDITIONAL_PARAM_FOR_TRAIN, - }, - handler="train", - local=True, - ) - - optimize_run = fn.run( - params={"model_path": train_run.outputs["model"]}, - handler="optimize", - local=True, - ) - except Exception as exception: - print(f"- The test failed - raised the following error:\n- {exception}") - assert train_run and all( - key in train_run.outputs for key in ["model", "loss"] - ), "outputs should include more data" - assert optimize_run and all( - key in optimize_run.outputs for key in ["model"] - ), "outputs should include more data" diff --git a/functions/development/hugging_face_classifier_trainer/latest/static/documentation.html b/functions/development/hugging_face_classifier_trainer/latest/static/documentation.html deleted file mode 100644 index 1652c838..00000000 --- a/functions/development/hugging_face_classifier_trainer/latest/static/documentation.html +++ /dev/null @@ -1,394 +0,0 @@ - - - - - - - -hugging_face_classifier_trainer package - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - - - -
-
- - -
-
-
- -
-

hugging_face_classifier_trainer package

- -
- -
-
-
-
-
-

hugging_face_classifier_trainer package#

-
-

Submodules#

-
-
-

hugging_face_classifier_trainer.hugging_face_classifier_trainer module#

-
-
-class hugging_face_classifier_trainer.hugging_face_classifier_trainer.HFORTOptimizerMLRunInterface(*args: Any, **kwargs: Any)[source]#
-

Bases: mlrun.frameworks._common., abc.ABC

-

Interface for adding MLRun features for tensorflow keras API.

-
-
-DEFAULT_CONTEXT_NAME = 'mlrun-huggingface'#
-
-
-
-classmethod add_interface(obj, restoration: Optional[mlrun.frameworks._common.CommonTypes.MLRunInterfaceRestorationType] = None)[source]#
-

Enrich the object with this interface properties, methods and functions, so it will have this TensorFlow.Keras -MLRun’s features. -:param obj: The object to enrich his interface. -:param restoration: Restoration information tuple as returned from ‘remove_interface’ in order to

-
-

add the interface in a certain state.

-
-
-
-
-enable_auto_logging(context: mlrun.execution.MLClientCtx, model_name: str = 'model', tag: str = '', labels: Optional[Dict[str, str]] = None, extra_data: Optional[dict] = None)[source]#
-
-
-
-classmethod mlrun_optimize()[source]#
-

MLRun’s tf.keras.Model.fit wrapper. It will setup the optimizer when using horovod. The optimizer must be -passed in a keyword argument and when using horovod, it must be passed as an Optimizer instance, not a string.

-

raise MLRunInvalidArgumentError: In case the optimizer provided did not follow the instructions above.

-
-
-
-
-class hugging_face_classifier_trainer.hugging_face_classifier_trainer.HFTrainerMLRunInterface(*args: Any, **kwargs: Any)[source]#
-

Bases: mlrun.frameworks._common., abc.ABC

-

Interface for adding MLRun features for tensorflow keras API.

-
-
-DEFAULT_CONTEXT_NAME = 'mlrun-huggingface'#
-
-
-
-classmethod add_interface(obj: transformers.Trainer, restoration: Optional[mlrun.frameworks._common.CommonTypes.MLRunInterfaceRestorationType] = None)[source]#
-

Enrich the object with this interface properties, methods and functions, so it will have this TensorFlow.Keras -MLRuns features. -:param obj: The object to enrich his interface. -:param restoration: Restoration information tuple as returned from ‘remove_interface’ in order to

-
-

add the interface in a certain state.

-
-
-
-
-classmethod mlrun_train()[source]#
-

MLRuns tf.keras.Model.fit wrapper. It will setup the optimizer when using horovod. The optimizer must be -passed in a keyword argument and when using horovod, it must be passed as an Optimizer instance, not a string.

-

raise MLRunInvalidArgumentError: In case the optimizer provided did not follow the instructions above.

-
-
-
-
-class hugging_face_classifier_trainer.hugging_face_classifier_trainer.KWArgsPrefixes[source]#
-

Bases: object

-
-
-FIT = 'FIT_'#
-
-
-
-MODEL_CLASS = 'CLASS_'#
-
-
-
-PREDICT = 'PREDICT_'#
-
-
-
-TRAIN = 'TRAIN_'#
-
-
-
-
-class hugging_face_classifier_trainer.hugging_face_classifier_trainer.MLRunCallback(*args: Any, **kwargs: Any)[source]#
-

Bases: transformers.

-

Callback for collecting logs during training / evaluation of the Trainer API.

-
-
-on_epoch_begin(args: transformers.TrainingArguments, state: transformers.TrainerState, control: transformers.TrainerControl, **kwargs)[source]#
-
-
-
-on_epoch_end(args: transformers.TrainingArguments, state: transformers.TrainerState, control: transformers.TrainerControl, **kwargs)[source]#
-
-
-
-on_evaluate(args: transformers.TrainingArguments, state: transformers.TrainerState, control: transformers.TrainerControl, **kwargs)[source]#
-
-
-
-on_log(args: transformers.TrainingArguments, state: transformers.TrainerState, control: transformers.TrainerControl, logs: Optional[Dict[str, float]] = None, **kwargs)[source]#
-
-
-
-on_train_begin(args: transformers.TrainingArguments, state: transformers.TrainerState, control: transformers.TrainerControl, **kwargs)[source]#
-
-
-
-on_train_end(args: transformers.TrainingArguments, state: transformers.TrainerState, control: transformers.TrainerControl, model: Optional[transformers.PreTrainedModel] = None, tokenizer: Optional[transformers.PreTrainedTokenizer] = None, **kwargs)[source]#
-
-
-
-
-hugging_face_classifier_trainer.hugging_face_classifier_trainer.apply_mlrun(huggingface_object, model_name: Optional[str] = None, tag: str = '', context: Optional[mlrun.execution.MLClientCtx] = None, auto_log: bool = True, labels: Optional[Dict[str, str]] = None, extra_data: Optional[dict] = None, **kwargs)[source]#
-

Wrap the given model with MLRun’s interface providing it with mlrun’s additional features. -:param huggingface_object: The model to wrap. Can be loaded from the model path given as well. -:param model_name: The model name to use for storing the model artifact. Default: “model”. -:param tag: The model’s tag to log with. -:param context: MLRun context to work with. If no context is given it will be retrieved via

-
-

‘mlrun.get_or_create_ctx(None)’

-
-
-
Parameters
-

auto_log – Whether to enable MLRun’s auto logging. Default: True.

-
-
-
-
-
-hugging_face_classifier_trainer.hugging_face_classifier_trainer.optimize(model_path: str, model_name: str = 'optimized_model', target_dir: str = './optimized', optimization_level: int = 1)[source]#
-

Optimizing the transformer model using ONNX optimization.

-
-
Parameters
-
    -
  • model_path – The path of the model to optimize.

  • -
  • model_name – Name of the optimized model.

  • -
  • target_dir – The directory to save the ONNX model.

  • -
  • optimization_level – Optimization level performed by ONNX Runtime of the loaded graph. (default is 1)

  • -
-
-
-
-
-
-hugging_face_classifier_trainer.hugging_face_classifier_trainer.train(context: mlrun.execution.MLClientCtx, hf_dataset: Optional[str] = None, dataset: Optional[mlrun.datastore.base.DataItem] = None, test_set: Optional[mlrun.datastore.base.DataItem] = None, drop_columns: Optional[List[str]] = None, pretrained_tokenizer: Optional[str] = None, pretrained_model: Optional[str] = None, model_class: Optional[str] = None, model_name: str = 'huggingface-model', label_name: str = 'labels', text_col: str = 'text', num_of_train_samples: Optional[int] = None, train_test_split_size: Optional[float] = None, metrics: Optional[List[str]] = None, random_state: Optional[int] = None)[source]#
-

Training and evaluating a pretrained model with a pretrained tokenizer over a dataset. -The dataset can be either be the name of the dataset that contains in the HuggingFace hub, -or a URI or a FeatureVector

-
-
Parameters
-
    -
  • context – MLRun context

  • -
  • hf_dataset – The name of the dataset to get from the HuggingFace hub

  • -
  • dataset – The dataset to train the model on. Can be either a URI or a FeatureVector

  • -
  • test_set – The test set to train the model with.

  • -
  • drop_columns – The columns to drop from the dataset.

  • -
  • pretrained_tokenizer – The name of the pretrained tokenizer from the HuggingFace hub.

  • -
  • pretrained_model – The name of the pretrained model from the HuggingFace hub.

  • -
  • model_name – The model’s name to use for storing the model artifact, default to ‘model’

  • -
  • model_class – The class of the model, e.g. transformers.AutoModelForSequenceClassification

  • -
  • label_name – The target label of the column in the dataset.

  • -
  • text_col – The input text column un the dataset.

  • -
  • num_of_train_samples – Max number of training samples, for debugging.

  • -
  • train_test_split_size – Should be between 0.0 and 1.0 and represent the proportion of the dataset to include -in the test split.

  • -
  • metrics – List of different metrics for evaluate the model such as f1, accuracy etc.

  • -
  • random_state – Random state for train_test_split

  • -
-
-
-
-
-
-

Module contents#

-
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/hugging_face_classifier_trainer/latest/static/example.html b/functions/development/hugging_face_classifier_trainer/latest/static/example.html deleted file mode 100644 index 5fdd60e5..00000000 --- a/functions/development/hugging_face_classifier_trainer/latest/static/example.html +++ /dev/null @@ -1,2406 +0,0 @@ - - - - - - - -MLRun Hugging Face Classifier Trainer Tutorial - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - - - -
-
- - -
-
-
- - -
-
-

-
-

MLRun Hugging Face Classifier Trainer Tutorial#

-

This notebook shows how to use the handlers of the Hugging Face classifier trainer. -the following handlers are:

-
    -
  • train

  • -
  • optimize

  • -
-

All you need is simply HF model type and a HF dataset name.

-
-
-
%pip install -r requirements.txt
-
-
-
-
-
Requirement already satisfied: onnx~=1.14.1 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from -r requirements.txt (line 1)) (1.14.1)
-Requirement already satisfied: onnxruntime==1.16.1 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from -r requirements.txt (line 2)) (1.16.1)
-Requirement already satisfied: optimum~=1.6.4 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from -r requirements.txt (line 3)) (1.6.4)
-Requirement already satisfied: transformers~=4.26.1 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from -r requirements.txt (line 4)) (4.26.1)
-Requirement already satisfied: datasets~=2.10.1 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from -r requirements.txt (line 5)) (2.10.1)
-Requirement already satisfied: scikit-learn~=1.0.2 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from -r requirements.txt (line 6)) (1.0.2)
-Requirement already satisfied: coloredlogs in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from onnxruntime==1.16.1->-r requirements.txt (line 2)) (15.0.1)
-Requirement already satisfied: flatbuffers in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from onnxruntime==1.16.1->-r requirements.txt (line 2)) (1.12)
-Requirement already satisfied: numpy>=1.21.6 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from onnxruntime==1.16.1->-r requirements.txt (line 2)) (1.23.5)
-Requirement already satisfied: packaging in /conda/envs/mlrun-base/lib/python3.9/site-packages (from onnxruntime==1.16.1->-r requirements.txt (line 2)) (21.3)
-Requirement already satisfied: protobuf in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from onnxruntime==1.16.1->-r requirements.txt (line 2)) (3.20.2)
-Requirement already satisfied: sympy in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from onnxruntime==1.16.1->-r requirements.txt (line 2)) (1.12)
-Requirement already satisfied: typing-extensions>=3.6.2.1 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from onnx~=1.14.1->-r requirements.txt (line 1)) (4.7.1)
-Requirement already satisfied: torch>=1.9 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from optimum~=1.6.4->-r requirements.txt (line 3)) (2.1.2)
-Requirement already satisfied: huggingface-hub>=0.8.0 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from optimum~=1.6.4->-r requirements.txt (line 3)) (0.20.1)
-Requirement already satisfied: filelock in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from transformers~=4.26.1->-r requirements.txt (line 4)) (3.13.1)
-Requirement already satisfied: pyyaml>=5.1 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from transformers~=4.26.1->-r requirements.txt (line 4)) (5.4.1)
-Requirement already satisfied: regex!=2019.12.17 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from transformers~=4.26.1->-r requirements.txt (line 4)) (2023.12.25)
-Requirement already satisfied: requests in /conda/envs/mlrun-base/lib/python3.9/site-packages (from transformers~=4.26.1->-r requirements.txt (line 4)) (2.31.0)
-Requirement already satisfied: tokenizers!=0.11.3,<0.14,>=0.11.1 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from transformers~=4.26.1->-r requirements.txt (line 4)) (0.13.3)
-Requirement already satisfied: tqdm>=4.27 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from transformers~=4.26.1->-r requirements.txt (line 4)) (4.65.0)
-Requirement already satisfied: pyarrow>=6.0.0 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from datasets~=2.10.1->-r requirements.txt (line 5)) (11.0.0)
-Requirement already satisfied: dill<0.3.7,>=0.3.0 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from datasets~=2.10.1->-r requirements.txt (line 5)) (0.3.6)
-Requirement already satisfied: pandas in /conda/envs/mlrun-base/lib/python3.9/site-packages (from datasets~=2.10.1->-r requirements.txt (line 5)) (1.4.4)
-Requirement already satisfied: xxhash in /conda/envs/mlrun-base/lib/python3.9/site-packages (from datasets~=2.10.1->-r requirements.txt (line 5)) (3.3.0)
-Requirement already satisfied: multiprocess in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from datasets~=2.10.1->-r requirements.txt (line 5)) (0.70.14)
-Requirement already satisfied: fsspec>=2021.11.1 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from fsspec[http]>=2021.11.1->datasets~=2.10.1->-r requirements.txt (line 5)) (2023.9.2)
-Requirement already satisfied: aiohttp in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from datasets~=2.10.1->-r requirements.txt (line 5)) (3.9.1)
-Requirement already satisfied: responses<0.19 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from datasets~=2.10.1->-r requirements.txt (line 5)) (0.18.0)
-Requirement already satisfied: scipy>=1.1.0 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from scikit-learn~=1.0.2->-r requirements.txt (line 6)) (1.11.4)
-Requirement already satisfied: joblib>=0.11 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from scikit-learn~=1.0.2->-r requirements.txt (line 6)) (1.3.2)
-Requirement already satisfied: threadpoolctl>=2.0.0 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from scikit-learn~=1.0.2->-r requirements.txt (line 6)) (3.2.0)
-Requirement already satisfied: attrs>=17.3.0 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from aiohttp->datasets~=2.10.1->-r requirements.txt (line 5)) (19.1.0)
-Requirement already satisfied: multidict<7.0,>=4.5 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from aiohttp->datasets~=2.10.1->-r requirements.txt (line 5)) (6.0.4)
-Requirement already satisfied: yarl<2.0,>=1.0 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from aiohttp->datasets~=2.10.1->-r requirements.txt (line 5)) (1.9.2)
-Requirement already satisfied: frozenlist>=1.1.1 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from aiohttp->datasets~=2.10.1->-r requirements.txt (line 5)) (1.4.0)
-Requirement already satisfied: aiosignal>=1.1.2 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from aiohttp->datasets~=2.10.1->-r requirements.txt (line 5)) (1.3.1)
-Requirement already satisfied: async-timeout<5.0,>=4.0 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from aiohttp->datasets~=2.10.1->-r requirements.txt (line 5)) (4.0.3)
-Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from packaging->onnxruntime==1.16.1->-r requirements.txt (line 2)) (3.1.1)
-Requirement already satisfied: charset-normalizer<4,>=2 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from requests->transformers~=4.26.1->-r requirements.txt (line 4)) (2.1.1)
-Requirement already satisfied: idna<4,>=2.5 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from requests->transformers~=4.26.1->-r requirements.txt (line 4)) (3.4)
-Requirement already satisfied: urllib3<3,>=1.21.1 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from requests->transformers~=4.26.1->-r requirements.txt (line 4)) (1.26.16)
-Requirement already satisfied: certifi>=2017.4.17 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from requests->transformers~=4.26.1->-r requirements.txt (line 4)) (2023.7.22)
-Requirement already satisfied: networkx in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (3.2.1)
-Requirement already satisfied: jinja2 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (3.1.3)
-Requirement already satisfied: nvidia-cuda-nvrtc-cu12==12.1.105 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (12.1.105)
-Requirement already satisfied: nvidia-cuda-runtime-cu12==12.1.105 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (12.1.105)
-Requirement already satisfied: nvidia-cuda-cupti-cu12==12.1.105 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (12.1.105)
-Requirement already satisfied: nvidia-cudnn-cu12==8.9.2.26 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (8.9.2.26)
-Requirement already satisfied: nvidia-cublas-cu12==12.1.3.1 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (12.1.3.1)
-Requirement already satisfied: nvidia-cufft-cu12==11.0.2.54 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (11.0.2.54)
-Requirement already satisfied: nvidia-curand-cu12==10.3.2.106 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (10.3.2.106)
-Requirement already satisfied: nvidia-cusolver-cu12==11.4.5.107 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (11.4.5.107)
-Requirement already satisfied: nvidia-cusparse-cu12==12.1.0.106 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (12.1.0.106)
-Requirement already satisfied: nvidia-nccl-cu12==2.18.1 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (2.18.1)
-Requirement already satisfied: nvidia-nvtx-cu12==12.1.105 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (12.1.105)
-Requirement already satisfied: triton==2.1.0 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (2.1.0)
-Requirement already satisfied: nvidia-nvjitlink-cu12 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from nvidia-cusolver-cu12==11.4.5.107->torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (12.3.101)
-Requirement already satisfied: sentencepiece!=0.1.92,>=0.1.91 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from transformers[sentencepiece]>=4.26.0->optimum~=1.6.4->-r requirements.txt (line 3)) (0.2.0)
-Requirement already satisfied: humanfriendly>=9.1 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from coloredlogs->onnxruntime==1.16.1->-r requirements.txt (line 2)) (9.2)
-Requirement already satisfied: python-dateutil>=2.8.1 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from pandas->datasets~=2.10.1->-r requirements.txt (line 5)) (2.8.2)
-Requirement already satisfied: pytz>=2020.1 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from pandas->datasets~=2.10.1->-r requirements.txt (line 5)) (2023.3.post1)
-Requirement already satisfied: mpmath>=0.19 in /User/.pythonlibs/mlrun-base/lib/python3.9/site-packages (from sympy->onnxruntime==1.16.1->-r requirements.txt (line 2)) (1.3.0)
-Requirement already satisfied: six>=1.5 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from python-dateutil>=2.8.1->pandas->datasets~=2.10.1->-r requirements.txt (line 5)) (1.16.0)
-Requirement already satisfied: MarkupSafe>=2.0 in /conda/envs/mlrun-base/lib/python3.9/site-packages (from jinja2->torch>=1.9->optimum~=1.6.4->-r requirements.txt (line 3)) (2.1.3)
-Note: you may need to restart the kernel to use updated packages.
-
-
-
-
-
-
-
import mlrun
-
-
-
-
-
-
-
project = mlrun.get_or_create_project('hugging-face-trainer', context="./", user_project=True)
-
-
-
-
-
> 2024-03-24 17:10:17,091 [info] Project loaded successfully: {'project_name': 'hugging-face-trainer'}
-
-
-
-
-
-

Importing the hugging_face_classifier_trainer function from the Marketplace#

-
-
-
hugging_face_classifier_trainer = mlrun.import_function("hub://hugging_face_classifier_trainer")
-
-
-
-
-
-
-

Training a model#

-

Choosing the train handler

-
-

Define task parameters¶#

-
    -
  • Class parameters should contain the prefix CLASS_

  • -
  • Train parameters should contain the prefix TRAIN_

  • -
-
-
-
model_class = "transformers.AutoModelForSequenceClassification"
-additional_parameters = {
-    "TRAIN_output_dir": "finetuning-sentiment-model-3000-samples",
-    "TRAIN_learning_rate": 2e-5,
-    "TRAIN_per_device_train_batch_size": 16,
-    "TRAIN_per_device_eval_batch_size": 16,
-    "TRAIN_num_train_epochs": 3,
-    "TRAIN_weight_decay": 0.01,
-    "TRAIN_push_to_hub": False,
-    "TRAIN_evaluation_strategy": "epoch",
-    "TRAIN_eval_steps": 1,
-    "TRAIN_logging_steps": 1,
-    "CLASS_num_labels": 2
-}
-
-
-
-
-
-
-

Running the Training job with the “train” handler#

-
-
-
train_run = hugging_face_classifier_trainer.run(params={
-                                                        "hf_dataset": "Shayanvsf/US_Airline_Sentiment",
-                                                        "drop_columns": [
-                                                            "airline_sentiment_confidence",
-                                                            "negativereason_confidence",
-                                                        ],
-                                                        "pretrained_tokenizer": "distilbert-base-uncased",
-                                                        "pretrained_model": "distilbert-base-uncased",
-                                                        "model_class": "transformers.AutoModelForSequenceClassification",
-                                                        "label_name": "airline_sentiment",
-                                                        "num_of_train_samples": 100,
-                                                        "metrics": ["accuracy", "f1"],
-                                                        "random_state": 42,
-                                                        **additional_parameters
-                                                    },
-                                                    handler="train",
-                                                    local=True,
-                                                )
-
-
-
-
-
> 2024-03-24 17:10:21,025 [info] Storing function: {'name': 'hugging-face-classifier-trainer-train', 'uid': '514d8d5530c842238b1cc81983cd943e', 'db': 'http://mlrun-api:8080'}
-> 2024-03-24 17:11:03,727 [info] 'train_test_split_size' is not provided, setting train_test_split_size to 0.2
-> 2024-03-24 17:11:03,882 [info] Loading and editing Shayanvsf/US_Airline_Sentiment dataset from Hugging Face hub
-
-
-
Found cached dataset parquet (/igz/.cache/huggingface/datasets/Shayanvsf___parquet/Shayanvsf--US_Airline_Sentiment-1319c42f87c44b2f/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
-
-
-
Loading cached shuffled indices for dataset at /igz/.cache/huggingface/datasets/Shayanvsf___parquet/Shayanvsf--US_Airline_Sentiment-1319c42f87c44b2f/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-ec18d1773cfb9bb5.arrow
-Loading cached shuffled indices for dataset at /igz/.cache/huggingface/datasets/Shayanvsf___parquet/Shayanvsf--US_Airline_Sentiment-1319c42f87c44b2f/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-e0c54c494a578ee6.arrow
-
-
-
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight']
-- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
-- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
-Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.weight', 'pre_classifier.weight', 'classifier.bias']
-You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
-
-
-
> 2024-03-24 17:11:08,938 [info] training 'huggingface-model'
-
-
-
The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
-This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning
-***** Running training *****
-  Num examples = 100
-  Num Epochs = 3
-  Instantaneous batch size per device = 16
-  Total train batch size (w. parallel, distributed & accumulation) = 16
-  Gradient Accumulation steps = 1
-  Total optimization steps = 21
-  Number of trainable parameters = 66955010
-You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
-
-
-
-
- - [21/21 00:15, Epoch 3/3] -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
EpochTraining LossValidation LossAccuracyF1
10.7389000.5153110.7916670.000000
20.5259000.4815630.7916670.000000
30.4908000.4716750.7916670.000000

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
-***** Running Evaluation *****
-  Num examples = 24
-  Batch size = 16
-/tmp/tmp0c1aawrq.py:561: FutureWarning:
-
-load_metric is deprecated and will be removed in the next major version of datasets. Use 'evaluate.load' instead, from the new library 🤗 Evaluate: https://huggingface.co/docs/evaluate
-
-The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
-***** Running Evaluation *****
-  Num examples = 24
-  Batch size = 16
-The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
-***** Running Evaluation *****
-  Num examples = 24
-  Batch size = 16
-
-
-Training completed. Do not forget to share your model on huggingface.co/models =)
-
-
-tokenizer config file saved in /tmp/tokenizer/tokenizer_config.json
-Special tokens file saved in /tmp/tokenizer/special_tokens_map.json
-Configuration saved in /tmp/model/config.json
-Model weights saved in /tmp/model/pytorch_model.bin
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
hugging-face-trainer-avia0Mar 24 17:10:21completedhugging-face-classifier-trainer-train
v3io_user=avia
kind=local
owner=avia
host=jupyter-avia-6454bdd4c5-xz8cg
hf_dataset=Shayanvsf/US_Airline_Sentiment
drop_columns=['airline_sentiment_confidence', 'negativereason_confidence']
pretrained_tokenizer=distilbert-base-uncased
pretrained_model=distilbert-base-uncased
model_class=transformers.AutoModelForSequenceClassification
label_name=airline_sentiment
num_of_train_samples=100
metrics=['accuracy', 'f1']
random_state=42
TRAIN_output_dir=finetuning-sentiment-model-3000-samples
TRAIN_learning_rate=2e-05
TRAIN_per_device_train_batch_size=16
TRAIN_per_device_eval_batch_size=16
TRAIN_num_train_epochs=3
TRAIN_weight_decay=0.01
TRAIN_push_to_hub=False
TRAIN_evaluation_strategy=epoch
TRAIN_eval_steps=1
TRAIN_logging_steps=1
CLASS_num_labels=2
loss=0.4908
learning_rate=0.0
eval_loss=0.47167453169822693
eval_accuracy=0.7916666666666666
eval_f1=0.0
eval_runtime=0.5186
eval_samples_per_second=46.276
eval_steps_per_second=3.856
train_runtime=17.6054
train_samples_per_second=17.04
train_steps_per_second=1.193
total_flos=3327208489680.0
loss_plot
learning_rate_plot
eval_loss_plot
eval_accuracy_plot
eval_f1_plot
eval_runtime_plot
eval_samples_per_second_plot
eval_steps_per_second_plot
tokenizer
model
-
- -
-

-
-
-
> to track results use the .show() or .logs() methods or click here to open in UI
> 2024-03-24 17:12:01,880 [info] Run execution finished: {'status': 'completed', 'name': 'hugging-face-classifier-trainer-train'}
-
-
-
-
-
-
-

The result of the train run#

-
-
-
train_run.outputs
-
-
-
-
-
{'loss': 0.4908,
- 'learning_rate': 0.0,
- 'eval_loss': 0.47167453169822693,
- 'eval_accuracy': 0.7916666666666666,
- 'eval_f1': 0.0,
- 'eval_runtime': 0.5186,
- 'eval_samples_per_second': 46.276,
- 'eval_steps_per_second': 3.856,
- 'train_runtime': 17.6054,
- 'train_samples_per_second': 17.04,
- 'train_steps_per_second': 1.193,
- 'total_flos': 3327208489680.0,
- 'loss_plot': 'v3io:///projects/hugging-face-trainer-avia/artifacts/hugging-face-classifier-trainer-train/0/loss_plot.html',
- 'learning_rate_plot': 'v3io:///projects/hugging-face-trainer-avia/artifacts/hugging-face-classifier-trainer-train/0/learning_rate_plot.html',
- 'eval_loss_plot': 'v3io:///projects/hugging-face-trainer-avia/artifacts/hugging-face-classifier-trainer-train/0/eval_loss_plot.html',
- 'eval_accuracy_plot': 'v3io:///projects/hugging-face-trainer-avia/artifacts/hugging-face-classifier-trainer-train/0/eval_accuracy_plot.html',
- 'eval_f1_plot': 'v3io:///projects/hugging-face-trainer-avia/artifacts/hugging-face-classifier-trainer-train/0/eval_f1_plot.html',
- 'eval_runtime_plot': 'v3io:///projects/hugging-face-trainer-avia/artifacts/hugging-face-classifier-trainer-train/0/eval_runtime_plot.html',
- 'eval_samples_per_second_plot': 'v3io:///projects/hugging-face-trainer-avia/artifacts/hugging-face-classifier-trainer-train/0/eval_samples_per_second_plot.html',
- 'eval_steps_per_second_plot': 'v3io:///projects/hugging-face-trainer-avia/artifacts/hugging-face-classifier-trainer-train/0/eval_steps_per_second_plot.html',
- 'tokenizer': 'store://artifacts/hugging-face-trainer-avia/hugging-face-classifier-trainer-train_tokenizer@514d8d5530c842238b1cc81983cd943e',
- 'model': 'store://artifacts/hugging-face-trainer-avia/huggingface-model@514d8d5530c842238b1cc81983cd943e'}
-
-
-
-
-
-
-
train_run.artifact('loss_plot').show()
-
-
-
-
-
- - -
-
- -
-
-
-
-

Getting the model for evaluating and predicting#

-
-
-
model_path = train_run.outputs['model']
-
-
-
-
-
-
-
-

Optimize the model#

-

Choosing the optimize handler

-

The result of using this handled is an onnx optimized model.

-
-
-
optimize_run = hugging_face_classifier_trainer.run(params={
-                                                        "model_path": str(model_path)
-                                                    },
-                                                    handler="optimize",
-                                                    local=True,
-                                                )
-
-
-
-
-
> 2024-03-24 17:12:02,020 [info] Storing function: {'name': 'hugging-face-classifier-trainer-optimize', 'uid': 'fbee1ead18444824a4b5c0308a677bf4', 'db': 'http://mlrun-api:8080'}
-
-
-
/User/.pythonlibs/mlrun-base/lib/python3.9/site-packages/optimum/onnxruntime/configuration.py:726: FutureWarning:
-
-disable_embed_layer_norm will be deprecated soon, use disable_embed_layer_norm_fusion instead, disable_embed_layer_norm_fusion is set to True.
-
-loading configuration file /tmp/config.json
-Model config DistilBertConfig {
-  "_name_or_path": "/tmp/config.json",
-  "activation": "gelu",
-  "architectures": [
-    "DistilBertForSequenceClassification"
-  ],
-  "attention_dropout": 0.1,
-  "dim": 768,
-  "dropout": 0.1,
-  "hidden_dim": 3072,
-  "initializer_range": 0.02,
-  "max_position_embeddings": 512,
-  "model_type": "distilbert",
-  "n_heads": 12,
-  "n_layers": 6,
-  "pad_token_id": 0,
-  "problem_type": "single_label_classification",
-  "qa_dropout": 0.1,
-  "seq_classif_dropout": 0.2,
-  "sinusoidal_pos_embds": false,
-  "tie_weights_": true,
-  "torch_dtype": "float32",
-  "transformers_version": "4.26.1",
-  "vocab_size": 30522
-}
-
-loading configuration file /tmp/config.json
-Model config DistilBertConfig {
-  "_name_or_path": "/tmp",
-  "activation": "gelu",
-  "architectures": [
-    "DistilBertForSequenceClassification"
-  ],
-  "attention_dropout": 0.1,
-  "dim": 768,
-  "dropout": 0.1,
-  "hidden_dim": 3072,
-  "initializer_range": 0.02,
-  "max_position_embeddings": 512,
-  "model_type": "distilbert",
-  "n_heads": 12,
-  "n_layers": 6,
-  "pad_token_id": 0,
-  "problem_type": "single_label_classification",
-  "qa_dropout": 0.1,
-  "seq_classif_dropout": 0.2,
-  "sinusoidal_pos_embds": false,
-  "tie_weights_": true,
-  "torch_dtype": "float32",
-  "transformers_version": "4.26.1",
-  "vocab_size": 30522
-}
-
-loading weights file /tmp/pytorch_model.bin
-All model checkpoint weights were used when initializing DistilBertForSequenceClassification.
-
-All the weights of DistilBertForSequenceClassification were initialized from the model checkpoint at /tmp.
-If your task is similar to the task the model of the checkpoint was trained on, you can already use DistilBertForSequenceClassification for predictions without further training.
-/User/.pythonlibs/mlrun-base/lib/python3.9/site-packages/transformers/models/distilbert/modeling_distilbert.py:218: TracerWarning:
-
-torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect.
-
-Configuration saved in /tmp/tmp79wjp8m8/config.json
-Could not locate the tokenizer configuration file, will try to use the model config instead.
-loading configuration file /tmp/config.json
-Model config DistilBertConfig {
-  "_name_or_path": "/tmp",
-  "activation": "gelu",
-  "architectures": [
-    "DistilBertForSequenceClassification"
-  ],
-  "attention_dropout": 0.1,
-  "dim": 768,
-  "dropout": 0.1,
-  "hidden_dim": 3072,
-  "initializer_range": 0.02,
-  "max_position_embeddings": 512,
-  "model_type": "distilbert",
-  "n_heads": 12,
-  "n_layers": 6,
-  "pad_token_id": 0,
-  "problem_type": "single_label_classification",
-  "qa_dropout": 0.1,
-  "seq_classif_dropout": 0.2,
-  "sinusoidal_pos_embds": false,
-  "tie_weights_": true,
-  "torch_dtype": "float32",
-  "transformers_version": "4.26.1",
-  "vocab_size": 30522
-}
-
-loading configuration file /tmp/config.json
-Model config DistilBertConfig {
-  "_name_or_path": "/tmp",
-  "activation": "gelu",
-  "architectures": [
-    "DistilBertForSequenceClassification"
-  ],
-  "attention_dropout": 0.1,
-  "dim": 768,
-  "dropout": 0.1,
-  "hidden_dim": 3072,
-  "initializer_range": 0.02,
-  "max_position_embeddings": 512,
-  "model_type": "distilbert",
-  "n_heads": 12,
-  "n_layers": 6,
-  "pad_token_id": 0,
-  "problem_type": "single_label_classification",
-  "qa_dropout": 0.1,
-  "seq_classif_dropout": 0.2,
-  "sinusoidal_pos_embds": false,
-  "tie_weights_": true,
-  "torch_dtype": "float32",
-  "transformers_version": "4.26.1",
-  "vocab_size": 30522
-}
-
-Could not locate the tokenizer configuration file, will try to use the model config instead.
-loading configuration file /tmp/config.json
-Model config DistilBertConfig {
-  "_name_or_path": "/tmp",
-  "activation": "gelu",
-  "architectures": [
-    "DistilBertForSequenceClassification"
-  ],
-  "attention_dropout": 0.1,
-  "dim": 768,
-  "dropout": 0.1,
-  "hidden_dim": 3072,
-  "initializer_range": 0.02,
-  "max_position_embeddings": 512,
-  "model_type": "distilbert",
-  "n_heads": 12,
-  "n_layers": 6,
-  "pad_token_id": 0,
-  "problem_type": "single_label_classification",
-  "qa_dropout": 0.1,
-  "seq_classif_dropout": 0.2,
-  "sinusoidal_pos_embds": false,
-  "tie_weights_": true,
-  "torch_dtype": "float32",
-  "transformers_version": "4.26.1",
-  "vocab_size": 30522
-}
-
-Could not locate the tokenizer configuration file, will try to use the model config instead.
-loading configuration file /tmp/tmp79wjp8m8/config.json
-Model config DistilBertConfig {
-  "_name_or_path": "/tmp/tmp79wjp8m8",
-  "activation": "gelu",
-  "architectures": [
-    "DistilBertForSequenceClassification"
-  ],
-  "attention_dropout": 0.1,
-  "dim": 768,
-  "dropout": 0.1,
-  "hidden_dim": 3072,
-  "initializer_range": 0.02,
-  "max_position_embeddings": 512,
-  "model_type": "distilbert",
-  "n_heads": 12,
-  "n_layers": 6,
-  "pad_token_id": 0,
-  "problem_type": "single_label_classification",
-  "qa_dropout": 0.1,
-  "seq_classif_dropout": 0.2,
-  "sinusoidal_pos_embds": false,
-  "tie_weights_": true,
-  "torch_dtype": "float32",
-  "transformers_version": "4.26.1",
-  "vocab_size": 30522
-}
-
-loading configuration file /tmp/tmp79wjp8m8/config.json
-Model config DistilBertConfig {
-  "_name_or_path": "/tmp/tmp79wjp8m8",
-  "activation": "gelu",
-  "architectures": [
-    "DistilBertForSequenceClassification"
-  ],
-  "attention_dropout": 0.1,
-  "dim": 768,
-  "dropout": 0.1,
-  "hidden_dim": 3072,
-  "initializer_range": 0.02,
-  "max_position_embeddings": 512,
-  "model_type": "distilbert",
-  "n_heads": 12,
-  "n_layers": 6,
-  "pad_token_id": 0,
-  "problem_type": "single_label_classification",
-  "qa_dropout": 0.1,
-  "seq_classif_dropout": 0.2,
-  "sinusoidal_pos_embds": false,
-  "tie_weights_": true,
-  "torch_dtype": "float32",
-  "transformers_version": "4.26.1",
-  "vocab_size": 30522
-}
-
-Could not locate the tokenizer configuration file, will try to use the model config instead.
-loading configuration file /tmp/tmp79wjp8m8/config.json
-Model config DistilBertConfig {
-  "_name_or_path": "/tmp/tmp79wjp8m8",
-  "activation": "gelu",
-  "architectures": [
-    "DistilBertForSequenceClassification"
-  ],
-  "attention_dropout": 0.1,
-  "dim": 768,
-  "dropout": 0.1,
-  "hidden_dim": 3072,
-  "initializer_range": 0.02,
-  "max_position_embeddings": 512,
-  "model_type": "distilbert",
-  "n_heads": 12,
-  "n_layers": 6,
-  "pad_token_id": 0,
-  "problem_type": "single_label_classification",
-  "qa_dropout": 0.1,
-  "seq_classif_dropout": 0.2,
-  "sinusoidal_pos_embds": false,
-  "tie_weights_": true,
-  "torch_dtype": "float32",
-  "transformers_version": "4.26.1",
-  "vocab_size": 30522
-}
-
-Configuration saved in optimized/config.json
-Could not locate the tokenizer configuration file, will try to use the model config instead.
-loading configuration file /tmp/tmp79wjp8m8/config.json
-Model config DistilBertConfig {
-  "_name_or_path": "/tmp/tmp79wjp8m8",
-  "activation": "gelu",
-  "architectures": [
-    "DistilBertForSequenceClassification"
-  ],
-  "attention_dropout": 0.1,
-  "dim": 768,
-  "dropout": 0.1,
-  "hidden_dim": 3072,
-  "initializer_range": 0.02,
-  "max_position_embeddings": 512,
-  "model_type": "distilbert",
-  "n_heads": 12,
-  "n_layers": 6,
-  "pad_token_id": 0,
-  "problem_type": "single_label_classification",
-  "qa_dropout": 0.1,
-  "seq_classif_dropout": 0.2,
-  "sinusoidal_pos_embds": false,
-  "tie_weights_": true,
-  "torch_dtype": "float32",
-  "transformers_version": "4.26.1",
-  "vocab_size": 30522
-}
-
-loading configuration file /tmp/tmp79wjp8m8/config.json
-Model config DistilBertConfig {
-  "_name_or_path": "/tmp/tmp79wjp8m8",
-  "activation": "gelu",
-  "architectures": [
-    "DistilBertForSequenceClassification"
-  ],
-  "attention_dropout": 0.1,
-  "dim": 768,
-  "dropout": 0.1,
-  "hidden_dim": 3072,
-  "initializer_range": 0.02,
-  "max_position_embeddings": 512,
-  "model_type": "distilbert",
-  "n_heads": 12,
-  "n_layers": 6,
-  "pad_token_id": 0,
-  "problem_type": "single_label_classification",
-  "qa_dropout": 0.1,
-  "seq_classif_dropout": 0.2,
-  "sinusoidal_pos_embds": false,
-  "tie_weights_": true,
-  "torch_dtype": "float32",
-  "transformers_version": "4.26.1",
-  "vocab_size": 30522
-}
-
-Could not locate the tokenizer configuration file, will try to use the model config instead.
-loading configuration file /tmp/tmp79wjp8m8/config.json
-Model config DistilBertConfig {
-  "_name_or_path": "/tmp/tmp79wjp8m8",
-  "activation": "gelu",
-  "architectures": [
-    "DistilBertForSequenceClassification"
-  ],
-  "attention_dropout": 0.1,
-  "dim": 768,
-  "dropout": 0.1,
-  "hidden_dim": 3072,
-  "initializer_range": 0.02,
-  "max_position_embeddings": 512,
-  "model_type": "distilbert",
-  "n_heads": 12,
-  "n_layers": 6,
-  "pad_token_id": 0,
-  "problem_type": "single_label_classification",
-  "qa_dropout": 0.1,
-  "seq_classif_dropout": 0.2,
-  "sinusoidal_pos_embds": false,
-  "tie_weights_": true,
-  "torch_dtype": "float32",
-  "transformers_version": "4.26.1",
-  "vocab_size": 30522
-}
-
-Failed to remove node input: "/distilbert/transformer/layer.0/attention/Transpose_output_0"
-input: "/distilbert/transformer/layer.0/attention/Constant_11_output_0"
-output: "/distilbert/transformer/layer.0/attention/Div_output_0"
-name: "/distilbert/transformer/layer.0/attention/Div"
-op_type: "Div"
-
-Failed to remove node input: "/distilbert/transformer/layer.1/attention/Transpose_output_0"
-input: "/distilbert/transformer/layer.1/attention/Constant_11_output_0"
-output: "/distilbert/transformer/layer.1/attention/Div_output_0"
-name: "/distilbert/transformer/layer.1/attention/Div"
-op_type: "Div"
-
-Failed to remove node input: "/distilbert/transformer/layer.2/attention/Transpose_output_0"
-input: "/distilbert/transformer/layer.2/attention/Constant_11_output_0"
-output: "/distilbert/transformer/layer.2/attention/Div_output_0"
-name: "/distilbert/transformer/layer.2/attention/Div"
-op_type: "Div"
-
-Failed to remove node input: "/distilbert/transformer/layer.3/attention/Transpose_output_0"
-input: "/distilbert/transformer/layer.3/attention/Constant_11_output_0"
-output: "/distilbert/transformer/layer.3/attention/Div_output_0"
-name: "/distilbert/transformer/layer.3/attention/Div"
-op_type: "Div"
-
-Failed to remove node input: "/distilbert/transformer/layer.4/attention/Transpose_output_0"
-input: "/distilbert/transformer/layer.4/attention/Constant_11_output_0"
-output: "/distilbert/transformer/layer.4/attention/Div_output_0"
-name: "/distilbert/transformer/layer.4/attention/Div"
-op_type: "Div"
-
-Failed to remove node input: "/distilbert/transformer/layer.5/attention/Transpose_output_0"
-input: "/distilbert/transformer/layer.5/attention/Constant_11_output_0"
-output: "/distilbert/transformer/layer.5/attention/Div_output_0"
-name: "/distilbert/transformer/layer.5/attention/Div"
-op_type: "Div"
-
-Configuration saved in optimized/config.json
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
hugging-face-trainer-avia0Mar 24 17:12:02completedhugging-face-classifier-trainer-optimize
v3io_user=avia
kind=local
owner=avia
host=jupyter-avia-6454bdd4c5-xz8cg
model_path=store://artifacts/hugging-face-trainer-avia/huggingface-model@514d8d5530c842238b1cc81983cd943e
model
-
- -
-

-
-
-
> to track results use the .show() or .logs() methods or click here to open in UI
> 2024-03-24 17:12:22,721 [info] Run execution finished: {'status': 'completed', 'name': 'hugging-face-classifier-trainer-optimize'}
-
-
-
-
-
-
-
optimize_run.outputs
-
-
-
-
-
{'model': 'store://artifacts/hugging-face-trainer-avia/optimized_model@fbee1ead18444824a4b5c0308a677bf4'}
-
-
-
-
-
-
-

Running the training remotely#

-
-
-
project.build_function("hugging-face-classifier-trainer",with_mlrun=True)
-
-
-
-
-
/User/.pythonlibs/mlrun-base/lib/python3.9/site-packages/mlrun/projects/operations.py:276: OverwriteBuildParamsWarning:
-
-The `overwrite_build_params` parameter default will change from 'False' to 'True' in 1.8.0.
-
-
-
> 2024-03-24 17:14:22,792 [info] Started building image: .mlrun/func-hugging-face-trainer-avia-hugging-face-classifier-trainer:latest
-INFO[0000] Retrieving image manifest mlrun/mlrun:1.6.1  
-INFO[0000] Retrieving image mlrun/mlrun:1.6.1 from registry index.docker.io 
-INFO[0000] Built cross stage deps: map[]                
-INFO[0000] Retrieving image manifest mlrun/mlrun:1.6.1  
-INFO[0000] Returning cached image manifest              
-INFO[0000] Executing 0 build triggers                   
-INFO[0000] Building stage 'mlrun/mlrun:1.6.1' [idx: '0', base-idx: '-1'] 
-INFO[0000] Unpacking rootfs as cmd RUN echo 'Installing /empty/requirements.txt...'; cat /empty/requirements.txt requires it. 
-INFO[0047] RUN echo 'Installing /empty/requirements.txt...'; cat /empty/requirements.txt 
-INFO[0047] Initializing snapshotter ...                 
-INFO[0047] Taking snapshot of full filesystem...        
-INFO[0074] Cmd: /bin/sh                                 
-INFO[0074] Args: [-c echo 'Installing /empty/requirements.txt...'; cat /empty/requirements.txt] 
-INFO[0074] Running: [/bin/sh -c echo 'Installing /empty/requirements.txt...'; cat /empty/requirements.txt] 
-Installing /empty/requirements.txt...
-mlrun[complete]==1.6.1
-onnx~=1.14.1
-onnxruntime~=1.16.1
-optimum~=1.6.4
-transformers~=4.26.1
-datasets~=2.10.1
-scikit-learn~=1.0.2
-INFO[0074] Taking snapshot of full filesystem...        
-INFO[0078] No files were changed, appending empty layer to config. No layer added to image. 
-INFO[0078] RUN python -m pip install -r /empty/requirements.txt 
-INFO[0078] Cmd: /bin/sh                                 
-INFO[0078] Args: [-c python -m pip install -r /empty/requirements.txt] 
-INFO[0078] Running: [/bin/sh -c python -m pip install -r /empty/requirements.txt] 
-Requirement already satisfied: mlrun[complete]==1.6.1 in /opt/conda/lib/python3.9/site-packages (from -r /empty/requirements.txt (line 1)) (1.6.1)
-Collecting onnx~=1.14.1 (from -r /empty/requirements.txt (line 2))
-  Obtaining dependency information for onnx~=1.14.1 from https://files.pythonhosted.org/packages/ff/24/0e522fdcadf0e15fc304145a5b6e5d7246d7f2c507fd9bfe6e1fafb2aa95/onnx-1.14.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
-  Downloading onnx-1.14.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (15 kB)
-Collecting onnxruntime~=1.16.1 (from -r /empty/requirements.txt (line 3))
-  Obtaining dependency information for onnxruntime~=1.16.1 from https://files.pythonhosted.org/packages/de/ab/ed3ae0d649cee41e870f8b1653cf4a1c1fc321e0ded4e3e1a3d4a25c0131/onnxruntime-1.16.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
-  Downloading onnxruntime-1.16.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.3 kB)
-Collecting optimum~=1.6.4 (from -r /empty/requirements.txt (line 4))
-  Obtaining dependency information for optimum~=1.6.4 from https://files.pythonhosted.org/packages/31/72/a7e3b2c57d6368c5f4bb6fba54a85cbf07d25c385a2db3f1a638f3c0ddb2/optimum-1.6.4-py3-none-any.whl.metadata
-  Downloading optimum-1.6.4-py3-none-any.whl.metadata (17 kB)
-Collecting transformers~=4.26.1 (from -r /empty/requirements.txt (line 5))
-  Obtaining dependency information for transformers~=4.26.1 from https://files.pythonhosted.org/packages/1e/e2/60c3f4691b16d126ee9cfe28f598b13c424b60350ab339aba81aef054b8f/transformers-4.26.1-py3-none-any.whl.metadata
-  Downloading transformers-4.26.1-py3-none-any.whl.metadata (100 kB)
-     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100.3/100.3 kB 6.2 MB/s eta 0:00:00
-Collecting datasets~=2.10.1 (from -r /empty/requirements.txt (line 6))
-  Obtaining dependency information for datasets~=2.10.1 from https://files.pythonhosted.org/packages/fe/17/5825fdf034ff1a315becdbb9b6fe5a2bd9d8e724464535f18809593bf9c2/datasets-2.10.1-py3-none-any.whl.metadata
-  Downloading datasets-2.10.1-py3-none-any.whl.metadata (20 kB)
-Collecting scikit-learn~=1.0.2 (from -r /empty/requirements.txt (line 7))
-  Obtaining dependency information for scikit-learn~=1.0.2 from https://files.pythonhosted.org/packages/57/aa/483fbe6b5314bce2d49801e6cec1f2139a9c220d0d51494788fff47233b3/scikit_learn-1.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
-  Downloading scikit_learn-1.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
-Requirement already satisfied: urllib3<1.27,>=1.26.9 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.26.18)
-Requirement already satisfied: GitPython>=3.1.41,~=3.1 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.1.42)
-Requirement already satisfied: aiohttp~=3.9 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.9.3)
-Requirement already satisfied: aiohttp-retry~=2.8 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.8.3)
-Requirement already satisfied: click~=8.1 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (8.1.7)
-Requirement already satisfied: kfp~=1.8 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.8.22)
-Requirement already satisfied: nest-asyncio~=1.0 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.6.0)
-Requirement already satisfied: ipython~=8.10 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (8.18.1)
-Requirement already satisfied: nuclio-jupyter~=0.9.15 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.9.16)
-Requirement already satisfied: numpy<1.27.0,>=1.16.5 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.26.4)
-Requirement already satisfied: pandas<2.2,>=1.2 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.1.4)
-Requirement already satisfied: pyarrow<15,>=10.0 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (14.0.2)
-Requirement already satisfied: pyyaml~=5.1 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (5.4.1)
-Requirement already satisfied: requests~=2.31 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.31.0)
-Requirement already satisfied: tabulate~=0.8.6 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.8.10)
-Requirement already satisfied: v3io~=0.5.21 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.5.23)
-Requirement already satisfied: pydantic>=1.10.8,~=1.10 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.10.14)
-Requirement already satisfied: mergedeep~=1.3 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.3.4)
-Requirement already satisfied: v3io-frames~=0.10.12 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.10.13)
-Requirement already satisfied: semver~=3.0 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.0.2)
-Requirement already satisfied: dependency-injector~=4.41 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (4.41.0)
-Requirement already satisfied: fsspec==2023.9.2 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2023.9.2)
-Requirement already satisfied: v3iofs~=0.1.17 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.1.18)
-Requirement already satisfied: storey~=1.6.18 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.6.18)
-Requirement already satisfied: inflection~=0.5.0 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.5.1)
-Requirement already satisfied: python-dotenv~=0.17.0 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.17.1)
-Requirement already satisfied: setuptools~=68.2 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (68.2.2)
-Requirement already satisfied: deprecated~=1.2 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.2.14)
-Requirement already satisfied: jinja2>=3.1.3,~=3.1 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.1.3)
-Requirement already satisfied: anyio~=3.7 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.7.1)
-Requirement already satisfied: orjson~=3.9 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.9.15)
-Requirement already satisfied: adlfs==2023.9.0 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2023.9.0)
-Requirement already satisfied: aiobotocore<2.8,>=2.5.0 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.5.4)
-Requirement already satisfied: avro~=1.11 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.11.3)
-Requirement already satisfied: azure-core~=1.24 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.30.0)
-Requirement already satisfied: azure-identity~=1.5 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.15.0)
-Requirement already satisfied: azure-keyvault-secrets~=4.2 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (4.8.0)
-Requirement already satisfied: boto3<1.29.0,>=1.28.0 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.28.17)
-Requirement already satisfied: dask~=2023.9.0 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2023.9.3)
-Requirement already satisfied: databricks-sdk~=0.13.0 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.13.0)
-Requirement already satisfied: distributed~=2023.9.0 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2023.9.3)
-Requirement already satisfied: gcsfs==2023.9.2 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2023.9.2)
-Requirement already satisfied: google-cloud-bigquery[bqstorage,pandas]==3.14.1 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.14.1)
-Requirement already satisfied: graphviz~=0.20.0 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.20.1)
-Requirement already satisfied: kafka-python~=2.0 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.0.2)
-Requirement already satisfied: mlflow~=2.8 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.10.2)
-Requirement already satisfied: msrest~=0.6.21 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.6.21)
-Requirement already satisfied: plotly<5.12.0,~=5.4 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (5.11.0)
-Requirement already satisfied: pyopenssl>=23 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (24.0.0)
-Requirement already satisfied: redis~=4.3 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (4.6.0)
-Requirement already satisfied: s3fs==2023.9.2 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2023.9.2)
-Requirement already satisfied: sqlalchemy~=1.4 in /opt/conda/lib/python3.9/site-packages (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.4.51)
-Requirement already satisfied: azure-datalake-store<0.1,>=0.0.46 in /opt/conda/lib/python3.9/site-packages (from adlfs==2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.0.53)
-Requirement already satisfied: azure-storage-blob>=12.12.0 in /opt/conda/lib/python3.9/site-packages (from adlfs==2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (12.19.0)
-Requirement already satisfied: decorator>4.1.2 in /opt/conda/lib/python3.9/site-packages (from gcsfs==2023.9.2->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (5.1.1)
-Requirement already satisfied: google-auth>=1.2 in /opt/conda/lib/python3.9/site-packages (from gcsfs==2023.9.2->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.28.1)
-Requirement already satisfied: google-auth-oauthlib in /opt/conda/lib/python3.9/site-packages (from gcsfs==2023.9.2->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.2.0)
-Requirement already satisfied: google-cloud-storage in /opt/conda/lib/python3.9/site-packages (from gcsfs==2023.9.2->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.14.0)
-Requirement already satisfied: google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0dev,>=1.31.5 in /opt/conda/lib/python3.9/site-packages (from google-cloud-bigquery[bqstorage,pandas]==3.14.1->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.17.1)
-Requirement already satisfied: google-cloud-core<3.0.0dev,>=1.6.0 in /opt/conda/lib/python3.9/site-packages (from google-cloud-bigquery[bqstorage,pandas]==3.14.1->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.4.1)
-Requirement already satisfied: google-resumable-media<3.0dev,>=0.6.0 in /opt/conda/lib/python3.9/site-packages (from google-cloud-bigquery[bqstorage,pandas]==3.14.1->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.7.0)
-Requirement already satisfied: packaging>=20.0.0 in /opt/conda/lib/python3.9/site-packages (from google-cloud-bigquery[bqstorage,pandas]==3.14.1->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (23.1)
-Requirement already satisfied: python-dateutil<3.0dev,>=2.7.2 in /opt/conda/lib/python3.9/site-packages (from google-cloud-bigquery[bqstorage,pandas]==3.14.1->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.8.2)
-Requirement already satisfied: db-dtypes<2.0.0dev,>=0.3.0 in /opt/conda/lib/python3.9/site-packages (from google-cloud-bigquery[bqstorage,pandas]==3.14.1->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.2.0)
-Requirement already satisfied: google-cloud-bigquery-storage<3.0.0dev,>=2.6.0 in /opt/conda/lib/python3.9/site-packages (from google-cloud-bigquery[bqstorage,pandas]==3.14.1->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.24.0)
-Requirement already satisfied: grpcio<2.0dev,>=1.47.0 in /opt/conda/lib/python3.9/site-packages (from google-cloud-bigquery[bqstorage,pandas]==3.14.1->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.62.0)
-Requirement already satisfied: protobuf>=3.20.2 in /opt/conda/lib/python3.9/site-packages (from onnx~=1.14.1->-r /empty/requirements.txt (line 2)) (3.20.3)
-Requirement already satisfied: typing-extensions>=3.6.2.1 in /opt/conda/lib/python3.9/site-packages (from onnx~=1.14.1->-r /empty/requirements.txt (line 2)) (4.10.0)
-Collecting coloredlogs (from onnxruntime~=1.16.1->-r /empty/requirements.txt (line 3))
-  Obtaining dependency information for coloredlogs from https://files.pythonhosted.org/packages/a7/06/3d6badcf13db419e25b07041d9c7b4a2c331d3f4e7134445ec5df57714cd/coloredlogs-15.0.1-py2.py3-none-any.whl.metadata
-  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
-Collecting flatbuffers (from onnxruntime~=1.16.1->-r /empty/requirements.txt (line 3))
-  Obtaining dependency information for flatbuffers from https://files.pythonhosted.org/packages/bf/45/c961e3cb6ddad76b325c163d730562bb6deb1ace5acbed0306f5fbefb90e/flatbuffers-24.3.7-py2.py3-none-any.whl.metadata
-  Downloading flatbuffers-24.3.7-py2.py3-none-any.whl.metadata (849 bytes)
-Collecting sympy (from onnxruntime~=1.16.1->-r /empty/requirements.txt (line 3))
-  Obtaining dependency information for sympy from https://files.pythonhosted.org/packages/d2/05/e6600db80270777c4a64238a98d442f0fd07cc8915be2a1c16da7f2b9e74/sympy-1.12-py3-none-any.whl.metadata
-  Downloading sympy-1.12-py3-none-any.whl.metadata (12 kB)
-Collecting transformers[sentencepiece]>=4.26.0 (from optimum~=1.6.4->-r /empty/requirements.txt (line 4))
-  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/0a/fd/280f4385e76f3c1890efc15fa93f7206134fefad6351397e1bfab6d0d0de/transformers-4.39.1-py3-none-any.whl.metadata
-  Downloading transformers-4.39.1-py3-none-any.whl.metadata (134 kB)
-     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 134.8/134.8 kB 40.1 MB/s eta 0:00:00
-Collecting torch>=1.9 (from optimum~=1.6.4->-r /empty/requirements.txt (line 4))
-  Obtaining dependency information for torch>=1.9 from https://files.pythonhosted.org/packages/98/04/95a12556d068786d6505c609daf2805bed91c9210c5185499a7c121eba47/torch-2.2.1-cp39-cp39-manylinux1_x86_64.whl.metadata
-  Downloading torch-2.2.1-cp39-cp39-manylinux1_x86_64.whl.metadata (25 kB)
-Collecting numpy<1.27.0,>=1.16.5 (from mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1))
-  Obtaining dependency information for numpy<1.27.0,>=1.16.5 from https://files.pythonhosted.org/packages/4c/b9/038abd6fbd67b05b03cb1af590cfc02b7f1e5a37af7ac6a868f5093c29f5/numpy-1.23.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
-  Downloading numpy-1.23.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.3 kB)
-Collecting huggingface-hub>=0.8.0 (from optimum~=1.6.4->-r /empty/requirements.txt (line 4))
-  Obtaining dependency information for huggingface-hub>=0.8.0 from https://files.pythonhosted.org/packages/ab/28/d4b691840d73126d4c9845f8a22dad033ac872509b6d3a0d93b456eef424/huggingface_hub-0.21.4-py3-none-any.whl.metadata
-  Downloading huggingface_hub-0.21.4-py3-none-any.whl.metadata (13 kB)
-Collecting filelock (from transformers~=4.26.1->-r /empty/requirements.txt (line 5))
-  Obtaining dependency information for filelock from https://files.pythonhosted.org/packages/81/54/84d42a0bee35edba99dee7b59a8d4970eccdd44b99fe728ed912106fc781/filelock-3.13.1-py3-none-any.whl.metadata
-  Downloading filelock-3.13.1-py3-none-any.whl.metadata (2.8 kB)
-Collecting regex!=2019.12.17 (from transformers~=4.26.1->-r /empty/requirements.txt (line 5))
-  Obtaining dependency information for regex!=2019.12.17 from https://files.pythonhosted.org/packages/05/9e/80c20f1151432a6025690c9c2037053039b028a7b236fa81d7e7ac9dec60/regex-2023.12.25-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
-  Downloading regex-2023.12.25-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
-     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 40.9/40.9 kB 217.5 MB/s eta 0:00:00
-Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers~=4.26.1->-r /empty/requirements.txt (line 5))
-  Obtaining dependency information for tokenizers!=0.11.3,<0.14,>=0.11.1 from https://files.pythonhosted.org/packages/d6/27/07a337087dd507170a1b20fed3bbf8da81401185a7130a6e74e440c52040/tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
-  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
-Requirement already satisfied: tqdm>=4.27 in /opt/conda/lib/python3.9/site-packages (from transformers~=4.26.1->-r /empty/requirements.txt (line 5)) (4.65.0)
-Collecting dill<0.3.7,>=0.3.0 (from datasets~=2.10.1->-r /empty/requirements.txt (line 6))
-  Obtaining dependency information for dill<0.3.7,>=0.3.0 from https://files.pythonhosted.org/packages/be/e3/a84bf2e561beed15813080d693b4b27573262433fced9c1d1fea59e60553/dill-0.3.6-py3-none-any.whl.metadata
-  Downloading dill-0.3.6-py3-none-any.whl.metadata (9.8 kB)
-Requirement already satisfied: xxhash in /opt/conda/lib/python3.9/site-packages (from datasets~=2.10.1->-r /empty/requirements.txt (line 6)) (3.4.1)
-Collecting multiprocess (from datasets~=2.10.1->-r /empty/requirements.txt (line 6))
-  Obtaining dependency information for multiprocess from https://files.pythonhosted.org/packages/da/d9/f7f9379981e39b8c2511c9e0326d212accacb82f12fbfdc1aa2ce2a7b2b6/multiprocess-0.70.16-py39-none-any.whl.metadata
-  Downloading multiprocess-0.70.16-py39-none-any.whl.metadata (7.2 kB)
-Collecting responses<0.19 (from datasets~=2.10.1->-r /empty/requirements.txt (line 6))
-  Obtaining dependency information for responses<0.19 from https://files.pythonhosted.org/packages/79/f3/2b3a6dc5986303b3dd1bbbcf482022acb2583c428cd23f0b6d37b1a1a519/responses-0.18.0-py3-none-any.whl.metadata
-  Downloading responses-0.18.0-py3-none-any.whl.metadata (29 kB)
-Requirement already satisfied: scipy>=1.1.0 in /opt/conda/lib/python3.9/site-packages (from scikit-learn~=1.0.2->-r /empty/requirements.txt (line 7)) (1.12.0)
-Requirement already satisfied: joblib>=0.11 in /opt/conda/lib/python3.9/site-packages (from scikit-learn~=1.0.2->-r /empty/requirements.txt (line 7)) (1.3.2)
-Requirement already satisfied: threadpoolctl>=2.0.0 in /opt/conda/lib/python3.9/site-packages (from scikit-learn~=1.0.2->-r /empty/requirements.txt (line 7)) (3.3.0)
-Requirement already satisfied: botocore<1.31.18,>=1.31.17 in /opt/conda/lib/python3.9/site-packages (from aiobotocore<2.8,>=2.5.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.31.17)
-Requirement already satisfied: wrapt<2.0.0,>=1.10.10 in /opt/conda/lib/python3.9/site-packages (from aiobotocore<2.8,>=2.5.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.16.0)
-Requirement already satisfied: aioitertools<1.0.0,>=0.5.1 in /opt/conda/lib/python3.9/site-packages (from aiobotocore<2.8,>=2.5.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.11.0)
-Requirement already satisfied: aiosignal>=1.1.2 in /opt/conda/lib/python3.9/site-packages (from aiohttp~=3.9->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.3.1)
-Requirement already satisfied: attrs>=17.3.0 in /opt/conda/lib/python3.9/site-packages (from aiohttp~=3.9->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (23.2.0)
-Requirement already satisfied: frozenlist>=1.1.1 in /opt/conda/lib/python3.9/site-packages (from aiohttp~=3.9->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.4.1)
-Requirement already satisfied: multidict<7.0,>=4.5 in /opt/conda/lib/python3.9/site-packages (from aiohttp~=3.9->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (6.0.5)
-Requirement already satisfied: yarl<2.0,>=1.0 in /opt/conda/lib/python3.9/site-packages (from aiohttp~=3.9->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.9.4)
-Requirement already satisfied: async-timeout<5.0,>=4.0 in /opt/conda/lib/python3.9/site-packages (from aiohttp~=3.9->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (4.0.3)
-Requirement already satisfied: idna>=2.8 in /opt/conda/lib/python3.9/site-packages (from anyio~=3.7->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.4)
-Requirement already satisfied: sniffio>=1.1 in /opt/conda/lib/python3.9/site-packages (from anyio~=3.7->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.3.1)
-Requirement already satisfied: exceptiongroup in /opt/conda/lib/python3.9/site-packages (from anyio~=3.7->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.2.0)
-Requirement already satisfied: six>=1.11.0 in /opt/conda/lib/python3.9/site-packages (from azure-core~=1.24->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.16.0)
-Requirement already satisfied: cryptography>=2.5 in /opt/conda/lib/python3.9/site-packages (from azure-identity~=1.5->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (42.0.2)
-Requirement already satisfied: msal<2.0.0,>=1.24.0 in /opt/conda/lib/python3.9/site-packages (from azure-identity~=1.5->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.27.0)
-Requirement already satisfied: msal-extensions<2.0.0,>=0.3.0 in /opt/conda/lib/python3.9/site-packages (from azure-identity~=1.5->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.1.0)
-Requirement already satisfied: isodate>=0.6.1 in /opt/conda/lib/python3.9/site-packages (from azure-keyvault-secrets~=4.2->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.6.1)
-Requirement already satisfied: jmespath<2.0.0,>=0.7.1 in /opt/conda/lib/python3.9/site-packages (from boto3<1.29.0,>=1.28.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.0.1)
-Requirement already satisfied: s3transfer<0.7.0,>=0.6.0 in /opt/conda/lib/python3.9/site-packages (from boto3<1.29.0,>=1.28.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.6.2)
-Requirement already satisfied: cloudpickle>=1.5.0 in /opt/conda/lib/python3.9/site-packages (from dask~=2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.2.1)
-Requirement already satisfied: partd>=1.2.0 in /opt/conda/lib/python3.9/site-packages (from dask~=2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.4.1)
-Requirement already satisfied: toolz>=0.10.0 in /opt/conda/lib/python3.9/site-packages (from dask~=2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.12.0)
-Requirement already satisfied: importlib-metadata>=4.13.0 in /opt/conda/lib/python3.9/site-packages (from dask~=2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (7.0.1)
-Requirement already satisfied: locket>=1.0.0 in /opt/conda/lib/python3.9/site-packages (from distributed~=2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.0.0)
-Requirement already satisfied: msgpack>=1.0.0 in /opt/conda/lib/python3.9/site-packages (from distributed~=2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.0.7)
-Requirement already satisfied: psutil>=5.7.2 in /opt/conda/lib/python3.9/site-packages (from distributed~=2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (5.9.8)
-Requirement already satisfied: sortedcontainers>=2.0.5 in /opt/conda/lib/python3.9/site-packages (from distributed~=2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.4.0)
-Requirement already satisfied: tblib>=1.6.0 in /opt/conda/lib/python3.9/site-packages (from distributed~=2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.0.0)
-Requirement already satisfied: tornado>=6.0.4 in /opt/conda/lib/python3.9/site-packages (from distributed~=2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (6.4)
-Requirement already satisfied: zict>=3.0.0 in /opt/conda/lib/python3.9/site-packages (from distributed~=2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.0.0)
-Requirement already satisfied: gitdb<5,>=4.0.1 in /opt/conda/lib/python3.9/site-packages (from GitPython>=3.1.41,~=3.1->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (4.0.11)
-Requirement already satisfied: jedi>=0.16 in /opt/conda/lib/python3.9/site-packages (from ipython~=8.10->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.19.1)
-Requirement already satisfied: matplotlib-inline in /opt/conda/lib/python3.9/site-packages (from ipython~=8.10->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.1.6)
-Requirement already satisfied: prompt-toolkit<3.1.0,>=3.0.41 in /opt/conda/lib/python3.9/site-packages (from ipython~=8.10->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.0.43)
-Requirement already satisfied: pygments>=2.4.0 in /opt/conda/lib/python3.9/site-packages (from ipython~=8.10->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.17.2)
-Requirement already satisfied: stack-data in /opt/conda/lib/python3.9/site-packages (from ipython~=8.10->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.6.3)
-Requirement already satisfied: traitlets>=5 in /opt/conda/lib/python3.9/site-packages (from ipython~=8.10->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (5.14.1)
-Requirement already satisfied: pexpect>4.3 in /opt/conda/lib/python3.9/site-packages (from ipython~=8.10->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (4.9.0)
-Requirement already satisfied: MarkupSafe>=2.0 in /opt/conda/lib/python3.9/site-packages (from jinja2>=3.1.3,~=3.1->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.1.5)
-Requirement already satisfied: absl-py<2,>=0.9 in /opt/conda/lib/python3.9/site-packages (from kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.4.0)
-Requirement already satisfied: kubernetes<26,>=8.0.0 in /opt/conda/lib/python3.9/site-packages (from kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (25.3.0)
-Requirement already satisfied: google-api-python-client<2,>=1.7.8 in /opt/conda/lib/python3.9/site-packages (from kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.12.11)
-Requirement already satisfied: requests-toolbelt<1,>=0.8.0 in /opt/conda/lib/python3.9/site-packages (from kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.10.1)
-Requirement already satisfied: kfp-server-api<2.0.0,>=1.1.2 in /opt/conda/lib/python3.9/site-packages (from kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.8.5)
-Requirement already satisfied: jsonschema<5,>=3.0.1 in /opt/conda/lib/python3.9/site-packages (from kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (4.21.1)
-Requirement already satisfied: strip-hints<1,>=0.1.8 in /opt/conda/lib/python3.9/site-packages (from kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.1.10)
-Requirement already satisfied: docstring-parser<1,>=0.7.3 in /opt/conda/lib/python3.9/site-packages (from kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.15)
-Requirement already satisfied: kfp-pipeline-spec<0.2.0,>=0.1.16 in /opt/conda/lib/python3.9/site-packages (from kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.1.16)
-Requirement already satisfied: fire<1,>=0.3.1 in /opt/conda/lib/python3.9/site-packages (from kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.5.0)
-Requirement already satisfied: uritemplate<4,>=3.0.1 in /opt/conda/lib/python3.9/site-packages (from kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.0.1)
-Requirement already satisfied: typer<1.0,>=0.3.2 in /opt/conda/lib/python3.9/site-packages (from kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.9.0)
-Requirement already satisfied: entrypoints<1 in /opt/conda/lib/python3.9/site-packages (from mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.4)
-Requirement already satisfied: pytz<2024 in /opt/conda/lib/python3.9/site-packages (from mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2023.4)
-Requirement already satisfied: sqlparse<1,>=0.4.0 in /opt/conda/lib/python3.9/site-packages (from mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.4.4)
-Requirement already satisfied: alembic!=1.10.0,<2 in /opt/conda/lib/python3.9/site-packages (from mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.13.1)
-Requirement already satisfied: docker<8,>=4.0.0 in /opt/conda/lib/python3.9/site-packages (from mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (7.0.0)
-Requirement already satisfied: Flask<4 in /opt/conda/lib/python3.9/site-packages (from mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.0.2)
-Requirement already satisfied: querystring-parser<2 in /opt/conda/lib/python3.9/site-packages (from mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.2.4)
-Requirement already satisfied: markdown<4,>=3.3 in /opt/conda/lib/python3.9/site-packages (from mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.5.2)
-Requirement already satisfied: matplotlib<4 in /opt/conda/lib/python3.9/site-packages (from mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.8.3)
-Requirement already satisfied: gunicorn<22 in /opt/conda/lib/python3.9/site-packages (from mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (21.2.0)
-Requirement already satisfied: requests-oauthlib>=0.5.0 in /opt/conda/lib/python3.9/site-packages (from msrest~=0.6.21->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.3.1)
-Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.9/site-packages (from msrest~=0.6.21->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2024.2.2)
-Requirement already satisfied: nbconvert>=6.4.5 in /opt/conda/lib/python3.9/site-packages (from nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (7.16.1)
-Requirement already satisfied: notebook<7.0.0,>=6.4 in /opt/conda/lib/python3.9/site-packages (from nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (6.5.6)
-Requirement already satisfied: tzdata>=2022.1 in /opt/conda/lib/python3.9/site-packages (from pandas<2.2,>=1.2->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2024.1)
-Requirement already satisfied: tenacity>=6.2.0 in /opt/conda/lib/python3.9/site-packages (from plotly<5.12.0,~=5.4->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (8.2.3)
-Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.9/site-packages (from requests~=2.31->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.0.4)
-Requirement already satisfied: greenlet!=0.4.17 in /opt/conda/lib/python3.9/site-packages (from sqlalchemy~=1.4->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.0.3)
-Requirement already satisfied: nuclio-sdk>=0.5.3 in /opt/conda/lib/python3.9/site-packages (from storey~=1.6.18->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.5.9)
-Collecting networkx (from torch>=1.9->optimum~=1.6.4->-r /empty/requirements.txt (line 4))
-  Obtaining dependency information for networkx from https://files.pythonhosted.org/packages/d5/f0/8fbc882ca80cf077f1b246c0e3c3465f7f415439bdea6b899f6b19f61f70/networkx-3.2.1-py3-none-any.whl.metadata
-  Downloading networkx-3.2.1-py3-none-any.whl.metadata (5.2 kB)
-Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.9->optimum~=1.6.4->-r /empty/requirements.txt (line 4))
-  Obtaining dependency information for nvidia-cuda-nvrtc-cu12==12.1.105 from https://files.pythonhosted.org/packages/b6/9f/c64c03f49d6fbc56196664d05dba14e3a561038a81a638eeb47f4d4cfd48/nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata
-  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
-Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.9->optimum~=1.6.4->-r /empty/requirements.txt (line 4))
-  Obtaining dependency information for nvidia-cuda-runtime-cu12==12.1.105 from https://files.pythonhosted.org/packages/eb/d5/c68b1d2cdfcc59e72e8a5949a37ddb22ae6cade80cd4a57a84d4c8b55472/nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata
-  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
-Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.9->optimum~=1.6.4->-r /empty/requirements.txt (line 4))
-  Obtaining dependency information for nvidia-cuda-cupti-cu12==12.1.105 from https://files.pythonhosted.org/packages/7e/00/6b218edd739ecfc60524e585ba8e6b00554dd908de2c9c66c1af3e44e18d/nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata
-  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
-Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.9->optimum~=1.6.4->-r /empty/requirements.txt (line 4))
-  Obtaining dependency information for nvidia-cudnn-cu12==8.9.2.26 from https://files.pythonhosted.org/packages/ff/74/a2e2be7fb83aaedec84f391f082cf765dfb635e7caa9b49065f73e4835d8/nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata
-  Downloading nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
-Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.9->optimum~=1.6.4->-r /empty/requirements.txt (line 4))
-  Obtaining dependency information for nvidia-cublas-cu12==12.1.3.1 from https://files.pythonhosted.org/packages/37/6d/121efd7382d5b0284239f4ab1fc1590d86d34ed4a4a2fdb13b30ca8e5740/nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata
-  Downloading nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
-Collecting nvidia-cufft-cu12==11.0.2.54 (from torch>=1.9->optimum~=1.6.4->-r /empty/requirements.txt (line 4))
-  Obtaining dependency information for nvidia-cufft-cu12==11.0.2.54 from https://files.pythonhosted.org/packages/86/94/eb540db023ce1d162e7bea9f8f5aa781d57c65aed513c33ee9a5123ead4d/nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl.metadata
-  Downloading nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
-Collecting nvidia-curand-cu12==10.3.2.106 (from torch>=1.9->optimum~=1.6.4->-r /empty/requirements.txt (line 4))
-  Obtaining dependency information for nvidia-curand-cu12==10.3.2.106 from https://files.pythonhosted.org/packages/44/31/4890b1c9abc496303412947fc7dcea3d14861720642b49e8ceed89636705/nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl.metadata
-  Downloading nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
-Collecting nvidia-cusolver-cu12==11.4.5.107 (from torch>=1.9->optimum~=1.6.4->-r /empty/requirements.txt (line 4))
-  Obtaining dependency information for nvidia-cusolver-cu12==11.4.5.107 from https://files.pythonhosted.org/packages/bc/1d/8de1e5c67099015c834315e333911273a8c6aaba78923dd1d1e25fc5f217/nvidia_cusolver_cu12-11.4.5.107-py3-none-manylinux1_x86_64.whl.metadata
-  Downloading nvidia_cusolver_cu12-11.4.5.107-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
-Collecting nvidia-cusparse-cu12==12.1.0.106 (from torch>=1.9->optimum~=1.6.4->-r /empty/requirements.txt (line 4))
-  Obtaining dependency information for nvidia-cusparse-cu12==12.1.0.106 from https://files.pythonhosted.org/packages/65/5b/cfaeebf25cd9fdec14338ccb16f6b2c4c7fa9163aefcf057d86b9cc248bb/nvidia_cusparse_cu12-12.1.0.106-py3-none-manylinux1_x86_64.whl.metadata
-  Downloading nvidia_cusparse_cu12-12.1.0.106-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
-Collecting nvidia-nccl-cu12==2.19.3 (from torch>=1.9->optimum~=1.6.4->-r /empty/requirements.txt (line 4))
-  Obtaining dependency information for nvidia-nccl-cu12==2.19.3 from https://files.pythonhosted.org/packages/38/00/d0d4e48aef772ad5aebcf70b73028f88db6e5640b36c38e90445b7a57c45/nvidia_nccl_cu12-2.19.3-py3-none-manylinux1_x86_64.whl.metadata
-  Downloading nvidia_nccl_cu12-2.19.3-py3-none-manylinux1_x86_64.whl.metadata (1.8 kB)
-Collecting nvidia-nvtx-cu12==12.1.105 (from torch>=1.9->optimum~=1.6.4->-r /empty/requirements.txt (line 4))
-  Obtaining dependency information for nvidia-nvtx-cu12==12.1.105 from https://files.pythonhosted.org/packages/da/d3/8057f0587683ed2fcd4dbfbdfdfa807b9160b809976099d36b8f60d08f03/nvidia_nvtx_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata
-  Downloading nvidia_nvtx_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.7 kB)
-Collecting triton==2.2.0 (from torch>=1.9->optimum~=1.6.4->-r /empty/requirements.txt (line 4))
-  Obtaining dependency information for triton==2.2.0 from https://files.pythonhosted.org/packages/6a/5c/01d9f062f719581cf6e60053e1a005d666ec67dcb59630fffaa3a3e5c9d8/triton-2.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
-  Downloading triton-2.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
-Collecting nvidia-nvjitlink-cu12 (from nvidia-cusolver-cu12==11.4.5.107->torch>=1.9->optimum~=1.6.4->-r /empty/requirements.txt (line 4))
-  Obtaining dependency information for nvidia-nvjitlink-cu12 from https://files.pythonhosted.org/packages/58/d1/d1c80553f9d5d07b6072bc132607d75a0ef3600e28e1890e11c0f55d7346/nvidia_nvjitlink_cu12-12.4.99-py3-none-manylinux2014_x86_64.whl.metadata
-  Downloading nvidia_nvjitlink_cu12-12.4.99-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
-INFO: pip is looking at multiple versions of transformers[sentencepiece] to determine which version is compatible with other requirements. This could take a while.
-Collecting transformers[sentencepiece]>=4.26.0 (from optimum~=1.6.4->-r /empty/requirements.txt (line 4))
-  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/a4/73/f620d76193954e16db3d5c53a07d956d7b9c800e570758d3bff91906d4a4/transformers-4.39.0-py3-none-any.whl.metadata
-  Downloading transformers-4.39.0-py3-none-any.whl.metadata (134 kB)
-     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 134.8/134.8 kB 115.9 MB/s eta 0:00:00
-  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/b6/4d/fbe6d89fde59d8107f0a02816c4ac4542a8f9a85559fdf33c68282affcc1/transformers-4.38.2-py3-none-any.whl.metadata
-  Downloading transformers-4.38.2-py3-none-any.whl.metadata (130 kB)
-     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 130.7/130.7 kB 126.3 MB/s eta 0:00:00
-  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/3e/6b/1b589f7b69aaea8193cf5bc91cf97410284aecd97b6312cdb08baedbdffe/transformers-4.38.1-py3-none-any.whl.metadata
-  Downloading transformers-4.38.1-py3-none-any.whl.metadata (131 kB)
-     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 131.1/131.1 kB 138.2 MB/s eta 0:00:00
-  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/91/89/5416dc364c7ef0711c564fd61a69b03d1e40eeb5c506c38e53ba8a969e79/transformers-4.38.0-py3-none-any.whl.metadata
-  Downloading transformers-4.38.0-py3-none-any.whl.metadata (131 kB)
-     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 131.1/131.1 kB 186.3 MB/s eta 0:00:00
-  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/85/f6/c5065913119c41ecad148c34e3a861f719e16b89a522287213698da911fc/transformers-4.37.2-py3-none-any.whl.metadata
-  Downloading transformers-4.37.2-py3-none-any.whl.metadata (129 kB)
-     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 129.4/129.4 kB 236.8 MB/s eta 0:00:00
-  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/ad/67/b4d6a51dcaf988cb45b31e26c6e33fb169fe34ba5fb168b086309bd7c028/transformers-4.37.1-py3-none-any.whl.metadata
-  Downloading transformers-4.37.1-py3-none-any.whl.metadata (129 kB)
-     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 129.4/129.4 kB 156.4 MB/s eta 0:00:00
-  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/3c/45/52133ce6bce49a099cc865599803bf1fad93de887276f728e56848d77a70/transformers-4.37.0-py3-none-any.whl.metadata
-  Downloading transformers-4.37.0-py3-none-any.whl.metadata (129 kB)
-     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 129.4/129.4 kB 102.0 MB/s eta 0:00:00
-INFO: pip is still looking at multiple versions of transformers[sentencepiece] to determine which version is compatible with other requirements. This could take a while.
-  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/20/0a/739426a81f7635b422fbe6cb8d1d99d1235579a6ac8024c13d743efa6847/transformers-4.36.2-py3-none-any.whl.metadata
-  Downloading transformers-4.36.2-py3-none-any.whl.metadata (126 kB)
-     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 126.8/126.8 kB 108.8 MB/s eta 0:00:00
-  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/fc/04/0aad491cd98b09236c54ab849863ee85421eeda5138bbf9d33ecc594652b/transformers-4.36.1-py3-none-any.whl.metadata
-  Downloading transformers-4.36.1-py3-none-any.whl.metadata (126 kB)
-     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 126.8/126.8 kB 140.6 MB/s eta 0:00:00
-  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/0f/12/d8e27a190ca67811f81deea3183b528d9169f10b74d827e0b9211520ecfa/transformers-4.36.0-py3-none-any.whl.metadata
-  Downloading transformers-4.36.0-py3-none-any.whl.metadata (126 kB)
-     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 126.8/126.8 kB 267.8 MB/s eta 0:00:00
-  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/12/dd/f17b11a93a9ca27728e12512d167eb1281c151c4c6881d3ab59eb58f4127/transformers-4.35.2-py3-none-any.whl.metadata
-  Downloading transformers-4.35.2-py3-none-any.whl.metadata (123 kB)
-     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 123.5/123.5 kB 130.2 MB/s eta 0:00:00
-  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/92/ba/cfff7e01f7070d9fca3964bf42b2257b86964c3e6763b8d5435436cc1d77/transformers-4.35.1-py3-none-any.whl.metadata
-  Downloading transformers-4.35.1-py3-none-any.whl.metadata (123 kB)
-     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 123.1/123.1 kB 183.6 MB/s eta 0:00:00
-INFO: This is taking longer than usual. You might need to provide the dependency resolver with stricter constraints to reduce runtime. See https://pip.pypa.io/warnings/backtracking for guidance. If you want to abort this run, press Ctrl + C.
-  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/9a/06/e4ec2a321e57c03b7e9345d709d554a52c33760e5015fdff0919d9459af0/transformers-4.35.0-py3-none-any.whl.metadata
-  Downloading transformers-4.35.0-py3-none-any.whl.metadata (123 kB)
-     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 123.1/123.1 kB 177.3 MB/s eta 0:00:00
-  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/c1/bd/f64d67df4d3b05a460f281defe830ffab6d7940b7ca98ec085e94e024781/transformers-4.34.1-py3-none-any.whl.metadata
-  Downloading transformers-4.34.1-py3-none-any.whl.metadata (121 kB)
-     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 121.5/121.5 kB 270.5 MB/s eta 0:00:00
-  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/1a/d1/3bba59606141ae808017f6fde91453882f931957f125009417b87a281067/transformers-4.34.0-py3-none-any.whl.metadata
-  Downloading transformers-4.34.0-py3-none-any.whl.metadata (121 kB)
-     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 121.5/121.5 kB 133.4 MB/s eta 0:00:00
-  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/98/46/f6a79f944d5c7763a9bc13b2aa6ac72daf43a6551f5fb03bccf0a9c2fec1/transformers-4.33.3-py3-none-any.whl.metadata
-  Downloading transformers-4.33.3-py3-none-any.whl.metadata (119 kB)
-     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 119.9/119.9 kB 163.1 MB/s eta 0:00:00
-  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/1a/06/3817f9bb923437ead9a794f0ac0d03b8b5e0478ab112db4c413dd37c09da/transformers-4.33.2-py3-none-any.whl.metadata
-  Downloading transformers-4.33.2-py3-none-any.whl.metadata (119 kB)
-     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 119.9/119.9 kB 274.9 MB/s eta 0:00:00
-  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/13/30/54b59e73400df3de506ad8630284e9fd63f4b94f735423d55fc342181037/transformers-4.33.1-py3-none-any.whl.metadata
-  Downloading transformers-4.33.1-py3-none-any.whl.metadata (119 kB)
-     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 119.9/119.9 kB 274.2 MB/s eta 0:00:00
-  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/e1/9d/4d9fe5c3b820db10773392ac5f4a0c8dab668f70b245ce2ce09785166128/transformers-4.33.0-py3-none-any.whl.metadata
-  Downloading transformers-4.33.0-py3-none-any.whl.metadata (119 kB)
-     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 119.9/119.9 kB 185.9 MB/s eta 0:00:00
-  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/83/8d/f65f8138365462ace54458a9e164f4b28ce1141361970190eef36bdef986/transformers-4.32.1-py3-none-any.whl.metadata
-  Downloading transformers-4.32.1-py3-none-any.whl.metadata (118 kB)
-     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 118.5/118.5 kB 144.4 MB/s eta 0:00:00
-  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/ae/95/283a1c004430bd2a9425d6937fc545dd49a4e4592feb76be0299a14e2378/transformers-4.32.0-py3-none-any.whl.metadata
-  Downloading transformers-4.32.0-py3-none-any.whl.metadata (118 kB)
-     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 118.5/118.5 kB 150.3 MB/s eta 0:00:00
-  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/21/02/ae8e595f45b6c8edee07913892b3b41f5f5f273962ad98851dc6a564bbb9/transformers-4.31.0-py3-none-any.whl.metadata
-  Downloading transformers-4.31.0-py3-none-any.whl.metadata (116 kB)
-     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 116.9/116.9 kB 156.7 MB/s eta 0:00:00
-  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/5b/0b/e45d26ccd28568013523e04f325432ea88a442b4e3020b757cf4361f0120/transformers-4.30.2-py3-none-any.whl.metadata
-  Downloading transformers-4.30.2-py3-none-any.whl.metadata (113 kB)
-     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 113.6/113.6 kB 263.7 MB/s eta 0:00:00
-  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/b8/df/b01b5e67cde3883757c9212455cbb9169385dcab5858b7172199126b756d/transformers-4.30.1-py3-none-any.whl.metadata
-  Downloading transformers-4.30.1-py3-none-any.whl.metadata (113 kB)
-     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 113.6/113.6 kB 263.8 MB/s eta 0:00:00
-  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/e2/72/1af3d38e98fdcceb3876de4567ac395a66c26976e259fe2d46266e052d61/transformers-4.30.0-py3-none-any.whl.metadata
-  Downloading transformers-4.30.0-py3-none-any.whl.metadata (113 kB)
-     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 113.6/113.6 kB 266.5 MB/s eta 0:00:00
-  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/17/aa/a89864288afe45abe1ab79f002140a20348140e86836d96096d8f8a3bac0/transformers-4.29.2-py3-none-any.whl.metadata
-  Downloading transformers-4.29.2-py3-none-any.whl.metadata (112 kB)
-     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 112.3/112.3 kB 272.7 MB/s eta 0:00:00
-  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/e8/b5/ddb16f9de207e6571ab7cc5db0cc538fa2d6d91cf024565496462af4c1ce/transformers-4.29.1-py3-none-any.whl.metadata
-  Downloading transformers-4.29.1-py3-none-any.whl.metadata (112 kB)
-     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 112.3/112.3 kB 262.3 MB/s eta 0:00:00
-  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/45/e4/4914b11df70954d95a7c36b74bf9010c8594fcec960471479449b0deb4f7/transformers-4.29.0-py3-none-any.whl.metadata
-  Downloading transformers-4.29.0-py3-none-any.whl.metadata (111 kB)
-     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 111.9/111.9 kB 269.5 MB/s eta 0:00:00
-  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/d8/a7/a6ff727fd5d96d6625f4658944a2ae230f0c75743a9a117fbda013b03d3d/transformers-4.28.1-py3-none-any.whl.metadata
-  Downloading transformers-4.28.1-py3-none-any.whl.metadata (109 kB)
-     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 110.0/110.0 kB 245.6 MB/s eta 0:00:00
-  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/8b/13/1ce598763b3669d43f192a7911bf2bf730a328012ab8801b93187a4f70d0/transformers-4.28.0-py3-none-any.whl.metadata
-  Downloading transformers-4.28.0-py3-none-any.whl.metadata (109 kB)
-     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 110.0/110.0 kB 256.3 MB/s eta 0:00:00
-  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/87/f0/2a152ed10ab8601431e87a606d397f7473c5fa4f8162f4ec5bda6ddb2df4/transformers-4.27.4-py3-none-any.whl.metadata
-  Downloading transformers-4.27.4-py3-none-any.whl.metadata (106 kB)
-     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 106.7/106.7 kB 254.4 MB/s eta 0:00:00
-  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/52/ac/9dc5a17ba60bc354d99250d9d1629f99d76f6729cee438fa91c8cc74bc5d/transformers-4.27.3-py3-none-any.whl.metadata
-  Downloading transformers-4.27.3-py3-none-any.whl.metadata (106 kB)
-     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 106.7/106.7 kB 251.5 MB/s eta 0:00:00
-  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/73/f0/4a795505387a3e7cd7f0c2a2a87f876658f9a07947a38fb67bffceff9246/transformers-4.27.2-py3-none-any.whl.metadata
-  Downloading transformers-4.27.2-py3-none-any.whl.metadata (106 kB)
-     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 106.7/106.7 kB 246.1 MB/s eta 0:00:00
-  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/6d/9b/2f536f9e73390209e0b27b74691355dac494b7ec8154f3012fdc6debbae7/transformers-4.27.1-py3-none-any.whl.metadata
-  Downloading transformers-4.27.1-py3-none-any.whl.metadata (106 kB)
-     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 106.7/106.7 kB 114.0 MB/s eta 0:00:00
-  Obtaining dependency information for transformers[sentencepiece]>=4.26.0 from https://files.pythonhosted.org/packages/4d/3e/1378ed266cf991f5ab5fcb29e953d97d793c7f9242ea5dc52f856415ea3a/transformers-4.27.0-py3-none-any.whl.metadata
-  Downloading transformers-4.27.0-py3-none-any.whl.metadata (106 kB)
-     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 106.7/106.7 kB 247.2 MB/s eta 0:00:00
-Collecting sentencepiece!=0.1.92,>=0.1.91 (from transformers~=4.26.1->-r /empty/requirements.txt (line 5))
-  Obtaining dependency information for sentencepiece!=0.1.92,>=0.1.91 from https://files.pythonhosted.org/packages/5f/01/c95e42eb86282b2c79305d3e0b0ca5a743f85a61262bb7130999c70b9374/sentencepiece-0.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
-  Downloading sentencepiece-0.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)
-Collecting protobuf>=3.20.2 (from onnx~=1.14.1->-r /empty/requirements.txt (line 2))
-  Obtaining dependency information for protobuf>=3.20.2 from https://files.pythonhosted.org/packages/38/b1/d9b615dceb67ac38e13cbd7680c27182b40154996022cbb244ba1ac7d30f/protobuf-3.20.2-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl.metadata
-  Downloading protobuf-3.20.2-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl.metadata (679 bytes)
-Requirement already satisfied: future>=0.18.2 in /opt/conda/lib/python3.9/site-packages (from v3io~=0.5.21->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.0.0)
-Requirement already satisfied: ujson>=3 in /opt/conda/lib/python3.9/site-packages (from v3io~=0.5.21->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (5.9.0)
-Requirement already satisfied: googleapis-common-protos>=1.5.3 in /opt/conda/lib/python3.9/site-packages (from v3io-frames~=0.10.12->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.62.0)
-Requirement already satisfied: grpcio-tools!=1.34.0,<1.49,>=1.30 in /opt/conda/lib/python3.9/site-packages (from v3io-frames~=0.10.12->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.48.2)
-Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime~=1.16.1->-r /empty/requirements.txt (line 3))
-  Obtaining dependency information for humanfriendly>=9.1 from https://files.pythonhosted.org/packages/f0/0f/310fb31e39e2d734ccaa2c0fb981ee41f7bd5056ce9bc29b2248bd569169/humanfriendly-10.0-py2.py3-none-any.whl.metadata
-  Downloading humanfriendly-10.0-py2.py3-none-any.whl.metadata (9.2 kB)
-INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
-Collecting multiprocess (from datasets~=2.10.1->-r /empty/requirements.txt (line 6))
-  Obtaining dependency information for multiprocess from https://files.pythonhosted.org/packages/c6/c9/820b5ab056f4ada76fbe05bd481a948f287957d6cbfd59e2dd2618b408c1/multiprocess-0.70.15-py39-none-any.whl.metadata
-  Downloading multiprocess-0.70.15-py39-none-any.whl.metadata (7.2 kB)
-  Obtaining dependency information for multiprocess from https://files.pythonhosted.org/packages/6a/f4/fbeb03ef7abdda54db4a6a75c971b88ab73d724ff09e3275cc1e99f1c946/multiprocess-0.70.14-py39-none-any.whl.metadata
-  Downloading multiprocess-0.70.14-py39-none-any.whl.metadata (6.6 kB)
-Collecting mpmath>=0.19 (from sympy->onnxruntime~=1.16.1->-r /empty/requirements.txt (line 3))
-  Obtaining dependency information for mpmath>=0.19 from https://files.pythonhosted.org/packages/43/e3/7d92a15f894aa0c9c4b49b8ee9ac9850d6e63b03c9c32c0367a13ae62209/mpmath-1.3.0-py3-none-any.whl.metadata
-  Downloading mpmath-1.3.0-py3-none-any.whl.metadata (8.6 kB)
-Requirement already satisfied: Mako in /opt/conda/lib/python3.9/site-packages (from alembic!=1.10.0,<2->mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.3.2)
-Requirement already satisfied: cffi in /opt/conda/lib/python3.9/site-packages (from azure-datalake-store<0.1,>=0.0.46->adlfs==2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.16.0)
-Requirement already satisfied: termcolor in /opt/conda/lib/python3.9/site-packages (from fire<1,>=0.3.1->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.4.0)
-Requirement already satisfied: Werkzeug>=3.0.0 in /opt/conda/lib/python3.9/site-packages (from Flask<4->mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.0.1)
-Requirement already satisfied: itsdangerous>=2.1.2 in /opt/conda/lib/python3.9/site-packages (from Flask<4->mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.1.2)
-Requirement already satisfied: blinker>=1.6.2 in /opt/conda/lib/python3.9/site-packages (from Flask<4->mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.7.0)
-Requirement already satisfied: smmap<6,>=3.0.1 in /opt/conda/lib/python3.9/site-packages (from gitdb<5,>=4.0.1->GitPython>=3.1.41,~=3.1->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (5.0.1)
-Requirement already satisfied: httplib2<1dev,>=0.15.0 in /opt/conda/lib/python3.9/site-packages (from google-api-python-client<2,>=1.7.8->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.22.0)
-Requirement already satisfied: google-auth-httplib2>=0.0.3 in /opt/conda/lib/python3.9/site-packages (from google-api-python-client<2,>=1.7.8->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.2.0)
-Requirement already satisfied: cachetools<6.0,>=2.0.0 in /opt/conda/lib/python3.9/site-packages (from google-auth>=1.2->gcsfs==2023.9.2->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (5.3.3)
-Requirement already satisfied: pyasn1-modules>=0.2.1 in /opt/conda/lib/python3.9/site-packages (from google-auth>=1.2->gcsfs==2023.9.2->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.3.0)
-Requirement already satisfied: rsa<5,>=3.1.4 in /opt/conda/lib/python3.9/site-packages (from google-auth>=1.2->gcsfs==2023.9.2->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (4.9)
-Requirement already satisfied: proto-plus<2.0.0dev,>=1.22.0 in /opt/conda/lib/python3.9/site-packages (from google-cloud-bigquery-storage<3.0.0dev,>=2.6.0->google-cloud-bigquery[bqstorage,pandas]==3.14.1->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.23.0)
-Requirement already satisfied: google-crc32c<2.0dev,>=1.0 in /opt/conda/lib/python3.9/site-packages (from google-cloud-storage->gcsfs==2023.9.2->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.5.0)
-Requirement already satisfied: zipp>=0.5 in /opt/conda/lib/python3.9/site-packages (from importlib-metadata>=4.13.0->dask~=2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.17.0)
-Requirement already satisfied: parso<0.9.0,>=0.8.3 in /opt/conda/lib/python3.9/site-packages (from jedi>=0.16->ipython~=8.10->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.8.3)
-Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /opt/conda/lib/python3.9/site-packages (from jsonschema<5,>=3.0.1->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2023.12.1)
-Requirement already satisfied: referencing>=0.28.4 in /opt/conda/lib/python3.9/site-packages (from jsonschema<5,>=3.0.1->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.33.0)
-Requirement already satisfied: rpds-py>=0.7.1 in /opt/conda/lib/python3.9/site-packages (from jsonschema<5,>=3.0.1->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.18.0)
-Requirement already satisfied: websocket-client!=0.40.0,!=0.41.*,!=0.42.*,>=0.32.0 in /opt/conda/lib/python3.9/site-packages (from kubernetes<26,>=8.0.0->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.7.0)
-Requirement already satisfied: contourpy>=1.0.1 in /opt/conda/lib/python3.9/site-packages (from matplotlib<4->mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.2.0)
-Requirement already satisfied: cycler>=0.10 in /opt/conda/lib/python3.9/site-packages (from matplotlib<4->mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.12.1)
-Requirement already satisfied: fonttools>=4.22.0 in /opt/conda/lib/python3.9/site-packages (from matplotlib<4->mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (4.49.0)
-Requirement already satisfied: kiwisolver>=1.3.1 in /opt/conda/lib/python3.9/site-packages (from matplotlib<4->mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.4.5)
-Requirement already satisfied: pillow>=8 in /opt/conda/lib/python3.9/site-packages (from matplotlib<4->mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (10.2.0)
-Requirement already satisfied: pyparsing>=2.3.1 in /opt/conda/lib/python3.9/site-packages (from matplotlib<4->mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.1.1)
-Requirement already satisfied: importlib-resources>=3.2.0 in /opt/conda/lib/python3.9/site-packages (from matplotlib<4->mlflow~=2.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (6.1.2)
-Requirement already satisfied: PyJWT[crypto]<3,>=1.0.0 in /opt/conda/lib/python3.9/site-packages (from msal<2.0.0,>=1.24.0->azure-identity~=1.5->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.8.0)
-Requirement already satisfied: portalocker<3,>=1.0 in /opt/conda/lib/python3.9/site-packages (from msal-extensions<2.0.0,>=0.3.0->azure-identity~=1.5->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.8.2)
-Requirement already satisfied: beautifulsoup4 in /opt/conda/lib/python3.9/site-packages (from nbconvert>=6.4.5->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (4.12.3)
-Requirement already satisfied: bleach!=5.0.0 in /opt/conda/lib/python3.9/site-packages (from nbconvert>=6.4.5->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (6.1.0)
-Requirement already satisfied: defusedxml in /opt/conda/lib/python3.9/site-packages (from nbconvert>=6.4.5->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.7.1)
-Requirement already satisfied: jupyter-core>=4.7 in /opt/conda/lib/python3.9/site-packages (from nbconvert>=6.4.5->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (5.7.1)
-Requirement already satisfied: jupyterlab-pygments in /opt/conda/lib/python3.9/site-packages (from nbconvert>=6.4.5->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.3.0)
-Requirement already satisfied: mistune<4,>=2.0.3 in /opt/conda/lib/python3.9/site-packages (from nbconvert>=6.4.5->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.0.2)
-Requirement already satisfied: nbclient>=0.5.0 in /opt/conda/lib/python3.9/site-packages (from nbconvert>=6.4.5->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.9.0)
-Requirement already satisfied: nbformat>=5.7 in /opt/conda/lib/python3.9/site-packages (from nbconvert>=6.4.5->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (5.9.2)
-Requirement already satisfied: pandocfilters>=1.4.1 in /opt/conda/lib/python3.9/site-packages (from nbconvert>=6.4.5->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.5.1)
-Requirement already satisfied: tinycss2 in /opt/conda/lib/python3.9/site-packages (from nbconvert>=6.4.5->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.2.1)
-Requirement already satisfied: pyzmq<25,>=17 in /opt/conda/lib/python3.9/site-packages (from notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (24.0.1)
-Requirement already satisfied: argon2-cffi in /opt/conda/lib/python3.9/site-packages (from notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (23.1.0)
-Requirement already satisfied: jupyter-client<8,>=5.3.4 in /opt/conda/lib/python3.9/site-packages (from notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (7.4.9)
-Requirement already satisfied: ipython-genutils in /opt/conda/lib/python3.9/site-packages (from notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.2.0)
-Requirement already satisfied: ipykernel in /opt/conda/lib/python3.9/site-packages (from notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (6.29.3)
-Requirement already satisfied: Send2Trash>=1.8.0 in /opt/conda/lib/python3.9/site-packages (from notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.8.2)
-Requirement already satisfied: terminado>=0.8.3 in /opt/conda/lib/python3.9/site-packages (from notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.18.0)
-Requirement already satisfied: prometheus-client in /opt/conda/lib/python3.9/site-packages (from notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.20.0)
-Requirement already satisfied: nbclassic>=0.4.7 in /opt/conda/lib/python3.9/site-packages (from notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.0.0)
-Requirement already satisfied: ptyprocess>=0.5 in /opt/conda/lib/python3.9/site-packages (from pexpect>4.3->ipython~=8.10->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.7.0)
-Requirement already satisfied: wcwidth in /opt/conda/lib/python3.9/site-packages (from prompt-toolkit<3.1.0,>=3.0.41->ipython~=8.10->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.2.13)
-Requirement already satisfied: oauthlib>=3.0.0 in /opt/conda/lib/python3.9/site-packages (from requests-oauthlib>=0.5.0->msrest~=0.6.21->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.2.2)
-Requirement already satisfied: wheel in /opt/conda/lib/python3.9/site-packages (from strip-hints<1,>=0.1.8->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.41.2)
-Requirement already satisfied: executing>=1.2.0 in /opt/conda/lib/python3.9/site-packages (from stack-data->ipython~=8.10->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.0.1)
-Requirement already satisfied: asttokens>=2.1.0 in /opt/conda/lib/python3.9/site-packages (from stack-data->ipython~=8.10->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.4.1)
-Requirement already satisfied: pure-eval in /opt/conda/lib/python3.9/site-packages (from stack-data->ipython~=8.10->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.2.2)
-Requirement already satisfied: webencodings in /opt/conda/lib/python3.9/site-packages (from bleach!=5.0.0->nbconvert>=6.4.5->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.5.1)
-Requirement already satisfied: pycparser in /opt/conda/lib/python3.9/site-packages (from cffi->azure-datalake-store<0.1,>=0.0.46->adlfs==2023.9.0->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.21)
-Requirement already satisfied: grpcio-status<2.0.dev0,>=1.33.2 in /opt/conda/lib/python3.9/site-packages (from google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0dev,>=1.31.5->google-cloud-bigquery[bqstorage,pandas]==3.14.1->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.48.2)
-Requirement already satisfied: platformdirs>=2.5 in /opt/conda/lib/python3.9/site-packages (from jupyter-core>=4.7->nbconvert>=6.4.5->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (3.10.0)
-Requirement already satisfied: jupyter-server>=1.8 in /opt/conda/lib/python3.9/site-packages (from nbclassic>=0.4.7->notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.12.5)
-Requirement already satisfied: notebook-shim>=0.2.3 in /opt/conda/lib/python3.9/site-packages (from nbclassic>=0.4.7->notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.2.4)
-Requirement already satisfied: fastjsonschema in /opt/conda/lib/python3.9/site-packages (from nbformat>=5.7->nbconvert>=6.4.5->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.19.1)
-Requirement already satisfied: pyasn1<0.6.0,>=0.4.6 in /opt/conda/lib/python3.9/site-packages (from pyasn1-modules>=0.2.1->google-auth>=1.2->gcsfs==2023.9.2->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.5.1)
-Requirement already satisfied: argon2-cffi-bindings in /opt/conda/lib/python3.9/site-packages (from argon2-cffi->notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (21.2.0)
-Requirement already satisfied: soupsieve>1.2 in /opt/conda/lib/python3.9/site-packages (from beautifulsoup4->nbconvert>=6.4.5->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.5)
-Requirement already satisfied: comm>=0.1.1 in /opt/conda/lib/python3.9/site-packages (from ipykernel->notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.2.1)
-Requirement already satisfied: debugpy>=1.6.5 in /opt/conda/lib/python3.9/site-packages (from ipykernel->notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.8.1)
-Requirement already satisfied: jupyter-events>=0.9.0 in /opt/conda/lib/python3.9/site-packages (from jupyter-server>=1.8->nbclassic>=0.4.7->notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.9.0)
-Requirement already satisfied: jupyter-server-terminals in /opt/conda/lib/python3.9/site-packages (from jupyter-server>=1.8->nbclassic>=0.4.7->notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.5.2)
-Requirement already satisfied: overrides in /opt/conda/lib/python3.9/site-packages (from jupyter-server>=1.8->nbclassic>=0.4.7->notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (7.7.0)
-Requirement already satisfied: python-json-logger>=2.0.4 in /opt/conda/lib/python3.9/site-packages (from jupyter-events>=0.9.0->jupyter-server>=1.8->nbclassic>=0.4.7->notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.0.7)
-Requirement already satisfied: rfc3339-validator in /opt/conda/lib/python3.9/site-packages (from jupyter-events>=0.9.0->jupyter-server>=1.8->nbclassic>=0.4.7->notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.1.4)
-Requirement already satisfied: rfc3986-validator>=0.1.1 in /opt/conda/lib/python3.9/site-packages (from jupyter-events>=0.9.0->jupyter-server>=1.8->nbclassic>=0.4.7->notebook<7.0.0,>=6.4->nuclio-jupyter~=0.9.15->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (0.1.1)
-Requirement already satisfied: fqdn in /opt/conda/lib/python3.9/site-packages (from jsonschema<5,>=3.0.1->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.5.1)
-Requirement already satisfied: isoduration in /opt/conda/lib/python3.9/site-packages (from jsonschema<5,>=3.0.1->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (20.11.0)
-Requirement already satisfied: jsonpointer>1.13 in /opt/conda/lib/python3.9/site-packages (from jsonschema<5,>=3.0.1->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.1)
-Requirement already satisfied: uri-template in /opt/conda/lib/python3.9/site-packages (from jsonschema<5,>=3.0.1->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.3.0)
-Requirement already satisfied: webcolors>=1.11 in /opt/conda/lib/python3.9/site-packages (from jsonschema<5,>=3.0.1->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.13)
-Requirement already satisfied: arrow>=0.15.0 in /opt/conda/lib/python3.9/site-packages (from isoduration->jsonschema<5,>=3.0.1->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (1.3.0)
-Requirement already satisfied: types-python-dateutil>=2.8.10 in /opt/conda/lib/python3.9/site-packages (from arrow>=0.15.0->isoduration->jsonschema<5,>=3.0.1->kfp~=1.8->mlrun[complete]==1.6.1->-r /empty/requirements.txt (line 1)) (2.8.19.20240106)
-Downloading onnx-1.14.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (14.6 MB)
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 14.6/14.6 MB 274.2 MB/s eta 0:00:00
-Downloading onnxruntime-1.16.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.4 MB)
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 6.4/6.4 MB 277.9 MB/s eta 0:00:00
-Downloading optimum-1.6.4-py3-none-any.whl (227 kB)
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 227.8/227.8 kB 291.3 MB/s eta 0:00:00
-Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 6.3/6.3 MB 242.4 MB/s eta 0:00:00
-Downloading datasets-2.10.1-py3-none-any.whl (469 kB)
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 469.0/469.0 kB 185.9 MB/s eta 0:00:00
-Downloading scikit_learn-1.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.4 MB)
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 26.4/26.4 MB 275.9 MB/s eta 0:00:00
-Downloading dill-0.3.6-py3-none-any.whl (110 kB)
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 110.5/110.5 kB 282.3 MB/s eta 0:00:00
-Downloading huggingface_hub-0.21.4-py3-none-any.whl (346 kB)
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 346.4/346.4 kB 311.7 MB/s eta 0:00:00
-Downloading numpy-1.23.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.1 MB)
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 17.1/17.1 MB 269.6 MB/s eta 0:00:00
-Downloading regex-2023.12.25-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (773 kB)
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 773.4/773.4 kB 311.9 MB/s eta 0:00:00
-Downloading responses-0.18.0-py3-none-any.whl (38 kB)
-Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 7.8/7.8 MB 264.1 MB/s eta 0:00:00
-Downloading torch-2.2.1-cp39-cp39-manylinux1_x86_64.whl (755.5 MB)
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 755.5/755.5 MB 204.0 MB/s eta 0:00:00
-Downloading nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 410.6/410.6 MB 40.3 MB/s eta 0:00:00
-Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 14.1/14.1 MB 43.0 MB/s eta 0:00:00
-Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 23.7/23.7 MB 46.9 MB/s eta 0:00:00
-Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 823.6/823.6 kB 51.0 MB/s eta 0:00:00
-Downloading nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 731.7/731.7 MB 58.2 MB/s eta 0:00:00
-Downloading nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 121.6/121.6 MB 69.0 MB/s eta 0:00:00
-Downloading nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 56.5/56.5 MB 36.0 MB/s eta 0:00:00
-Downloading nvidia_cusolver_cu12-11.4.5.107-py3-none-manylinux1_x86_64.whl (124.2 MB)
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 124.2/124.2 MB 52.8 MB/s eta 0:00:00
-Downloading nvidia_cusparse_cu12-12.1.0.106-py3-none-manylinux1_x86_64.whl (196.0 MB)
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 196.0/196.0 MB 45.9 MB/s eta 0:00:00
-Downloading nvidia_nccl_cu12-2.19.3-py3-none-manylinux1_x86_64.whl (166.0 MB)
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 166.0/166.0 MB 19.6 MB/s eta 0:00:00
-Downloading nvidia_nvtx_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (99 kB)
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 99.1/99.1 kB 27.7 MB/s eta 0:00:00
-Downloading triton-2.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (167.9 MB)
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 167.9/167.9 MB 41.3 MB/s eta 0:00:00
-Downloading protobuf-3.20.2-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl (1.0 MB)
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.0/1.0 MB 42.8 MB/s eta 0:00:00
-Downloading coloredlogs-15.0.1-py2.py3-none-any.whl (46 kB)
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 46.0/46.0 kB 192.0 MB/s eta 0:00:00
-Downloading filelock-3.13.1-py3-none-any.whl (11 kB)
-Downloading flatbuffers-24.3.7-py2.py3-none-any.whl (26 kB)
-Downloading multiprocess-0.70.14-py39-none-any.whl (132 kB)
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 132.9/132.9 kB 100.7 MB/s eta 0:00:00
-Downloading sympy-1.12-py3-none-any.whl (5.7 MB)
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 5.7/5.7 MB 41.4 MB/s eta 0:00:00
-Downloading humanfriendly-10.0-py2.py3-none-any.whl (86 kB)
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 86.8/86.8 kB 253.7 MB/s eta 0:00:00
-Downloading mpmath-1.3.0-py3-none-any.whl (536 kB)
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 536.2/536.2 kB 45.4 MB/s eta 0:00:00
-Downloading sentencepiece-0.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.3/1.3 MB 46.1 MB/s eta 0:00:00
-Downloading networkx-3.2.1-py3-none-any.whl (1.6 MB)
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.6/1.6 MB 43.7 MB/s eta 0:00:00
-Downloading nvidia_nvjitlink_cu12-12.4.99-py3-none-manylinux2014_x86_64.whl (21.1 MB)
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 21.1/21.1 MB 43.8 MB/s eta 0:00:00
-Installing collected packages: tokenizers, sentencepiece, mpmath, flatbuffers, sympy, regex, protobuf, nvidia-nvtx-cu12, nvidia-nvjitlink-cu12, nvidia-nccl-cu12, nvidia-curand-cu12, nvidia-cufft-cu12, nvidia-cuda-runtime-cu12, nvidia-cuda-nvrtc-cu12, nvidia-cuda-cupti-cu12, nvidia-cublas-cu12, numpy, networkx, humanfriendly, filelock, dill, triton, responses, onnx, nvidia-cusparse-cu12, nvidia-cudnn-cu12, multiprocess, huggingface-hub, coloredlogs, transformers, scikit-learn, onnxruntime, nvidia-cusolver-cu12, torch, datasets, optimum
-  Attempting uninstall: protobuf
-    Found existing installation: protobuf 3.20.3
-    Uninstalling protobuf-3.20.3:
-      Successfully uninstalled protobuf-3.20.3
-  Attempting uninstall: numpy
-    Found existing installation: numpy 1.26.4
-    Uninstalling numpy-1.26.4:
-      Successfully uninstalled numpy-1.26.4
-  Attempting uninstall: scikit-learn
-    Found existing installation: scikit-learn 1.4.1.post1
-    Uninstalling scikit-learn-1.4.1.post1:
-      Successfully uninstalled scikit-learn-1.4.1.post1
-Successfully installed coloredlogs-15.0.1 datasets-2.10.1 dill-0.3.6 filelock-3.13.1 flatbuffers-24.3.7 huggingface-hub-0.21.4 humanfriendly-10.0 mpmath-1.3.0 multiprocess-0.70.14 networkx-3.2.1 numpy-1.23.5 nvidia-cublas-cu12-12.1.3.1 nvidia-cuda-cupti-cu12-12.1.105 nvidia-cuda-nvrtc-cu12-12.1.105 nvidia-cuda-runtime-cu12-12.1.105 nvidia-cudnn-cu12-8.9.2.26 nvidia-cufft-cu12-11.0.2.54 nvidia-curand-cu12-10.3.2.106 nvidia-cusolver-cu12-11.4.5.107 nvidia-cusparse-cu12-12.1.0.106 nvidia-nccl-cu12-2.19.3 nvidia-nvjitlink-cu12-12.4.99 nvidia-nvtx-cu12-12.1.105 onnx-1.14.1 onnxruntime-1.16.3 optimum-1.6.4 protobuf-3.20.2 regex-2023.12.25 responses-0.18.0 scikit-learn-1.0.2 sentencepiece-0.2.0 sympy-1.12 tokenizers-0.13.3 torch-2.2.1 transformers-4.26.1 triton-2.2.0
-WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv
-INFO[0238] Taking snapshot of full filesystem...        
-INFO[0463] Pushing image to docker-registry.default-tenant.app.app-lab-2-b688.iguazio-cd2.com/mlrun/func-hugging-face-trainer-avia-hugging-face-classifier-trainer:latest 
-INFO[0493] Pushed docker-registry.default-tenant.app.app-lab-2-b688.iguazio-cd2.com/mlrun/func-hugging-face-trainer-avia-hugging-face-classifier-trainer@sha256:691d0bb3c23487b4b5d2f84ab323c24735626ee81681475f53a4158b72d4cfee 
-
-
-
BuildStatus(ready=True, outputs={'image': '.mlrun/func-hugging-face-trainer-avia-hugging-face-classifier-trainer:latest'})
-
-
-
-
-
-
-
train_run = hugging_face_classifier_trainer.run(params={
-                                                        "hf_dataset": "Shayanvsf/US_Airline_Sentiment",
-                                                        "drop_columns": [
-                                                            "airline_sentiment_confidence",
-                                                            "negativereason_confidence",
-                                                        ],
-                                                        "pretrained_tokenizer": "distilbert-base-uncased",
-                                                        "pretrained_model": "distilbert-base-uncased",
-                                                        "model_class": "transformers.AutoModelForSequenceClassification",
-                                                        "label_name": "airline_sentiment",
-                                                        "num_of_train_samples": 100,
-                                                        "metrics": ["accuracy", "f1"],
-                                                        "random_state": 42,
-                                                        **additional_parameters
-                                                    },
-                                                    handler="train",                                                    
-                                                )
-
-
-
-
-
> 2024-03-24 17:22:42,252 [info] Storing function: {'name': 'hugging-face-classifier-trainer-train', 'uid': '53252ce7aacb4b1aacf86bf3b862daa2', 'db': 'http://mlrun-api:8080'}
-> 2024-03-24 17:22:42,536 [info] Job is running in the background, pod: hugging-face-classifier-trainer-train-dqqfr
-> 2024-03-24 17:24:43,288 [info] 'train_test_split_size' is not provided, setting train_test_split_size to 0.2
-> 2024-03-24 17:24:43,847 [info] Loading and editing Shayanvsf/US_Airline_Sentiment dataset from Hugging Face hub
-Downloading metadata: 100%|██████████| 1.03k/1.03k [00:00<00:00, 6.77MB/s]
-Downloading and preparing dataset None/None (download: 265.13 KiB, generated: 1.50 MiB, post-processed: Unknown size, total: 1.76 MiB) to /root/.cache/huggingface/datasets/Shayanvsf___parquet/Shayanvsf--US_Airline_Sentiment-1319c42f87c44b2f/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...
-Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]
-Downloading data: 100%|██████████| 92.6k/92.6k [00:00<00:00, 59.3MB/s]
-Downloading data files:  33%|███▎      | 1/3 [00:00<00:00,  6.42it/s]
-Downloading data: 100%|██████████| 605k/605k [00:00<00:00, 81.8MB/s]
-Downloading data files:  67%|██████▋   | 2/3 [00:00<00:00,  6.59it/s]
-Downloading data: 100%|██████████| 179k/179k [00:00<00:00, 50.9MB/s]
-Downloading data files: 100%|██████████| 3/3 [00:00<00:00,  6.62it/s]
-Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 1263.34it/s]
-Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/Shayanvsf___parquet/Shayanvsf--US_Airline_Sentiment-1319c42f87c44b2f/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.
-100%|██████████| 3/3 [00:00<00:00, 978.99it/s]                              
-Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight']
-- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
-- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
-Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classifier.weight', 'classifier.bias']
-You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
-> 2024-03-24 17:24:47,076 [info] training 'huggingface-model'
-The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
-This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning
-***** Running training *****
-  Num examples = 100
-  Num Epochs = 3
-  Instantaneous batch size per device = 16
-  Total train batch size (w. parallel, distributed & accumulation) = 16
-  Gradient Accumulation steps = 1
-  Total optimization steps = 21
-  Number of trainable parameters = 66955010
-huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
-To disable this warning, you can either:
-	- Avoid using `tokenizers` before the fork if possible
-	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
-  0%|          | 0/21 [00:00<?, ?it/s]You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
- 33%|███▎      | 7/21 [00:16<00:28,  2.02s/it]The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
-***** Running Evaluation *****
-  Num examples = 24
-  Batch size = 16
-
-{'loss': 0.7005, 'learning_rate': 1.904761904761905e-05, 'epoch': 0.14}
-{'loss': 0.6528, 'learning_rate': 1.8095238095238097e-05, 'epoch': 0.29}
-{'loss': 0.6468, 'learning_rate': 1.7142857142857142e-05, 'epoch': 0.43}
-{'loss': 0.5877, 'learning_rate': 1.6190476190476193e-05, 'epoch': 0.57}
-{'loss': 0.6694, 'learning_rate': 1.523809523809524e-05, 'epoch': 0.71}
-{'loss': 0.5219, 'learning_rate': 1.4285714285714287e-05, 'epoch': 0.86}
-{'loss': 0.7052, 'learning_rate': 1.3333333333333333e-05, 'epoch': 1.0}
-  0%|          | 0/2 [00:00<?, ?it/s]
-100%|██████████| 2/2 [00:00<00:00,  4.86it/s]main.py:561: FutureWarning:
-
-load_metric is deprecated and will be removed in the next major version of datasets. Use 'evaluate.load' instead, from the new library 🤗 Evaluate: https://huggingface.co/docs/evaluate
-
-
-
-Downloading builder script: 4.21kB [00:00, 11.4MB/s]                   
-
-
-Downloading builder script: 6.50kB [00:00, 21.8MB/s]                   
-                                              
- 33%|███▎      | 7/21 [00:18<00:28,  2.02s/it]
-100%|██████████| 2/2 [00:00<00:00,  4.86it/s]
- 67%|██████▋   | 14/21 [00:34<00:14,  2.07s/it]The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
-***** Running Evaluation *****
-  Num examples = 24
-  Batch size = 16
-{'eval_loss': 0.5350419878959656, 'eval_accuracy': 0.7916666666666666, 'eval_f1': 0.0, 'eval_runtime': 1.5536, 'eval_samples_per_second': 15.448, 'eval_steps_per_second': 1.287, 'epoch': 1.0}
-{'loss': 0.5942, 'learning_rate': 1.2380952380952383e-05, 'epoch': 1.14}
-{'loss': 0.5899, 'learning_rate': 1.1428571428571429e-05, 'epoch': 1.29}
-{'loss': 0.5317, 'learning_rate': 1.0476190476190477e-05, 'epoch': 1.43}
-{'loss': 0.4516, 'learning_rate': 9.523809523809525e-06, 'epoch': 1.57}
-{'loss': 0.5121, 'learning_rate': 8.571428571428571e-06, 'epoch': 1.71}
-{'loss': 0.5264, 'learning_rate': 7.61904761904762e-06, 'epoch': 1.86}
-{'loss': 0.539, 'learning_rate': 6.666666666666667e-06, 'epoch': 2.0}
-
-  0%|          | 0/2 [00:00<?, ?it/s]
-                                               A
- 67%|██████▋   | 14/21 [00:35<00:14,  2.07s/it]
-100%|██████████| 2/2 [00:00<00:00,  4.95it/s]
-100%|██████████| 21/21 [00:52<00:00,  2.05s/it]The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
-***** Running Evaluation *****
-  Num examples = 24
-  Batch size = 16
-{'eval_loss': 0.4877033233642578, 'eval_accuracy': 0.7916666666666666, 'eval_f1': 0.0, 'eval_runtime': 1.1789, 'eval_samples_per_second': 20.357, 'eval_steps_per_second': 1.696, 'epoch': 2.0}
-{'loss': 0.4059, 'learning_rate': 5.7142857142857145e-06, 'epoch': 2.14}
-{'loss': 0.5851, 'learning_rate': 4.761904761904762e-06, 'epoch': 2.29}
-{'loss': 0.4135, 'learning_rate': 3.80952380952381e-06, 'epoch': 2.43}
-{'loss': 0.6571, 'learning_rate': 2.8571428571428573e-06, 'epoch': 2.57}
-{'loss': 0.4883, 'learning_rate': 1.904761904761905e-06, 'epoch': 2.71}
-{'loss': 0.5114, 'learning_rate': 9.523809523809525e-07, 'epoch': 2.86}
-{'loss': 0.5215, 'learning_rate': 0.0, 'epoch': 3.0}
-
-  0%|          | 0/2 [00:00<?, ?it/s]
-                                               A
-100%|██████████| 21/21 [00:54<00:00,  2.05s/it]
-100%|██████████| 2/2 [00:00<00:00,  6.38it/s]
-                                             
-
-Training completed. Do not forget to share your model on huggingface.co/models =)
-
-
-100%|██████████| 21/21 [00:55<00:00,  2.62s/it]
-tokenizer config file saved in /tmp/tokenizer/tokenizer_config.json
-Special tokens file saved in /tmp/tokenizer/special_tokens_map.json
-Configuration saved in /tmp/model/config.json
-Model weights saved in /tmp/model/pytorch_model.bin
-{'eval_loss': 0.4750453531742096, 'eval_accuracy': 0.7916666666666666, 'eval_f1': 0.0, 'eval_runtime': 1.0524, 'eval_samples_per_second': 22.806, 'eval_steps_per_second': 1.9, 'epoch': 3.0}
-{'train_runtime': 55.1543, 'train_samples_per_second': 5.439, 'train_steps_per_second': 0.381, 'train_loss': 0.5624780683290391, 'epoch': 3.0}
-> 2024-03-24 17:26:00,230 [info] To track results use the CLI: {'info_cmd': 'mlrun get run 53252ce7aacb4b1aacf86bf3b862daa2 -p hugging-face-trainer-avia', 'logs_cmd': 'mlrun logs 53252ce7aacb4b1aacf86bf3b862daa2 -p hugging-face-trainer-avia'}
-> 2024-03-24 17:26:00,231 [info] Or click for UI: {'ui_url': 'https://dashboard.default-tenant.app.app-lab-2-b688.iguazio-cd2.com/mlprojects/hugging-face-trainer-avia/jobs/monitor/53252ce7aacb4b1aacf86bf3b862daa2/overview'}
-> 2024-03-24 17:26:00,231 [info] Run execution finished: {'status': 'completed', 'name': 'hugging-face-classifier-trainer-train'}
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
hugging-face-trainer-avia0Mar 24 17:24:39completedhugging-face-classifier-trainer-train
v3io_user=avia
kind=job
owner=avia
mlrun/client_version=1.6.1
mlrun/client_python_version=3.9.16
host=hugging-face-classifier-trainer-train-dqqfr
hf_dataset=Shayanvsf/US_Airline_Sentiment
drop_columns=['airline_sentiment_confidence', 'negativereason_confidence']
pretrained_tokenizer=distilbert-base-uncased
pretrained_model=distilbert-base-uncased
model_class=transformers.AutoModelForSequenceClassification
label_name=airline_sentiment
num_of_train_samples=100
metrics=['accuracy', 'f1']
random_state=42
TRAIN_output_dir=finetuning-sentiment-model-3000-samples
TRAIN_learning_rate=2e-05
TRAIN_per_device_train_batch_size=16
TRAIN_per_device_eval_batch_size=16
TRAIN_num_train_epochs=3
TRAIN_weight_decay=0.01
TRAIN_push_to_hub=False
TRAIN_evaluation_strategy=epoch
TRAIN_eval_steps=1
TRAIN_logging_steps=1
CLASS_num_labels=2
loss=0.5215
learning_rate=0.0
eval_loss=0.4750453531742096
eval_accuracy=0.7916666666666666
eval_f1=0.0
eval_runtime=1.0524
eval_samples_per_second=22.806
eval_steps_per_second=1.9
train_runtime=55.1543
train_samples_per_second=5.439
train_steps_per_second=0.381
total_flos=3327208489680.0
loss_plot
learning_rate_plot
eval_loss_plot
eval_accuracy_plot
eval_f1_plot
eval_runtime_plot
eval_samples_per_second_plot
eval_steps_per_second_plot
tokenizer
model
-
- -
-

-
-
-
> to track results use the .show() or .logs() methods or click here to open in UI
> 2024-03-24 17:26:09,792 [info] Run execution finished: {'status': 'completed', 'name': 'hugging-face-classifier-trainer-train'}
-
-
-
-
-

Back to the top

-
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/hugging_face_classifier_trainer/latest/static/function.html b/functions/development/hugging_face_classifier_trainer/latest/static/function.html deleted file mode 100644 index 2bf1ffb9..00000000 --- a/functions/development/hugging_face_classifier_trainer/latest/static/function.html +++ /dev/null @@ -1,390 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: job
-metadata:
-  name: hugging-face-classifier-trainer
-  tag: ''
-  hash: e8113e81f04c96fc9a8a94e717dea81ee3e05a18
-  project: ''
-  labels:
-    author: davids
-  categories:
-  - machine-learning
-  - model-training
-spec:
-  command: ''
-  args: []
-  image: ''
-  build:
-    functionSourceCode: 
-    base_image: mlrun/mlrun
-    commands: []
-    code_origin: ''
-    origin_filename: ''
-    requirements:
-    - onnx~=1.14.1
-    - onnxruntime~=1.16.1
-    - optimum~=1.6.4
-    - transformers~=4.26.1
-    - datasets~=2.10.1
-    - scikit-learn~=1.0.2
-  entry_points:
-    add_interface:
-      name: add_interface
-      doc: 'Enrich the object with this interface properties, methods and functions,
-        so it will have this TensorFlow.Keras
-
-        MLRuns features.'
-      parameters:
-      - name: cls
-      - name: obj
-        type: Trainer
-        doc: The object to enrich his interface.
-      - name: restoration
-        type: MLRunInterfaceRestorationType
-        doc: Restoration information tuple as returned from 'remove_interface' in
-          order to add the interface in a certain state.
-        default: null
-      outputs: []
-      lineno: 146
-      has_varargs: false
-      has_kwargs: false
-    mlrun_optimize:
-      name: mlrun_optimize
-      doc: 'MLRun''s tf.keras.Model.fit wrapper. It will setup the optimizer when
-        using horovod. The optimizer must be
-
-        passed in a keyword argument and when using horovod, it must be passed as
-        an Optimizer instance, not a string.
-
-
-        raise MLRunInvalidArgumentError: In case the optimizer provided did not follow
-        the instructions above.'
-      parameters:
-      - name: cls
-      outputs: []
-      lineno: 79
-      has_varargs: false
-      has_kwargs: false
-    wrapper:
-      name: wrapper
-      doc: ''
-      parameters:
-      - name: self
-        type: Trainer
-      outputs: []
-      lineno: 173
-      has_varargs: true
-      has_kwargs: true
-    enable_auto_logging:
-      name: enable_auto_logging
-      doc: ''
-      parameters:
-      - name: self
-      - name: context
-        type: MLClientCtx
-      - name: model_name
-        type: str
-        default: model
-      - name: tag
-        type: str
-        default: ''
-      - name: labels
-        type: Dict[str, str]
-        default: null
-      - name: extra_data
-        type: dict
-        default: null
-      outputs: []
-      lineno: 114
-      has_varargs: false
-      has_kwargs: false
-    mlrun_train:
-      name: mlrun_train
-      doc: 'MLRuns tf.keras.Model.fit wrapper. It will setup the optimizer when using
-        horovod. The optimizer must be
-
-        passed in a keyword argument and when using horovod, it must be passed as
-        an Optimizer instance, not a string.
-
-
-        raise MLRunInvalidArgumentError: In case the optimizer provided did not follow
-        the instructions above.'
-      parameters:
-      - name: cls
-      outputs: []
-      lineno: 164
-      has_varargs: false
-      has_kwargs: false
-    on_epoch_begin:
-      name: on_epoch_begin
-      doc: ''
-      parameters:
-      - name: self
-      - name: args
-        type: TrainingArguments
-      - name: state
-        type: TrainerState
-      - name: control
-        type: TrainerControl
-      outputs: []
-      lineno: 220
-      has_varargs: false
-      has_kwargs: true
-    on_epoch_end:
-      name: on_epoch_end
-      doc: ''
-      parameters:
-      - name: self
-      - name: args
-        type: TrainingArguments
-      - name: state
-        type: TrainerState
-      - name: control
-        type: TrainerControl
-      outputs: []
-      lineno: 229
-      has_varargs: false
-      has_kwargs: true
-    on_log:
-      name: on_log
-      doc: ''
-      parameters:
-      - name: self
-      - name: args
-        type: TrainingArguments
-      - name: state
-        type: TrainerState
-      - name: control
-        type: TrainerControl
-      - name: logs
-        type: Dict[str, float]
-        default: null
-      outputs: []
-      lineno: 238
-      has_varargs: false
-      has_kwargs: true
-    on_train_begin:
-      name: on_train_begin
-      doc: ''
-      parameters:
-      - name: self
-      - name: args
-        type: TrainingArguments
-      - name: state
-        type: TrainerState
-      - name: control
-        type: TrainerControl
-      outputs: []
-      lineno: 262
-      has_varargs: false
-      has_kwargs: true
-    on_train_end:
-      name: on_train_end
-      doc: ''
-      parameters:
-      - name: self
-      - name: args
-        type: TrainingArguments
-      - name: state
-        type: TrainerState
-      - name: control
-        type: TrainerControl
-      - name: model
-        type: PreTrainedModel
-        default: null
-      - name: tokenizer
-        type: PreTrainedTokenizer
-        default: null
-      outputs: []
-      lineno: 271
-      has_varargs: false
-      has_kwargs: true
-    on_evaluate:
-      name: on_evaluate
-      doc: ''
-      parameters:
-      - name: self
-      - name: args
-        type: TrainingArguments
-      - name: state
-        type: TrainerState
-      - name: control
-        type: TrainerControl
-      outputs: []
-      lineno: 322
-      has_varargs: false
-      has_kwargs: true
-    apply_mlrun:
-      name: apply_mlrun
-      doc: Wrap the given model with MLRun's interface providing it with mlrun's additional
-        features.
-      parameters:
-      - name: huggingface_object
-        doc: The model to wrap. Can be loaded from the model path given as well.
-      - name: model_name
-        type: str
-        doc: 'The model name to use for storing the model artifact. Default: "model".'
-        default: null
-      - name: tag
-        type: str
-        doc: The model's tag to log with.
-        default: ''
-      - name: context
-        type: MLClientCtx
-        doc: MLRun context to work with. If no context is given it will be retrieved
-          via 'mlrun.get_or_create_ctx(None)'
-        default: null
-      - name: auto_log
-        type: bool
-        doc: 'Whether to enable MLRun''s auto logging. Default: True.'
-        default: true
-      - name: labels
-        type: Dict[str, str]
-        default: null
-      - name: extra_data
-        type: dict
-        default: null
-      outputs: []
-      lineno: 421
-      has_varargs: false
-      has_kwargs: true
-    train:
-      name: train
-      doc: 'Training and evaluating a pretrained model with a pretrained tokenizer
-        over a dataset.
-
-        The dataset can be either be the name of the dataset that contains in the
-        HuggingFace hub,
-
-        or a URI or a FeatureVector'
-      parameters:
-      - name: context
-        type: MLClientCtx
-        doc: MLRun context
-      - name: hf_dataset
-        type: str
-        doc: The name of the dataset to get from the HuggingFace hub
-        default: null
-      - name: dataset
-        type: DataItem
-        doc: The dataset to train the model on. Can be either a URI or a FeatureVector
-        default: null
-      - name: test_set
-        type: DataItem
-        doc: The test set to train the model with.
-        default: null
-      - name: drop_columns
-        type: Optional[List[str]]
-        doc: The columns to drop from the dataset.
-        default: null
-      - name: pretrained_tokenizer
-        type: str
-        doc: The name of the pretrained tokenizer from the HuggingFace hub.
-        default: null
-      - name: pretrained_model
-        type: str
-        doc: The name of the pretrained model from the HuggingFace hub.
-        default: null
-      - name: model_class
-        type: str
-        doc: The class of the model, e.g. `transformers.AutoModelForSequenceClassification`
-        default: null
-      - name: model_name
-        type: str
-        doc: The model's name to use for storing the model artifact, default to 'model'
-        default: huggingface-model
-      - name: label_name
-        type: str
-        doc: The target label of the column in the dataset.
-        default: labels
-      - name: text_col
-        type: str
-        doc: The input text column un the dataset.
-        default: text
-      - name: num_of_train_samples
-        type: int
-        doc: Max number of training samples, for debugging.
-        default: null
-      - name: train_test_split_size
-        type: float
-        doc: Should be between 0.0 and 1.0 and represent the proportion of the dataset
-          to include in the test split.
-        default: null
-      - name: metrics
-        type: List[str]
-        doc: List of different metrics for evaluate the model such as f1, accuracy
-          etc.
-        default: null
-      - name: random_state
-        type: int
-        doc: Random state for train_test_split
-        default: null
-      outputs: []
-      lineno: 647
-      has_varargs: false
-      has_kwargs: false
-    preprocess_function:
-      name: preprocess_function
-      doc: ''
-      parameters:
-      - name: examples
-      outputs: []
-      lineno: 696
-      has_varargs: false
-      has_kwargs: false
-    optimize:
-      name: optimize
-      doc: Optimizing the transformer model using ONNX optimization.
-      parameters:
-      - name: model_path
-        type: str
-        doc: The path of the model to optimize.
-      - name: model_name
-        type: str
-        doc: Name of the optimized model.
-        default: optimized_model
-      - name: target_dir
-        type: str
-        doc: The directory to save the ONNX model.
-        default: ./optimized
-      - name: optimization_level
-        type: int
-        doc: Optimization level performed by ONNX Runtime of the loaded graph. (default
-          is 1)
-        default: 1
-      outputs: []
-      lineno: 799
-      has_varargs: false
-      has_kwargs: false
-  description: Automatic train and optimize functions for HuggingFace framework
-  default_handler: train
-  disable_auto_mount: false
-  clone_target_dir: ''
-  env: []
-  priority_class_name: ''
-  preemption_mode: prevent
-  affinity: null
-  tolerations: null
-  security_context: {}
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/hugging_face_classifier_trainer/latest/static/hugging_face_classifier_trainer.html b/functions/development/hugging_face_classifier_trainer/latest/static/hugging_face_classifier_trainer.html deleted file mode 100644 index 99a105cb..00000000 --- a/functions/development/hugging_face_classifier_trainer/latest/static/hugging_face_classifier_trainer.html +++ /dev/null @@ -1,972 +0,0 @@ - - - - - - - -hugging_face_classifier_trainer.hugging_face_classifier_trainer - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - -
-
- -
-
-
-
-
- -
-

- -
-
-
-
-
-
-
-

Source code for hugging_face_classifier_trainer.hugging_face_classifier_trainer

-import os
-import shutil
-import tempfile
-import zipfile
-from abc import ABC
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
-
-import mlrun
-import mlrun.datastore
-import mlrun.utils
-import numpy as np
-import pandas as pd
-import transformers
-from datasets import Dataset, load_dataset, load_metric
-from mlrun import MLClientCtx
-from mlrun import feature_store as fs
-from mlrun.artifacts import Artifact, PlotlyArtifact
-from mlrun.datastore import DataItem
-from mlrun.frameworks._common import CommonTypes, MLRunInterface
-from mlrun.utils import create_class
-from plotly import graph_objects as go
-from sklearn.model_selection import train_test_split
-from transformers import (
-    AutoTokenizer,
-    DataCollatorWithPadding,
-    EvalPrediction,
-    PreTrainedModel,
-    PreTrainedTokenizer,
-    Trainer,
-    TrainerCallback,
-    TrainerControl,
-    TrainerState,
-    TrainingArguments,
-)
-
-
-# ----------------------from MLRUN--------------------------------
-
[docs]class HFORTOptimizerMLRunInterface(MLRunInterface, ABC): - """ - Interface for adding MLRun features for tensorflow keras API. - """ - - # MLRun's context default name: - DEFAULT_CONTEXT_NAME = "mlrun-huggingface" - - # Attributes to be inserted so the MLRun interface will be fully enabled. - _PROPERTIES = { - "_auto_log": False, - "_context": None, - "_model_name": "model", - "_tag": "", - "_labels": None, - "_extra_data": None, - } - _METHODS = ["enable_auto_logging"] - # Attributes to replace so the MLRun interface will be fully enabled. - _REPLACED_METHODS = [ - "optimize", - ] - -
[docs] @classmethod - def add_interface( - cls, - obj, - restoration: CommonTypes.MLRunInterfaceRestorationType = None, - ): - """ - Enrich the object with this interface properties, methods and functions, so it will have this TensorFlow.Keras - MLRun's features. - :param obj: The object to enrich his interface. - :param restoration: Restoration information tuple as returned from 'remove_interface' in order to - add the interface in a certain state. - """ - super(HFORTOptimizerMLRunInterface, cls).add_interface( - obj=obj, restoration=restoration - )
- -
[docs] @classmethod - def mlrun_optimize(cls): - """ - MLRun's tf.keras.Model.fit wrapper. It will setup the optimizer when using horovod. The optimizer must be - passed in a keyword argument and when using horovod, it must be passed as an Optimizer instance, not a string. - - raise MLRunInvalidArgumentError: In case the optimizer provided did not follow the instructions above. - """ - - def wrapper(self, *args, **kwargs): - save_dir = cls._get_function_argument( - self.optimize, - argument_name="save_dir", - passed_args=args, - passed_kwargs=kwargs, - )[0] - - # Call the original optimize method: - result = self.original_optimize(*args, **kwargs) - - if self._auto_log: - # Log the onnx model: - self._context.log_model( - key="model", - db_key=self._model_name, - model_file=f"{save_dir}/model_optimized.onnx", - tag=self._tag, - framework="ONNX", - labels=self._labels, - extra_data=self._extra_data, - ) - - return result - - return wrapper
- -
[docs] def enable_auto_logging( - self, - context: mlrun.MLClientCtx, - model_name: str = "model", - tag: str = "", - labels: Dict[str, str] = None, - extra_data: dict = None, - ): - self._auto_log = True - - self._context = context - self._model_name = model_name - self._tag = tag - self._labels = labels - self._extra_data = extra_data
- - -
[docs]class HFTrainerMLRunInterface(MLRunInterface, ABC): - """ - Interface for adding MLRun features for tensorflow keras API. - """ - - # MLRuns context default name: - DEFAULT_CONTEXT_NAME = "mlrun-huggingface" - - # Attributes to replace so the MLRun interface will be fully enabled. - _REPLACED_METHODS = [ - "train", - # "evaluate" - ] - -
[docs] @classmethod - def add_interface( - cls, - obj: Trainer, - restoration: CommonTypes.MLRunInterfaceRestorationType = None, - ): - """ - Enrich the object with this interface properties, methods and functions, so it will have this TensorFlow.Keras - MLRuns features. - :param obj: The object to enrich his interface. - :param restoration: Restoration information tuple as returned from 'remove_interface' in order to - add the interface in a certain state. - """ - - super(HFTrainerMLRunInterface, cls).add_interface( - obj=obj, restoration=restoration - )
- -
[docs] @classmethod - def mlrun_train(cls): - - """ - MLRuns tf.keras.Model.fit wrapper. It will setup the optimizer when using horovod. The optimizer must be - passed in a keyword argument and when using horovod, it must be passed as an Optimizer instance, not a string. - - raise MLRunInvalidArgumentError: In case the optimizer provided did not follow the instructions above. - """ - - def wrapper(self: Trainer, *args, **kwargs): - # Restore the evaluation method as `train` will use it: - # cls._restore_attribute(obj=self, attribute_name="evaluate") - - # Call the original fit method: - result = self.original_train(*args, **kwargs) - - # Replace the evaluation method again: - # cls._replace_function(obj=self, function_name="evaluate") - - return result - - return wrapper
- - -
[docs]class MLRunCallback(TrainerCallback): - """ - Callback for collecting logs during training / evaluation of the `Trainer` API. - """ - - def __init__( - self, - context: mlrun.MLClientCtx = None, - model_name: str = "model", - tag: str = "", - labels: Dict[str, str] = None, - extra_data: dict = None, - ): - super().__init__() - - # Store the configurations: - self._context = ( - context - if context is not None - else mlrun.get_or_create_ctx("./mlrun-huggingface") - ) - self._model_name = model_name - self._tag = tag - self._labels = labels - self._extra_data = extra_data if extra_data is not None else {} - - # Set up the logging mode: - self._is_training = False - self._steps: List[List[int]] = [] - self._metric_scores: Dict[str, List[float]] = {} - self._artifacts: Dict[str, Artifact] = {} - -
[docs] def on_epoch_begin( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs, - ): - self._steps.append([])
- -
[docs] def on_epoch_end( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs, - ): - self._log_metrics()
- -
[docs] def on_log( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - logs: Dict[str, float] = None, - **kwargs, - ): - recent_logs = state.log_history[-1].copy() - - recent_logs.pop("epoch") - current_step = int(recent_logs.pop("step")) - if current_step not in self._steps[-1]: - self._steps[-1].append(current_step) - - for metric_name, metric_score in recent_logs.items(): - if metric_name.startswith("train_"): - if metric_name.split("train_")[1] not in self._metric_scores: - self._metric_scores[metric_name] = [metric_score] - continue - if metric_name not in self._metric_scores: - self._metric_scores[metric_name] = [] - self._metric_scores[metric_name].append(metric_score)
- -
[docs] def on_train_begin( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs, - ): - self._is_training = True
- -
[docs] def on_train_end( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - model: PreTrainedModel = None, - tokenizer: PreTrainedTokenizer = None, - **kwargs, - ): - self._log_metrics() - - temp_directory = tempfile.gettempdir() - - # Save and log the tokenizer: - if tokenizer is not None: - # Save tokenizer: - tokenizer_dir = os.path.join(temp_directory, "tokenizer") - tokenizer.save_pretrained(save_directory=tokenizer_dir) - # Zip the tokenizer directory: - tokenizer_zip = shutil.make_archive( - base_name="tokenizer", - format="zip", - root_dir=tokenizer_dir, - ) - # Log the zip file: - self._artifacts["tokenizer"] = self._context.log_artifact( - item="tokenizer", local_path=tokenizer_zip - ) - - # Save the model: - model_dir = os.path.join(temp_directory, "model") - model.save_pretrained(save_directory=model_dir) - - # Zip the model directory: - shutil.make_archive( - base_name="model", - format="zip", - root_dir=model_dir, - ) - - # Log the model: - self._context.log_model( - key="model", - db_key=self._model_name, - model_file="model.zip", - tag=self._tag, - framework="Hugging Face", - labels=self._labels, - extra_data={**self._artifacts, **self._extra_data}, - )
- -
[docs] def on_evaluate( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs, - ): - self._log_metrics() - - if self._is_training: - return
- - # TODO: Update the model object - - def _log_metrics(self): - for metric_name, metric_scores in self._metric_scores.items(): - self._context.log_result(key=metric_name, value=metric_scores[-1]) - if len(metric_scores) > 1: - self._log_metric_plot(name=metric_name, scores=metric_scores) - self._context.commit(completed=False) - - def _log_metric_plot(self, name: str, scores: List[float]): - # Initialize a plotly figure: - metric_figure = go.Figure() - - # Add titles: - metric_figure.update_layout( - title=name.capitalize().replace("_", " "), - xaxis_title="Samples", - yaxis_title="Scores", - ) - - # Draw: - metric_figure.add_trace( - go.Scatter(x=np.arange(len(scores)), y=scores, mode="lines") - ) - - # Create the plotly artifact: - artifact_name = f"{name}_plot" - artifact = PlotlyArtifact(key=artifact_name, figure=metric_figure) - self._artifacts[artifact_name] = self._context.log_artifact(artifact)
- - -def _apply_mlrun_on_trainer( - trainer: transformers.Trainer, - model_name: str = None, - tag: str = "", - context: mlrun.MLClientCtx = None, - auto_log: bool = True, - labels: Dict[str, str] = None, - extra_data: dict = None, - **kwargs, -): - # Get parameters defaults: - if context is None: - context = mlrun.get_or_create_ctx(HFTrainerMLRunInterface.DEFAULT_CONTEXT_NAME) - - HFTrainerMLRunInterface.add_interface(obj=trainer) - - if auto_log: - trainer.add_callback( - MLRunCallback( - context=context, - model_name=model_name, - tag=tag, - labels=labels, - extra_data=extra_data, - ) - ) - - -def _apply_mlrun_on_optimizer( - optimizer, - model_name: str = None, - tag: str = "", - context: mlrun.MLClientCtx = None, - auto_log: bool = True, - labels: Dict[str, str] = None, - extra_data: dict = None, - **kwargs, -): - # Get parameters defaults: - if context is None: - context = mlrun.get_or_create_ctx( - HFORTOptimizerMLRunInterface.DEFAULT_CONTEXT_NAME - ) - - HFORTOptimizerMLRunInterface.add_interface(obj=optimizer) - - if auto_log: - optimizer.enable_auto_logging( - context=context, - model_name=model_name, - tag=tag, - labels=labels, - extra_data=extra_data, - ) - - -
[docs]def apply_mlrun( - huggingface_object, - model_name: str = None, - tag: str = "", - context: mlrun.MLClientCtx = None, - auto_log: bool = True, - labels: Dict[str, str] = None, - extra_data: dict = None, - **kwargs, -): - """ - Wrap the given model with MLRun's interface providing it with mlrun's additional features. - :param huggingface_object: The model to wrap. Can be loaded from the model path given as well. - :param model_name: The model name to use for storing the model artifact. Default: "model". - :param tag: The model's tag to log with. - :param context: MLRun context to work with. If no context is given it will be retrieved via - 'mlrun.get_or_create_ctx(None)' - :param auto_log: Whether to enable MLRun's auto logging. Default: True. - """ - - if isinstance(huggingface_object, transformers.Trainer): - return _apply_mlrun_on_trainer( - trainer=huggingface_object, - model_name=model_name, - tag=tag, - context=context, - auto_log=auto_log, - labels=labels, - extra_data=extra_data, - ) - import optimum.onnxruntime as optimum_ort - - if isinstance(huggingface_object, optimum_ort.ORTOptimizer): - return _apply_mlrun_on_optimizer( - optimizer=huggingface_object, - model_name=model_name, - tag=tag, - context=context, - auto_log=auto_log, - labels=labels, - extra_data=extra_data, - ) - raise mlrun.errors.MLRunInvalidArgumentError
- - -# ---------------------- from auto_trainer-------------------------------- -
[docs]class KWArgsPrefixes: - MODEL_CLASS = "CLASS_" - FIT = "FIT_" - TRAIN = "TRAIN_" - PREDICT = "PREDICT_"
- - -def _get_sub_dict_by_prefix(src: Dict, prefix_key: str) -> Dict[str, Any]: - """ - Collect all the keys from the given dict that starts with the given prefix and creates a new dictionary with these - keys. - - :param src: The source dict to extract the values from. - :param prefix_key: Only keys with this prefix will be returned. The keys in the result dict will be without this - prefix. - """ - return { - key.replace(prefix_key, ""): val - for key, val in src.items() - if key.startswith(prefix_key) - } - - -def _get_dataframe( - context: MLClientCtx, - dataset: DataItem, - label_columns: Optional[Union[str, List[str]]] = None, - drop_columns: Union[str, List[str], int, List[int]] = None, -) -> Tuple[pd.DataFrame, Optional[Union[str, List[str]]]]: - """ - Getting the DataFrame of the dataset and drop the columns accordingly. - - :param context: MLRun context. - :param dataset: The dataset to train the model on. - Can be either a list of lists, dict, URI or a FeatureVector. - :param label_columns: The target label(s) of the column(s) in the dataset. for Regression or - Classification tasks. - :param drop_columns: str/int or a list of strings/ints that represent the column names/indices to drop. - """ - if isinstance(dataset, (list, dict)): - dataset = pd.DataFrame(dataset) - # Checking if drop_columns provided by integer type: - if drop_columns: - if isinstance(drop_columns, str) or ( - isinstance(drop_columns, list) - and any(isinstance(col, str) for col in drop_columns) - ): - context.logger.error( - "drop_columns must be an integer/list of integers if not provided with a URI/FeatureVector dataset" - ) - raise ValueError - dataset.drop(drop_columns, axis=1, inplace=True) - - return dataset, label_columns - - store_uri_prefix, _ = mlrun.datastore.parse_store_uri(dataset.artifact_url) - if mlrun.utils.StorePrefix.FeatureVector == store_uri_prefix: - # feature-vector case: - label_columns = label_columns or dataset.meta.status.label_column - dataset = fs.get_offline_features( - dataset.meta.uri, drop_columns=drop_columns - ).to_dataframe() - - context.logger.info(f"label columns: {label_columns}") - else: - # simple URL case: - dataset = dataset.as_df() - if drop_columns: - if all(col in dataset for col in drop_columns): - dataset = dataset.drop(drop_columns, axis=1) - else: - context.logger.info( - "not all of the columns to drop in the dataset, drop columns process skipped" - ) - return dataset, label_columns - - -# ---------------------- Hugging Face Trainer -------------------------------- - - -def _create_compute_metrics(metrics: List[str]) -> Callable[[EvalPrediction], Dict]: - """ - This function create and returns a function that will be used to compute metrics at evaluation. - :param metrics: List of different metrics for evaluate the model such as f1, accuracy etc. - - :returns: Function that will be used to compute metrics at evaluation. - Must take a [`EvalPrediction`] and return a dictionary string to metric values. - """ - - def _compute_metrics(eval_pred): - logits, labels = eval_pred - predictions = np.argmax(logits, axis=-1) - metric_dict_results = {} - for metric in metrics: - load_met = load_metric(metric) - metric_res = load_met.compute(predictions=predictions, references=labels)[ - metric - ] - metric_dict_results[metric] = metric_res - - return metric_dict_results - - return _compute_metrics - - -def _edit_columns( - dataset: Dataset, - drop_columns: List[str] = None, - rename_columns: [str, str] = None, -) -> Dataset: - """ - Drop and renames that columns of the given dataset - :param dataset: Dataset to process - :param drop_columns: The columns to drop from the dataset. - :param rename_columns: Dict of columns ro rename : {<old_name>: <new_name>, ...} - - :returns: The dataset after the desired process - """ - if drop_columns: - dataset = dataset.remove_columns(drop_columns) - if rename_columns: - dataset = dataset.rename_columns(rename_columns) - return dataset - - -def _prepare_dataset( - context: MLClientCtx, - dataset_name: str, - label_name: str = None, - drop_columns: Optional[List[str]] = None, - num_of_train_samples: int = None, - train_test_split_size: float = None, - random_state: int = None, -) -> Tuple[Dataset, Dataset]: - """ - Loading the dataset and editing the columns - - :param context: MLRun contex - :param dataset_name: The name of the dataset to get from the HuggingFace hub - :param label_name: The target label of the column in the dataset. - :param drop_columns: The columns to drop from the dataset. - :param num_of_train_samples: Max number of training samples, for debugging. - :param train_test_split_size: Should be between 0.0 and 1.0 and represent the proportion of the dataset to include - in the test split. - :param random_state: Random state for train_test_split - - """ - - context.logger.info( - f"Loading and editing {dataset_name} dataset from Hugging Face hub" - ) - rename_cols = {label_name: "labels"} - - # Loading and editing dataset: - dataset = load_dataset(dataset_name) - - # train set - train_dataset = dataset["train"] - if num_of_train_samples: - train_dataset = train_dataset.shuffle(seed=random_state).select( - list(range(num_of_train_samples)) - ) - train_dataset = _edit_columns(train_dataset, drop_columns, rename_cols) - - # test set - test_dataset = dataset["test"] - if train_test_split_size or num_of_train_samples: - train_test_split_size = train_test_split_size or 0.2 - num_of_test_samples = int( - (train_dataset.num_rows * train_test_split_size) - // (1 - train_test_split_size) - ) - test_dataset = test_dataset.shuffle(seed=random_state).select( - list(range(num_of_test_samples)) - ) - test_dataset = _edit_columns(test_dataset, drop_columns, rename_cols) - - return train_dataset, test_dataset - - -
[docs]def train( - context: MLClientCtx, - hf_dataset: str = None, - dataset: DataItem = None, - test_set: DataItem = None, - drop_columns: Optional[List[str]] = None, - pretrained_tokenizer: str = None, - pretrained_model: str = None, - model_class: str = None, - model_name: str = "huggingface-model", - label_name: str = "labels", - text_col: str = "text", - num_of_train_samples: int = None, - train_test_split_size: float = None, - metrics: List[str] = None, - random_state: int = None, -): - """ - Training and evaluating a pretrained model with a pretrained tokenizer over a dataset. - The dataset can be either be the name of the dataset that contains in the HuggingFace hub, - or a URI or a FeatureVector - - :param context: MLRun context - :param hf_dataset: The name of the dataset to get from the HuggingFace hub - :param dataset: The dataset to train the model on. Can be either a URI or a FeatureVector - :param test_set: The test set to train the model with. - :param drop_columns: The columns to drop from the dataset. - :param pretrained_tokenizer: The name of the pretrained tokenizer from the HuggingFace hub. - :param pretrained_model: The name of the pretrained model from the HuggingFace hub. - :param model_name: The model's name to use for storing the model artifact, default to 'model' - :param model_class: The class of the model, e.g. `transformers.AutoModelForSequenceClassification` - :param label_name: The target label of the column in the dataset. - :param text_col: The input text column un the dataset. - :param num_of_train_samples: Max number of training samples, for debugging. - :param train_test_split_size: Should be between 0.0 and 1.0 and represent the proportion of the dataset to include - in the test split. - :param metrics: List of different metrics for evaluate the model such as f1, accuracy etc. - :param random_state: Random state for train_test_split - """ - - if train_test_split_size is None and test_set is None: - context.logger.info( - "'train_test_split_size' is not provided, setting train_test_split_size to 0.2" - ) - train_test_split_size = 0.2 - - # Creating tokenizer: - tokenizer = AutoTokenizer.from_pretrained(pretrained_tokenizer) - - def preprocess_function(examples): - return tokenizer(examples[text_col], truncation=True) - - # prepare data for training - if hf_dataset: - train_dataset, test_dataset = _prepare_dataset( - context, - hf_dataset, - label_name, - drop_columns, - num_of_train_samples, - train_test_split_size, - random_state=random_state, - ) - elif dataset: - # Get DataFrame by URL or by FeatureVector: - train_dataset, label_name = _get_dataframe( - context=context, - dataset=dataset, - label_columns=label_name, - drop_columns=drop_columns, - ) - if test_set: - test_dataset, _ = _get_dataframe( - context=context, - dataset=test_set, - label_columns=label_name, - drop_columns=drop_columns, - ) - else: - train_dataset, test_dataset = train_test_split( - train_dataset, - test_size=train_test_split_size, - random_state=random_state, - ) - train_dataset = Dataset.from_pandas(train_dataset) - test_dataset = Dataset.from_pandas(test_dataset) - else: - raise mlrun.errors.MLRunInvalidArgumentError( - "Training data was not provided. A training dataset is mandatory for training." - " Please provide a training set using one of the arguments 'hf_dataset' or 'dataset'." - ) - - # Mapping datasets with the tokenizer: - tokenized_train = train_dataset.map(preprocess_function, batched=True) - tokenized_test = test_dataset.map(preprocess_function, batched=True) - - # Creating data collator for batching: - data_collator = DataCollatorWithPadding(tokenizer=tokenizer) - - # Parsing kwargs: - train_kwargs = _get_sub_dict_by_prefix( - src=context.parameters, prefix_key=KWArgsPrefixes.TRAIN - ) - model_class_kwargs = _get_sub_dict_by_prefix( - src=context.parameters, prefix_key=KWArgsPrefixes.MODEL_CLASS - ) - - # Loading our pretrained model: - model_class_kwargs["pretrained_model_name_or_path"] = ( - model_class_kwargs.get("pretrained_model_name_or_path") or pretrained_model - ) - train_kwargs["hub_token"] = train_kwargs.get("hub_token") or pretrained_tokenizer - if not model_class_kwargs["pretrained_model_name_or_path"]: - raise mlrun.errors.MLRunRuntimeError( - "Must provide pretrained_model name as " - "function argument or in extra params" - ) - model = create_class(model_class).from_pretrained(**model_class_kwargs) - - # Preparing training arguments: - training_args = TrainingArguments( - **train_kwargs, - ) - - compute_metrics = _create_compute_metrics(metrics) if metrics else None - trainer = Trainer( - model=model, - args=training_args, - train_dataset=tokenized_train, - eval_dataset=tokenized_test, - tokenizer=tokenizer, - data_collator=data_collator, - compute_metrics=compute_metrics, - ) - - apply_mlrun(trainer, model_name=model_name) - - # Apply training with evaluation: - context.logger.info(f"training '{model_name}'") - trainer.train()
- - -def _get_model_dir(model_uri: str): - model_file, _, _ = mlrun.artifacts.get_model(model_uri) - model_dir = tempfile.gettempdir() - # Unzip the Model: - with zipfile.ZipFile(model_file, "r") as zip_file: - zip_file.extractall(model_dir) - - return model_dir - - -
[docs]def optimize( - model_path: str, - model_name: str = "optimized_model", - target_dir: str = "./optimized", - optimization_level: int = 1, -): - """ - Optimizing the transformer model using ONNX optimization. - - - :param model_path: The path of the model to optimize. - :param model_name: Name of the optimized model. - :param target_dir: The directory to save the ONNX model. - :param optimization_level: Optimization level performed by ONNX Runtime of the loaded graph. (default is 1) - """ - # We import these in the function scope so ONNX won't be mandatory for the other handlers: - from optimum.onnxruntime import ORTModelForSequenceClassification, ORTOptimizer - from optimum.onnxruntime.configuration import OptimizationConfig - - model_dir = _get_model_dir(model_uri=model_path) - # Creating configuration for optimization step: - optimization_config = OptimizationConfig(optimization_level=optimization_level) - - # Converting our pretrained model to an ONNX-Runtime model: - ort_model = ORTModelForSequenceClassification.from_pretrained( - model_dir, from_transformers=True - ) - - # Creating an ONNX-Runtime optimizer from ONNX model: - optimizer = ORTOptimizer.from_pretrained(ort_model) - - apply_mlrun(optimizer, model_name=model_name) - # Optimizing and saving the ONNX model: - optimizer.optimize(save_dir=target_dir, optimization_config=optimization_config)
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/hugging_face_classifier_trainer/latest/static/item.html b/functions/development/hugging_face_classifier_trainer/latest/static/item.html deleted file mode 100644 index 7db7e49b..00000000 --- a/functions/development/hugging_face_classifier_trainer/latest/static/item.html +++ /dev/null @@ -1,53 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- machine-learning
-- model-training
-description: Automatic train and optimize functions for HuggingFace framework
-doc: ''
-example: hugging_face_classifier_trainer.ipynb
-generationDate: 2022-08-28:17-25
-hidden: false
-icon: ''
-labels:
-  author: davids
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 1.6.1
-name: hugging_face_classifier_trainer
-platformVersion: 3.5.5
-spec:
-  filename: hugging_face_classifier_trainer.py
-  handler: train
-  image: mlrun/mlrun
-  kind: job
-  requirements:
-  - onnx~=1.14.1
-  - onnxruntime~=1.16.1
-  - optimum~=1.6.4
-  - transformers~=4.26.1
-  - datasets~=2.10.1
-  - scikit-learn~=1.0.2
-url: ''
-version: 0.2.0
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/hugging_face_classifier_trainer/latest/static/source.html b/functions/development/hugging_face_classifier_trainer/latest/static/source.html deleted file mode 100644 index 6eee51f5..00000000 --- a/functions/development/hugging_face_classifier_trainer/latest/static/source.html +++ /dev/null @@ -1,854 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-import os
-import shutil
-import tempfile
-import zipfile
-from abc import ABC
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
-
-import mlrun
-import mlrun.datastore
-import mlrun.utils
-import numpy as np
-import pandas as pd
-import transformers
-from datasets import Dataset, load_dataset, load_metric
-from mlrun import MLClientCtx
-from mlrun import feature_store as fs
-from mlrun.artifacts import Artifact, PlotlyArtifact
-from mlrun.datastore import DataItem
-from mlrun.frameworks._common import CommonTypes, MLRunInterface
-from mlrun.utils import create_class
-from plotly import graph_objects as go
-from sklearn.model_selection import train_test_split
-from transformers import (
-    AutoTokenizer,
-    DataCollatorWithPadding,
-    EvalPrediction,
-    PreTrainedModel,
-    PreTrainedTokenizer,
-    Trainer,
-    TrainerCallback,
-    TrainerControl,
-    TrainerState,
-    TrainingArguments,
-)
-
-
-# ----------------------from MLRUN--------------------------------
-class HFORTOptimizerMLRunInterface(MLRunInterface, ABC):
-    """
-    Interface for adding MLRun features for tensorflow keras API.
-    """
-
-    # MLRun's context default name:
-    DEFAULT_CONTEXT_NAME = "mlrun-huggingface"
-
-    # Attributes to be inserted so the MLRun interface will be fully enabled.
-    _PROPERTIES = {
-        "_auto_log": False,
-        "_context": None,
-        "_model_name": "model",
-        "_tag": "",
-        "_labels": None,
-        "_extra_data": None,
-    }
-    _METHODS = ["enable_auto_logging"]
-    # Attributes to replace so the MLRun interface will be fully enabled.
-    _REPLACED_METHODS = [
-        "optimize",
-    ]
-
-    @classmethod
-    def add_interface(
-        cls,
-        obj,
-        restoration: CommonTypes.MLRunInterfaceRestorationType = None,
-    ):
-        """
-        Enrich the object with this interface properties, methods and functions, so it will have this TensorFlow.Keras
-        MLRun's features.
-        :param obj:                     The object to enrich his interface.
-        :param restoration: Restoration information tuple as returned from 'remove_interface' in order to
-                                        add the interface in a certain state.
-        """
-        super(HFORTOptimizerMLRunInterface, cls).add_interface(
-            obj=obj, restoration=restoration
-        )
-
-    @classmethod
-    def mlrun_optimize(cls):
-        """
-        MLRun's tf.keras.Model.fit wrapper. It will setup the optimizer when using horovod. The optimizer must be
-        passed in a keyword argument and when using horovod, it must be passed as an Optimizer instance, not a string.
-
-        raise MLRunInvalidArgumentError: In case the optimizer provided did not follow the instructions above.
-        """
-
-        def wrapper(self, *args, **kwargs):
-            save_dir = cls._get_function_argument(
-                self.optimize,
-                argument_name="save_dir",
-                passed_args=args,
-                passed_kwargs=kwargs,
-            )[0]
-
-            # Call the original optimize method:
-            result = self.original_optimize(*args, **kwargs)
-
-            if self._auto_log:
-                # Log the onnx model:
-                self._context.log_model(
-                    key="model",
-                    db_key=self._model_name,
-                    model_file=f"{save_dir}/model_optimized.onnx",
-                    tag=self._tag,
-                    framework="ONNX",
-                    labels=self._labels,
-                    extra_data=self._extra_data,
-                )
-
-            return result
-
-        return wrapper
-
-    def enable_auto_logging(
-        self,
-        context: mlrun.MLClientCtx,
-        model_name: str = "model",
-        tag: str = "",
-        labels: Dict[str, str] = None,
-        extra_data: dict = None,
-    ):
-        self._auto_log = True
-
-        self._context = context
-        self._model_name = model_name
-        self._tag = tag
-        self._labels = labels
-        self._extra_data = extra_data
-
-
-class HFTrainerMLRunInterface(MLRunInterface, ABC):
-    """
-    Interface for adding MLRun features for tensorflow keras API.
-    """
-
-    # MLRuns context default name:
-    DEFAULT_CONTEXT_NAME = "mlrun-huggingface"
-
-    # Attributes to replace so the MLRun interface will be fully enabled.
-    _REPLACED_METHODS = [
-        "train",
-        # "evaluate"
-    ]
-
-    @classmethod
-    def add_interface(
-        cls,
-        obj: Trainer,
-        restoration: CommonTypes.MLRunInterfaceRestorationType = None,
-    ):
-        """
-        Enrich the object with this interface properties, methods and functions, so it will have this TensorFlow.Keras
-        MLRuns features.
-        :param obj:                     The object to enrich his interface.
-        :param restoration: Restoration information tuple as returned from 'remove_interface' in order to
-                                        add the interface in a certain state.
-        """
-
-        super(HFTrainerMLRunInterface, cls).add_interface(
-            obj=obj, restoration=restoration
-        )
-
-    @classmethod
-    def mlrun_train(cls):
-
-        """
-        MLRuns tf.keras.Model.fit wrapper. It will setup the optimizer when using horovod. The optimizer must be
-        passed in a keyword argument and when using horovod, it must be passed as an Optimizer instance, not a string.
-
-        raise MLRunInvalidArgumentError: In case the optimizer provided did not follow the instructions above.
-        """
-
-        def wrapper(self: Trainer, *args, **kwargs):
-            # Restore the evaluation method as `train` will use it:
-            # cls._restore_attribute(obj=self, attribute_name="evaluate")
-
-            # Call the original fit method:
-            result = self.original_train(*args, **kwargs)
-
-            # Replace the evaluation method again:
-            # cls._replace_function(obj=self, function_name="evaluate")
-
-            return result
-
-        return wrapper
-
-
-class MLRunCallback(TrainerCallback):
-    """
-    Callback for collecting logs during training / evaluation of the `Trainer` API.
-    """
-
-    def __init__(
-        self,
-        context: mlrun.MLClientCtx = None,
-        model_name: str = "model",
-        tag: str = "",
-        labels: Dict[str, str] = None,
-        extra_data: dict = None,
-    ):
-        super().__init__()
-
-        # Store the configurations:
-        self._context = (
-            context
-            if context is not None
-            else mlrun.get_or_create_ctx("./mlrun-huggingface")
-        )
-        self._model_name = model_name
-        self._tag = tag
-        self._labels = labels
-        self._extra_data = extra_data if extra_data is not None else {}
-
-        # Set up the logging mode:
-        self._is_training = False
-        self._steps: List[List[int]] = []
-        self._metric_scores: Dict[str, List[float]] = {}
-        self._artifacts: Dict[str, Artifact] = {}
-
-    def on_epoch_begin(
-        self,
-        args: TrainingArguments,
-        state: TrainerState,
-        control: TrainerControl,
-        **kwargs,
-    ):
-        self._steps.append([])
-
-    def on_epoch_end(
-        self,
-        args: TrainingArguments,
-        state: TrainerState,
-        control: TrainerControl,
-        **kwargs,
-    ):
-        self._log_metrics()
-
-    def on_log(
-        self,
-        args: TrainingArguments,
-        state: TrainerState,
-        control: TrainerControl,
-        logs: Dict[str, float] = None,
-        **kwargs,
-    ):
-        recent_logs = state.log_history[-1].copy()
-
-        recent_logs.pop("epoch")
-        current_step = int(recent_logs.pop("step"))
-        if current_step not in self._steps[-1]:
-            self._steps[-1].append(current_step)
-
-        for metric_name, metric_score in recent_logs.items():
-            if metric_name.startswith("train_"):
-                if metric_name.split("train_")[1] not in self._metric_scores:
-                    self._metric_scores[metric_name] = [metric_score]
-                continue
-            if metric_name not in self._metric_scores:
-                self._metric_scores[metric_name] = []
-            self._metric_scores[metric_name].append(metric_score)
-
-    def on_train_begin(
-        self,
-        args: TrainingArguments,
-        state: TrainerState,
-        control: TrainerControl,
-        **kwargs,
-    ):
-        self._is_training = True
-
-    def on_train_end(
-        self,
-        args: TrainingArguments,
-        state: TrainerState,
-        control: TrainerControl,
-        model: PreTrainedModel = None,
-        tokenizer: PreTrainedTokenizer = None,
-        **kwargs,
-    ):
-        self._log_metrics()
-
-        temp_directory = tempfile.gettempdir()
-
-        # Save and log the tokenizer:
-        if tokenizer is not None:
-            # Save tokenizer:
-            tokenizer_dir = os.path.join(temp_directory, "tokenizer")
-            tokenizer.save_pretrained(save_directory=tokenizer_dir)
-            # Zip the tokenizer directory:
-            tokenizer_zip = shutil.make_archive(
-                base_name="tokenizer",
-                format="zip",
-                root_dir=tokenizer_dir,
-            )
-            # Log the zip file:
-            self._artifacts["tokenizer"] = self._context.log_artifact(
-                item="tokenizer", local_path=tokenizer_zip
-            )
-
-        # Save the model:
-        model_dir = os.path.join(temp_directory, "model")
-        model.save_pretrained(save_directory=model_dir)
-
-        # Zip the model directory:
-        shutil.make_archive(
-            base_name="model",
-            format="zip",
-            root_dir=model_dir,
-        )
-
-        # Log the model:
-        self._context.log_model(
-            key="model",
-            db_key=self._model_name,
-            model_file="model.zip",
-            tag=self._tag,
-            framework="Hugging Face",
-            labels=self._labels,
-            extra_data={**self._artifacts, **self._extra_data},
-        )
-
-    def on_evaluate(
-        self,
-        args: TrainingArguments,
-        state: TrainerState,
-        control: TrainerControl,
-        **kwargs,
-    ):
-        self._log_metrics()
-
-        if self._is_training:
-            return
-
-        # TODO: Update the model object
-
-    def _log_metrics(self):
-        for metric_name, metric_scores in self._metric_scores.items():
-            self._context.log_result(key=metric_name, value=metric_scores[-1])
-            if len(metric_scores) > 1:
-                self._log_metric_plot(name=metric_name, scores=metric_scores)
-        self._context.commit(completed=False)
-
-    def _log_metric_plot(self, name: str, scores: List[float]):
-        # Initialize a plotly figure:
-        metric_figure = go.Figure()
-
-        # Add titles:
-        metric_figure.update_layout(
-            title=name.capitalize().replace("_", " "),
-            xaxis_title="Samples",
-            yaxis_title="Scores",
-        )
-
-        # Draw:
-        metric_figure.add_trace(
-            go.Scatter(x=np.arange(len(scores)), y=scores, mode="lines")
-        )
-
-        # Create the plotly artifact:
-        artifact_name = f"{name}_plot"
-        artifact = PlotlyArtifact(key=artifact_name, figure=metric_figure)
-        self._artifacts[artifact_name] = self._context.log_artifact(artifact)
-
-
-def _apply_mlrun_on_trainer(
-    trainer: transformers.Trainer,
-    model_name: str = None,
-    tag: str = "",
-    context: mlrun.MLClientCtx = None,
-    auto_log: bool = True,
-    labels: Dict[str, str] = None,
-    extra_data: dict = None,
-    **kwargs,
-):
-    # Get parameters defaults:
-    if context is None:
-        context = mlrun.get_or_create_ctx(HFTrainerMLRunInterface.DEFAULT_CONTEXT_NAME)
-
-    HFTrainerMLRunInterface.add_interface(obj=trainer)
-
-    if auto_log:
-        trainer.add_callback(
-            MLRunCallback(
-                context=context,
-                model_name=model_name,
-                tag=tag,
-                labels=labels,
-                extra_data=extra_data,
-            )
-        )
-
-
-def _apply_mlrun_on_optimizer(
-    optimizer,
-    model_name: str = None,
-    tag: str = "",
-    context: mlrun.MLClientCtx = None,
-    auto_log: bool = True,
-    labels: Dict[str, str] = None,
-    extra_data: dict = None,
-    **kwargs,
-):
-    # Get parameters defaults:
-    if context is None:
-        context = mlrun.get_or_create_ctx(
-            HFORTOptimizerMLRunInterface.DEFAULT_CONTEXT_NAME
-        )
-
-    HFORTOptimizerMLRunInterface.add_interface(obj=optimizer)
-
-    if auto_log:
-        optimizer.enable_auto_logging(
-            context=context,
-            model_name=model_name,
-            tag=tag,
-            labels=labels,
-            extra_data=extra_data,
-        )
-
-
-def apply_mlrun(
-    huggingface_object,
-    model_name: str = None,
-    tag: str = "",
-    context: mlrun.MLClientCtx = None,
-    auto_log: bool = True,
-    labels: Dict[str, str] = None,
-    extra_data: dict = None,
-    **kwargs,
-):
-    """
-    Wrap the given model with MLRun's interface providing it with mlrun's additional features.
-    :param huggingface_object: The model to wrap. Can be loaded from the model path given as well.
-    :param model_name:         The model name to use for storing the model artifact. Default: "model".
-    :param tag:                The model's tag to log with.
-    :param context:            MLRun context to work with. If no context is given it will be retrieved via
-                               'mlrun.get_or_create_ctx(None)'
-    :param auto_log:           Whether to enable MLRun's auto logging. Default: True.
-    """
-
-    if isinstance(huggingface_object, transformers.Trainer):
-        return _apply_mlrun_on_trainer(
-            trainer=huggingface_object,
-            model_name=model_name,
-            tag=tag,
-            context=context,
-            auto_log=auto_log,
-            labels=labels,
-            extra_data=extra_data,
-        )
-    import optimum.onnxruntime as optimum_ort
-
-    if isinstance(huggingface_object, optimum_ort.ORTOptimizer):
-        return _apply_mlrun_on_optimizer(
-            optimizer=huggingface_object,
-            model_name=model_name,
-            tag=tag,
-            context=context,
-            auto_log=auto_log,
-            labels=labels,
-            extra_data=extra_data,
-        )
-    raise mlrun.errors.MLRunInvalidArgumentError
-
-
-# ---------------------- from auto_trainer--------------------------------
-class KWArgsPrefixes:
-    MODEL_CLASS = "CLASS_"
-    FIT = "FIT_"
-    TRAIN = "TRAIN_"
-    PREDICT = "PREDICT_"
-
-
-def _get_sub_dict_by_prefix(src: Dict, prefix_key: str) -> Dict[str, Any]:
-    """
-    Collect all the keys from the given dict that starts with the given prefix and creates a new dictionary with these
-    keys.
-
-    :param src:         The source dict to extract the values from.
-    :param prefix_key:  Only keys with this prefix will be returned. The keys in the result dict will be without this
-                        prefix.
-    """
-    return {
-        key.replace(prefix_key, ""): val
-        for key, val in src.items()
-        if key.startswith(prefix_key)
-    }
-
-
-def _get_dataframe(
-    context: MLClientCtx,
-    dataset: DataItem,
-    label_columns: Optional[Union[str, List[str]]] = None,
-    drop_columns: Union[str, List[str], int, List[int]] = None,
-) -> Tuple[pd.DataFrame, Optional[Union[str, List[str]]]]:
-    """
-    Getting the DataFrame of the dataset and drop the columns accordingly.
-
-    :param context:         MLRun context.
-    :param dataset:         The dataset to train the model on.
-                            Can be either a list of lists, dict, URI or a FeatureVector.
-    :param label_columns:   The target label(s) of the column(s) in the dataset. for Regression or
-                            Classification tasks.
-    :param drop_columns:    str/int or a list of strings/ints that represent the column names/indices to drop.
-    """
-    if isinstance(dataset, (list, dict)):
-        dataset = pd.DataFrame(dataset)
-        # Checking if drop_columns provided by integer type:
-        if drop_columns:
-            if isinstance(drop_columns, str) or (
-                isinstance(drop_columns, list)
-                and any(isinstance(col, str) for col in drop_columns)
-            ):
-                context.logger.error(
-                    "drop_columns must be an integer/list of integers if not provided with a URI/FeatureVector dataset"
-                )
-                raise ValueError
-            dataset.drop(drop_columns, axis=1, inplace=True)
-
-        return dataset, label_columns
-
-    store_uri_prefix, _ = mlrun.datastore.parse_store_uri(dataset.artifact_url)
-    if mlrun.utils.StorePrefix.FeatureVector == store_uri_prefix:
-        # feature-vector case:
-        label_columns = label_columns or dataset.meta.status.label_column
-        dataset = fs.get_offline_features(
-            dataset.meta.uri, drop_columns=drop_columns
-        ).to_dataframe()
-
-        context.logger.info(f"label columns: {label_columns}")
-    else:
-        # simple URL case:
-        dataset = dataset.as_df()
-        if drop_columns:
-            if all(col in dataset for col in drop_columns):
-                dataset = dataset.drop(drop_columns, axis=1)
-            else:
-                context.logger.info(
-                    "not all of the columns to drop in the dataset, drop columns process skipped"
-                )
-    return dataset, label_columns
-
-
-# ---------------------- Hugging Face Trainer --------------------------------
-
-
-def _create_compute_metrics(metrics: List[str]) -> Callable[[EvalPrediction], Dict]:
-    """
-    This function create and returns a function that will be used to compute metrics at evaluation.
-    :param metrics: List of different metrics for evaluate the model such as f1, accuracy etc.
-
-    :returns: Function that will be used to compute metrics at evaluation.
-             Must take a [`EvalPrediction`] and return a dictionary string to metric values.
-    """
-
-    def _compute_metrics(eval_pred):
-        logits, labels = eval_pred
-        predictions = np.argmax(logits, axis=-1)
-        metric_dict_results = {}
-        for metric in metrics:
-            load_met = load_metric(metric)
-            metric_res = load_met.compute(predictions=predictions, references=labels)[
-                metric
-            ]
-            metric_dict_results[metric] = metric_res
-
-        return metric_dict_results
-
-    return _compute_metrics
-
-
-def _edit_columns(
-    dataset: Dataset,
-    drop_columns: List[str] = None,
-    rename_columns: [str, str] = None,
-) -> Dataset:
-    """
-    Drop and renames that columns of the given dataset
-    :param dataset:         Dataset to process
-    :param drop_columns:    The columns to drop from the dataset.
-    :param rename_columns:  Dict of columns ro rename : {: , ...}
-
-    :returns: The dataset after the desired process
-    """
-    if drop_columns:
-        dataset = dataset.remove_columns(drop_columns)
-    if rename_columns:
-        dataset = dataset.rename_columns(rename_columns)
-    return dataset
-
-
-def _prepare_dataset(
-    context: MLClientCtx,
-    dataset_name: str,
-    label_name: str = None,
-    drop_columns: Optional[List[str]] = None,
-    num_of_train_samples: int = None,
-    train_test_split_size: float = None,
-    random_state: int = None,
-) -> Tuple[Dataset, Dataset]:
-    """
-    Loading the dataset and editing the columns
-
-    :param context:                 MLRun contex
-    :param dataset_name:            The name of the dataset to get from the HuggingFace hub
-    :param label_name:              The target label of the column in the dataset.
-    :param drop_columns:            The columns to drop from the dataset.
-    :param num_of_train_samples:    Max number of training samples, for debugging.
-    :param train_test_split_size:   Should be between 0.0 and 1.0 and represent the proportion of the dataset to include
-                                    in the test split.
-    :param random_state:            Random state for train_test_split
-
-    """
-
-    context.logger.info(
-        f"Loading and editing {dataset_name} dataset from Hugging Face hub"
-    )
-    rename_cols = {label_name: "labels"}
-
-    # Loading and editing dataset:
-    dataset = load_dataset(dataset_name)
-
-    # train set
-    train_dataset = dataset["train"]
-    if num_of_train_samples:
-        train_dataset = train_dataset.shuffle(seed=random_state).select(
-            list(range(num_of_train_samples))
-        )
-    train_dataset = _edit_columns(train_dataset, drop_columns, rename_cols)
-
-    # test set
-    test_dataset = dataset["test"]
-    if train_test_split_size or num_of_train_samples:
-        train_test_split_size = train_test_split_size or 0.2
-        num_of_test_samples = int(
-            (train_dataset.num_rows * train_test_split_size)
-            // (1 - train_test_split_size)
-        )
-        test_dataset = test_dataset.shuffle(seed=random_state).select(
-            list(range(num_of_test_samples))
-        )
-    test_dataset = _edit_columns(test_dataset, drop_columns, rename_cols)
-
-    return train_dataset, test_dataset
-
-
-def train(
-    context: MLClientCtx,
-    hf_dataset: str = None,
-    dataset: DataItem = None,
-    test_set: DataItem = None,
-    drop_columns: Optional[List[str]] = None,
-    pretrained_tokenizer: str = None,
-    pretrained_model: str = None,
-    model_class: str = None,
-    model_name: str = "huggingface-model",
-    label_name: str = "labels",
-    text_col: str = "text",
-    num_of_train_samples: int = None,
-    train_test_split_size: float = None,
-    metrics: List[str] = None,
-    random_state: int = None,
-):
-    """
-    Training and evaluating a pretrained model with a pretrained tokenizer over a dataset.
-    The dataset can be either be the name of the dataset that contains in the HuggingFace hub,
-    or a URI or a FeatureVector
-
-    :param context:                 MLRun context
-    :param hf_dataset:              The name of the dataset to get from the HuggingFace hub
-    :param dataset:                 The dataset to train the model on. Can be either a URI or a FeatureVector
-    :param test_set:                The test set to train the model with.
-    :param drop_columns:            The columns to drop from the dataset.
-    :param pretrained_tokenizer:    The name of the pretrained tokenizer from the HuggingFace hub.
-    :param pretrained_model:        The name of the pretrained model from the HuggingFace hub.
-    :param model_name:              The model's name to use for storing the model artifact, default to 'model'
-    :param model_class:             The class of the model, e.g. `transformers.AutoModelForSequenceClassification`
-    :param label_name:              The target label of the column in the dataset.
-    :param text_col:                The input text column un the dataset.
-    :param num_of_train_samples:    Max number of training samples, for debugging.
-    :param train_test_split_size:   Should be between 0.0 and 1.0 and represent the proportion of the dataset to include
-                                    in the test split.
-    :param metrics:                 List of different metrics for evaluate the model such as f1, accuracy etc.
-    :param random_state:            Random state for train_test_split
-    """
-
-    if train_test_split_size is None and test_set is None:
-        context.logger.info(
-            "'train_test_split_size' is not provided, setting train_test_split_size to 0.2"
-        )
-        train_test_split_size = 0.2
-
-    # Creating tokenizer:
-    tokenizer = AutoTokenizer.from_pretrained(pretrained_tokenizer)
-
-    def preprocess_function(examples):
-        return tokenizer(examples[text_col], truncation=True)
-
-    # prepare data for training
-    if hf_dataset:
-        train_dataset, test_dataset = _prepare_dataset(
-            context,
-            hf_dataset,
-            label_name,
-            drop_columns,
-            num_of_train_samples,
-            train_test_split_size,
-            random_state=random_state,
-        )
-    elif dataset:
-        # Get DataFrame by URL or by FeatureVector:
-        train_dataset, label_name = _get_dataframe(
-            context=context,
-            dataset=dataset,
-            label_columns=label_name,
-            drop_columns=drop_columns,
-        )
-        if test_set:
-            test_dataset, _ = _get_dataframe(
-                context=context,
-                dataset=test_set,
-                label_columns=label_name,
-                drop_columns=drop_columns,
-            )
-        else:
-            train_dataset, test_dataset = train_test_split(
-                train_dataset,
-                test_size=train_test_split_size,
-                random_state=random_state,
-            )
-        train_dataset = Dataset.from_pandas(train_dataset)
-        test_dataset = Dataset.from_pandas(test_dataset)
-    else:
-        raise mlrun.errors.MLRunInvalidArgumentError(
-            "Training data was not provided. A training dataset is mandatory for training."
-            " Please provide a training set using one of the arguments 'hf_dataset' or 'dataset'."
-        )
-
-    # Mapping datasets with the tokenizer:
-    tokenized_train = train_dataset.map(preprocess_function, batched=True)
-    tokenized_test = test_dataset.map(preprocess_function, batched=True)
-
-    # Creating data collator for batching:
-    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
-
-    # Parsing kwargs:
-    train_kwargs = _get_sub_dict_by_prefix(
-        src=context.parameters, prefix_key=KWArgsPrefixes.TRAIN
-    )
-    model_class_kwargs = _get_sub_dict_by_prefix(
-        src=context.parameters, prefix_key=KWArgsPrefixes.MODEL_CLASS
-    )
-
-    # Loading our pretrained model:
-    model_class_kwargs["pretrained_model_name_or_path"] = (
-        model_class_kwargs.get("pretrained_model_name_or_path") or pretrained_model
-    )
-    train_kwargs["hub_token"] = train_kwargs.get("hub_token") or pretrained_tokenizer
-    if not model_class_kwargs["pretrained_model_name_or_path"]:
-        raise mlrun.errors.MLRunRuntimeError(
-            "Must provide pretrained_model name as "
-            "function argument or in extra params"
-        )
-    model = create_class(model_class).from_pretrained(**model_class_kwargs)
-
-    # Preparing training arguments:
-    training_args = TrainingArguments(
-        **train_kwargs,
-    )
-
-    compute_metrics = _create_compute_metrics(metrics) if metrics else None
-    trainer = Trainer(
-        model=model,
-        args=training_args,
-        train_dataset=tokenized_train,
-        eval_dataset=tokenized_test,
-        tokenizer=tokenizer,
-        data_collator=data_collator,
-        compute_metrics=compute_metrics,
-    )
-
-    apply_mlrun(trainer, model_name=model_name)
-
-    # Apply training with evaluation:
-    context.logger.info(f"training '{model_name}'")
-    trainer.train()
-
-
-def _get_model_dir(model_uri: str):
-    model_file, _, _ = mlrun.artifacts.get_model(model_uri)
-    model_dir = tempfile.gettempdir()
-    # Unzip the Model:
-    with zipfile.ZipFile(model_file, "r") as zip_file:
-        zip_file.extractall(model_dir)
-
-    return model_dir
-
-
-def optimize(
-    model_path: str,
-    model_name: str = "optimized_model",
-    target_dir: str = "./optimized",
-    optimization_level: int = 1,
-):
-    """
-    Optimizing the transformer model using ONNX optimization.
-
-
-    :param model_path:          The path of the model to optimize.
-    :param model_name:          Name of the optimized model.
-    :param target_dir:          The directory to save the ONNX model.
-    :param optimization_level:  Optimization level performed by ONNX Runtime of the loaded graph. (default is 1)
-    """
-    # We import these in the function scope so ONNX won't be mandatory for the other handlers:
-    from optimum.onnxruntime import ORTModelForSequenceClassification, ORTOptimizer
-    from optimum.onnxruntime.configuration import OptimizationConfig
-
-    model_dir = _get_model_dir(model_uri=model_path)
-    # Creating configuration for optimization step:
-    optimization_config = OptimizationConfig(optimization_level=optimization_level)
-
-    # Converting our pretrained model to an ONNX-Runtime model:
-    ort_model = ORTModelForSequenceClassification.from_pretrained(
-        model_dir, from_transformers=True
-    )
-
-    # Creating an ONNX-Runtime optimizer from ONNX model:
-    optimizer = ORTOptimizer.from_pretrained(ort_model)
-
-    apply_mlrun(optimizer, model_name=model_name)
-    # Optimizing and saving the ONNX model:
-    optimizer.optimize(save_dir=target_dir, optimization_config=optimization_config)
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/huggingface_auto_trainer/1.0.0/src/function.yaml b/functions/development/huggingface_auto_trainer/1.0.0/src/function.yaml deleted file mode 100644 index eff09b4c..00000000 --- a/functions/development/huggingface_auto_trainer/1.0.0/src/function.yaml +++ /dev/null @@ -1,349 +0,0 @@ -kind: job -metadata: - name: huggingface-auto-trainer - tag: '' - hash: 4459f0b675c36a20c8f542126a96b98b0ac82271 - project: '' - labels: - author: Zeevr - categories: - - machine-learning - - model-training -spec: - command: '' - args: [] - image: mlrun/mlrun - build: - functionSourceCode:  - commands: [] - code_origin: https://github.com/ZeevRispler/functions.git#a63a647cf6bc3015a8dcbd18903f9db44bdf0b66:/Users/Zeev_Rispler/PycharmProjects/functions/huggingface_auto_trainer/huggingface_auto_trainer.py - origin_filename: /Users/Zeev_Rispler/PycharmProjects/functions/huggingface_auto_trainer/huggingface_auto_trainer.py - requirements: [] - entry_points: - add_interface: - name: add_interface - doc: '' - parameters: - - name: cls - default: '' - - name: obj - type: Trainer - default: '' - - name: restoration - type: MLRunInterfaceRestorationType - default: null - outputs: - - default: '' - lineno: 70 - mlrun_train: - name: mlrun_train - doc: '' - parameters: - - name: cls - default: '' - outputs: - - default: '' - lineno: 80 - wrapper: - name: wrapper - doc: '' - parameters: - - name: self - type: Trainer - default: '' - outputs: - - default: '' - lineno: 81 - on_epoch_begin: - name: on_epoch_begin - doc: '' - parameters: - - name: self - default: '' - - name: args - type: TrainingArguments - default: '' - - name: state - type: TrainerState - default: '' - - name: control - type: TrainerControl - default: '' - outputs: - - default: '' - lineno: 129 - on_epoch_end: - name: on_epoch_end - doc: '' - parameters: - - name: self - default: '' - - name: args - type: TrainingArguments - default: '' - - name: state - type: TrainerState - default: '' - - name: control - type: TrainerControl - default: '' - outputs: - - default: '' - lineno: 140 - on_log: - name: on_log - doc: '' - parameters: - - name: self - default: '' - - name: args - type: TrainingArguments - default: '' - - name: state - type: TrainerState - default: '' - - name: control - type: TrainerControl - default: '' - - name: logs - type: Dict[str, float] - default: null - outputs: - - default: '' - lineno: 151 - on_train_begin: - name: on_train_begin - doc: '' - parameters: - - name: self - default: '' - - name: args - type: TrainingArguments - default: '' - - name: state - type: TrainerState - default: '' - - name: control - type: TrainerControl - default: '' - outputs: - - default: '' - lineno: 177 - on_train_end: - name: on_train_end - doc: '' - parameters: - - name: self - default: '' - - name: args - type: TrainingArguments - default: '' - - name: state - type: TrainerState - default: '' - - name: control - type: TrainerControl - default: '' - - name: model - type: PreTrainedModel - default: null - - name: tokenizer - type: PreTrainedTokenizer - default: null - outputs: - - default: '' - lineno: 188 - on_evaluate: - name: on_evaluate - doc: '' - parameters: - - name: self - default: '' - - name: args - type: TrainingArguments - default: '' - - name: state - type: TrainerState - default: '' - - name: control - type: TrainerControl - default: '' - outputs: - - default: '' - lineno: 201 - log_metrics: - name: log_metrics - doc: '' - parameters: - - name: self - default: '' - outputs: - - default: '' - lineno: 215 - log_metric_plot: - name: log_metric_plot - doc: '' - parameters: - - name: self - default: '' - - name: name - type: str - default: '' - - name: scores - type: List[float] - default: '' - outputs: - - default: '' - lineno: 222 - apply_mlrun: - name: apply_mlrun - doc: This is temporary and will be built in mlrun 1.5.0 - parameters: - - name: trainer - type: Trainer - default: '' - - name: model_name - type: str - default: null - - name: tag - type: str - default: '' - - name: context - type: MLClientCtx - default: null - - name: auto_log - type: bool - default: true - - name: labels - type: Dict[str, str] - default: null - - name: extra_data - type: dict - default: null - outputs: - - default: '' - lineno: 244 - finetune_llm: - name: finetune_llm - doc: "Fine-tunes a Language Model (LLM) on a specific task using the provided\ - \ dataset.\n The function takes various configuration parameters to customize\ - \ the training process\n and adapt the model to specific tasks using a provided\ - \ dataset." - parameters: - - name: context - type: MLClientCtx - doc: mlrun context in order to log trained model - default: '' - - name: train_dataset - type: Union[str, mlrun.datastore.DataItem] - doc: The train dataset used for fine-tuning the language model. - default: '' - - name: eval_dataset - type: str - doc: The eval dataset used for evaluate the language model during training. - default: null - - name: train_load_dataset_kwargs - type: dict - doc: kwargs for dataset loading - default: {} - - name: eval_load_dataset_kwargs - type: dict - doc: kwargs for dataset loading - default: {} - - name: dataset_columns_to_train - type: Union[str, list] - doc: which columns to pass to the model as inputs - default: text - - name: model - type: Union[str, List[str]] - doc: a tuple containing model name and class, or str with model name or path - default: huggingface-model - - name: tokenizer - type: Union[str, List[str]] - doc: a tuple containing tokenizer name and class, or str with tokenizer name - or path - default: null - - name: deepspeed_config - type: Union[dict, bool] - doc: Configuration options for DeepSpeed (optional). - default: false - - name: quantization_config - type: Union[dict, bool] - doc: Configuration options for model quantization (optional). - default: false - - name: lora_config - type: Union[dict, bool] - doc: Configuration options for Low-Rank Approximation (LoRA) (optional). - default: false - - name: training_config - type: dict - doc: Configuration options specific to the fine-tuning training process (optional). - default: {} - - name: model_pretrained_config - type: dict - doc: config to load the pretrained model - default: {} - - name: tokenizer_pretrained_config - type: dict - doc: config to load the pretrained tokenizer - default: {} - - name: data_collator_config - type: dict - doc: Configuration options for data collation during training (optional). - default: {} - - name: task - type: str - doc: A description of the specific task the model is being fine-tuned for. - default: text-generation - - name: use_cuda - type: bool - doc: use gpu or not - default: true - - name: framework - type: str - doc: pt ot tf - default: pt - - name: device_map - type: str - default: auto - outputs: - - default: '' - lineno: 630 - evaluate: - name: evaluate - doc: 'Evaluating the model using perplexity, for more information visit: - - https://huggingface.co/docs/transformers/perplexity' - parameters: - - name: context - doc: mlrun context - default: '' - - name: model_path - doc: path to the model directory - default: '' - - name: data - type: DataFrame - doc: the data to evaluate the model - default: '' - - name: model_name - type: str - doc: name of base model - default: null - - name: tokenizer_name - type: str - doc: name of base tokenizer - default: null - outputs: - - default: '' - lineno: 784 - description: fine-tune llm model with ease - default_handler: finetune_llm - disable_auto_mount: false - clone_target_dir: '' - env: [] - priority_class_name: '' - preemption_mode: prevent - affinity: null - tolerations: null - security_context: {} -verbose: false diff --git a/functions/development/huggingface_auto_trainer/1.0.0/src/huggingface_auto_trainer.ipynb b/functions/development/huggingface_auto_trainer/1.0.0/src/huggingface_auto_trainer.ipynb deleted file mode 100644 index 847fa98d..00000000 --- a/functions/development/huggingface_auto_trainer/1.0.0/src/huggingface_auto_trainer.ipynb +++ /dev/null @@ -1,195 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "a2c5dc6d-33d0-4e74-a875-6eab556e3b2d", - "metadata": {}, - "source": [ - "# Llm auto trainer" - ] - }, - { - "cell_type": "markdown", - "id": "cc7aa261-17b2-4362-bf6a-34af79b0230b", - "metadata": {}, - "source": [ - "## Notebook Introduction: Fine-Tuning a Large Language Model with Ease\n", - "\n", - "Welcome to this example notebook that demonstrates a simplified yet powerful approach to fine-tuning a Large Language Model (LLM) effortlessly. Fine-tuning is a crucial technique that allows you to adapt pre-trained language models to specific tasks, making them more contextually relevant and useful.\n", - "\n", - "In this notebook, we will walk you through a step-by-step process of fine-tuning a state-of-the-art language model using a user-friendly and efficient method. You don't need to be an expert in machine learning or natural language processing to follow along – our approach focuses on simplicity and effectiveness." - ] - }, - { - "cell_type": "markdown", - "id": "425249e9-f43f-45e6-aa25-9f53099049cd", - "metadata": {}, - "source": [ - "### First, we will select the model we wish to fine-tune and take the matching tokenizer and appropriate config" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3410e9c2-0557-4961-995e-0ef0cc07bf82", - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig\n", - "from transformers import logging\n", - "\n", - "logging.set_verbosity(\"CRITICAL\")\n", - "\n", - "model_name = \"tiiuae/falcon-7b\"\n", - "tokenizer = model_name\n", - "generation_config = GenerationConfig.from_pretrained(model_name)" - ] - }, - { - "cell_type": "markdown", - "id": "f33f3c35-cf61-4b0f-8da9-1c30d3b53230", - "metadata": {}, - "source": [ - "### Then, in order to use with mlrun, we will create an mlrun project and create an mlrun function" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a8ee7c35-adf7-4ed8-9e7e-e659b9461cd5", - "metadata": {}, - "outputs": [], - "source": [ - "import mlrun\n", - "\n", - "project = mlrun.get_or_create_project(\n", - " name=\"auto-trainer-test\",\n", - " context=\"./\",\n", - " user_project=True,\n", - " parameters={\n", - " \"default_image\": \"yonishelach/mlrun-llm\",\n", - " },\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d56b834f-adf6-4736-8de7-3348e050f561", - "metadata": {}, - "outputs": [], - "source": [ - "project.set_function(\n", - " \"auto-trainer.py\",\n", - " name=\"auto-trainer\",\n", - " kind=\"job\",\n", - " image=\"yonishelach/mlrun-llm\",\n", - " handler=\"finetune_llm\",\n", - ")\n", - "project.save()" - ] - }, - { - "cell_type": "markdown", - "id": "f42315db-6ddd-4dc1-89f3-c732f92d0d47", - "metadata": {}, - "source": [ - "### we can set the every config or parameter we want, including training arguments, hyper parameters and more, and pass to the function" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8e62e577-15fb-477d-9c56-fa9fb4c2669b", - "metadata": {}, - "outputs": [], - "source": [ - "import transformers\n", - "\n", - "training_arguments = {\n", - " \"per_device_train_batch_size\": 4,\n", - " \"gradient_accumulation_steps\": 1,\n", - " \"warmup_steps\": 2,\n", - " \"max_steps\": 10,\n", - " \"learning_rate\": 2e-4,\n", - " \"fp16\": True,\n", - " \"logging_steps\": 1,\n", - " \"optim\": \"paged_adamw_8bit\",\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "284a5772-f88d-46c9-87bc-fc14e434c1b4", - "metadata": {}, - "source": [ - "### Now we simply run the function" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "11ab5888-5870-4bf8-9657-db930adecd77", - "metadata": {}, - "outputs": [], - "source": [ - "training_run = mlrun.run_function(\n", - " function=\"auto-trainer\",\n", - " name=\"auto-trainer\",\n", - " local=True,\n", - " params={\n", - " \"model\": (model_name, \"transformers.AutoModelForCausalLM\"),\n", - " \"tokenizer\": tokenizer,\n", - " \"train_dataset\": \"Abirate/english_quotes\",\n", - " \"training_config\": training_arguments,\n", - " \"quantization_config\": True,\n", - " \"lora_config\": True,\n", - " \"dataset_columns_to_train\": \"quote\",\n", - " \"lora_target_modules\": [\"query_key_value\"],\n", - " \"model_pretrained_config\": {\"trust_remote_code\": True, \"use_cache\": False},\n", - " },\n", - " handler=\"finetune_llm\",\n", - " outputs=[\"model\"],\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0e674d25-5f1f-4ea8-af02-7d22c2fb6760", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7a4dfe9b-407a-43c0-9c5e-56de106477ac", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "mlrun-base", - "language": "python", - "name": "conda-env-mlrun-base-py" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.16" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/functions/development/huggingface_auto_trainer/1.0.0/src/huggingface_auto_trainer.py b/functions/development/huggingface_auto_trainer/1.0.0/src/huggingface_auto_trainer.py deleted file mode 100644 index d1166318..00000000 --- a/functions/development/huggingface_auto_trainer/1.0.0/src/huggingface_auto_trainer.py +++ /dev/null @@ -1,855 +0,0 @@ -import importlib -import os -import shutil -import tempfile -import zipfile -from abc import ABC -from typing import Dict, List, Tuple, Union - -import mlrun -import numpy as np -import pandas as pd -import peft -import torch -import transformers -from datasets import Dataset, load_dataset -from mlrun.artifacts.manager import Artifact, PlotlyArtifact -from mlrun.datastore import is_store_uri -from mlrun.frameworks._common import CommonTypes, MLRunInterface -from mlrun.utils import logger -from peft import (LoraConfig, PeftModel, get_peft_model, - prepare_model_for_kbit_training) -from plotly import graph_objects as go -from transformers import (AutoModelForCausalLM, AutoTokenizer, - BitsAndBytesConfig, DataCollatorForLanguageModeling, - PreTrainedModel, PreTrainedTokenizer, Trainer, - TrainerCallback, TrainerControl, TrainerState, - TrainingArguments) - -supported_tasks = [ - "question-answering", - "summarization", - "table-question-answering", - "text2text-generation", - "text-classification", - "sentiment-analysis", - "text-generation", - "token-classification", - "translation", - "translation_xx_to_yy", -] - - -class ConfigKeys: - deepspeed = "deepspeed" - quantization = "quantization" - lora = "lora" - training = "training" - tokenizer_pretrained = "tokenizer_pretrained" - model_pretrained = "model_pretrained" - data_collator = "data_collator" - - -# ----------------------from MLRUN-------------------------------- -class HFTrainerMLRunInterface(MLRunInterface, ABC): - """ - This is temporary and will be built in mlrun 1.5.0 - Interface for adding MLRun features for tensorflow keras API. - """ - - # MLRuns context default name: - DEFAULT_CONTEXT_NAME = "mlrun-huggingface" - - # Attributes to replace so the MLRun interface will be fully enabled. - _REPLACED_METHODS = [ - "train", - # "evaluate" - ] - - @classmethod - def add_interface( - cls, - obj: Trainer, - restoration: CommonTypes.MLRunInterfaceRestorationType = None, - ): - super(HFTrainerMLRunInterface, cls).add_interface( - obj=obj, restoration=restoration - ) - - @classmethod - def mlrun_train(cls): - def wrapper(self: Trainer, *args, **kwargs): - # Restore the evaluation method as `train` will use it: - # cls._restore_attribute(obj=self, attribute_name="evaluate") - - # Call the original fit method: - result = self.original_train(*args, **kwargs) - - # Replace the evaluation method again: - # cls._replace_function(obj=self, function_name="evaluate") - - return result - - return wrapper - - -class MLRunCallback(TrainerCallback): - """ - This is temporary and will be built in mlrun 1.5.0 - Callback for collecting logs during training / evaluation of the `Trainer` API. - """ - - def __init__( - self, - context: mlrun.MLClientCtx = None, - model_name: str = "model", - tag: str = "", - labels: Dict[str, str] = None, - extra_data: dict = None, - ): - super().__init__() - - # Store the configurations: - self._context = ( - context - if context is not None - else mlrun.get_or_create_ctx("./mlrun-huggingface") - ) - self._model_name = model_name - self._tag = tag - self._labels = labels - self._extra_data = extra_data if extra_data is not None else {} - - # Set up the logging mode: - self._is_training = False - self._steps: List[List[int]] = [] - self._metric_scores: Dict[str, List[float]] = {} - self._artifacts: Dict[str, Artifact] = {} - - def on_epoch_begin( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs, - ): - if not state.is_world_process_zero: - return - self._steps.append([]) - - def on_epoch_end( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs, - ): - if not state.is_world_process_zero: - return - self.log_metrics() - - def on_log( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - logs: Dict[str, float] = None, - **kwargs, - ): - if not state.is_world_process_zero: - return - recent_logs = state.log_history[-1].copy() - - recent_logs.pop("epoch") - current_step = int(recent_logs.pop("step")) - if current_step not in self._steps[-1]: - self._steps[-1].append(current_step) - - for metric_name, metric_score in recent_logs.items(): - if metric_name.startswith("train_"): - if metric_name.split("train_")[1] not in self._metric_scores: - self._metric_scores[metric_name] = [metric_score] - continue - if metric_name not in self._metric_scores: - self._metric_scores[metric_name] = [] - self._metric_scores[metric_name].append(metric_score) - - def on_train_begin( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs, - ): - if not state.is_world_process_zero: - return - self._is_training = True - - def on_train_end( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - model: PreTrainedModel = None, - tokenizer: PreTrainedTokenizer = None, - **kwargs, - ): - if not state.is_world_process_zero: - return - self.log_metrics() - - def on_evaluate( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs, - ): - if not state.is_world_process_zero: - return - self.log_metrics() - - if self._is_training: - return - - def log_metrics(self): - for metric_name, metric_scores in self._metric_scores.items(): - self._context.log_result(key=metric_name, value=metric_scores[-1]) - if len(metric_scores) > 1: - self.log_metric_plot(name=metric_name, scores=metric_scores) - self._context.commit(completed=False) - - def log_metric_plot(self, name: str, scores: List[float]): - # Initialize a plotly figure: - metric_figure = go.Figure() - - # Add titles: - metric_figure.update_layout( - title=name.capitalize().replace("_", " "), - xaxis_title="Samples", - yaxis_title="Scores", - ) - - # Draw: - metric_figure.add_trace( - go.Scatter(x=np.arange(len(scores)), y=scores, mode="lines") - ) - - # Create the plotly artifact: - artifact_name = f"{name}_plot" - artifact = PlotlyArtifact(key=artifact_name, figure=metric_figure) - self._artifacts[artifact_name] = self._context.log_artifact(artifact) - - -def apply_mlrun( - trainer: transformers.Trainer, - model_name: str = None, - tag: str = "", - context: mlrun.MLClientCtx = None, - auto_log: bool = True, - labels: Dict[str, str] = None, - extra_data: dict = None, - **kwargs, -): - """ - This is temporary and will be built in mlrun 1.5.0 - """ - # Get parameters defaults: - if context is None: - context = mlrun.get_or_create_ctx(HFTrainerMLRunInterface.DEFAULT_CONTEXT_NAME) - - HFTrainerMLRunInterface.add_interface(obj=trainer) - - if auto_log: - trainer.add_callback( - MLRunCallback( - context=context, - model_name=model_name, - tag=tag, - labels=labels, - extra_data=extra_data, - ) - ) - - -# ----------------------end from MLRUN-------------------------------- - - -def _print_trainable_parameters(model): - """ - Prints the number of trainable parameters in the model. - """ - trainable_params = 0 - all_param = 0 - for _, param in model.named_parameters(): - all_param += param.numel() - if param.requires_grad: - trainable_params += param.numel() - print( - f"trainable params: {trainable_params} || all params: {all_param} || trainable%:" - f" {100 * trainable_params / all_param}" - ) - - -# default configs -# will be used if user provides "True" with config name as input -QUANTIZATION_CONFIG = transformers.BitsAndBytesConfig( - load_in_4bit=True, - bnb_4bit_use_double_quant=True, - bnb_4bit_quant_type="nf4", - bnb_4bit_compute_dtype=torch.bfloat16, -) - -LORA_CONFIG = peft.LoraConfig( - r=8, - lora_alpha=32, - target_modules=["query_key_value"], - lora_dropout=0.05, - bias="none", - task_type="CAUSAL_LM", -) - -DEEPSPEED_CONFIG = { - "train_micro_batch_size_per_gpu": "auto", - "fp16": {"enabled": True}, - "autotuning": { - "enabled": True, - "arg_mappings": { - "train_micro_batch_size_per_gpu": "--per_device_train_batch_size", - "gradient_accumulation_steps ": "--gradient_accumulation_steps", - }, - }, - "zero_optimization": { - "stage": 2, - }, -} - - -def _update_config(src: dict, dst: dict): - """ - update configs according to user, this way the user can add/modify values in default configs for e.g. - - goes over all configs and corresponding prefixes, collect all the keys from the given dict that start - with the prefix and add them to appropriate config - - :param src: dict of all candidate values to update dict. - :param dst: dict containing all configs to update. - """ - - for config_name, config in dst.items(): - - # If given True we use default dict - # Can also be False or a config dict given from user, so we check specifically fo True - if config is True and config_name == "quantization": - config = QUANTIZATION_CONFIG - - if config is True and config_name == "lora": - config = LORA_CONFIG - - if config is True and config_name == "deepspeed": - config = DEEPSPEED_CONFIG - - # in some cases we can get a boolean value, in that case no need to look for args - if isinstance(config, bool): - config = None - - elif isinstance(config, dict): - for key, val in src.items(): - if key.startswith(config_name): - config[key.replace(f"{config_name}_", "")] = val - - # update by config name - else: - for key, val in src.items(): - if key.startswith(config_name): - setattr(config, key.replace(f"{config_name}_", ""), val) - - dst.update({config_name: config}) - - -def _get_class_object(class_path: str) -> type: - """ - given a full class name, this function returns the correct class - - :param class_path: a full class name (ex. 'transformers.AutoModelForCausalLM') - - :return the wanted class object - """ - module_path, class_name = class_path.rsplit(".", 1) - module = importlib.import_module(module_path) - return getattr(module, class_name) - - -def _set_model_and_tokenizer( - model: Union[str, List[str]], - tokenizer: Union[str, List[str]], - task: str, - framework: str, - lora_config: dict, - quantization_config: dict, - use_cuda: bool, - tokenizer_pretrained_config, - model_pretrained_config, - device_map: str, -): - """ - get the correct model and tokenizer according to given user inputs - - :param model: a tuple containing model name and class, or str with model name or path - :param tokenizer: a tuple containing tokenizer name and class, or str with tokenizer name or path - :param task: a supported nlp task, used to choose model if not provided - :param framework: pt or tf - :param lora_config: lora config or None, to load model in appropriate way - :param quantization_config: quantization config or None, to load model in appropriate way - :param use_cuda: use gpu or not - :param tokenizer_pretrained_config: config to load the pretrained tokenizer - :param model_pretrained_config: config to load the pretrained model - :param device_map: a device map for model training if using number of gpu's - - :returns: model and tokenizer - """ - # if task is not supported and no model was given we can't choose one - if task and task not in supported_tasks and not model: - logger.error("unsupported task option chosen") - raise - - # load model from store - if isinstance(model, str) and is_store_uri(model): - pass - # TODO: load both model and tokenizer and return, need guy's help - - # if it's a tuple them we assume it contains of both name and class - if isinstance(model, list): - model_name, model_class = model - model_class = _get_class_object(model_class) - - # in the case we don't get the model class we need the task in order to choose the correct model - else: - if task is None: - logger.error("task must be chosen in order to determine the correct model") - raise Exception( - "this function requires either a supported task or a model and model class to be chosen" - ) - - _, available_classes, task_options = transformers.pipelines.check_task(task) - - if isinstance(model, str): - model_name = model - - # if model is not given, we take the default model for the given task - else: - model_name, _ = transformers.pipelines.get_default_model_and_revision( - available_classes, framework, task_options - ) - if not available_classes.get(framework, tuple()): - logger.error( - "given task's default model is not supported in specified framework" - ) - raise Exception( - "this function requires either a supported task or a model and model class to be chosen" - ) - - model_class = available_classes[framework][0] - - # load the pretrained model - if use_cuda: - device_map = device_map - else: - device_map = None - - model = model_class.from_pretrained( - model_name, - quantization_config=quantization_config, - device_map=device_map, - **model_pretrained_config, - ) - - # If quantization config is given we will load a quantized model, if not a regular one - if quantization_config: - model.gradient_checkpointing_enable() - model = peft.prepare_model_for_kbit_training(model) - - # If lora config was given we want to do lora fine tune, we update model here - if lora_config: - model = peft.get_peft_model(model, lora_config) - - # if not specified we choose the default tokenizer that corresponding to the model - if tokenizer is None: - tokenizer = transformers.AutoTokenizer.from_pretrained(model_name) - return model_name, model, tokenizer - - if isinstance(tokenizer, str): - tokenizer_name = tokenizer - tokenizer_class = transformers.AutoTokenizer - - # if it's not a str then it's a tuple of both name and class - else: - tokenizer_name, tokenizer_class = tokenizer - tokenizer_class = _get_class_object(tokenizer_class) - - tokenizer = tokenizer_class.from_pretrained( - tokenizer_name, **tokenizer_pretrained_config - ) - - tokenizer.pad_token = tokenizer.eos_token - - return model_name, model, tokenizer - - -def _dataset_loader(dataset: str, is_train: bool = True, **kwargs) -> Dataset: - """ - loads the specific dataset provided by the user - - :param dataset: name or path of dataset to load - :param is_train: bool that indicates the purpose of the dataset - :param kwargs: other kwargs for loading the dataset - - :returns: loaded dataset - """ - # if split in kwargs then the user decides how to split the dataset - if "split" in kwargs: - return load_dataset(dataset, **kwargs) - - # if it's a dataset for train we split with train - if is_train: - return load_dataset(dataset, split="train", **kwargs) - - # if it's eval dataset, then a lot of names are acceptable for the set and we check all of them - dataset = load_dataset(dataset, **kwargs) - if "test" in dataset: - return dataset.get("test") - elif "eval" in dataset: - return dataset.get("eval") - elif "validation" in dataset: - return dataset.get("validation") - - -def _prepare_dataset( - train_dataset: str, - eval_dataset: str, - train_load_dataset_kwargs, - eval_load_dataset_kwargs, - tokenizer, - dataset_columns_to_train: Union[str, list], -) -> (Dataset, Union[Dataset, None]): - """ - Loads the train and eval datasets (if provided) passes them through the tokenizer and - returns them ready to use in training - - :param train_dataset: the name or path to the train dataset - :param eval_dataset: the name or path to the eval dataset - :param dataset_columns_to_train: which columns to pass to the model as inputs - (need to pass through the tokenizer first) - :param train_load_dataset_kwargs: kwargs for dataset loading - :param eval_load_dataset_kwargs: kwargs for dataset loading - :param tokenizer: the tokenizer to pass the data through - - :returns: tokenized datasets - """ - if not tokenizer.pad_token: - tokenizer.pad_token = tokenizer.eos_token - - # we take col name/s in a list for easy generalization - if isinstance(dataset_columns_to_train, str): - dataset_columns_to_train = [dataset_columns_to_train] - - if isinstance(train_dataset, mlrun.datastore.DataItem): - train_dataset = Dataset.from_pandas(train_dataset.as_df()) - return ( - train_dataset.map( - lambda examples: tokenizer( - *[examples[col] for col in dataset_columns_to_train], - truncation=True, - padding=True, - ), - batched=True, - ), - None, - ) - - # Load datasets - # if provided two paths/names we load each separately using designated func - if eval_dataset: - train_dataset = _dataset_loader( - dataset=train_dataset, is_train=True, **train_load_dataset_kwargs - ) - eval_dataset = _dataset_loader( - dataset=eval_dataset, is_train=False, **eval_load_dataset_kwargs - ) - - # if only on path is given then we must check if it contains both dataset or if only one should be used - else: - dataset = load_dataset(train_dataset, **train_load_dataset_kwargs) - if "train" in dataset: - train_dataset = dataset.get("train") - if "test" in dataset: - eval_dataset = dataset.get("test") - elif "eval" in dataset: - eval_dataset = dataset.get("eval") - elif "validation" in dataset: - eval_dataset = dataset.get("validation") - else: - # only train dataset given, tokenize and return it - return ( - train_dataset.map( - lambda examples: tokenizer( - *[examples[col] for col in dataset_columns_to_train], - truncation=True, - padding=True, - ), - batched=True, - ), - None, - ) - else: - logger.error("train dataset is mandatory") - raise KeyError("no train dataset found in given dataset") - - # Tokenize the data so the model can understand it - tokenized_train_dataset = train_dataset.map( - lambda examples: tokenizer( - *[examples[col] for col in dataset_columns_to_train], - truncation=True, - padding=True, - ), - batched=True, - ) - - tokenized_eval_dataset = eval_dataset.map( - lambda examples: tokenizer( - *[examples[col] for col in dataset_columns_to_train], - truncation=True, - padding=True, - ), - batched=True, - ) - - return tokenized_train_dataset, tokenized_eval_dataset - - -def finetune_llm( - context: mlrun.MLClientCtx, - train_dataset: Union[str, mlrun.datastore.DataItem], - eval_dataset: str = None, - train_load_dataset_kwargs: dict = {}, - eval_load_dataset_kwargs: dict = {}, - dataset_columns_to_train: Union[str, list] = "text", - model: Union[str, List[str]] = "huggingface-model", - tokenizer: Union[str, List[str]] = None, - deepspeed_config: Union[dict, bool] = False, - quantization_config: Union[dict, bool] = False, - lora_config: Union[dict, bool] = False, - training_config: dict = {}, - model_pretrained_config: dict = {}, - tokenizer_pretrained_config: dict = {}, - data_collator_config: dict = {}, - task: str = "text-generation", - use_cuda: bool = True, - framework: str = "pt", - device_map: str = "auto", - **kwargs, -): - """ - Fine-tunes a Language Model (LLM) on a specific task using the provided dataset. - The function takes various configuration parameters to customize the training process - and adapt the model to specific tasks using a provided dataset. - - :param context: mlrun context in order to log trained model - :param dataset_columns_to_train: which columns to pass to the model as inputs - :param eval_load_dataset_kwargs: kwargs for dataset loading - :param train_load_dataset_kwargs: kwargs for dataset loading - :param framework: pt ot tf - :param use_cuda: use gpu or not - :param tokenizer_pretrained_config: config to load the pretrained tokenizer - :param model_pretrained_config: config to load the pretrained model - :param tokenizer: a tuple containing tokenizer name and class, or str with tokenizer name or path - :param model: a tuple containing model name and class, or str with model name or path - :param train_dataset: The train dataset used for fine-tuning the language model. - :param eval_dataset: The eval dataset used for evaluate the language model during training. - :param deepspeed_config: Configuration options for DeepSpeed (optional). - :param quantization_config: Configuration options for model quantization (optional). - :param lora_config: Configuration options for Low-Rank Approximation (LoRA) (optional). - :param training_config: Configuration options specific to the fine-tuning training process (optional). - :param data_collator_config: Configuration options for data collation during training (optional). - :param task: A description of the specific task the model is being fine-tuned for. - :param kwargs: Additional keyword arguments. - """ - - # TODO: match forward.keyword to dataset.keyword - check if relevant in new design - # TODO: add warning for label, and add option to modify dataset col names - check if relevant in new design - - # Look for updates to configs given in kwargs - configs = { - ConfigKeys.deepspeed: deepspeed_config, - ConfigKeys.quantization: quantization_config, - ConfigKeys.lora: lora_config, - ConfigKeys.training: training_config, - ConfigKeys.model_pretrained: model_pretrained_config, - ConfigKeys.tokenizer_pretrained: tokenizer_pretrained_config, - ConfigKeys.data_collator: data_collator_config, - } - _update_config(dst=configs, src=kwargs) - - # check gpu permission and availability - if use_cuda: - if torch.cuda.is_available(): - # Clean gpu cache - torch.cuda.empty_cache() - else: - logger.warning("'use_cuda' is set to True, but no cuda device is available") - - # get model and tokenizer - model_name, model, tokenizer = _set_model_and_tokenizer( - model=model, - tokenizer=tokenizer, - task=task, - framework=framework, - lora_config=configs[ConfigKeys.lora], - quantization_config=configs[ConfigKeys.quantization], - use_cuda=use_cuda, - tokenizer_pretrained_config=tokenizer_pretrained_config, - model_pretrained_config=configs[ConfigKeys.model_pretrained], - device_map=device_map, - ) - - # Load datasets - tokenized_train, tokenized_eval = _prepare_dataset( - train_dataset=train_dataset, - eval_dataset=eval_dataset, - train_load_dataset_kwargs=train_load_dataset_kwargs, - eval_load_dataset_kwargs=eval_load_dataset_kwargs, - tokenizer=tokenizer, - dataset_columns_to_train=dataset_columns_to_train, - ) - - # Initialize the data collator for the trainer to use in order to create batches of data - data_collator = transformers.DataCollatorForLanguageModeling( - tokenizer=tokenizer, mlm=False, **data_collator_config - ) - - # Initialize training kwargs from user kwargs: - train_kwargs = configs[ConfigKeys.training] - - # If deepspeed config given we add it to training kwargs - if configs[ConfigKeys.deepspeed]: - train_kwargs["deepspeed"] = configs[ConfigKeys.deepspeed] - - # Take a look at the trainable parameters in the model - _print_trainable_parameters(model) - - # Preparing training arguments: - training_args = transformers.TrainingArguments( - output_dir=tempfile.mkdtemp(), - **train_kwargs, - ) - - trainer = transformers.Trainer( - model=model, - train_dataset=tokenized_train, - eval_dataset=tokenized_eval, - tokenizer=tokenizer, - data_collator=data_collator, - args=training_args, - ) - - apply_mlrun(trainer, model_name=model_name.split("/")[-1]) - model.config.use_cache = ( - False # silence the warnings. Please re-enable for inference! - ) - - # Apply training with evaluation: - context.logger.info(f"training '{model_name}'") - trainer.train() - - temp_directory = tempfile.TemporaryDirectory().name - trainer.save_model(temp_directory) - - # Zip the model directory: - shutil.make_archive( - base_name="model", - format="zip", - root_dir=temp_directory, - ) - - # Log the model: - context.log_model( - key="model", - db_key=model_name.split("/")[-1], - model_file="model.zip", - tag="", - framework="Hugging Face", - ) - - -def evaluate( - context, - model_path, - data: pd.DataFrame, - model_name: str = None, - tokenizer_name: str = None, -): - """ - Evaluating the model using perplexity, for more information visit: - https://huggingface.co/docs/transformers/perplexity - - :param context: mlrun context - :param model_path: path to the model directory - :param data: the data to evaluate the model - :param model_name: name of base model - :param tokenizer_name: name of base tokenizer - """ - # Get the model artifact and file: - ( - model_file, - model_artifact, - extra_data, - ) = mlrun.artifacts.get_model(model_path) - - # Read the name: - _model_name = model_artifact.spec.db_key - - # Extract logged model files: - model_directory = os.path.join(os.path.dirname(model_file), _model_name) - with zipfile.ZipFile(model_file, "r") as zip_file: - zip_file.extractall(model_directory) - - # Loading the saved pretrained tokenizer and model: - dataset = Dataset.from_pandas(data) - tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) - pad_token_id = tokenizer.eos_token_id - model = AutoModelForCausalLM.from_pretrained( - model_name, device_map="cuda:0", trust_remote_code=True, load_in_8bit=True - ) - model = PeftModel.from_pretrained(model, model_directory) - model.eval() - encodings = tokenizer("\n\n".join(dataset["text"][:5]), return_tensors="pt") - - max_length = 1024 - stride = 512 - seq_len = encodings.input_ids.size(1) - - nlls = [] - prev_end_loc = 0 - for begin_loc in range(0, seq_len, stride): - end_loc = min(begin_loc + max_length, seq_len) - trg_len = end_loc - prev_end_loc # may be different from stride on last loop - input_ids = encodings.input_ids[:, begin_loc:end_loc] - target_ids = input_ids.clone() - target_ids[:, :-trg_len] = -100 - - with torch.no_grad(): - outputs = model(input_ids.cuda(), labels=target_ids) - - # loss is calculated using CrossEntropyLoss which averages over valid labels - # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels - # to the left by 1. - neg_log_likelihood = outputs.loss - - nlls.append(neg_log_likelihood) - - prev_end_loc = end_loc - if end_loc == seq_len: - break - - ppl = torch.exp(torch.stack(nlls).mean()).item() - context.log_result("perplexity", ppl) diff --git a/functions/development/huggingface_auto_trainer/1.0.0/src/item.yaml b/functions/development/huggingface_auto_trainer/1.0.0/src/item.yaml deleted file mode 100644 index e556c11d..00000000 --- a/functions/development/huggingface_auto_trainer/1.0.0/src/item.yaml +++ /dev/null @@ -1,25 +0,0 @@ -apiVersion: v1 -categories: -- machine-learning -- model-training -description: fine-tune llm model with ease -doc: '' -example: huggingface_auto_trainer.ipynb -generationDate: 2023-08-21:17-25 -hidden: false -icon: '' -labels: - author: Zeevr -maintainers: [] -marketplaceType: '' -mlrunVersion: 1.4.0 -name: huggingface-auto-trainer -platformVersion: 3.5.0 -spec: - filename: huggingface_auto_trainer.py - handler: finetune_llm - image: mlrun/mlrun - kind: job - requirements: [] -url: '' -version: 1.0.0 diff --git a/functions/development/huggingface_auto_trainer/1.0.0/src/requirements.txt b/functions/development/huggingface_auto_trainer/1.0.0/src/requirements.txt deleted file mode 100644 index 1376b1d0..00000000 --- a/functions/development/huggingface_auto_trainer/1.0.0/src/requirements.txt +++ /dev/null @@ -1,5 +0,0 @@ -peft -transformers -torch -datasets -plotly diff --git a/functions/development/huggingface_auto_trainer/1.0.0/src/test_huggingface_auto_trainer.py b/functions/development/huggingface_auto_trainer/1.0.0/src/test_huggingface_auto_trainer.py deleted file mode 100644 index 53576e4e..00000000 --- a/functions/development/huggingface_auto_trainer/1.0.0/src/test_huggingface_auto_trainer.py +++ /dev/null @@ -1,42 +0,0 @@ -import tempfile - -import mlrun - - -def test_train(): - - model_name = "distilgpt2" - tokenizer = model_name - auto_trainer = mlrun.import_function("function.yaml") - - training_arguments = { - "per_device_train_batch_size": 4, - "gradient_accumulation_steps": 1, - "warmup_steps": 2, - "max_steps": 10, - "learning_rate": 2e-4, - "logging_steps": 1, - } - - params = { - "model": (model_name, "transformers.AutoModelForCausalLM"), - "tokenizer": tokenizer, - "train_dataset": "Abirate/english_quotes", - "training_config": training_arguments, - "dataset_columns_to_train": "quote", - "model_pretrained_config": {"use_cache": False}, - "use_cuda": False, - } - - try: - with tempfile.TemporaryDirectory() as test_directory: - auto_trainer.run( - local=True, - params=params, - handler="finetune_llm", - returns=["model"], - workdir=test_directory, - ) - - except Exception as exception: - print(f"- The training failed - raised the following error:\n- {exception}") diff --git a/functions/development/huggingface_auto_trainer/1.0.0/static/documentation.html b/functions/development/huggingface_auto_trainer/1.0.0/static/documentation.html deleted file mode 100644 index be893164..00000000 --- a/functions/development/huggingface_auto_trainer/1.0.0/static/documentation.html +++ /dev/null @@ -1,380 +0,0 @@ - - - - - - - -huggingface_auto_trainer package - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - - - -
-
- - -
-
-
- -
-

huggingface_auto_trainer package

- -
- -
-
-
-
-
-

huggingface_auto_trainer package#

-
-

Submodules#

-
-
-

huggingface_auto_trainer.huggingface_auto_trainer module#

-
-
-class huggingface_auto_trainer.huggingface_auto_trainer.ConfigKeys[source]#
-

Bases: object

-
-
-data_collator = 'data_collator'#
-
-
-
-deepspeed = 'deepspeed'#
-
-
-
-lora = 'lora'#
-
-
-
-model_pretrained = 'model_pretrained'#
-
-
-
-quantization = 'quantization'#
-
-
-
-tokenizer_pretrained = 'tokenizer_pretrained'#
-
-
-
-training = 'training'#
-
-
-
-
-class huggingface_auto_trainer.huggingface_auto_trainer.HFTrainerMLRunInterface[source]#
-

Bases: abc.ABC, Generic[mlrun.frameworks._common.utils.MLRunInterfaceableType]

-

This is temporary and will be built in mlrun 1.5.0 -Interface for adding MLRun features for tensorflow keras API.

-
-
-DEFAULT_CONTEXT_NAME = 'mlrun-huggingface'#
-
-
-
-classmethod add_interface(obj: transformers.Trainer, restoration: Optional[Tuple[Dict[str, Any], Dict[str, Any], List[str]]] = None)[source]#
-

Enrich the object with this interface properties, methods and functions so it will have this framework MLRun’s -features.

-
-
Parameters
-
    -
  • obj – The object to enrich his interface.

  • -
  • restoration – Restoration information tuple as returned from ‘remove_interface’ in order to add the -interface in a certain state.

  • -
-
-
-
-
-
-classmethod mlrun_train()[source]#
-
-
-
-
-class huggingface_auto_trainer.huggingface_auto_trainer.MLRunCallback(*args: Any, **kwargs: Any)[source]#
-

Bases: transformers.

-

This is temporary and will be built in mlrun 1.5.0 -Callback for collecting logs during training / evaluation of the Trainer API.

-
-
-log_metric_plot(name: str, scores: List[float])[source]#
-
-
-
-log_metrics()[source]#
-
-
-
-on_epoch_begin(args: transformers.TrainingArguments, state: transformers.TrainerState, control: transformers.TrainerControl, **kwargs)[source]#
-
-
-
-on_epoch_end(args: transformers.TrainingArguments, state: transformers.TrainerState, control: transformers.TrainerControl, **kwargs)[source]#
-
-
-
-on_evaluate(args: transformers.TrainingArguments, state: transformers.TrainerState, control: transformers.TrainerControl, **kwargs)[source]#
-
-
-
-on_log(args: transformers.TrainingArguments, state: transformers.TrainerState, control: transformers.TrainerControl, logs: Optional[Dict[str, float]] = None, **kwargs)[source]#
-
-
-
-on_train_begin(args: transformers.TrainingArguments, state: transformers.TrainerState, control: transformers.TrainerControl, **kwargs)[source]#
-
-
-
-on_train_end(args: transformers.TrainingArguments, state: transformers.TrainerState, control: transformers.TrainerControl, model: Optional[transformers.PreTrainedModel] = None, tokenizer: Optional[transformers.PreTrainedTokenizer] = None, **kwargs)[source]#
-
-
-
-
-huggingface_auto_trainer.huggingface_auto_trainer.apply_mlrun(trainer: transformers.Trainer, model_name: Optional[str] = None, tag: str = '', context: Optional[mlrun.execution.MLClientCtx] = None, auto_log: bool = True, labels: Optional[Dict[str, str]] = None, extra_data: Optional[dict] = None, **kwargs)[source]#
-

This is temporary and will be built in mlrun 1.5.0

-
-
-
-huggingface_auto_trainer.huggingface_auto_trainer.evaluate(context, model_path, data: pandas.core.frame.DataFrame, model_name: Optional[str] = None, tokenizer_name: Optional[str] = None)[source]#
-

Evaluating the model using perplexity, for more information visit: -https://huggingface.co/docs/transformers/perplexity

-
-
Parameters
-
    -
  • context – mlrun context

  • -
  • model_path – path to the model directory

  • -
  • data – the data to evaluate the model

  • -
  • model_name – name of base model

  • -
  • tokenizer_name – name of base tokenizer

  • -
-
-
-
-
-
-huggingface_auto_trainer.huggingface_auto_trainer.finetune_llm(context: mlrun.execution.MLClientCtx, train_dataset: Union[str, mlrun.datastore.base.DataItem], eval_dataset: Optional[str] = None, train_load_dataset_kwargs: dict = {}, eval_load_dataset_kwargs: dict = {}, dataset_columns_to_train: Union[str, list] = 'text', model: Union[str, List[str]] = 'huggingface-model', tokenizer: Optional[Union[str, List[str]]] = None, deepspeed_config: Union[dict, bool] = False, quantization_config: Union[dict, bool] = False, lora_config: Union[dict, bool] = False, training_config: dict = {}, model_pretrained_config: dict = {}, tokenizer_pretrained_config: dict = {}, data_collator_config: dict = {}, task: str = 'text-generation', use_cuda: bool = True, framework: str = 'pt', device_map: str = 'auto', **kwargs)[source]#
-
-
Fine-tunes a Language Model (LLM) on a specific task using the provided dataset.

The function takes various configuration parameters to customize the training process -and adapt the model to specific tasks using a provided dataset.

-
-
-
-
Parameters
-
    -
  • context – mlrun context in order to log trained model

  • -
  • dataset_columns_to_train – which columns to pass to the model as inputs

  • -
  • eval_load_dataset_kwargs – kwargs for dataset loading

  • -
  • train_load_dataset_kwargs – kwargs for dataset loading

  • -
  • framework – pt ot tf

  • -
  • use_cuda – use gpu or not

  • -
  • tokenizer_pretrained_config – config to load the pretrained tokenizer

  • -
  • model_pretrained_config – config to load the pretrained model

  • -
  • tokenizer – a tuple containing tokenizer name and class, or str with tokenizer name or path

  • -
  • model – a tuple containing model name and class, or str with model name or path

  • -
  • train_dataset – The train dataset used for fine-tuning the language model.

  • -
  • eval_dataset – The eval dataset used for evaluate the language model during training.

  • -
  • deepspeed_config – Configuration options for DeepSpeed (optional).

  • -
  • quantization_config – Configuration options for model quantization (optional).

  • -
  • lora_config – Configuration options for Low-Rank Approximation (LoRA) (optional).

  • -
  • training_config – Configuration options specific to the fine-tuning training process (optional).

  • -
  • data_collator_config – Configuration options for data collation during training (optional).

  • -
  • task – A description of the specific task the model is being fine-tuned for.

  • -
  • kwargs – Additional keyword arguments.

  • -
-
-
-
-
-
-

Module contents#

-
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/huggingface_auto_trainer/1.0.0/static/example.html b/functions/development/huggingface_auto_trainer/1.0.0/static/example.html deleted file mode 100644 index 7ae9a6c4..00000000 --- a/functions/development/huggingface_auto_trainer/1.0.0/static/example.html +++ /dev/null @@ -1,351 +0,0 @@ - - - - - - - -Llm auto trainer - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - - - -
-
- - -
-
-
- - -
-
-
-

Llm auto trainer#

-
-

Notebook Introduction: Fine-Tuning a Large Language Model with Ease#

-

Welcome to this example notebook that demonstrates a simplified yet powerful approach to fine-tuning a Large Language Model (LLM) effortlessly. Fine-tuning is a crucial technique that allows you to adapt pre-trained language models to specific tasks, making them more contextually relevant and useful.

-

In this notebook, we will walk you through a step-by-step process of fine-tuning a state-of-the-art language model using a user-friendly and efficient method. You don’t need to be an expert in machine learning or natural language processing to follow along – our approach focuses on simplicity and effectiveness.

-
-

First, we will select the model we wish to fine-tune and take the matching tokenizer and appropriate config#

-
-
-
import os
-from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
-from transformers import logging
-
-logging.set_verbosity("CRITICAL")
-
-model_name = "tiiuae/falcon-7b"
-tokenizer = model_name
-generation_config = GenerationConfig.from_pretrained(model_name)
-
-
-
-
-
-
-

Then, in order to use with mlrun, we will create an mlrun project and create an mlrun function#

-
-
-
import mlrun
-
-project = mlrun.get_or_create_project(
-    name="auto-trainer-test",
-    context="./",
-    user_project=True,
-    parameters={
-        "default_image": "yonishelach/mlrun-llm",
-    },
-)
-
-
-
-
-
-
-
project.set_function(
-    "auto-trainer.py",
-    name="auto-trainer",
-    kind="job",
-    image="yonishelach/mlrun-llm",
-    handler="finetune_llm",
-)
-project.save()
-
-
-
-
-
-
-

we can set the every config or parameter we want, including training arguments, hyper parameters and more, and pass to the function#

-
-
-
import transformers
-
-training_arguments = {
-    "per_device_train_batch_size": 4,
-    "gradient_accumulation_steps": 1,
-    "warmup_steps": 2,
-    "max_steps": 10,
-    "learning_rate": 2e-4,
-    "fp16": True,
-    "logging_steps": 1,
-    "optim": "paged_adamw_8bit",
-}
-
-
-
-
-
-
-

Now we simply run the function#

-
-
-
training_run = mlrun.run_function(
-    function="auto-trainer",
-    name="auto-trainer",
-    local=True,
-    params={
-        "model": (model_name, "transformers.AutoModelForCausalLM"),
-        "tokenizer": tokenizer,
-        "train_dataset": "Abirate/english_quotes",
-        "training_config": training_arguments,
-        "quantization_config": True,
-        "lora_config": True,
-        "dataset_columns_to_train": "quote",
-        "lora_target_modules": ["query_key_value"],
-        "model_pretrained_config": {"trust_remote_code": True, "use_cache": False},
-    },
-    handler="finetune_llm",
-    outputs=["model"],
-)
-
-
-
-
-
-
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/huggingface_auto_trainer/1.0.0/static/function.html b/functions/development/huggingface_auto_trainer/1.0.0/static/function.html deleted file mode 100644 index 9a1f2953..00000000 --- a/functions/development/huggingface_auto_trainer/1.0.0/static/function.html +++ /dev/null @@ -1,371 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: job
-metadata:
-  name: huggingface-auto-trainer
-  tag: ''
-  hash: 4459f0b675c36a20c8f542126a96b98b0ac82271
-  project: ''
-  labels:
-    author: Zeevr
-  categories:
-  - machine-learning
-  - model-training
-spec:
-  command: ''
-  args: []
-  image: mlrun/mlrun
-  build:
-    functionSourceCode: 
-    commands: []
-    code_origin: https://github.com/ZeevRispler/functions.git#a63a647cf6bc3015a8dcbd18903f9db44bdf0b66:/Users/Zeev_Rispler/PycharmProjects/functions/huggingface_auto_trainer/huggingface_auto_trainer.py
-    origin_filename: /Users/Zeev_Rispler/PycharmProjects/functions/huggingface_auto_trainer/huggingface_auto_trainer.py
-    requirements: []
-  entry_points:
-    add_interface:
-      name: add_interface
-      doc: ''
-      parameters:
-      - name: cls
-        default: ''
-      - name: obj
-        type: Trainer
-        default: ''
-      - name: restoration
-        type: MLRunInterfaceRestorationType
-        default: null
-      outputs:
-      - default: ''
-      lineno: 70
-    mlrun_train:
-      name: mlrun_train
-      doc: ''
-      parameters:
-      - name: cls
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 80
-    wrapper:
-      name: wrapper
-      doc: ''
-      parameters:
-      - name: self
-        type: Trainer
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 81
-    on_epoch_begin:
-      name: on_epoch_begin
-      doc: ''
-      parameters:
-      - name: self
-        default: ''
-      - name: args
-        type: TrainingArguments
-        default: ''
-      - name: state
-        type: TrainerState
-        default: ''
-      - name: control
-        type: TrainerControl
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 129
-    on_epoch_end:
-      name: on_epoch_end
-      doc: ''
-      parameters:
-      - name: self
-        default: ''
-      - name: args
-        type: TrainingArguments
-        default: ''
-      - name: state
-        type: TrainerState
-        default: ''
-      - name: control
-        type: TrainerControl
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 140
-    on_log:
-      name: on_log
-      doc: ''
-      parameters:
-      - name: self
-        default: ''
-      - name: args
-        type: TrainingArguments
-        default: ''
-      - name: state
-        type: TrainerState
-        default: ''
-      - name: control
-        type: TrainerControl
-        default: ''
-      - name: logs
-        type: Dict[str, float]
-        default: null
-      outputs:
-      - default: ''
-      lineno: 151
-    on_train_begin:
-      name: on_train_begin
-      doc: ''
-      parameters:
-      - name: self
-        default: ''
-      - name: args
-        type: TrainingArguments
-        default: ''
-      - name: state
-        type: TrainerState
-        default: ''
-      - name: control
-        type: TrainerControl
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 177
-    on_train_end:
-      name: on_train_end
-      doc: ''
-      parameters:
-      - name: self
-        default: ''
-      - name: args
-        type: TrainingArguments
-        default: ''
-      - name: state
-        type: TrainerState
-        default: ''
-      - name: control
-        type: TrainerControl
-        default: ''
-      - name: model
-        type: PreTrainedModel
-        default: null
-      - name: tokenizer
-        type: PreTrainedTokenizer
-        default: null
-      outputs:
-      - default: ''
-      lineno: 188
-    on_evaluate:
-      name: on_evaluate
-      doc: ''
-      parameters:
-      - name: self
-        default: ''
-      - name: args
-        type: TrainingArguments
-        default: ''
-      - name: state
-        type: TrainerState
-        default: ''
-      - name: control
-        type: TrainerControl
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 201
-    log_metrics:
-      name: log_metrics
-      doc: ''
-      parameters:
-      - name: self
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 215
-    log_metric_plot:
-      name: log_metric_plot
-      doc: ''
-      parameters:
-      - name: self
-        default: ''
-      - name: name
-        type: str
-        default: ''
-      - name: scores
-        type: List[float]
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 222
-    apply_mlrun:
-      name: apply_mlrun
-      doc: This is temporary and will be built in mlrun 1.5.0
-      parameters:
-      - name: trainer
-        type: Trainer
-        default: ''
-      - name: model_name
-        type: str
-        default: null
-      - name: tag
-        type: str
-        default: ''
-      - name: context
-        type: MLClientCtx
-        default: null
-      - name: auto_log
-        type: bool
-        default: true
-      - name: labels
-        type: Dict[str, str]
-        default: null
-      - name: extra_data
-        type: dict
-        default: null
-      outputs:
-      - default: ''
-      lineno: 244
-    finetune_llm:
-      name: finetune_llm
-      doc: "Fine-tunes a Language Model (LLM) on a specific task using the provided\
-        \ dataset.\n The function takes various configuration parameters to customize\
-        \ the training process\n and adapt the model to specific tasks using a provided\
-        \ dataset."
-      parameters:
-      - name: context
-        type: MLClientCtx
-        doc: mlrun context in order to log trained model
-        default: ''
-      - name: train_dataset
-        type: Union[str, mlrun.datastore.DataItem]
-        doc: The train dataset used for fine-tuning the language model.
-        default: ''
-      - name: eval_dataset
-        type: str
-        doc: The eval dataset used for evaluate the language model during training.
-        default: null
-      - name: train_load_dataset_kwargs
-        type: dict
-        doc: kwargs for dataset loading
-        default: {}
-      - name: eval_load_dataset_kwargs
-        type: dict
-        doc: kwargs for dataset loading
-        default: {}
-      - name: dataset_columns_to_train
-        type: Union[str, list]
-        doc: which columns to pass to the model as inputs
-        default: text
-      - name: model
-        type: Union[str, List[str]]
-        doc: a tuple containing model name and class, or str with model name or path
-        default: huggingface-model
-      - name: tokenizer
-        type: Union[str, List[str]]
-        doc: a tuple containing tokenizer name and class, or str with tokenizer name
-          or path
-        default: null
-      - name: deepspeed_config
-        type: Union[dict, bool]
-        doc: Configuration options for DeepSpeed (optional).
-        default: false
-      - name: quantization_config
-        type: Union[dict, bool]
-        doc: Configuration options for model quantization (optional).
-        default: false
-      - name: lora_config
-        type: Union[dict, bool]
-        doc: Configuration options for Low-Rank Approximation (LoRA) (optional).
-        default: false
-      - name: training_config
-        type: dict
-        doc: Configuration options specific to the fine-tuning training process (optional).
-        default: {}
-      - name: model_pretrained_config
-        type: dict
-        doc: config to load the pretrained model
-        default: {}
-      - name: tokenizer_pretrained_config
-        type: dict
-        doc: config to load the pretrained tokenizer
-        default: {}
-      - name: data_collator_config
-        type: dict
-        doc: Configuration options for data collation during training (optional).
-        default: {}
-      - name: task
-        type: str
-        doc: A description of the specific task the model is being fine-tuned for.
-        default: text-generation
-      - name: use_cuda
-        type: bool
-        doc: use gpu or not
-        default: true
-      - name: framework
-        type: str
-        doc: pt ot tf
-        default: pt
-      - name: device_map
-        type: str
-        default: auto
-      outputs:
-      - default: ''
-      lineno: 630
-    evaluate:
-      name: evaluate
-      doc: 'Evaluating the model using perplexity, for more information visit:
-
-        https://huggingface.co/docs/transformers/perplexity'
-      parameters:
-      - name: context
-        doc: mlrun context
-        default: ''
-      - name: model_path
-        doc: path to the model directory
-        default: ''
-      - name: data
-        type: DataFrame
-        doc: the data to evaluate the model
-        default: ''
-      - name: model_name
-        type: str
-        doc: name of base model
-        default: null
-      - name: tokenizer_name
-        type: str
-        doc: name of base tokenizer
-        default: null
-      outputs:
-      - default: ''
-      lineno: 784
-  description: fine-tune llm model with ease
-  default_handler: finetune_llm
-  disable_auto_mount: false
-  clone_target_dir: ''
-  env: []
-  priority_class_name: ''
-  preemption_mode: prevent
-  affinity: null
-  tolerations: null
-  security_context: {}
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/huggingface_auto_trainer/1.0.0/static/huggingface_auto_trainer.html b/functions/development/huggingface_auto_trainer/1.0.0/static/huggingface_auto_trainer.html deleted file mode 100644 index 2063d183..00000000 --- a/functions/development/huggingface_auto_trainer/1.0.0/static/huggingface_auto_trainer.html +++ /dev/null @@ -1,995 +0,0 @@ - - - - - - - -huggingface_auto_trainer.huggingface_auto_trainer - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - -
-
- -
-
-
-
-
- -
-

- -
-
-
-
-
-
-
-

Source code for huggingface_auto_trainer.huggingface_auto_trainer

-import importlib
-import os
-import shutil
-import tempfile
-import zipfile
-from abc import ABC
-from typing import Dict, List, Tuple, Union
-
-import mlrun
-import numpy as np
-import pandas as pd
-import peft
-import torch
-import transformers
-from datasets import Dataset, load_dataset
-from mlrun.artifacts.manager import Artifact, PlotlyArtifact
-from mlrun.datastore import is_store_uri
-from mlrun.frameworks._common import CommonTypes, MLRunInterface
-from mlrun.utils import logger
-from peft import (LoraConfig, PeftModel, get_peft_model,
-                  prepare_model_for_kbit_training)
-from plotly import graph_objects as go
-from transformers import (AutoModelForCausalLM, AutoTokenizer,
-                          BitsAndBytesConfig, DataCollatorForLanguageModeling,
-                          PreTrainedModel, PreTrainedTokenizer, Trainer,
-                          TrainerCallback, TrainerControl, TrainerState,
-                          TrainingArguments)
-
-supported_tasks = [
-    "question-answering",
-    "summarization",
-    "table-question-answering",
-    "text2text-generation",
-    "text-classification",
-    "sentiment-analysis",
-    "text-generation",
-    "token-classification",
-    "translation",
-    "translation_xx_to_yy",
-]
-
-
-
[docs]class ConfigKeys: - deepspeed = "deepspeed" - quantization = "quantization" - lora = "lora" - training = "training" - tokenizer_pretrained = "tokenizer_pretrained" - model_pretrained = "model_pretrained" - data_collator = "data_collator"
- - -# ----------------------from MLRUN-------------------------------- -
[docs]class HFTrainerMLRunInterface(MLRunInterface, ABC): - """ - This is temporary and will be built in mlrun 1.5.0 - Interface for adding MLRun features for tensorflow keras API. - """ - - # MLRuns context default name: - DEFAULT_CONTEXT_NAME = "mlrun-huggingface" - - # Attributes to replace so the MLRun interface will be fully enabled. - _REPLACED_METHODS = [ - "train", - # "evaluate" - ] - -
[docs] @classmethod - def add_interface( - cls, - obj: Trainer, - restoration: CommonTypes.MLRunInterfaceRestorationType = None, - ): - super(HFTrainerMLRunInterface, cls).add_interface( - obj=obj, restoration=restoration - )
- -
[docs] @classmethod - def mlrun_train(cls): - def wrapper(self: Trainer, *args, **kwargs): - # Restore the evaluation method as `train` will use it: - # cls._restore_attribute(obj=self, attribute_name="evaluate") - - # Call the original fit method: - result = self.original_train(*args, **kwargs) - - # Replace the evaluation method again: - # cls._replace_function(obj=self, function_name="evaluate") - - return result - - return wrapper
- - -
[docs]class MLRunCallback(TrainerCallback): - """ - This is temporary and will be built in mlrun 1.5.0 - Callback for collecting logs during training / evaluation of the `Trainer` API. - """ - - def __init__( - self, - context: mlrun.MLClientCtx = None, - model_name: str = "model", - tag: str = "", - labels: Dict[str, str] = None, - extra_data: dict = None, - ): - super().__init__() - - # Store the configurations: - self._context = ( - context - if context is not None - else mlrun.get_or_create_ctx("./mlrun-huggingface") - ) - self._model_name = model_name - self._tag = tag - self._labels = labels - self._extra_data = extra_data if extra_data is not None else {} - - # Set up the logging mode: - self._is_training = False - self._steps: List[List[int]] = [] - self._metric_scores: Dict[str, List[float]] = {} - self._artifacts: Dict[str, Artifact] = {} - -
[docs] def on_epoch_begin( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs, - ): - if not state.is_world_process_zero: - return - self._steps.append([])
- -
[docs] def on_epoch_end( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs, - ): - if not state.is_world_process_zero: - return - self.log_metrics()
- -
[docs] def on_log( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - logs: Dict[str, float] = None, - **kwargs, - ): - if not state.is_world_process_zero: - return - recent_logs = state.log_history[-1].copy() - - recent_logs.pop("epoch") - current_step = int(recent_logs.pop("step")) - if current_step not in self._steps[-1]: - self._steps[-1].append(current_step) - - for metric_name, metric_score in recent_logs.items(): - if metric_name.startswith("train_"): - if metric_name.split("train_")[1] not in self._metric_scores: - self._metric_scores[metric_name] = [metric_score] - continue - if metric_name not in self._metric_scores: - self._metric_scores[metric_name] = [] - self._metric_scores[metric_name].append(metric_score)
- -
[docs] def on_train_begin( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs, - ): - if not state.is_world_process_zero: - return - self._is_training = True
- -
[docs] def on_train_end( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - model: PreTrainedModel = None, - tokenizer: PreTrainedTokenizer = None, - **kwargs, - ): - if not state.is_world_process_zero: - return - self.log_metrics()
- -
[docs] def on_evaluate( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs, - ): - if not state.is_world_process_zero: - return - self.log_metrics() - - if self._is_training: - return
- -
[docs] def log_metrics(self): - for metric_name, metric_scores in self._metric_scores.items(): - self._context.log_result(key=metric_name, value=metric_scores[-1]) - if len(metric_scores) > 1: - self.log_metric_plot(name=metric_name, scores=metric_scores) - self._context.commit(completed=False)
- -
[docs] def log_metric_plot(self, name: str, scores: List[float]): - # Initialize a plotly figure: - metric_figure = go.Figure() - - # Add titles: - metric_figure.update_layout( - title=name.capitalize().replace("_", " "), - xaxis_title="Samples", - yaxis_title="Scores", - ) - - # Draw: - metric_figure.add_trace( - go.Scatter(x=np.arange(len(scores)), y=scores, mode="lines") - ) - - # Create the plotly artifact: - artifact_name = f"{name}_plot" - artifact = PlotlyArtifact(key=artifact_name, figure=metric_figure) - self._artifacts[artifact_name] = self._context.log_artifact(artifact)
- - -
[docs]def apply_mlrun( - trainer: transformers.Trainer, - model_name: str = None, - tag: str = "", - context: mlrun.MLClientCtx = None, - auto_log: bool = True, - labels: Dict[str, str] = None, - extra_data: dict = None, - **kwargs, -): - """ - This is temporary and will be built in mlrun 1.5.0 - """ - # Get parameters defaults: - if context is None: - context = mlrun.get_or_create_ctx(HFTrainerMLRunInterface.DEFAULT_CONTEXT_NAME) - - HFTrainerMLRunInterface.add_interface(obj=trainer) - - if auto_log: - trainer.add_callback( - MLRunCallback( - context=context, - model_name=model_name, - tag=tag, - labels=labels, - extra_data=extra_data, - ) - )
- - -# ----------------------end from MLRUN-------------------------------- - - -def _print_trainable_parameters(model): - """ - Prints the number of trainable parameters in the model. - """ - trainable_params = 0 - all_param = 0 - for _, param in model.named_parameters(): - all_param += param.numel() - if param.requires_grad: - trainable_params += param.numel() - print( - f"trainable params: {trainable_params} || all params: {all_param} || trainable%:" - f" {100 * trainable_params / all_param}" - ) - - -# default configs -# will be used if user provides "True" with config name as input -QUANTIZATION_CONFIG = transformers.BitsAndBytesConfig( - load_in_4bit=True, - bnb_4bit_use_double_quant=True, - bnb_4bit_quant_type="nf4", - bnb_4bit_compute_dtype=torch.bfloat16, -) - -LORA_CONFIG = peft.LoraConfig( - r=8, - lora_alpha=32, - target_modules=["query_key_value"], - lora_dropout=0.05, - bias="none", - task_type="CAUSAL_LM", -) - -DEEPSPEED_CONFIG = { - "train_micro_batch_size_per_gpu": "auto", - "fp16": {"enabled": True}, - "autotuning": { - "enabled": True, - "arg_mappings": { - "train_micro_batch_size_per_gpu": "--per_device_train_batch_size", - "gradient_accumulation_steps ": "--gradient_accumulation_steps", - }, - }, - "zero_optimization": { - "stage": 2, - }, -} - - -def _update_config(src: dict, dst: dict): - """ - update configs according to user, this way the user can add/modify values in default configs for e.g. - - goes over all configs and corresponding prefixes, collect all the keys from the given dict that start - with the prefix and add them to appropriate config - - :param src: dict of all candidate values to update dict. - :param dst: dict containing all configs to update. - """ - - for config_name, config in dst.items(): - - # If given True we use default dict - # Can also be False or a config dict given from user, so we check specifically fo True - if config is True and config_name == "quantization": - config = QUANTIZATION_CONFIG - - if config is True and config_name == "lora": - config = LORA_CONFIG - - if config is True and config_name == "deepspeed": - config = DEEPSPEED_CONFIG - - # in some cases we can get a boolean value, in that case no need to look for args - if isinstance(config, bool): - config = None - - elif isinstance(config, dict): - for key, val in src.items(): - if key.startswith(config_name): - config[key.replace(f"{config_name}_", "")] = val - - # update by config name - else: - for key, val in src.items(): - if key.startswith(config_name): - setattr(config, key.replace(f"{config_name}_", ""), val) - - dst.update({config_name: config}) - - -def _get_class_object(class_path: str) -> type: - """ - given a full class name, this function returns the correct class - - :param class_path: a full class name (ex. 'transformers.AutoModelForCausalLM') - - :return the wanted class object - """ - module_path, class_name = class_path.rsplit(".", 1) - module = importlib.import_module(module_path) - return getattr(module, class_name) - - -def _set_model_and_tokenizer( - model: Union[str, List[str]], - tokenizer: Union[str, List[str]], - task: str, - framework: str, - lora_config: dict, - quantization_config: dict, - use_cuda: bool, - tokenizer_pretrained_config, - model_pretrained_config, - device_map: str, -): - """ - get the correct model and tokenizer according to given user inputs - - :param model: a tuple containing model name and class, or str with model name or path - :param tokenizer: a tuple containing tokenizer name and class, or str with tokenizer name or path - :param task: a supported nlp task, used to choose model if not provided - :param framework: pt or tf - :param lora_config: lora config or None, to load model in appropriate way - :param quantization_config: quantization config or None, to load model in appropriate way - :param use_cuda: use gpu or not - :param tokenizer_pretrained_config: config to load the pretrained tokenizer - :param model_pretrained_config: config to load the pretrained model - :param device_map: a device map for model training if using number of gpu's - - :returns: model and tokenizer - """ - # if task is not supported and no model was given we can't choose one - if task and task not in supported_tasks and not model: - logger.error("unsupported task option chosen") - raise - - # load model from store - if isinstance(model, str) and is_store_uri(model): - pass - # TODO: load both model and tokenizer and return, need guy's help - - # if it's a tuple them we assume it contains of both name and class - if isinstance(model, list): - model_name, model_class = model - model_class = _get_class_object(model_class) - - # in the case we don't get the model class we need the task in order to choose the correct model - else: - if task is None: - logger.error("task must be chosen in order to determine the correct model") - raise Exception( - "this function requires either a supported task or a model and model class to be chosen" - ) - - _, available_classes, task_options = transformers.pipelines.check_task(task) - - if isinstance(model, str): - model_name = model - - # if model is not given, we take the default model for the given task - else: - model_name, _ = transformers.pipelines.get_default_model_and_revision( - available_classes, framework, task_options - ) - if not available_classes.get(framework, tuple()): - logger.error( - "given task's default model is not supported in specified framework" - ) - raise Exception( - "this function requires either a supported task or a model and model class to be chosen" - ) - - model_class = available_classes[framework][0] - - # load the pretrained model - if use_cuda: - device_map = device_map - else: - device_map = None - - model = model_class.from_pretrained( - model_name, - quantization_config=quantization_config, - device_map=device_map, - **model_pretrained_config, - ) - - # If quantization config is given we will load a quantized model, if not a regular one - if quantization_config: - model.gradient_checkpointing_enable() - model = peft.prepare_model_for_kbit_training(model) - - # If lora config was given we want to do lora fine tune, we update model here - if lora_config: - model = peft.get_peft_model(model, lora_config) - - # if not specified we choose the default tokenizer that corresponding to the model - if tokenizer is None: - tokenizer = transformers.AutoTokenizer.from_pretrained(model_name) - return model_name, model, tokenizer - - if isinstance(tokenizer, str): - tokenizer_name = tokenizer - tokenizer_class = transformers.AutoTokenizer - - # if it's not a str then it's a tuple of both name and class - else: - tokenizer_name, tokenizer_class = tokenizer - tokenizer_class = _get_class_object(tokenizer_class) - - tokenizer = tokenizer_class.from_pretrained( - tokenizer_name, **tokenizer_pretrained_config - ) - - tokenizer.pad_token = tokenizer.eos_token - - return model_name, model, tokenizer - - -def _dataset_loader(dataset: str, is_train: bool = True, **kwargs) -> Dataset: - """ - loads the specific dataset provided by the user - - :param dataset: name or path of dataset to load - :param is_train: bool that indicates the purpose of the dataset - :param kwargs: other kwargs for loading the dataset - - :returns: loaded dataset - """ - # if split in kwargs then the user decides how to split the dataset - if "split" in kwargs: - return load_dataset(dataset, **kwargs) - - # if it's a dataset for train we split with train - if is_train: - return load_dataset(dataset, split="train", **kwargs) - - # if it's eval dataset, then a lot of names are acceptable for the set and we check all of them - dataset = load_dataset(dataset, **kwargs) - if "test" in dataset: - return dataset.get("test") - elif "eval" in dataset: - return dataset.get("eval") - elif "validation" in dataset: - return dataset.get("validation") - - -def _prepare_dataset( - train_dataset: str, - eval_dataset: str, - train_load_dataset_kwargs, - eval_load_dataset_kwargs, - tokenizer, - dataset_columns_to_train: Union[str, list], -) -> (Dataset, Union[Dataset, None]): - """ - Loads the train and eval datasets (if provided) passes them through the tokenizer and - returns them ready to use in training - - :param train_dataset: the name or path to the train dataset - :param eval_dataset: the name or path to the eval dataset - :param dataset_columns_to_train: which columns to pass to the model as inputs - (need to pass through the tokenizer first) - :param train_load_dataset_kwargs: kwargs for dataset loading - :param eval_load_dataset_kwargs: kwargs for dataset loading - :param tokenizer: the tokenizer to pass the data through - - :returns: tokenized datasets - """ - if not tokenizer.pad_token: - tokenizer.pad_token = tokenizer.eos_token - - # we take col name/s in a list for easy generalization - if isinstance(dataset_columns_to_train, str): - dataset_columns_to_train = [dataset_columns_to_train] - - if isinstance(train_dataset, mlrun.datastore.DataItem): - train_dataset = Dataset.from_pandas(train_dataset.as_df()) - return ( - train_dataset.map( - lambda examples: tokenizer( - *[examples[col] for col in dataset_columns_to_train], - truncation=True, - padding=True, - ), - batched=True, - ), - None, - ) - - # Load datasets - # if provided two paths/names we load each separately using designated func - if eval_dataset: - train_dataset = _dataset_loader( - dataset=train_dataset, is_train=True, **train_load_dataset_kwargs - ) - eval_dataset = _dataset_loader( - dataset=eval_dataset, is_train=False, **eval_load_dataset_kwargs - ) - - # if only on path is given then we must check if it contains both dataset or if only one should be used - else: - dataset = load_dataset(train_dataset, **train_load_dataset_kwargs) - if "train" in dataset: - train_dataset = dataset.get("train") - if "test" in dataset: - eval_dataset = dataset.get("test") - elif "eval" in dataset: - eval_dataset = dataset.get("eval") - elif "validation" in dataset: - eval_dataset = dataset.get("validation") - else: - # only train dataset given, tokenize and return it - return ( - train_dataset.map( - lambda examples: tokenizer( - *[examples[col] for col in dataset_columns_to_train], - truncation=True, - padding=True, - ), - batched=True, - ), - None, - ) - else: - logger.error("train dataset is mandatory") - raise KeyError("no train dataset found in given dataset") - - # Tokenize the data so the model can understand it - tokenized_train_dataset = train_dataset.map( - lambda examples: tokenizer( - *[examples[col] for col in dataset_columns_to_train], - truncation=True, - padding=True, - ), - batched=True, - ) - - tokenized_eval_dataset = eval_dataset.map( - lambda examples: tokenizer( - *[examples[col] for col in dataset_columns_to_train], - truncation=True, - padding=True, - ), - batched=True, - ) - - return tokenized_train_dataset, tokenized_eval_dataset - - -
[docs]def finetune_llm( - context: mlrun.MLClientCtx, - train_dataset: Union[str, mlrun.datastore.DataItem], - eval_dataset: str = None, - train_load_dataset_kwargs: dict = {}, - eval_load_dataset_kwargs: dict = {}, - dataset_columns_to_train: Union[str, list] = "text", - model: Union[str, List[str]] = "huggingface-model", - tokenizer: Union[str, List[str]] = None, - deepspeed_config: Union[dict, bool] = False, - quantization_config: Union[dict, bool] = False, - lora_config: Union[dict, bool] = False, - training_config: dict = {}, - model_pretrained_config: dict = {}, - tokenizer_pretrained_config: dict = {}, - data_collator_config: dict = {}, - task: str = "text-generation", - use_cuda: bool = True, - framework: str = "pt", - device_map: str = "auto", - **kwargs, -): - """ - Fine-tunes a Language Model (LLM) on a specific task using the provided dataset. - The function takes various configuration parameters to customize the training process - and adapt the model to specific tasks using a provided dataset. - - :param context: mlrun context in order to log trained model - :param dataset_columns_to_train: which columns to pass to the model as inputs - :param eval_load_dataset_kwargs: kwargs for dataset loading - :param train_load_dataset_kwargs: kwargs for dataset loading - :param framework: pt ot tf - :param use_cuda: use gpu or not - :param tokenizer_pretrained_config: config to load the pretrained tokenizer - :param model_pretrained_config: config to load the pretrained model - :param tokenizer: a tuple containing tokenizer name and class, or str with tokenizer name or path - :param model: a tuple containing model name and class, or str with model name or path - :param train_dataset: The train dataset used for fine-tuning the language model. - :param eval_dataset: The eval dataset used for evaluate the language model during training. - :param deepspeed_config: Configuration options for DeepSpeed (optional). - :param quantization_config: Configuration options for model quantization (optional). - :param lora_config: Configuration options for Low-Rank Approximation (LoRA) (optional). - :param training_config: Configuration options specific to the fine-tuning training process (optional). - :param data_collator_config: Configuration options for data collation during training (optional). - :param task: A description of the specific task the model is being fine-tuned for. - :param kwargs: Additional keyword arguments. - """ - - # TODO: match forward.keyword to dataset.keyword - check if relevant in new design - # TODO: add warning for label, and add option to modify dataset col names - check if relevant in new design - - # Look for updates to configs given in kwargs - configs = { - ConfigKeys.deepspeed: deepspeed_config, - ConfigKeys.quantization: quantization_config, - ConfigKeys.lora: lora_config, - ConfigKeys.training: training_config, - ConfigKeys.model_pretrained: model_pretrained_config, - ConfigKeys.tokenizer_pretrained: tokenizer_pretrained_config, - ConfigKeys.data_collator: data_collator_config, - } - _update_config(dst=configs, src=kwargs) - - # check gpu permission and availability - if use_cuda: - if torch.cuda.is_available(): - # Clean gpu cache - torch.cuda.empty_cache() - else: - logger.warning("'use_cuda' is set to True, but no cuda device is available") - - # get model and tokenizer - model_name, model, tokenizer = _set_model_and_tokenizer( - model=model, - tokenizer=tokenizer, - task=task, - framework=framework, - lora_config=configs[ConfigKeys.lora], - quantization_config=configs[ConfigKeys.quantization], - use_cuda=use_cuda, - tokenizer_pretrained_config=tokenizer_pretrained_config, - model_pretrained_config=configs[ConfigKeys.model_pretrained], - device_map=device_map, - ) - - # Load datasets - tokenized_train, tokenized_eval = _prepare_dataset( - train_dataset=train_dataset, - eval_dataset=eval_dataset, - train_load_dataset_kwargs=train_load_dataset_kwargs, - eval_load_dataset_kwargs=eval_load_dataset_kwargs, - tokenizer=tokenizer, - dataset_columns_to_train=dataset_columns_to_train, - ) - - # Initialize the data collator for the trainer to use in order to create batches of data - data_collator = transformers.DataCollatorForLanguageModeling( - tokenizer=tokenizer, mlm=False, **data_collator_config - ) - - # Initialize training kwargs from user kwargs: - train_kwargs = configs[ConfigKeys.training] - - # If deepspeed config given we add it to training kwargs - if configs[ConfigKeys.deepspeed]: - train_kwargs["deepspeed"] = configs[ConfigKeys.deepspeed] - - # Take a look at the trainable parameters in the model - _print_trainable_parameters(model) - - # Preparing training arguments: - training_args = transformers.TrainingArguments( - output_dir=tempfile.mkdtemp(), - **train_kwargs, - ) - - trainer = transformers.Trainer( - model=model, - train_dataset=tokenized_train, - eval_dataset=tokenized_eval, - tokenizer=tokenizer, - data_collator=data_collator, - args=training_args, - ) - - apply_mlrun(trainer, model_name=model_name.split("/")[-1]) - model.config.use_cache = ( - False # silence the warnings. Please re-enable for inference! - ) - - # Apply training with evaluation: - context.logger.info(f"training '{model_name}'") - trainer.train() - - temp_directory = tempfile.TemporaryDirectory().name - trainer.save_model(temp_directory) - - # Zip the model directory: - shutil.make_archive( - base_name="model", - format="zip", - root_dir=temp_directory, - ) - - # Log the model: - context.log_model( - key="model", - db_key=model_name.split("/")[-1], - model_file="model.zip", - tag="", - framework="Hugging Face", - )
- - -
[docs]def evaluate( - context, - model_path, - data: pd.DataFrame, - model_name: str = None, - tokenizer_name: str = None, -): - """ - Evaluating the model using perplexity, for more information visit: - https://huggingface.co/docs/transformers/perplexity - - :param context: mlrun context - :param model_path: path to the model directory - :param data: the data to evaluate the model - :param model_name: name of base model - :param tokenizer_name: name of base tokenizer - """ - # Get the model artifact and file: - ( - model_file, - model_artifact, - extra_data, - ) = mlrun.artifacts.get_model(model_path) - - # Read the name: - _model_name = model_artifact.spec.db_key - - # Extract logged model files: - model_directory = os.path.join(os.path.dirname(model_file), _model_name) - with zipfile.ZipFile(model_file, "r") as zip_file: - zip_file.extractall(model_directory) - - # Loading the saved pretrained tokenizer and model: - dataset = Dataset.from_pandas(data) - tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) - pad_token_id = tokenizer.eos_token_id - model = AutoModelForCausalLM.from_pretrained( - model_name, device_map="cuda:0", trust_remote_code=True, load_in_8bit=True - ) - model = PeftModel.from_pretrained(model, model_directory) - model.eval() - encodings = tokenizer("\n\n".join(dataset["text"][:5]), return_tensors="pt") - - max_length = 1024 - stride = 512 - seq_len = encodings.input_ids.size(1) - - nlls = [] - prev_end_loc = 0 - for begin_loc in range(0, seq_len, stride): - end_loc = min(begin_loc + max_length, seq_len) - trg_len = end_loc - prev_end_loc # may be different from stride on last loop - input_ids = encodings.input_ids[:, begin_loc:end_loc] - target_ids = input_ids.clone() - target_ids[:, :-trg_len] = -100 - - with torch.no_grad(): - outputs = model(input_ids.cuda(), labels=target_ids) - - # loss is calculated using CrossEntropyLoss which averages over valid labels - # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels - # to the left by 1. - neg_log_likelihood = outputs.loss - - nlls.append(neg_log_likelihood) - - prev_end_loc = end_loc - if end_loc == seq_len: - break - - ppl = torch.exp(torch.stack(nlls).mean()).item() - context.log_result("perplexity", ppl)
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/huggingface_auto_trainer/1.0.0/static/item.html b/functions/development/huggingface_auto_trainer/1.0.0/static/item.html deleted file mode 100644 index 2ca49a41..00000000 --- a/functions/development/huggingface_auto_trainer/1.0.0/static/item.html +++ /dev/null @@ -1,47 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- machine-learning
-- model-training
-description: fine-tune llm model with ease
-doc: ''
-example: huggingface_auto_trainer.ipynb
-generationDate: 2023-08-21:17-25
-hidden: false
-icon: ''
-labels:
-  author: Zeevr
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 1.4.0
-name: huggingface-auto-trainer
-platformVersion: 3.5.0
-spec:
-  filename: huggingface_auto_trainer.py
-  handler: finetune_llm
-  image: mlrun/mlrun
-  kind: job
-  requirements: []
-url: ''
-version: 1.0.0
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/huggingface_auto_trainer/1.0.0/static/source.html b/functions/development/huggingface_auto_trainer/1.0.0/static/source.html deleted file mode 100644 index 7c445e5d..00000000 --- a/functions/development/huggingface_auto_trainer/1.0.0/static/source.html +++ /dev/null @@ -1,877 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-import importlib
-import os
-import shutil
-import tempfile
-import zipfile
-from abc import ABC
-from typing import Dict, List, Tuple, Union
-
-import mlrun
-import numpy as np
-import pandas as pd
-import peft
-import torch
-import transformers
-from datasets import Dataset, load_dataset
-from mlrun.artifacts.manager import Artifact, PlotlyArtifact
-from mlrun.datastore import is_store_uri
-from mlrun.frameworks._common import CommonTypes, MLRunInterface
-from mlrun.utils import logger
-from peft import (LoraConfig, PeftModel, get_peft_model,
-                  prepare_model_for_kbit_training)
-from plotly import graph_objects as go
-from transformers import (AutoModelForCausalLM, AutoTokenizer,
-                          BitsAndBytesConfig, DataCollatorForLanguageModeling,
-                          PreTrainedModel, PreTrainedTokenizer, Trainer,
-                          TrainerCallback, TrainerControl, TrainerState,
-                          TrainingArguments)
-
-supported_tasks = [
-    "question-answering",
-    "summarization",
-    "table-question-answering",
-    "text2text-generation",
-    "text-classification",
-    "sentiment-analysis",
-    "text-generation",
-    "token-classification",
-    "translation",
-    "translation_xx_to_yy",
-]
-
-
-class ConfigKeys:
-    deepspeed = "deepspeed"
-    quantization = "quantization"
-    lora = "lora"
-    training = "training"
-    tokenizer_pretrained = "tokenizer_pretrained"
-    model_pretrained = "model_pretrained"
-    data_collator = "data_collator"
-
-
-# ----------------------from MLRUN--------------------------------
-class HFTrainerMLRunInterface(MLRunInterface, ABC):
-    """
-    This is temporary and will be built in mlrun 1.5.0
-    Interface for adding MLRun features for tensorflow keras API.
-    """
-
-    # MLRuns context default name:
-    DEFAULT_CONTEXT_NAME = "mlrun-huggingface"
-
-    # Attributes to replace so the MLRun interface will be fully enabled.
-    _REPLACED_METHODS = [
-        "train",
-        # "evaluate"
-    ]
-
-    @classmethod
-    def add_interface(
-        cls,
-        obj: Trainer,
-        restoration: CommonTypes.MLRunInterfaceRestorationType = None,
-    ):
-        super(HFTrainerMLRunInterface, cls).add_interface(
-            obj=obj, restoration=restoration
-        )
-
-    @classmethod
-    def mlrun_train(cls):
-        def wrapper(self: Trainer, *args, **kwargs):
-            # Restore the evaluation method as `train` will use it:
-            # cls._restore_attribute(obj=self, attribute_name="evaluate")
-
-            # Call the original fit method:
-            result = self.original_train(*args, **kwargs)
-
-            # Replace the evaluation method again:
-            # cls._replace_function(obj=self, function_name="evaluate")
-
-            return result
-
-        return wrapper
-
-
-class MLRunCallback(TrainerCallback):
-    """
-    This is temporary and will be built in mlrun 1.5.0
-    Callback for collecting logs during training / evaluation of the `Trainer` API.
-    """
-
-    def __init__(
-        self,
-        context: mlrun.MLClientCtx = None,
-        model_name: str = "model",
-        tag: str = "",
-        labels: Dict[str, str] = None,
-        extra_data: dict = None,
-    ):
-        super().__init__()
-
-        # Store the configurations:
-        self._context = (
-            context
-            if context is not None
-            else mlrun.get_or_create_ctx("./mlrun-huggingface")
-        )
-        self._model_name = model_name
-        self._tag = tag
-        self._labels = labels
-        self._extra_data = extra_data if extra_data is not None else {}
-
-        # Set up the logging mode:
-        self._is_training = False
-        self._steps: List[List[int]] = []
-        self._metric_scores: Dict[str, List[float]] = {}
-        self._artifacts: Dict[str, Artifact] = {}
-
-    def on_epoch_begin(
-        self,
-        args: TrainingArguments,
-        state: TrainerState,
-        control: TrainerControl,
-        **kwargs,
-    ):
-        if not state.is_world_process_zero:
-            return
-        self._steps.append([])
-
-    def on_epoch_end(
-        self,
-        args: TrainingArguments,
-        state: TrainerState,
-        control: TrainerControl,
-        **kwargs,
-    ):
-        if not state.is_world_process_zero:
-            return
-        self.log_metrics()
-
-    def on_log(
-        self,
-        args: TrainingArguments,
-        state: TrainerState,
-        control: TrainerControl,
-        logs: Dict[str, float] = None,
-        **kwargs,
-    ):
-        if not state.is_world_process_zero:
-            return
-        recent_logs = state.log_history[-1].copy()
-
-        recent_logs.pop("epoch")
-        current_step = int(recent_logs.pop("step"))
-        if current_step not in self._steps[-1]:
-            self._steps[-1].append(current_step)
-
-        for metric_name, metric_score in recent_logs.items():
-            if metric_name.startswith("train_"):
-                if metric_name.split("train_")[1] not in self._metric_scores:
-                    self._metric_scores[metric_name] = [metric_score]
-                continue
-            if metric_name not in self._metric_scores:
-                self._metric_scores[metric_name] = []
-            self._metric_scores[metric_name].append(metric_score)
-
-    def on_train_begin(
-        self,
-        args: TrainingArguments,
-        state: TrainerState,
-        control: TrainerControl,
-        **kwargs,
-    ):
-        if not state.is_world_process_zero:
-            return
-        self._is_training = True
-
-    def on_train_end(
-        self,
-        args: TrainingArguments,
-        state: TrainerState,
-        control: TrainerControl,
-        model: PreTrainedModel = None,
-        tokenizer: PreTrainedTokenizer = None,
-        **kwargs,
-    ):
-        if not state.is_world_process_zero:
-            return
-        self.log_metrics()
-
-    def on_evaluate(
-        self,
-        args: TrainingArguments,
-        state: TrainerState,
-        control: TrainerControl,
-        **kwargs,
-    ):
-        if not state.is_world_process_zero:
-            return
-        self.log_metrics()
-
-        if self._is_training:
-            return
-
-    def log_metrics(self):
-        for metric_name, metric_scores in self._metric_scores.items():
-            self._context.log_result(key=metric_name, value=metric_scores[-1])
-            if len(metric_scores) > 1:
-                self.log_metric_plot(name=metric_name, scores=metric_scores)
-        self._context.commit(completed=False)
-
-    def log_metric_plot(self, name: str, scores: List[float]):
-        # Initialize a plotly figure:
-        metric_figure = go.Figure()
-
-        # Add titles:
-        metric_figure.update_layout(
-            title=name.capitalize().replace("_", " "),
-            xaxis_title="Samples",
-            yaxis_title="Scores",
-        )
-
-        # Draw:
-        metric_figure.add_trace(
-            go.Scatter(x=np.arange(len(scores)), y=scores, mode="lines")
-        )
-
-        # Create the plotly artifact:
-        artifact_name = f"{name}_plot"
-        artifact = PlotlyArtifact(key=artifact_name, figure=metric_figure)
-        self._artifacts[artifact_name] = self._context.log_artifact(artifact)
-
-
-def apply_mlrun(
-    trainer: transformers.Trainer,
-    model_name: str = None,
-    tag: str = "",
-    context: mlrun.MLClientCtx = None,
-    auto_log: bool = True,
-    labels: Dict[str, str] = None,
-    extra_data: dict = None,
-    **kwargs,
-):
-    """
-    This is temporary and will be built in mlrun 1.5.0
-    """
-    # Get parameters defaults:
-    if context is None:
-        context = mlrun.get_or_create_ctx(HFTrainerMLRunInterface.DEFAULT_CONTEXT_NAME)
-
-    HFTrainerMLRunInterface.add_interface(obj=trainer)
-
-    if auto_log:
-        trainer.add_callback(
-            MLRunCallback(
-                context=context,
-                model_name=model_name,
-                tag=tag,
-                labels=labels,
-                extra_data=extra_data,
-            )
-        )
-
-
-# ----------------------end from MLRUN--------------------------------
-
-
-def _print_trainable_parameters(model):
-    """
-    Prints the number of trainable parameters in the model.
-    """
-    trainable_params = 0
-    all_param = 0
-    for _, param in model.named_parameters():
-        all_param += param.numel()
-        if param.requires_grad:
-            trainable_params += param.numel()
-    print(
-        f"trainable params: {trainable_params} || all params: {all_param} || trainable%:"
-        f" {100 * trainable_params / all_param}"
-    )
-
-
-# default configs
-# will be used if user provides "True" with config name as input
-QUANTIZATION_CONFIG = transformers.BitsAndBytesConfig(
-    load_in_4bit=True,
-    bnb_4bit_use_double_quant=True,
-    bnb_4bit_quant_type="nf4",
-    bnb_4bit_compute_dtype=torch.bfloat16,
-)
-
-LORA_CONFIG = peft.LoraConfig(
-    r=8,
-    lora_alpha=32,
-    target_modules=["query_key_value"],
-    lora_dropout=0.05,
-    bias="none",
-    task_type="CAUSAL_LM",
-)
-
-DEEPSPEED_CONFIG = {
-    "train_micro_batch_size_per_gpu": "auto",
-    "fp16": {"enabled": True},
-    "autotuning": {
-        "enabled": True,
-        "arg_mappings": {
-            "train_micro_batch_size_per_gpu": "--per_device_train_batch_size",
-            "gradient_accumulation_steps ": "--gradient_accumulation_steps",
-        },
-    },
-    "zero_optimization": {
-        "stage": 2,
-    },
-}
-
-
-def _update_config(src: dict, dst: dict):
-    """
-    update configs according to user, this way the user can add/modify values in default configs for e.g.
-
-    goes over all configs and corresponding prefixes, collect all the keys from the given dict that start
-     with the prefix and add them to appropriate config
-
-    :param src: dict of all candidate values to update dict.
-    :param dst: dict containing all configs to update.
-    """
-
-    for config_name, config in dst.items():
-
-        # If given True we use default dict
-        # Can also be False or a config dict given from user, so we check specifically fo True
-        if config is True and config_name == "quantization":
-            config = QUANTIZATION_CONFIG
-
-        if config is True and config_name == "lora":
-            config = LORA_CONFIG
-
-        if config is True and config_name == "deepspeed":
-            config = DEEPSPEED_CONFIG
-
-        # in some cases we can get a boolean value, in that case no need to look for args
-        if isinstance(config, bool):
-            config = None
-
-        elif isinstance(config, dict):
-            for key, val in src.items():
-                if key.startswith(config_name):
-                    config[key.replace(f"{config_name}_", "")] = val
-
-        # update by config name
-        else:
-            for key, val in src.items():
-                if key.startswith(config_name):
-                    setattr(config, key.replace(f"{config_name}_", ""), val)
-
-        dst.update({config_name: config})
-
-
-def _get_class_object(class_path: str) -> type:
-    """
-    given a full class name, this function returns the correct class
-
-    :param class_path: a full class name (ex. 'transformers.AutoModelForCausalLM')
-
-    :return the wanted class object
-    """
-    module_path, class_name = class_path.rsplit(".", 1)
-    module = importlib.import_module(module_path)
-    return getattr(module, class_name)
-
-
-def _set_model_and_tokenizer(
-    model: Union[str, List[str]],
-    tokenizer: Union[str, List[str]],
-    task: str,
-    framework: str,
-    lora_config: dict,
-    quantization_config: dict,
-    use_cuda: bool,
-    tokenizer_pretrained_config,
-    model_pretrained_config,
-    device_map: str,
-):
-    """
-    get the correct model and tokenizer according to given user inputs
-
-    :param model: a tuple containing model name and class, or str with model name or path
-    :param tokenizer: a tuple containing tokenizer name and class, or str with tokenizer name or path
-    :param task: a supported nlp task, used to choose model if not provided
-    :param framework: pt or tf
-    :param lora_config: lora config or None, to load model in appropriate way
-    :param quantization_config: quantization config or None, to load model in appropriate way
-    :param use_cuda: use gpu or not
-    :param tokenizer_pretrained_config: config to load the pretrained tokenizer
-    :param model_pretrained_config: config to load the pretrained model
-    :param device_map: a device map for model training if using number of gpu's
-
-    :returns: model and tokenizer
-    """
-    # if task is not supported and no model was given we can't choose one
-    if task and task not in supported_tasks and not model:
-        logger.error("unsupported task option chosen")
-        raise
-
-    # load model from store
-    if isinstance(model, str) and is_store_uri(model):
-        pass
-        # TODO: load both model and tokenizer and return, need guy's help
-
-    # if it's a tuple them we assume it contains of both name and class
-    if isinstance(model, list):
-        model_name, model_class = model
-        model_class = _get_class_object(model_class)
-
-    # in the case we don't get the model class we need the task in order to choose the correct model
-    else:
-        if task is None:
-            logger.error("task must be chosen in order to determine the correct model")
-            raise Exception(
-                "this function requires either a supported task or a model and model class to be chosen"
-            )
-
-        _, available_classes, task_options = transformers.pipelines.check_task(task)
-
-        if isinstance(model, str):
-            model_name = model
-
-        # if model is not given, we take the default model for the given task
-        else:
-            model_name, _ = transformers.pipelines.get_default_model_and_revision(
-                available_classes, framework, task_options
-            )
-        if not available_classes.get(framework, tuple()):
-            logger.error(
-                "given task's default model is not supported in specified framework"
-            )
-            raise Exception(
-                "this function requires either a supported task or a model and model class to be chosen"
-            )
-
-        model_class = available_classes[framework][0]
-
-    # load the pretrained model
-    if use_cuda:
-        device_map = device_map
-    else:
-        device_map = None
-
-    model = model_class.from_pretrained(
-        model_name,
-        quantization_config=quantization_config,
-        device_map=device_map,
-        **model_pretrained_config,
-    )
-
-    # If quantization config is given we will load a quantized model, if not a regular one
-    if quantization_config:
-        model.gradient_checkpointing_enable()
-        model = peft.prepare_model_for_kbit_training(model)
-
-    # If lora config was given we want to do lora fine tune, we update model here
-    if lora_config:
-        model = peft.get_peft_model(model, lora_config)
-
-    # if not specified we choose the default tokenizer that corresponding to the model
-    if tokenizer is None:
-        tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
-        return model_name, model, tokenizer
-
-    if isinstance(tokenizer, str):
-        tokenizer_name = tokenizer
-        tokenizer_class = transformers.AutoTokenizer
-
-    # if it's not a str then it's a tuple of both name and class
-    else:
-        tokenizer_name, tokenizer_class = tokenizer
-        tokenizer_class = _get_class_object(tokenizer_class)
-
-    tokenizer = tokenizer_class.from_pretrained(
-        tokenizer_name, **tokenizer_pretrained_config
-    )
-
-    tokenizer.pad_token = tokenizer.eos_token
-
-    return model_name, model, tokenizer
-
-
-def _dataset_loader(dataset: str, is_train: bool = True, **kwargs) -> Dataset:
-    """
-    loads the specific dataset provided by the user
-
-    :param dataset: name or path of dataset to load
-    :param is_train: bool that indicates the purpose of the dataset
-    :param kwargs: other kwargs for loading the dataset
-
-    :returns: loaded dataset
-    """
-    # if split in kwargs then the user decides how to split the dataset
-    if "split" in kwargs:
-        return load_dataset(dataset, **kwargs)
-
-    # if it's a dataset for train we split with train
-    if is_train:
-        return load_dataset(dataset, split="train", **kwargs)
-
-    # if it's eval dataset, then a lot of names are acceptable for the set and we check all of them
-    dataset = load_dataset(dataset, **kwargs)
-    if "test" in dataset:
-        return dataset.get("test")
-    elif "eval" in dataset:
-        return dataset.get("eval")
-    elif "validation" in dataset:
-        return dataset.get("validation")
-
-
-def _prepare_dataset(
-    train_dataset: str,
-    eval_dataset: str,
-    train_load_dataset_kwargs,
-    eval_load_dataset_kwargs,
-    tokenizer,
-    dataset_columns_to_train: Union[str, list],
-) -> (Dataset, Union[Dataset, None]):
-    """
-    Loads the train and eval datasets (if provided) passes them through the tokenizer and
-    returns them ready to use in training
-
-    :param train_dataset: the name or path to the train dataset
-    :param eval_dataset: the name or path to the eval dataset
-    :param dataset_columns_to_train: which columns to pass to the model as inputs
-                                        (need to pass through the tokenizer first)
-    :param train_load_dataset_kwargs: kwargs for dataset loading
-    :param eval_load_dataset_kwargs: kwargs for dataset loading
-    :param tokenizer: the tokenizer to pass the data through
-
-    :returns: tokenized datasets
-    """
-    if not tokenizer.pad_token:
-        tokenizer.pad_token = tokenizer.eos_token
-
-    # we take col name/s in a list for easy generalization
-    if isinstance(dataset_columns_to_train, str):
-        dataset_columns_to_train = [dataset_columns_to_train]
-
-    if isinstance(train_dataset, mlrun.datastore.DataItem):
-        train_dataset = Dataset.from_pandas(train_dataset.as_df())
-        return (
-            train_dataset.map(
-                lambda examples: tokenizer(
-                    *[examples[col] for col in dataset_columns_to_train],
-                    truncation=True,
-                    padding=True,
-                ),
-                batched=True,
-            ),
-            None,
-        )
-
-    # Load datasets
-    # if provided two paths/names we load each separately using designated func
-    if eval_dataset:
-        train_dataset = _dataset_loader(
-            dataset=train_dataset, is_train=True, **train_load_dataset_kwargs
-        )
-        eval_dataset = _dataset_loader(
-            dataset=eval_dataset, is_train=False, **eval_load_dataset_kwargs
-        )
-
-    # if only on path is given then we must check if it contains both dataset or if only one should be used
-    else:
-        dataset = load_dataset(train_dataset, **train_load_dataset_kwargs)
-        if "train" in dataset:
-            train_dataset = dataset.get("train")
-            if "test" in dataset:
-                eval_dataset = dataset.get("test")
-            elif "eval" in dataset:
-                eval_dataset = dataset.get("eval")
-            elif "validation" in dataset:
-                eval_dataset = dataset.get("validation")
-            else:
-                # only train dataset given, tokenize and return it
-                return (
-                    train_dataset.map(
-                        lambda examples: tokenizer(
-                            *[examples[col] for col in dataset_columns_to_train],
-                            truncation=True,
-                            padding=True,
-                        ),
-                        batched=True,
-                    ),
-                    None,
-                )
-        else:
-            logger.error("train dataset is mandatory")
-            raise KeyError("no train dataset found in given dataset")
-
-    # Tokenize the data so the model can understand it
-    tokenized_train_dataset = train_dataset.map(
-        lambda examples: tokenizer(
-            *[examples[col] for col in dataset_columns_to_train],
-            truncation=True,
-            padding=True,
-        ),
-        batched=True,
-    )
-
-    tokenized_eval_dataset = eval_dataset.map(
-        lambda examples: tokenizer(
-            *[examples[col] for col in dataset_columns_to_train],
-            truncation=True,
-            padding=True,
-        ),
-        batched=True,
-    )
-
-    return tokenized_train_dataset, tokenized_eval_dataset
-
-
-def finetune_llm(
-    context: mlrun.MLClientCtx,
-    train_dataset: Union[str, mlrun.datastore.DataItem],
-    eval_dataset: str = None,
-    train_load_dataset_kwargs: dict = {},
-    eval_load_dataset_kwargs: dict = {},
-    dataset_columns_to_train: Union[str, list] = "text",
-    model: Union[str, List[str]] = "huggingface-model",
-    tokenizer: Union[str, List[str]] = None,
-    deepspeed_config: Union[dict, bool] = False,
-    quantization_config: Union[dict, bool] = False,
-    lora_config: Union[dict, bool] = False,
-    training_config: dict = {},
-    model_pretrained_config: dict = {},
-    tokenizer_pretrained_config: dict = {},
-    data_collator_config: dict = {},
-    task: str = "text-generation",
-    use_cuda: bool = True,
-    framework: str = "pt",
-    device_map: str = "auto",
-    **kwargs,
-):
-    """
-    Fine-tunes a Language Model (LLM) on a specific task using the provided dataset.
-     The function takes various configuration parameters to customize the training process
-     and adapt the model to specific tasks using a provided dataset.
-
-    :param context: mlrun context in order to log trained model
-    :param dataset_columns_to_train: which columns to pass to the model as inputs
-    :param eval_load_dataset_kwargs: kwargs for dataset loading
-    :param train_load_dataset_kwargs: kwargs for dataset loading
-    :param framework: pt ot tf
-    :param use_cuda: use gpu or not
-    :param tokenizer_pretrained_config: config to load the pretrained tokenizer
-    :param model_pretrained_config: config to load the pretrained model
-    :param tokenizer: a tuple containing tokenizer name and class, or str with tokenizer name or path
-    :param model: a tuple containing model name and class, or str with model name or path
-    :param train_dataset: The train dataset used for fine-tuning the language model.
-    :param eval_dataset: The eval dataset used for evaluate the language model during training.
-    :param deepspeed_config: Configuration options for DeepSpeed (optional).
-    :param quantization_config: Configuration options for model quantization (optional).
-    :param lora_config: Configuration options for Low-Rank Approximation (LoRA) (optional).
-    :param training_config: Configuration options specific to the fine-tuning training process (optional).
-    :param data_collator_config: Configuration options for data collation during training (optional).
-    :param task: A description of the specific task the model is being fine-tuned for.
-    :param kwargs: Additional keyword arguments.
-    """
-
-    # TODO: match forward.keyword to dataset.keyword - check if relevant in new design
-    # TODO: add warning for label, and add option to modify dataset col names - check if relevant in new design
-
-    # Look for updates to configs given in kwargs
-    configs = {
-        ConfigKeys.deepspeed: deepspeed_config,
-        ConfigKeys.quantization: quantization_config,
-        ConfigKeys.lora: lora_config,
-        ConfigKeys.training: training_config,
-        ConfigKeys.model_pretrained: model_pretrained_config,
-        ConfigKeys.tokenizer_pretrained: tokenizer_pretrained_config,
-        ConfigKeys.data_collator: data_collator_config,
-    }
-    _update_config(dst=configs, src=kwargs)
-
-    # check gpu permission and availability
-    if use_cuda:
-        if torch.cuda.is_available():
-            # Clean gpu cache
-            torch.cuda.empty_cache()
-        else:
-            logger.warning("'use_cuda' is set to True, but no cuda device is available")
-
-    # get model and tokenizer
-    model_name, model, tokenizer = _set_model_and_tokenizer(
-        model=model,
-        tokenizer=tokenizer,
-        task=task,
-        framework=framework,
-        lora_config=configs[ConfigKeys.lora],
-        quantization_config=configs[ConfigKeys.quantization],
-        use_cuda=use_cuda,
-        tokenizer_pretrained_config=tokenizer_pretrained_config,
-        model_pretrained_config=configs[ConfigKeys.model_pretrained],
-        device_map=device_map,
-    )
-
-    # Load datasets
-    tokenized_train, tokenized_eval = _prepare_dataset(
-        train_dataset=train_dataset,
-        eval_dataset=eval_dataset,
-        train_load_dataset_kwargs=train_load_dataset_kwargs,
-        eval_load_dataset_kwargs=eval_load_dataset_kwargs,
-        tokenizer=tokenizer,
-        dataset_columns_to_train=dataset_columns_to_train,
-    )
-
-    # Initialize the data collator for the trainer to use in order to create batches of data
-    data_collator = transformers.DataCollatorForLanguageModeling(
-        tokenizer=tokenizer, mlm=False, **data_collator_config
-    )
-
-    # Initialize training kwargs from user kwargs:
-    train_kwargs = configs[ConfigKeys.training]
-
-    # If deepspeed config given we add it to training kwargs
-    if configs[ConfigKeys.deepspeed]:
-        train_kwargs["deepspeed"] = configs[ConfigKeys.deepspeed]
-
-    # Take a look at the trainable parameters in the model
-    _print_trainable_parameters(model)
-
-    # Preparing training arguments:
-    training_args = transformers.TrainingArguments(
-        output_dir=tempfile.mkdtemp(),
-        **train_kwargs,
-    )
-
-    trainer = transformers.Trainer(
-        model=model,
-        train_dataset=tokenized_train,
-        eval_dataset=tokenized_eval,
-        tokenizer=tokenizer,
-        data_collator=data_collator,
-        args=training_args,
-    )
-
-    apply_mlrun(trainer, model_name=model_name.split("/")[-1])
-    model.config.use_cache = (
-        False  # silence the warnings. Please re-enable for inference!
-    )
-
-    # Apply training with evaluation:
-    context.logger.info(f"training '{model_name}'")
-    trainer.train()
-
-    temp_directory = tempfile.TemporaryDirectory().name
-    trainer.save_model(temp_directory)
-
-    # Zip the model directory:
-    shutil.make_archive(
-        base_name="model",
-        format="zip",
-        root_dir=temp_directory,
-    )
-
-    # Log the model:
-    context.log_model(
-        key="model",
-        db_key=model_name.split("/")[-1],
-        model_file="model.zip",
-        tag="",
-        framework="Hugging Face",
-    )
-
-
-def evaluate(
-    context,
-    model_path,
-    data: pd.DataFrame,
-    model_name: str = None,
-    tokenizer_name: str = None,
-):
-    """
-    Evaluating the model using perplexity, for more information visit:
-    https://huggingface.co/docs/transformers/perplexity
-
-    :param context:     mlrun context
-    :param model_path:  path to the model directory
-    :param data:        the data to evaluate the model
-    :param model_name:  name of base model
-    :param tokenizer_name: name of base tokenizer
-    """
-    # Get the model artifact and file:
-    (
-        model_file,
-        model_artifact,
-        extra_data,
-    ) = mlrun.artifacts.get_model(model_path)
-
-    # Read the name:
-    _model_name = model_artifact.spec.db_key
-
-    # Extract logged model files:
-    model_directory = os.path.join(os.path.dirname(model_file), _model_name)
-    with zipfile.ZipFile(model_file, "r") as zip_file:
-        zip_file.extractall(model_directory)
-
-    # Loading the saved pretrained tokenizer and model:
-    dataset = Dataset.from_pandas(data)
-    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
-    pad_token_id = tokenizer.eos_token_id
-    model = AutoModelForCausalLM.from_pretrained(
-        model_name, device_map="cuda:0", trust_remote_code=True, load_in_8bit=True
-    )
-    model = PeftModel.from_pretrained(model, model_directory)
-    model.eval()
-    encodings = tokenizer("\n\n".join(dataset["text"][:5]), return_tensors="pt")
-
-    max_length = 1024
-    stride = 512
-    seq_len = encodings.input_ids.size(1)
-
-    nlls = []
-    prev_end_loc = 0
-    for begin_loc in range(0, seq_len, stride):
-        end_loc = min(begin_loc + max_length, seq_len)
-        trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
-        input_ids = encodings.input_ids[:, begin_loc:end_loc]
-        target_ids = input_ids.clone()
-        target_ids[:, :-trg_len] = -100
-
-        with torch.no_grad():
-            outputs = model(input_ids.cuda(), labels=target_ids)
-
-            # loss is calculated using CrossEntropyLoss which averages over valid labels
-            # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
-            # to the left by 1.
-            neg_log_likelihood = outputs.loss
-
-        nlls.append(neg_log_likelihood)
-
-        prev_end_loc = end_loc
-        if end_loc == seq_len:
-            break
-
-    ppl = torch.exp(torch.stack(nlls).mean()).item()
-    context.log_result("perplexity", ppl)
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/huggingface_auto_trainer/latest/src/function.yaml b/functions/development/huggingface_auto_trainer/latest/src/function.yaml deleted file mode 100644 index eff09b4c..00000000 --- a/functions/development/huggingface_auto_trainer/latest/src/function.yaml +++ /dev/null @@ -1,349 +0,0 @@ -kind: job -metadata: - name: huggingface-auto-trainer - tag: '' - hash: 4459f0b675c36a20c8f542126a96b98b0ac82271 - project: '' - labels: - author: Zeevr - categories: - - machine-learning - - model-training -spec: - command: '' - args: [] - image: mlrun/mlrun - build: - functionSourceCode:  - commands: [] - code_origin: https://github.com/ZeevRispler/functions.git#a63a647cf6bc3015a8dcbd18903f9db44bdf0b66:/Users/Zeev_Rispler/PycharmProjects/functions/huggingface_auto_trainer/huggingface_auto_trainer.py - origin_filename: /Users/Zeev_Rispler/PycharmProjects/functions/huggingface_auto_trainer/huggingface_auto_trainer.py - requirements: [] - entry_points: - add_interface: - name: add_interface - doc: '' - parameters: - - name: cls - default: '' - - name: obj - type: Trainer - default: '' - - name: restoration - type: MLRunInterfaceRestorationType - default: null - outputs: - - default: '' - lineno: 70 - mlrun_train: - name: mlrun_train - doc: '' - parameters: - - name: cls - default: '' - outputs: - - default: '' - lineno: 80 - wrapper: - name: wrapper - doc: '' - parameters: - - name: self - type: Trainer - default: '' - outputs: - - default: '' - lineno: 81 - on_epoch_begin: - name: on_epoch_begin - doc: '' - parameters: - - name: self - default: '' - - name: args - type: TrainingArguments - default: '' - - name: state - type: TrainerState - default: '' - - name: control - type: TrainerControl - default: '' - outputs: - - default: '' - lineno: 129 - on_epoch_end: - name: on_epoch_end - doc: '' - parameters: - - name: self - default: '' - - name: args - type: TrainingArguments - default: '' - - name: state - type: TrainerState - default: '' - - name: control - type: TrainerControl - default: '' - outputs: - - default: '' - lineno: 140 - on_log: - name: on_log - doc: '' - parameters: - - name: self - default: '' - - name: args - type: TrainingArguments - default: '' - - name: state - type: TrainerState - default: '' - - name: control - type: TrainerControl - default: '' - - name: logs - type: Dict[str, float] - default: null - outputs: - - default: '' - lineno: 151 - on_train_begin: - name: on_train_begin - doc: '' - parameters: - - name: self - default: '' - - name: args - type: TrainingArguments - default: '' - - name: state - type: TrainerState - default: '' - - name: control - type: TrainerControl - default: '' - outputs: - - default: '' - lineno: 177 - on_train_end: - name: on_train_end - doc: '' - parameters: - - name: self - default: '' - - name: args - type: TrainingArguments - default: '' - - name: state - type: TrainerState - default: '' - - name: control - type: TrainerControl - default: '' - - name: model - type: PreTrainedModel - default: null - - name: tokenizer - type: PreTrainedTokenizer - default: null - outputs: - - default: '' - lineno: 188 - on_evaluate: - name: on_evaluate - doc: '' - parameters: - - name: self - default: '' - - name: args - type: TrainingArguments - default: '' - - name: state - type: TrainerState - default: '' - - name: control - type: TrainerControl - default: '' - outputs: - - default: '' - lineno: 201 - log_metrics: - name: log_metrics - doc: '' - parameters: - - name: self - default: '' - outputs: - - default: '' - lineno: 215 - log_metric_plot: - name: log_metric_plot - doc: '' - parameters: - - name: self - default: '' - - name: name - type: str - default: '' - - name: scores - type: List[float] - default: '' - outputs: - - default: '' - lineno: 222 - apply_mlrun: - name: apply_mlrun - doc: This is temporary and will be built in mlrun 1.5.0 - parameters: - - name: trainer - type: Trainer - default: '' - - name: model_name - type: str - default: null - - name: tag - type: str - default: '' - - name: context - type: MLClientCtx - default: null - - name: auto_log - type: bool - default: true - - name: labels - type: Dict[str, str] - default: null - - name: extra_data - type: dict - default: null - outputs: - - default: '' - lineno: 244 - finetune_llm: - name: finetune_llm - doc: "Fine-tunes a Language Model (LLM) on a specific task using the provided\ - \ dataset.\n The function takes various configuration parameters to customize\ - \ the training process\n and adapt the model to specific tasks using a provided\ - \ dataset." - parameters: - - name: context - type: MLClientCtx - doc: mlrun context in order to log trained model - default: '' - - name: train_dataset - type: Union[str, mlrun.datastore.DataItem] - doc: The train dataset used for fine-tuning the language model. - default: '' - - name: eval_dataset - type: str - doc: The eval dataset used for evaluate the language model during training. - default: null - - name: train_load_dataset_kwargs - type: dict - doc: kwargs for dataset loading - default: {} - - name: eval_load_dataset_kwargs - type: dict - doc: kwargs for dataset loading - default: {} - - name: dataset_columns_to_train - type: Union[str, list] - doc: which columns to pass to the model as inputs - default: text - - name: model - type: Union[str, List[str]] - doc: a tuple containing model name and class, or str with model name or path - default: huggingface-model - - name: tokenizer - type: Union[str, List[str]] - doc: a tuple containing tokenizer name and class, or str with tokenizer name - or path - default: null - - name: deepspeed_config - type: Union[dict, bool] - doc: Configuration options for DeepSpeed (optional). - default: false - - name: quantization_config - type: Union[dict, bool] - doc: Configuration options for model quantization (optional). - default: false - - name: lora_config - type: Union[dict, bool] - doc: Configuration options for Low-Rank Approximation (LoRA) (optional). - default: false - - name: training_config - type: dict - doc: Configuration options specific to the fine-tuning training process (optional). - default: {} - - name: model_pretrained_config - type: dict - doc: config to load the pretrained model - default: {} - - name: tokenizer_pretrained_config - type: dict - doc: config to load the pretrained tokenizer - default: {} - - name: data_collator_config - type: dict - doc: Configuration options for data collation during training (optional). - default: {} - - name: task - type: str - doc: A description of the specific task the model is being fine-tuned for. - default: text-generation - - name: use_cuda - type: bool - doc: use gpu or not - default: true - - name: framework - type: str - doc: pt ot tf - default: pt - - name: device_map - type: str - default: auto - outputs: - - default: '' - lineno: 630 - evaluate: - name: evaluate - doc: 'Evaluating the model using perplexity, for more information visit: - - https://huggingface.co/docs/transformers/perplexity' - parameters: - - name: context - doc: mlrun context - default: '' - - name: model_path - doc: path to the model directory - default: '' - - name: data - type: DataFrame - doc: the data to evaluate the model - default: '' - - name: model_name - type: str - doc: name of base model - default: null - - name: tokenizer_name - type: str - doc: name of base tokenizer - default: null - outputs: - - default: '' - lineno: 784 - description: fine-tune llm model with ease - default_handler: finetune_llm - disable_auto_mount: false - clone_target_dir: '' - env: [] - priority_class_name: '' - preemption_mode: prevent - affinity: null - tolerations: null - security_context: {} -verbose: false diff --git a/functions/development/huggingface_auto_trainer/latest/src/huggingface_auto_trainer.ipynb b/functions/development/huggingface_auto_trainer/latest/src/huggingface_auto_trainer.ipynb deleted file mode 100644 index 847fa98d..00000000 --- a/functions/development/huggingface_auto_trainer/latest/src/huggingface_auto_trainer.ipynb +++ /dev/null @@ -1,195 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "a2c5dc6d-33d0-4e74-a875-6eab556e3b2d", - "metadata": {}, - "source": [ - "# Llm auto trainer" - ] - }, - { - "cell_type": "markdown", - "id": "cc7aa261-17b2-4362-bf6a-34af79b0230b", - "metadata": {}, - "source": [ - "## Notebook Introduction: Fine-Tuning a Large Language Model with Ease\n", - "\n", - "Welcome to this example notebook that demonstrates a simplified yet powerful approach to fine-tuning a Large Language Model (LLM) effortlessly. Fine-tuning is a crucial technique that allows you to adapt pre-trained language models to specific tasks, making them more contextually relevant and useful.\n", - "\n", - "In this notebook, we will walk you through a step-by-step process of fine-tuning a state-of-the-art language model using a user-friendly and efficient method. You don't need to be an expert in machine learning or natural language processing to follow along – our approach focuses on simplicity and effectiveness." - ] - }, - { - "cell_type": "markdown", - "id": "425249e9-f43f-45e6-aa25-9f53099049cd", - "metadata": {}, - "source": [ - "### First, we will select the model we wish to fine-tune and take the matching tokenizer and appropriate config" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3410e9c2-0557-4961-995e-0ef0cc07bf82", - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig\n", - "from transformers import logging\n", - "\n", - "logging.set_verbosity(\"CRITICAL\")\n", - "\n", - "model_name = \"tiiuae/falcon-7b\"\n", - "tokenizer = model_name\n", - "generation_config = GenerationConfig.from_pretrained(model_name)" - ] - }, - { - "cell_type": "markdown", - "id": "f33f3c35-cf61-4b0f-8da9-1c30d3b53230", - "metadata": {}, - "source": [ - "### Then, in order to use with mlrun, we will create an mlrun project and create an mlrun function" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a8ee7c35-adf7-4ed8-9e7e-e659b9461cd5", - "metadata": {}, - "outputs": [], - "source": [ - "import mlrun\n", - "\n", - "project = mlrun.get_or_create_project(\n", - " name=\"auto-trainer-test\",\n", - " context=\"./\",\n", - " user_project=True,\n", - " parameters={\n", - " \"default_image\": \"yonishelach/mlrun-llm\",\n", - " },\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d56b834f-adf6-4736-8de7-3348e050f561", - "metadata": {}, - "outputs": [], - "source": [ - "project.set_function(\n", - " \"auto-trainer.py\",\n", - " name=\"auto-trainer\",\n", - " kind=\"job\",\n", - " image=\"yonishelach/mlrun-llm\",\n", - " handler=\"finetune_llm\",\n", - ")\n", - "project.save()" - ] - }, - { - "cell_type": "markdown", - "id": "f42315db-6ddd-4dc1-89f3-c732f92d0d47", - "metadata": {}, - "source": [ - "### we can set the every config or parameter we want, including training arguments, hyper parameters and more, and pass to the function" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8e62e577-15fb-477d-9c56-fa9fb4c2669b", - "metadata": {}, - "outputs": [], - "source": [ - "import transformers\n", - "\n", - "training_arguments = {\n", - " \"per_device_train_batch_size\": 4,\n", - " \"gradient_accumulation_steps\": 1,\n", - " \"warmup_steps\": 2,\n", - " \"max_steps\": 10,\n", - " \"learning_rate\": 2e-4,\n", - " \"fp16\": True,\n", - " \"logging_steps\": 1,\n", - " \"optim\": \"paged_adamw_8bit\",\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "284a5772-f88d-46c9-87bc-fc14e434c1b4", - "metadata": {}, - "source": [ - "### Now we simply run the function" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "11ab5888-5870-4bf8-9657-db930adecd77", - "metadata": {}, - "outputs": [], - "source": [ - "training_run = mlrun.run_function(\n", - " function=\"auto-trainer\",\n", - " name=\"auto-trainer\",\n", - " local=True,\n", - " params={\n", - " \"model\": (model_name, \"transformers.AutoModelForCausalLM\"),\n", - " \"tokenizer\": tokenizer,\n", - " \"train_dataset\": \"Abirate/english_quotes\",\n", - " \"training_config\": training_arguments,\n", - " \"quantization_config\": True,\n", - " \"lora_config\": True,\n", - " \"dataset_columns_to_train\": \"quote\",\n", - " \"lora_target_modules\": [\"query_key_value\"],\n", - " \"model_pretrained_config\": {\"trust_remote_code\": True, \"use_cache\": False},\n", - " },\n", - " handler=\"finetune_llm\",\n", - " outputs=[\"model\"],\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0e674d25-5f1f-4ea8-af02-7d22c2fb6760", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7a4dfe9b-407a-43c0-9c5e-56de106477ac", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "mlrun-base", - "language": "python", - "name": "conda-env-mlrun-base-py" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.16" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/functions/development/huggingface_auto_trainer/latest/src/huggingface_auto_trainer.py b/functions/development/huggingface_auto_trainer/latest/src/huggingface_auto_trainer.py deleted file mode 100644 index d1166318..00000000 --- a/functions/development/huggingface_auto_trainer/latest/src/huggingface_auto_trainer.py +++ /dev/null @@ -1,855 +0,0 @@ -import importlib -import os -import shutil -import tempfile -import zipfile -from abc import ABC -from typing import Dict, List, Tuple, Union - -import mlrun -import numpy as np -import pandas as pd -import peft -import torch -import transformers -from datasets import Dataset, load_dataset -from mlrun.artifacts.manager import Artifact, PlotlyArtifact -from mlrun.datastore import is_store_uri -from mlrun.frameworks._common import CommonTypes, MLRunInterface -from mlrun.utils import logger -from peft import (LoraConfig, PeftModel, get_peft_model, - prepare_model_for_kbit_training) -from plotly import graph_objects as go -from transformers import (AutoModelForCausalLM, AutoTokenizer, - BitsAndBytesConfig, DataCollatorForLanguageModeling, - PreTrainedModel, PreTrainedTokenizer, Trainer, - TrainerCallback, TrainerControl, TrainerState, - TrainingArguments) - -supported_tasks = [ - "question-answering", - "summarization", - "table-question-answering", - "text2text-generation", - "text-classification", - "sentiment-analysis", - "text-generation", - "token-classification", - "translation", - "translation_xx_to_yy", -] - - -class ConfigKeys: - deepspeed = "deepspeed" - quantization = "quantization" - lora = "lora" - training = "training" - tokenizer_pretrained = "tokenizer_pretrained" - model_pretrained = "model_pretrained" - data_collator = "data_collator" - - -# ----------------------from MLRUN-------------------------------- -class HFTrainerMLRunInterface(MLRunInterface, ABC): - """ - This is temporary and will be built in mlrun 1.5.0 - Interface for adding MLRun features for tensorflow keras API. - """ - - # MLRuns context default name: - DEFAULT_CONTEXT_NAME = "mlrun-huggingface" - - # Attributes to replace so the MLRun interface will be fully enabled. - _REPLACED_METHODS = [ - "train", - # "evaluate" - ] - - @classmethod - def add_interface( - cls, - obj: Trainer, - restoration: CommonTypes.MLRunInterfaceRestorationType = None, - ): - super(HFTrainerMLRunInterface, cls).add_interface( - obj=obj, restoration=restoration - ) - - @classmethod - def mlrun_train(cls): - def wrapper(self: Trainer, *args, **kwargs): - # Restore the evaluation method as `train` will use it: - # cls._restore_attribute(obj=self, attribute_name="evaluate") - - # Call the original fit method: - result = self.original_train(*args, **kwargs) - - # Replace the evaluation method again: - # cls._replace_function(obj=self, function_name="evaluate") - - return result - - return wrapper - - -class MLRunCallback(TrainerCallback): - """ - This is temporary and will be built in mlrun 1.5.0 - Callback for collecting logs during training / evaluation of the `Trainer` API. - """ - - def __init__( - self, - context: mlrun.MLClientCtx = None, - model_name: str = "model", - tag: str = "", - labels: Dict[str, str] = None, - extra_data: dict = None, - ): - super().__init__() - - # Store the configurations: - self._context = ( - context - if context is not None - else mlrun.get_or_create_ctx("./mlrun-huggingface") - ) - self._model_name = model_name - self._tag = tag - self._labels = labels - self._extra_data = extra_data if extra_data is not None else {} - - # Set up the logging mode: - self._is_training = False - self._steps: List[List[int]] = [] - self._metric_scores: Dict[str, List[float]] = {} - self._artifacts: Dict[str, Artifact] = {} - - def on_epoch_begin( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs, - ): - if not state.is_world_process_zero: - return - self._steps.append([]) - - def on_epoch_end( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs, - ): - if not state.is_world_process_zero: - return - self.log_metrics() - - def on_log( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - logs: Dict[str, float] = None, - **kwargs, - ): - if not state.is_world_process_zero: - return - recent_logs = state.log_history[-1].copy() - - recent_logs.pop("epoch") - current_step = int(recent_logs.pop("step")) - if current_step not in self._steps[-1]: - self._steps[-1].append(current_step) - - for metric_name, metric_score in recent_logs.items(): - if metric_name.startswith("train_"): - if metric_name.split("train_")[1] not in self._metric_scores: - self._metric_scores[metric_name] = [metric_score] - continue - if metric_name not in self._metric_scores: - self._metric_scores[metric_name] = [] - self._metric_scores[metric_name].append(metric_score) - - def on_train_begin( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs, - ): - if not state.is_world_process_zero: - return - self._is_training = True - - def on_train_end( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - model: PreTrainedModel = None, - tokenizer: PreTrainedTokenizer = None, - **kwargs, - ): - if not state.is_world_process_zero: - return - self.log_metrics() - - def on_evaluate( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs, - ): - if not state.is_world_process_zero: - return - self.log_metrics() - - if self._is_training: - return - - def log_metrics(self): - for metric_name, metric_scores in self._metric_scores.items(): - self._context.log_result(key=metric_name, value=metric_scores[-1]) - if len(metric_scores) > 1: - self.log_metric_plot(name=metric_name, scores=metric_scores) - self._context.commit(completed=False) - - def log_metric_plot(self, name: str, scores: List[float]): - # Initialize a plotly figure: - metric_figure = go.Figure() - - # Add titles: - metric_figure.update_layout( - title=name.capitalize().replace("_", " "), - xaxis_title="Samples", - yaxis_title="Scores", - ) - - # Draw: - metric_figure.add_trace( - go.Scatter(x=np.arange(len(scores)), y=scores, mode="lines") - ) - - # Create the plotly artifact: - artifact_name = f"{name}_plot" - artifact = PlotlyArtifact(key=artifact_name, figure=metric_figure) - self._artifacts[artifact_name] = self._context.log_artifact(artifact) - - -def apply_mlrun( - trainer: transformers.Trainer, - model_name: str = None, - tag: str = "", - context: mlrun.MLClientCtx = None, - auto_log: bool = True, - labels: Dict[str, str] = None, - extra_data: dict = None, - **kwargs, -): - """ - This is temporary and will be built in mlrun 1.5.0 - """ - # Get parameters defaults: - if context is None: - context = mlrun.get_or_create_ctx(HFTrainerMLRunInterface.DEFAULT_CONTEXT_NAME) - - HFTrainerMLRunInterface.add_interface(obj=trainer) - - if auto_log: - trainer.add_callback( - MLRunCallback( - context=context, - model_name=model_name, - tag=tag, - labels=labels, - extra_data=extra_data, - ) - ) - - -# ----------------------end from MLRUN-------------------------------- - - -def _print_trainable_parameters(model): - """ - Prints the number of trainable parameters in the model. - """ - trainable_params = 0 - all_param = 0 - for _, param in model.named_parameters(): - all_param += param.numel() - if param.requires_grad: - trainable_params += param.numel() - print( - f"trainable params: {trainable_params} || all params: {all_param} || trainable%:" - f" {100 * trainable_params / all_param}" - ) - - -# default configs -# will be used if user provides "True" with config name as input -QUANTIZATION_CONFIG = transformers.BitsAndBytesConfig( - load_in_4bit=True, - bnb_4bit_use_double_quant=True, - bnb_4bit_quant_type="nf4", - bnb_4bit_compute_dtype=torch.bfloat16, -) - -LORA_CONFIG = peft.LoraConfig( - r=8, - lora_alpha=32, - target_modules=["query_key_value"], - lora_dropout=0.05, - bias="none", - task_type="CAUSAL_LM", -) - -DEEPSPEED_CONFIG = { - "train_micro_batch_size_per_gpu": "auto", - "fp16": {"enabled": True}, - "autotuning": { - "enabled": True, - "arg_mappings": { - "train_micro_batch_size_per_gpu": "--per_device_train_batch_size", - "gradient_accumulation_steps ": "--gradient_accumulation_steps", - }, - }, - "zero_optimization": { - "stage": 2, - }, -} - - -def _update_config(src: dict, dst: dict): - """ - update configs according to user, this way the user can add/modify values in default configs for e.g. - - goes over all configs and corresponding prefixes, collect all the keys from the given dict that start - with the prefix and add them to appropriate config - - :param src: dict of all candidate values to update dict. - :param dst: dict containing all configs to update. - """ - - for config_name, config in dst.items(): - - # If given True we use default dict - # Can also be False or a config dict given from user, so we check specifically fo True - if config is True and config_name == "quantization": - config = QUANTIZATION_CONFIG - - if config is True and config_name == "lora": - config = LORA_CONFIG - - if config is True and config_name == "deepspeed": - config = DEEPSPEED_CONFIG - - # in some cases we can get a boolean value, in that case no need to look for args - if isinstance(config, bool): - config = None - - elif isinstance(config, dict): - for key, val in src.items(): - if key.startswith(config_name): - config[key.replace(f"{config_name}_", "")] = val - - # update by config name - else: - for key, val in src.items(): - if key.startswith(config_name): - setattr(config, key.replace(f"{config_name}_", ""), val) - - dst.update({config_name: config}) - - -def _get_class_object(class_path: str) -> type: - """ - given a full class name, this function returns the correct class - - :param class_path: a full class name (ex. 'transformers.AutoModelForCausalLM') - - :return the wanted class object - """ - module_path, class_name = class_path.rsplit(".", 1) - module = importlib.import_module(module_path) - return getattr(module, class_name) - - -def _set_model_and_tokenizer( - model: Union[str, List[str]], - tokenizer: Union[str, List[str]], - task: str, - framework: str, - lora_config: dict, - quantization_config: dict, - use_cuda: bool, - tokenizer_pretrained_config, - model_pretrained_config, - device_map: str, -): - """ - get the correct model and tokenizer according to given user inputs - - :param model: a tuple containing model name and class, or str with model name or path - :param tokenizer: a tuple containing tokenizer name and class, or str with tokenizer name or path - :param task: a supported nlp task, used to choose model if not provided - :param framework: pt or tf - :param lora_config: lora config or None, to load model in appropriate way - :param quantization_config: quantization config or None, to load model in appropriate way - :param use_cuda: use gpu or not - :param tokenizer_pretrained_config: config to load the pretrained tokenizer - :param model_pretrained_config: config to load the pretrained model - :param device_map: a device map for model training if using number of gpu's - - :returns: model and tokenizer - """ - # if task is not supported and no model was given we can't choose one - if task and task not in supported_tasks and not model: - logger.error("unsupported task option chosen") - raise - - # load model from store - if isinstance(model, str) and is_store_uri(model): - pass - # TODO: load both model and tokenizer and return, need guy's help - - # if it's a tuple them we assume it contains of both name and class - if isinstance(model, list): - model_name, model_class = model - model_class = _get_class_object(model_class) - - # in the case we don't get the model class we need the task in order to choose the correct model - else: - if task is None: - logger.error("task must be chosen in order to determine the correct model") - raise Exception( - "this function requires either a supported task or a model and model class to be chosen" - ) - - _, available_classes, task_options = transformers.pipelines.check_task(task) - - if isinstance(model, str): - model_name = model - - # if model is not given, we take the default model for the given task - else: - model_name, _ = transformers.pipelines.get_default_model_and_revision( - available_classes, framework, task_options - ) - if not available_classes.get(framework, tuple()): - logger.error( - "given task's default model is not supported in specified framework" - ) - raise Exception( - "this function requires either a supported task or a model and model class to be chosen" - ) - - model_class = available_classes[framework][0] - - # load the pretrained model - if use_cuda: - device_map = device_map - else: - device_map = None - - model = model_class.from_pretrained( - model_name, - quantization_config=quantization_config, - device_map=device_map, - **model_pretrained_config, - ) - - # If quantization config is given we will load a quantized model, if not a regular one - if quantization_config: - model.gradient_checkpointing_enable() - model = peft.prepare_model_for_kbit_training(model) - - # If lora config was given we want to do lora fine tune, we update model here - if lora_config: - model = peft.get_peft_model(model, lora_config) - - # if not specified we choose the default tokenizer that corresponding to the model - if tokenizer is None: - tokenizer = transformers.AutoTokenizer.from_pretrained(model_name) - return model_name, model, tokenizer - - if isinstance(tokenizer, str): - tokenizer_name = tokenizer - tokenizer_class = transformers.AutoTokenizer - - # if it's not a str then it's a tuple of both name and class - else: - tokenizer_name, tokenizer_class = tokenizer - tokenizer_class = _get_class_object(tokenizer_class) - - tokenizer = tokenizer_class.from_pretrained( - tokenizer_name, **tokenizer_pretrained_config - ) - - tokenizer.pad_token = tokenizer.eos_token - - return model_name, model, tokenizer - - -def _dataset_loader(dataset: str, is_train: bool = True, **kwargs) -> Dataset: - """ - loads the specific dataset provided by the user - - :param dataset: name or path of dataset to load - :param is_train: bool that indicates the purpose of the dataset - :param kwargs: other kwargs for loading the dataset - - :returns: loaded dataset - """ - # if split in kwargs then the user decides how to split the dataset - if "split" in kwargs: - return load_dataset(dataset, **kwargs) - - # if it's a dataset for train we split with train - if is_train: - return load_dataset(dataset, split="train", **kwargs) - - # if it's eval dataset, then a lot of names are acceptable for the set and we check all of them - dataset = load_dataset(dataset, **kwargs) - if "test" in dataset: - return dataset.get("test") - elif "eval" in dataset: - return dataset.get("eval") - elif "validation" in dataset: - return dataset.get("validation") - - -def _prepare_dataset( - train_dataset: str, - eval_dataset: str, - train_load_dataset_kwargs, - eval_load_dataset_kwargs, - tokenizer, - dataset_columns_to_train: Union[str, list], -) -> (Dataset, Union[Dataset, None]): - """ - Loads the train and eval datasets (if provided) passes them through the tokenizer and - returns them ready to use in training - - :param train_dataset: the name or path to the train dataset - :param eval_dataset: the name or path to the eval dataset - :param dataset_columns_to_train: which columns to pass to the model as inputs - (need to pass through the tokenizer first) - :param train_load_dataset_kwargs: kwargs for dataset loading - :param eval_load_dataset_kwargs: kwargs for dataset loading - :param tokenizer: the tokenizer to pass the data through - - :returns: tokenized datasets - """ - if not tokenizer.pad_token: - tokenizer.pad_token = tokenizer.eos_token - - # we take col name/s in a list for easy generalization - if isinstance(dataset_columns_to_train, str): - dataset_columns_to_train = [dataset_columns_to_train] - - if isinstance(train_dataset, mlrun.datastore.DataItem): - train_dataset = Dataset.from_pandas(train_dataset.as_df()) - return ( - train_dataset.map( - lambda examples: tokenizer( - *[examples[col] for col in dataset_columns_to_train], - truncation=True, - padding=True, - ), - batched=True, - ), - None, - ) - - # Load datasets - # if provided two paths/names we load each separately using designated func - if eval_dataset: - train_dataset = _dataset_loader( - dataset=train_dataset, is_train=True, **train_load_dataset_kwargs - ) - eval_dataset = _dataset_loader( - dataset=eval_dataset, is_train=False, **eval_load_dataset_kwargs - ) - - # if only on path is given then we must check if it contains both dataset or if only one should be used - else: - dataset = load_dataset(train_dataset, **train_load_dataset_kwargs) - if "train" in dataset: - train_dataset = dataset.get("train") - if "test" in dataset: - eval_dataset = dataset.get("test") - elif "eval" in dataset: - eval_dataset = dataset.get("eval") - elif "validation" in dataset: - eval_dataset = dataset.get("validation") - else: - # only train dataset given, tokenize and return it - return ( - train_dataset.map( - lambda examples: tokenizer( - *[examples[col] for col in dataset_columns_to_train], - truncation=True, - padding=True, - ), - batched=True, - ), - None, - ) - else: - logger.error("train dataset is mandatory") - raise KeyError("no train dataset found in given dataset") - - # Tokenize the data so the model can understand it - tokenized_train_dataset = train_dataset.map( - lambda examples: tokenizer( - *[examples[col] for col in dataset_columns_to_train], - truncation=True, - padding=True, - ), - batched=True, - ) - - tokenized_eval_dataset = eval_dataset.map( - lambda examples: tokenizer( - *[examples[col] for col in dataset_columns_to_train], - truncation=True, - padding=True, - ), - batched=True, - ) - - return tokenized_train_dataset, tokenized_eval_dataset - - -def finetune_llm( - context: mlrun.MLClientCtx, - train_dataset: Union[str, mlrun.datastore.DataItem], - eval_dataset: str = None, - train_load_dataset_kwargs: dict = {}, - eval_load_dataset_kwargs: dict = {}, - dataset_columns_to_train: Union[str, list] = "text", - model: Union[str, List[str]] = "huggingface-model", - tokenizer: Union[str, List[str]] = None, - deepspeed_config: Union[dict, bool] = False, - quantization_config: Union[dict, bool] = False, - lora_config: Union[dict, bool] = False, - training_config: dict = {}, - model_pretrained_config: dict = {}, - tokenizer_pretrained_config: dict = {}, - data_collator_config: dict = {}, - task: str = "text-generation", - use_cuda: bool = True, - framework: str = "pt", - device_map: str = "auto", - **kwargs, -): - """ - Fine-tunes a Language Model (LLM) on a specific task using the provided dataset. - The function takes various configuration parameters to customize the training process - and adapt the model to specific tasks using a provided dataset. - - :param context: mlrun context in order to log trained model - :param dataset_columns_to_train: which columns to pass to the model as inputs - :param eval_load_dataset_kwargs: kwargs for dataset loading - :param train_load_dataset_kwargs: kwargs for dataset loading - :param framework: pt ot tf - :param use_cuda: use gpu or not - :param tokenizer_pretrained_config: config to load the pretrained tokenizer - :param model_pretrained_config: config to load the pretrained model - :param tokenizer: a tuple containing tokenizer name and class, or str with tokenizer name or path - :param model: a tuple containing model name and class, or str with model name or path - :param train_dataset: The train dataset used for fine-tuning the language model. - :param eval_dataset: The eval dataset used for evaluate the language model during training. - :param deepspeed_config: Configuration options for DeepSpeed (optional). - :param quantization_config: Configuration options for model quantization (optional). - :param lora_config: Configuration options for Low-Rank Approximation (LoRA) (optional). - :param training_config: Configuration options specific to the fine-tuning training process (optional). - :param data_collator_config: Configuration options for data collation during training (optional). - :param task: A description of the specific task the model is being fine-tuned for. - :param kwargs: Additional keyword arguments. - """ - - # TODO: match forward.keyword to dataset.keyword - check if relevant in new design - # TODO: add warning for label, and add option to modify dataset col names - check if relevant in new design - - # Look for updates to configs given in kwargs - configs = { - ConfigKeys.deepspeed: deepspeed_config, - ConfigKeys.quantization: quantization_config, - ConfigKeys.lora: lora_config, - ConfigKeys.training: training_config, - ConfigKeys.model_pretrained: model_pretrained_config, - ConfigKeys.tokenizer_pretrained: tokenizer_pretrained_config, - ConfigKeys.data_collator: data_collator_config, - } - _update_config(dst=configs, src=kwargs) - - # check gpu permission and availability - if use_cuda: - if torch.cuda.is_available(): - # Clean gpu cache - torch.cuda.empty_cache() - else: - logger.warning("'use_cuda' is set to True, but no cuda device is available") - - # get model and tokenizer - model_name, model, tokenizer = _set_model_and_tokenizer( - model=model, - tokenizer=tokenizer, - task=task, - framework=framework, - lora_config=configs[ConfigKeys.lora], - quantization_config=configs[ConfigKeys.quantization], - use_cuda=use_cuda, - tokenizer_pretrained_config=tokenizer_pretrained_config, - model_pretrained_config=configs[ConfigKeys.model_pretrained], - device_map=device_map, - ) - - # Load datasets - tokenized_train, tokenized_eval = _prepare_dataset( - train_dataset=train_dataset, - eval_dataset=eval_dataset, - train_load_dataset_kwargs=train_load_dataset_kwargs, - eval_load_dataset_kwargs=eval_load_dataset_kwargs, - tokenizer=tokenizer, - dataset_columns_to_train=dataset_columns_to_train, - ) - - # Initialize the data collator for the trainer to use in order to create batches of data - data_collator = transformers.DataCollatorForLanguageModeling( - tokenizer=tokenizer, mlm=False, **data_collator_config - ) - - # Initialize training kwargs from user kwargs: - train_kwargs = configs[ConfigKeys.training] - - # If deepspeed config given we add it to training kwargs - if configs[ConfigKeys.deepspeed]: - train_kwargs["deepspeed"] = configs[ConfigKeys.deepspeed] - - # Take a look at the trainable parameters in the model - _print_trainable_parameters(model) - - # Preparing training arguments: - training_args = transformers.TrainingArguments( - output_dir=tempfile.mkdtemp(), - **train_kwargs, - ) - - trainer = transformers.Trainer( - model=model, - train_dataset=tokenized_train, - eval_dataset=tokenized_eval, - tokenizer=tokenizer, - data_collator=data_collator, - args=training_args, - ) - - apply_mlrun(trainer, model_name=model_name.split("/")[-1]) - model.config.use_cache = ( - False # silence the warnings. Please re-enable for inference! - ) - - # Apply training with evaluation: - context.logger.info(f"training '{model_name}'") - trainer.train() - - temp_directory = tempfile.TemporaryDirectory().name - trainer.save_model(temp_directory) - - # Zip the model directory: - shutil.make_archive( - base_name="model", - format="zip", - root_dir=temp_directory, - ) - - # Log the model: - context.log_model( - key="model", - db_key=model_name.split("/")[-1], - model_file="model.zip", - tag="", - framework="Hugging Face", - ) - - -def evaluate( - context, - model_path, - data: pd.DataFrame, - model_name: str = None, - tokenizer_name: str = None, -): - """ - Evaluating the model using perplexity, for more information visit: - https://huggingface.co/docs/transformers/perplexity - - :param context: mlrun context - :param model_path: path to the model directory - :param data: the data to evaluate the model - :param model_name: name of base model - :param tokenizer_name: name of base tokenizer - """ - # Get the model artifact and file: - ( - model_file, - model_artifact, - extra_data, - ) = mlrun.artifacts.get_model(model_path) - - # Read the name: - _model_name = model_artifact.spec.db_key - - # Extract logged model files: - model_directory = os.path.join(os.path.dirname(model_file), _model_name) - with zipfile.ZipFile(model_file, "r") as zip_file: - zip_file.extractall(model_directory) - - # Loading the saved pretrained tokenizer and model: - dataset = Dataset.from_pandas(data) - tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) - pad_token_id = tokenizer.eos_token_id - model = AutoModelForCausalLM.from_pretrained( - model_name, device_map="cuda:0", trust_remote_code=True, load_in_8bit=True - ) - model = PeftModel.from_pretrained(model, model_directory) - model.eval() - encodings = tokenizer("\n\n".join(dataset["text"][:5]), return_tensors="pt") - - max_length = 1024 - stride = 512 - seq_len = encodings.input_ids.size(1) - - nlls = [] - prev_end_loc = 0 - for begin_loc in range(0, seq_len, stride): - end_loc = min(begin_loc + max_length, seq_len) - trg_len = end_loc - prev_end_loc # may be different from stride on last loop - input_ids = encodings.input_ids[:, begin_loc:end_loc] - target_ids = input_ids.clone() - target_ids[:, :-trg_len] = -100 - - with torch.no_grad(): - outputs = model(input_ids.cuda(), labels=target_ids) - - # loss is calculated using CrossEntropyLoss which averages over valid labels - # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels - # to the left by 1. - neg_log_likelihood = outputs.loss - - nlls.append(neg_log_likelihood) - - prev_end_loc = end_loc - if end_loc == seq_len: - break - - ppl = torch.exp(torch.stack(nlls).mean()).item() - context.log_result("perplexity", ppl) diff --git a/functions/development/huggingface_auto_trainer/latest/src/item.yaml b/functions/development/huggingface_auto_trainer/latest/src/item.yaml deleted file mode 100644 index e556c11d..00000000 --- a/functions/development/huggingface_auto_trainer/latest/src/item.yaml +++ /dev/null @@ -1,25 +0,0 @@ -apiVersion: v1 -categories: -- machine-learning -- model-training -description: fine-tune llm model with ease -doc: '' -example: huggingface_auto_trainer.ipynb -generationDate: 2023-08-21:17-25 -hidden: false -icon: '' -labels: - author: Zeevr -maintainers: [] -marketplaceType: '' -mlrunVersion: 1.4.0 -name: huggingface-auto-trainer -platformVersion: 3.5.0 -spec: - filename: huggingface_auto_trainer.py - handler: finetune_llm - image: mlrun/mlrun - kind: job - requirements: [] -url: '' -version: 1.0.0 diff --git a/functions/development/huggingface_auto_trainer/latest/src/requirements.txt b/functions/development/huggingface_auto_trainer/latest/src/requirements.txt deleted file mode 100644 index 1376b1d0..00000000 --- a/functions/development/huggingface_auto_trainer/latest/src/requirements.txt +++ /dev/null @@ -1,5 +0,0 @@ -peft -transformers -torch -datasets -plotly diff --git a/functions/development/huggingface_auto_trainer/latest/src/test_huggingface_auto_trainer.py b/functions/development/huggingface_auto_trainer/latest/src/test_huggingface_auto_trainer.py deleted file mode 100644 index 53576e4e..00000000 --- a/functions/development/huggingface_auto_trainer/latest/src/test_huggingface_auto_trainer.py +++ /dev/null @@ -1,42 +0,0 @@ -import tempfile - -import mlrun - - -def test_train(): - - model_name = "distilgpt2" - tokenizer = model_name - auto_trainer = mlrun.import_function("function.yaml") - - training_arguments = { - "per_device_train_batch_size": 4, - "gradient_accumulation_steps": 1, - "warmup_steps": 2, - "max_steps": 10, - "learning_rate": 2e-4, - "logging_steps": 1, - } - - params = { - "model": (model_name, "transformers.AutoModelForCausalLM"), - "tokenizer": tokenizer, - "train_dataset": "Abirate/english_quotes", - "training_config": training_arguments, - "dataset_columns_to_train": "quote", - "model_pretrained_config": {"use_cache": False}, - "use_cuda": False, - } - - try: - with tempfile.TemporaryDirectory() as test_directory: - auto_trainer.run( - local=True, - params=params, - handler="finetune_llm", - returns=["model"], - workdir=test_directory, - ) - - except Exception as exception: - print(f"- The training failed - raised the following error:\n- {exception}") diff --git a/functions/development/huggingface_auto_trainer/latest/static/documentation.html b/functions/development/huggingface_auto_trainer/latest/static/documentation.html deleted file mode 100644 index be893164..00000000 --- a/functions/development/huggingface_auto_trainer/latest/static/documentation.html +++ /dev/null @@ -1,380 +0,0 @@ - - - - - - - -huggingface_auto_trainer package - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - - - -
-
- - -
-
-
- -
-

huggingface_auto_trainer package

- -
- -
-
-
-
-
-

huggingface_auto_trainer package#

-
-

Submodules#

-
-
-

huggingface_auto_trainer.huggingface_auto_trainer module#

-
-
-class huggingface_auto_trainer.huggingface_auto_trainer.ConfigKeys[source]#
-

Bases: object

-
-
-data_collator = 'data_collator'#
-
-
-
-deepspeed = 'deepspeed'#
-
-
-
-lora = 'lora'#
-
-
-
-model_pretrained = 'model_pretrained'#
-
-
-
-quantization = 'quantization'#
-
-
-
-tokenizer_pretrained = 'tokenizer_pretrained'#
-
-
-
-training = 'training'#
-
-
-
-
-class huggingface_auto_trainer.huggingface_auto_trainer.HFTrainerMLRunInterface[source]#
-

Bases: abc.ABC, Generic[mlrun.frameworks._common.utils.MLRunInterfaceableType]

-

This is temporary and will be built in mlrun 1.5.0 -Interface for adding MLRun features for tensorflow keras API.

-
-
-DEFAULT_CONTEXT_NAME = 'mlrun-huggingface'#
-
-
-
-classmethod add_interface(obj: transformers.Trainer, restoration: Optional[Tuple[Dict[str, Any], Dict[str, Any], List[str]]] = None)[source]#
-

Enrich the object with this interface properties, methods and functions so it will have this framework MLRun’s -features.

-
-
Parameters
-
    -
  • obj – The object to enrich his interface.

  • -
  • restoration – Restoration information tuple as returned from ‘remove_interface’ in order to add the -interface in a certain state.

  • -
-
-
-
-
-
-classmethod mlrun_train()[source]#
-
-
-
-
-class huggingface_auto_trainer.huggingface_auto_trainer.MLRunCallback(*args: Any, **kwargs: Any)[source]#
-

Bases: transformers.

-

This is temporary and will be built in mlrun 1.5.0 -Callback for collecting logs during training / evaluation of the Trainer API.

-
-
-log_metric_plot(name: str, scores: List[float])[source]#
-
-
-
-log_metrics()[source]#
-
-
-
-on_epoch_begin(args: transformers.TrainingArguments, state: transformers.TrainerState, control: transformers.TrainerControl, **kwargs)[source]#
-
-
-
-on_epoch_end(args: transformers.TrainingArguments, state: transformers.TrainerState, control: transformers.TrainerControl, **kwargs)[source]#
-
-
-
-on_evaluate(args: transformers.TrainingArguments, state: transformers.TrainerState, control: transformers.TrainerControl, **kwargs)[source]#
-
-
-
-on_log(args: transformers.TrainingArguments, state: transformers.TrainerState, control: transformers.TrainerControl, logs: Optional[Dict[str, float]] = None, **kwargs)[source]#
-
-
-
-on_train_begin(args: transformers.TrainingArguments, state: transformers.TrainerState, control: transformers.TrainerControl, **kwargs)[source]#
-
-
-
-on_train_end(args: transformers.TrainingArguments, state: transformers.TrainerState, control: transformers.TrainerControl, model: Optional[transformers.PreTrainedModel] = None, tokenizer: Optional[transformers.PreTrainedTokenizer] = None, **kwargs)[source]#
-
-
-
-
-huggingface_auto_trainer.huggingface_auto_trainer.apply_mlrun(trainer: transformers.Trainer, model_name: Optional[str] = None, tag: str = '', context: Optional[mlrun.execution.MLClientCtx] = None, auto_log: bool = True, labels: Optional[Dict[str, str]] = None, extra_data: Optional[dict] = None, **kwargs)[source]#
-

This is temporary and will be built in mlrun 1.5.0

-
-
-
-huggingface_auto_trainer.huggingface_auto_trainer.evaluate(context, model_path, data: pandas.core.frame.DataFrame, model_name: Optional[str] = None, tokenizer_name: Optional[str] = None)[source]#
-

Evaluating the model using perplexity, for more information visit: -https://huggingface.co/docs/transformers/perplexity

-
-
Parameters
-
    -
  • context – mlrun context

  • -
  • model_path – path to the model directory

  • -
  • data – the data to evaluate the model

  • -
  • model_name – name of base model

  • -
  • tokenizer_name – name of base tokenizer

  • -
-
-
-
-
-
-huggingface_auto_trainer.huggingface_auto_trainer.finetune_llm(context: mlrun.execution.MLClientCtx, train_dataset: Union[str, mlrun.datastore.base.DataItem], eval_dataset: Optional[str] = None, train_load_dataset_kwargs: dict = {}, eval_load_dataset_kwargs: dict = {}, dataset_columns_to_train: Union[str, list] = 'text', model: Union[str, List[str]] = 'huggingface-model', tokenizer: Optional[Union[str, List[str]]] = None, deepspeed_config: Union[dict, bool] = False, quantization_config: Union[dict, bool] = False, lora_config: Union[dict, bool] = False, training_config: dict = {}, model_pretrained_config: dict = {}, tokenizer_pretrained_config: dict = {}, data_collator_config: dict = {}, task: str = 'text-generation', use_cuda: bool = True, framework: str = 'pt', device_map: str = 'auto', **kwargs)[source]#
-
-
Fine-tunes a Language Model (LLM) on a specific task using the provided dataset.

The function takes various configuration parameters to customize the training process -and adapt the model to specific tasks using a provided dataset.

-
-
-
-
Parameters
-
    -
  • context – mlrun context in order to log trained model

  • -
  • dataset_columns_to_train – which columns to pass to the model as inputs

  • -
  • eval_load_dataset_kwargs – kwargs for dataset loading

  • -
  • train_load_dataset_kwargs – kwargs for dataset loading

  • -
  • framework – pt ot tf

  • -
  • use_cuda – use gpu or not

  • -
  • tokenizer_pretrained_config – config to load the pretrained tokenizer

  • -
  • model_pretrained_config – config to load the pretrained model

  • -
  • tokenizer – a tuple containing tokenizer name and class, or str with tokenizer name or path

  • -
  • model – a tuple containing model name and class, or str with model name or path

  • -
  • train_dataset – The train dataset used for fine-tuning the language model.

  • -
  • eval_dataset – The eval dataset used for evaluate the language model during training.

  • -
  • deepspeed_config – Configuration options for DeepSpeed (optional).

  • -
  • quantization_config – Configuration options for model quantization (optional).

  • -
  • lora_config – Configuration options for Low-Rank Approximation (LoRA) (optional).

  • -
  • training_config – Configuration options specific to the fine-tuning training process (optional).

  • -
  • data_collator_config – Configuration options for data collation during training (optional).

  • -
  • task – A description of the specific task the model is being fine-tuned for.

  • -
  • kwargs – Additional keyword arguments.

  • -
-
-
-
-
-
-

Module contents#

-
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/huggingface_auto_trainer/latest/static/example.html b/functions/development/huggingface_auto_trainer/latest/static/example.html deleted file mode 100644 index 7ae9a6c4..00000000 --- a/functions/development/huggingface_auto_trainer/latest/static/example.html +++ /dev/null @@ -1,351 +0,0 @@ - - - - - - - -Llm auto trainer - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - - - -
-
- - -
-
-
- - -
-
-
-

Llm auto trainer#

-
-

Notebook Introduction: Fine-Tuning a Large Language Model with Ease#

-

Welcome to this example notebook that demonstrates a simplified yet powerful approach to fine-tuning a Large Language Model (LLM) effortlessly. Fine-tuning is a crucial technique that allows you to adapt pre-trained language models to specific tasks, making them more contextually relevant and useful.

-

In this notebook, we will walk you through a step-by-step process of fine-tuning a state-of-the-art language model using a user-friendly and efficient method. You don’t need to be an expert in machine learning or natural language processing to follow along – our approach focuses on simplicity and effectiveness.

-
-

First, we will select the model we wish to fine-tune and take the matching tokenizer and appropriate config#

-
-
-
import os
-from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
-from transformers import logging
-
-logging.set_verbosity("CRITICAL")
-
-model_name = "tiiuae/falcon-7b"
-tokenizer = model_name
-generation_config = GenerationConfig.from_pretrained(model_name)
-
-
-
-
-
-
-

Then, in order to use with mlrun, we will create an mlrun project and create an mlrun function#

-
-
-
import mlrun
-
-project = mlrun.get_or_create_project(
-    name="auto-trainer-test",
-    context="./",
-    user_project=True,
-    parameters={
-        "default_image": "yonishelach/mlrun-llm",
-    },
-)
-
-
-
-
-
-
-
project.set_function(
-    "auto-trainer.py",
-    name="auto-trainer",
-    kind="job",
-    image="yonishelach/mlrun-llm",
-    handler="finetune_llm",
-)
-project.save()
-
-
-
-
-
-
-

we can set the every config or parameter we want, including training arguments, hyper parameters and more, and pass to the function#

-
-
-
import transformers
-
-training_arguments = {
-    "per_device_train_batch_size": 4,
-    "gradient_accumulation_steps": 1,
-    "warmup_steps": 2,
-    "max_steps": 10,
-    "learning_rate": 2e-4,
-    "fp16": True,
-    "logging_steps": 1,
-    "optim": "paged_adamw_8bit",
-}
-
-
-
-
-
-
-

Now we simply run the function#

-
-
-
training_run = mlrun.run_function(
-    function="auto-trainer",
-    name="auto-trainer",
-    local=True,
-    params={
-        "model": (model_name, "transformers.AutoModelForCausalLM"),
-        "tokenizer": tokenizer,
-        "train_dataset": "Abirate/english_quotes",
-        "training_config": training_arguments,
-        "quantization_config": True,
-        "lora_config": True,
-        "dataset_columns_to_train": "quote",
-        "lora_target_modules": ["query_key_value"],
-        "model_pretrained_config": {"trust_remote_code": True, "use_cache": False},
-    },
-    handler="finetune_llm",
-    outputs=["model"],
-)
-
-
-
-
-
-
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/huggingface_auto_trainer/latest/static/function.html b/functions/development/huggingface_auto_trainer/latest/static/function.html deleted file mode 100644 index 9a1f2953..00000000 --- a/functions/development/huggingface_auto_trainer/latest/static/function.html +++ /dev/null @@ -1,371 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: job
-metadata:
-  name: huggingface-auto-trainer
-  tag: ''
-  hash: 4459f0b675c36a20c8f542126a96b98b0ac82271
-  project: ''
-  labels:
-    author: Zeevr
-  categories:
-  - machine-learning
-  - model-training
-spec:
-  command: ''
-  args: []
-  image: mlrun/mlrun
-  build:
-    functionSourceCode: 
-    commands: []
-    code_origin: https://github.com/ZeevRispler/functions.git#a63a647cf6bc3015a8dcbd18903f9db44bdf0b66:/Users/Zeev_Rispler/PycharmProjects/functions/huggingface_auto_trainer/huggingface_auto_trainer.py
-    origin_filename: /Users/Zeev_Rispler/PycharmProjects/functions/huggingface_auto_trainer/huggingface_auto_trainer.py
-    requirements: []
-  entry_points:
-    add_interface:
-      name: add_interface
-      doc: ''
-      parameters:
-      - name: cls
-        default: ''
-      - name: obj
-        type: Trainer
-        default: ''
-      - name: restoration
-        type: MLRunInterfaceRestorationType
-        default: null
-      outputs:
-      - default: ''
-      lineno: 70
-    mlrun_train:
-      name: mlrun_train
-      doc: ''
-      parameters:
-      - name: cls
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 80
-    wrapper:
-      name: wrapper
-      doc: ''
-      parameters:
-      - name: self
-        type: Trainer
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 81
-    on_epoch_begin:
-      name: on_epoch_begin
-      doc: ''
-      parameters:
-      - name: self
-        default: ''
-      - name: args
-        type: TrainingArguments
-        default: ''
-      - name: state
-        type: TrainerState
-        default: ''
-      - name: control
-        type: TrainerControl
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 129
-    on_epoch_end:
-      name: on_epoch_end
-      doc: ''
-      parameters:
-      - name: self
-        default: ''
-      - name: args
-        type: TrainingArguments
-        default: ''
-      - name: state
-        type: TrainerState
-        default: ''
-      - name: control
-        type: TrainerControl
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 140
-    on_log:
-      name: on_log
-      doc: ''
-      parameters:
-      - name: self
-        default: ''
-      - name: args
-        type: TrainingArguments
-        default: ''
-      - name: state
-        type: TrainerState
-        default: ''
-      - name: control
-        type: TrainerControl
-        default: ''
-      - name: logs
-        type: Dict[str, float]
-        default: null
-      outputs:
-      - default: ''
-      lineno: 151
-    on_train_begin:
-      name: on_train_begin
-      doc: ''
-      parameters:
-      - name: self
-        default: ''
-      - name: args
-        type: TrainingArguments
-        default: ''
-      - name: state
-        type: TrainerState
-        default: ''
-      - name: control
-        type: TrainerControl
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 177
-    on_train_end:
-      name: on_train_end
-      doc: ''
-      parameters:
-      - name: self
-        default: ''
-      - name: args
-        type: TrainingArguments
-        default: ''
-      - name: state
-        type: TrainerState
-        default: ''
-      - name: control
-        type: TrainerControl
-        default: ''
-      - name: model
-        type: PreTrainedModel
-        default: null
-      - name: tokenizer
-        type: PreTrainedTokenizer
-        default: null
-      outputs:
-      - default: ''
-      lineno: 188
-    on_evaluate:
-      name: on_evaluate
-      doc: ''
-      parameters:
-      - name: self
-        default: ''
-      - name: args
-        type: TrainingArguments
-        default: ''
-      - name: state
-        type: TrainerState
-        default: ''
-      - name: control
-        type: TrainerControl
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 201
-    log_metrics:
-      name: log_metrics
-      doc: ''
-      parameters:
-      - name: self
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 215
-    log_metric_plot:
-      name: log_metric_plot
-      doc: ''
-      parameters:
-      - name: self
-        default: ''
-      - name: name
-        type: str
-        default: ''
-      - name: scores
-        type: List[float]
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 222
-    apply_mlrun:
-      name: apply_mlrun
-      doc: This is temporary and will be built in mlrun 1.5.0
-      parameters:
-      - name: trainer
-        type: Trainer
-        default: ''
-      - name: model_name
-        type: str
-        default: null
-      - name: tag
-        type: str
-        default: ''
-      - name: context
-        type: MLClientCtx
-        default: null
-      - name: auto_log
-        type: bool
-        default: true
-      - name: labels
-        type: Dict[str, str]
-        default: null
-      - name: extra_data
-        type: dict
-        default: null
-      outputs:
-      - default: ''
-      lineno: 244
-    finetune_llm:
-      name: finetune_llm
-      doc: "Fine-tunes a Language Model (LLM) on a specific task using the provided\
-        \ dataset.\n The function takes various configuration parameters to customize\
-        \ the training process\n and adapt the model to specific tasks using a provided\
-        \ dataset."
-      parameters:
-      - name: context
-        type: MLClientCtx
-        doc: mlrun context in order to log trained model
-        default: ''
-      - name: train_dataset
-        type: Union[str, mlrun.datastore.DataItem]
-        doc: The train dataset used for fine-tuning the language model.
-        default: ''
-      - name: eval_dataset
-        type: str
-        doc: The eval dataset used for evaluate the language model during training.
-        default: null
-      - name: train_load_dataset_kwargs
-        type: dict
-        doc: kwargs for dataset loading
-        default: {}
-      - name: eval_load_dataset_kwargs
-        type: dict
-        doc: kwargs for dataset loading
-        default: {}
-      - name: dataset_columns_to_train
-        type: Union[str, list]
-        doc: which columns to pass to the model as inputs
-        default: text
-      - name: model
-        type: Union[str, List[str]]
-        doc: a tuple containing model name and class, or str with model name or path
-        default: huggingface-model
-      - name: tokenizer
-        type: Union[str, List[str]]
-        doc: a tuple containing tokenizer name and class, or str with tokenizer name
-          or path
-        default: null
-      - name: deepspeed_config
-        type: Union[dict, bool]
-        doc: Configuration options for DeepSpeed (optional).
-        default: false
-      - name: quantization_config
-        type: Union[dict, bool]
-        doc: Configuration options for model quantization (optional).
-        default: false
-      - name: lora_config
-        type: Union[dict, bool]
-        doc: Configuration options for Low-Rank Approximation (LoRA) (optional).
-        default: false
-      - name: training_config
-        type: dict
-        doc: Configuration options specific to the fine-tuning training process (optional).
-        default: {}
-      - name: model_pretrained_config
-        type: dict
-        doc: config to load the pretrained model
-        default: {}
-      - name: tokenizer_pretrained_config
-        type: dict
-        doc: config to load the pretrained tokenizer
-        default: {}
-      - name: data_collator_config
-        type: dict
-        doc: Configuration options for data collation during training (optional).
-        default: {}
-      - name: task
-        type: str
-        doc: A description of the specific task the model is being fine-tuned for.
-        default: text-generation
-      - name: use_cuda
-        type: bool
-        doc: use gpu or not
-        default: true
-      - name: framework
-        type: str
-        doc: pt ot tf
-        default: pt
-      - name: device_map
-        type: str
-        default: auto
-      outputs:
-      - default: ''
-      lineno: 630
-    evaluate:
-      name: evaluate
-      doc: 'Evaluating the model using perplexity, for more information visit:
-
-        https://huggingface.co/docs/transformers/perplexity'
-      parameters:
-      - name: context
-        doc: mlrun context
-        default: ''
-      - name: model_path
-        doc: path to the model directory
-        default: ''
-      - name: data
-        type: DataFrame
-        doc: the data to evaluate the model
-        default: ''
-      - name: model_name
-        type: str
-        doc: name of base model
-        default: null
-      - name: tokenizer_name
-        type: str
-        doc: name of base tokenizer
-        default: null
-      outputs:
-      - default: ''
-      lineno: 784
-  description: fine-tune llm model with ease
-  default_handler: finetune_llm
-  disable_auto_mount: false
-  clone_target_dir: ''
-  env: []
-  priority_class_name: ''
-  preemption_mode: prevent
-  affinity: null
-  tolerations: null
-  security_context: {}
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/huggingface_auto_trainer/latest/static/huggingface_auto_trainer.html b/functions/development/huggingface_auto_trainer/latest/static/huggingface_auto_trainer.html deleted file mode 100644 index 2063d183..00000000 --- a/functions/development/huggingface_auto_trainer/latest/static/huggingface_auto_trainer.html +++ /dev/null @@ -1,995 +0,0 @@ - - - - - - - -huggingface_auto_trainer.huggingface_auto_trainer - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - -
-
- -
-
-
-
-
- -
-

- -
-
-
-
-
-
-
-

Source code for huggingface_auto_trainer.huggingface_auto_trainer

-import importlib
-import os
-import shutil
-import tempfile
-import zipfile
-from abc import ABC
-from typing import Dict, List, Tuple, Union
-
-import mlrun
-import numpy as np
-import pandas as pd
-import peft
-import torch
-import transformers
-from datasets import Dataset, load_dataset
-from mlrun.artifacts.manager import Artifact, PlotlyArtifact
-from mlrun.datastore import is_store_uri
-from mlrun.frameworks._common import CommonTypes, MLRunInterface
-from mlrun.utils import logger
-from peft import (LoraConfig, PeftModel, get_peft_model,
-                  prepare_model_for_kbit_training)
-from plotly import graph_objects as go
-from transformers import (AutoModelForCausalLM, AutoTokenizer,
-                          BitsAndBytesConfig, DataCollatorForLanguageModeling,
-                          PreTrainedModel, PreTrainedTokenizer, Trainer,
-                          TrainerCallback, TrainerControl, TrainerState,
-                          TrainingArguments)
-
-supported_tasks = [
-    "question-answering",
-    "summarization",
-    "table-question-answering",
-    "text2text-generation",
-    "text-classification",
-    "sentiment-analysis",
-    "text-generation",
-    "token-classification",
-    "translation",
-    "translation_xx_to_yy",
-]
-
-
-
[docs]class ConfigKeys: - deepspeed = "deepspeed" - quantization = "quantization" - lora = "lora" - training = "training" - tokenizer_pretrained = "tokenizer_pretrained" - model_pretrained = "model_pretrained" - data_collator = "data_collator"
- - -# ----------------------from MLRUN-------------------------------- -
[docs]class HFTrainerMLRunInterface(MLRunInterface, ABC): - """ - This is temporary and will be built in mlrun 1.5.0 - Interface for adding MLRun features for tensorflow keras API. - """ - - # MLRuns context default name: - DEFAULT_CONTEXT_NAME = "mlrun-huggingface" - - # Attributes to replace so the MLRun interface will be fully enabled. - _REPLACED_METHODS = [ - "train", - # "evaluate" - ] - -
[docs] @classmethod - def add_interface( - cls, - obj: Trainer, - restoration: CommonTypes.MLRunInterfaceRestorationType = None, - ): - super(HFTrainerMLRunInterface, cls).add_interface( - obj=obj, restoration=restoration - )
- -
[docs] @classmethod - def mlrun_train(cls): - def wrapper(self: Trainer, *args, **kwargs): - # Restore the evaluation method as `train` will use it: - # cls._restore_attribute(obj=self, attribute_name="evaluate") - - # Call the original fit method: - result = self.original_train(*args, **kwargs) - - # Replace the evaluation method again: - # cls._replace_function(obj=self, function_name="evaluate") - - return result - - return wrapper
- - -
[docs]class MLRunCallback(TrainerCallback): - """ - This is temporary and will be built in mlrun 1.5.0 - Callback for collecting logs during training / evaluation of the `Trainer` API. - """ - - def __init__( - self, - context: mlrun.MLClientCtx = None, - model_name: str = "model", - tag: str = "", - labels: Dict[str, str] = None, - extra_data: dict = None, - ): - super().__init__() - - # Store the configurations: - self._context = ( - context - if context is not None - else mlrun.get_or_create_ctx("./mlrun-huggingface") - ) - self._model_name = model_name - self._tag = tag - self._labels = labels - self._extra_data = extra_data if extra_data is not None else {} - - # Set up the logging mode: - self._is_training = False - self._steps: List[List[int]] = [] - self._metric_scores: Dict[str, List[float]] = {} - self._artifacts: Dict[str, Artifact] = {} - -
[docs] def on_epoch_begin( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs, - ): - if not state.is_world_process_zero: - return - self._steps.append([])
- -
[docs] def on_epoch_end( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs, - ): - if not state.is_world_process_zero: - return - self.log_metrics()
- -
[docs] def on_log( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - logs: Dict[str, float] = None, - **kwargs, - ): - if not state.is_world_process_zero: - return - recent_logs = state.log_history[-1].copy() - - recent_logs.pop("epoch") - current_step = int(recent_logs.pop("step")) - if current_step not in self._steps[-1]: - self._steps[-1].append(current_step) - - for metric_name, metric_score in recent_logs.items(): - if metric_name.startswith("train_"): - if metric_name.split("train_")[1] not in self._metric_scores: - self._metric_scores[metric_name] = [metric_score] - continue - if metric_name not in self._metric_scores: - self._metric_scores[metric_name] = [] - self._metric_scores[metric_name].append(metric_score)
- -
[docs] def on_train_begin( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs, - ): - if not state.is_world_process_zero: - return - self._is_training = True
- -
[docs] def on_train_end( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - model: PreTrainedModel = None, - tokenizer: PreTrainedTokenizer = None, - **kwargs, - ): - if not state.is_world_process_zero: - return - self.log_metrics()
- -
[docs] def on_evaluate( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs, - ): - if not state.is_world_process_zero: - return - self.log_metrics() - - if self._is_training: - return
- -
[docs] def log_metrics(self): - for metric_name, metric_scores in self._metric_scores.items(): - self._context.log_result(key=metric_name, value=metric_scores[-1]) - if len(metric_scores) > 1: - self.log_metric_plot(name=metric_name, scores=metric_scores) - self._context.commit(completed=False)
- -
[docs] def log_metric_plot(self, name: str, scores: List[float]): - # Initialize a plotly figure: - metric_figure = go.Figure() - - # Add titles: - metric_figure.update_layout( - title=name.capitalize().replace("_", " "), - xaxis_title="Samples", - yaxis_title="Scores", - ) - - # Draw: - metric_figure.add_trace( - go.Scatter(x=np.arange(len(scores)), y=scores, mode="lines") - ) - - # Create the plotly artifact: - artifact_name = f"{name}_plot" - artifact = PlotlyArtifact(key=artifact_name, figure=metric_figure) - self._artifacts[artifact_name] = self._context.log_artifact(artifact)
- - -
[docs]def apply_mlrun( - trainer: transformers.Trainer, - model_name: str = None, - tag: str = "", - context: mlrun.MLClientCtx = None, - auto_log: bool = True, - labels: Dict[str, str] = None, - extra_data: dict = None, - **kwargs, -): - """ - This is temporary and will be built in mlrun 1.5.0 - """ - # Get parameters defaults: - if context is None: - context = mlrun.get_or_create_ctx(HFTrainerMLRunInterface.DEFAULT_CONTEXT_NAME) - - HFTrainerMLRunInterface.add_interface(obj=trainer) - - if auto_log: - trainer.add_callback( - MLRunCallback( - context=context, - model_name=model_name, - tag=tag, - labels=labels, - extra_data=extra_data, - ) - )
- - -# ----------------------end from MLRUN-------------------------------- - - -def _print_trainable_parameters(model): - """ - Prints the number of trainable parameters in the model. - """ - trainable_params = 0 - all_param = 0 - for _, param in model.named_parameters(): - all_param += param.numel() - if param.requires_grad: - trainable_params += param.numel() - print( - f"trainable params: {trainable_params} || all params: {all_param} || trainable%:" - f" {100 * trainable_params / all_param}" - ) - - -# default configs -# will be used if user provides "True" with config name as input -QUANTIZATION_CONFIG = transformers.BitsAndBytesConfig( - load_in_4bit=True, - bnb_4bit_use_double_quant=True, - bnb_4bit_quant_type="nf4", - bnb_4bit_compute_dtype=torch.bfloat16, -) - -LORA_CONFIG = peft.LoraConfig( - r=8, - lora_alpha=32, - target_modules=["query_key_value"], - lora_dropout=0.05, - bias="none", - task_type="CAUSAL_LM", -) - -DEEPSPEED_CONFIG = { - "train_micro_batch_size_per_gpu": "auto", - "fp16": {"enabled": True}, - "autotuning": { - "enabled": True, - "arg_mappings": { - "train_micro_batch_size_per_gpu": "--per_device_train_batch_size", - "gradient_accumulation_steps ": "--gradient_accumulation_steps", - }, - }, - "zero_optimization": { - "stage": 2, - }, -} - - -def _update_config(src: dict, dst: dict): - """ - update configs according to user, this way the user can add/modify values in default configs for e.g. - - goes over all configs and corresponding prefixes, collect all the keys from the given dict that start - with the prefix and add them to appropriate config - - :param src: dict of all candidate values to update dict. - :param dst: dict containing all configs to update. - """ - - for config_name, config in dst.items(): - - # If given True we use default dict - # Can also be False or a config dict given from user, so we check specifically fo True - if config is True and config_name == "quantization": - config = QUANTIZATION_CONFIG - - if config is True and config_name == "lora": - config = LORA_CONFIG - - if config is True and config_name == "deepspeed": - config = DEEPSPEED_CONFIG - - # in some cases we can get a boolean value, in that case no need to look for args - if isinstance(config, bool): - config = None - - elif isinstance(config, dict): - for key, val in src.items(): - if key.startswith(config_name): - config[key.replace(f"{config_name}_", "")] = val - - # update by config name - else: - for key, val in src.items(): - if key.startswith(config_name): - setattr(config, key.replace(f"{config_name}_", ""), val) - - dst.update({config_name: config}) - - -def _get_class_object(class_path: str) -> type: - """ - given a full class name, this function returns the correct class - - :param class_path: a full class name (ex. 'transformers.AutoModelForCausalLM') - - :return the wanted class object - """ - module_path, class_name = class_path.rsplit(".", 1) - module = importlib.import_module(module_path) - return getattr(module, class_name) - - -def _set_model_and_tokenizer( - model: Union[str, List[str]], - tokenizer: Union[str, List[str]], - task: str, - framework: str, - lora_config: dict, - quantization_config: dict, - use_cuda: bool, - tokenizer_pretrained_config, - model_pretrained_config, - device_map: str, -): - """ - get the correct model and tokenizer according to given user inputs - - :param model: a tuple containing model name and class, or str with model name or path - :param tokenizer: a tuple containing tokenizer name and class, or str with tokenizer name or path - :param task: a supported nlp task, used to choose model if not provided - :param framework: pt or tf - :param lora_config: lora config or None, to load model in appropriate way - :param quantization_config: quantization config or None, to load model in appropriate way - :param use_cuda: use gpu or not - :param tokenizer_pretrained_config: config to load the pretrained tokenizer - :param model_pretrained_config: config to load the pretrained model - :param device_map: a device map for model training if using number of gpu's - - :returns: model and tokenizer - """ - # if task is not supported and no model was given we can't choose one - if task and task not in supported_tasks and not model: - logger.error("unsupported task option chosen") - raise - - # load model from store - if isinstance(model, str) and is_store_uri(model): - pass - # TODO: load both model and tokenizer and return, need guy's help - - # if it's a tuple them we assume it contains of both name and class - if isinstance(model, list): - model_name, model_class = model - model_class = _get_class_object(model_class) - - # in the case we don't get the model class we need the task in order to choose the correct model - else: - if task is None: - logger.error("task must be chosen in order to determine the correct model") - raise Exception( - "this function requires either a supported task or a model and model class to be chosen" - ) - - _, available_classes, task_options = transformers.pipelines.check_task(task) - - if isinstance(model, str): - model_name = model - - # if model is not given, we take the default model for the given task - else: - model_name, _ = transformers.pipelines.get_default_model_and_revision( - available_classes, framework, task_options - ) - if not available_classes.get(framework, tuple()): - logger.error( - "given task's default model is not supported in specified framework" - ) - raise Exception( - "this function requires either a supported task or a model and model class to be chosen" - ) - - model_class = available_classes[framework][0] - - # load the pretrained model - if use_cuda: - device_map = device_map - else: - device_map = None - - model = model_class.from_pretrained( - model_name, - quantization_config=quantization_config, - device_map=device_map, - **model_pretrained_config, - ) - - # If quantization config is given we will load a quantized model, if not a regular one - if quantization_config: - model.gradient_checkpointing_enable() - model = peft.prepare_model_for_kbit_training(model) - - # If lora config was given we want to do lora fine tune, we update model here - if lora_config: - model = peft.get_peft_model(model, lora_config) - - # if not specified we choose the default tokenizer that corresponding to the model - if tokenizer is None: - tokenizer = transformers.AutoTokenizer.from_pretrained(model_name) - return model_name, model, tokenizer - - if isinstance(tokenizer, str): - tokenizer_name = tokenizer - tokenizer_class = transformers.AutoTokenizer - - # if it's not a str then it's a tuple of both name and class - else: - tokenizer_name, tokenizer_class = tokenizer - tokenizer_class = _get_class_object(tokenizer_class) - - tokenizer = tokenizer_class.from_pretrained( - tokenizer_name, **tokenizer_pretrained_config - ) - - tokenizer.pad_token = tokenizer.eos_token - - return model_name, model, tokenizer - - -def _dataset_loader(dataset: str, is_train: bool = True, **kwargs) -> Dataset: - """ - loads the specific dataset provided by the user - - :param dataset: name or path of dataset to load - :param is_train: bool that indicates the purpose of the dataset - :param kwargs: other kwargs for loading the dataset - - :returns: loaded dataset - """ - # if split in kwargs then the user decides how to split the dataset - if "split" in kwargs: - return load_dataset(dataset, **kwargs) - - # if it's a dataset for train we split with train - if is_train: - return load_dataset(dataset, split="train", **kwargs) - - # if it's eval dataset, then a lot of names are acceptable for the set and we check all of them - dataset = load_dataset(dataset, **kwargs) - if "test" in dataset: - return dataset.get("test") - elif "eval" in dataset: - return dataset.get("eval") - elif "validation" in dataset: - return dataset.get("validation") - - -def _prepare_dataset( - train_dataset: str, - eval_dataset: str, - train_load_dataset_kwargs, - eval_load_dataset_kwargs, - tokenizer, - dataset_columns_to_train: Union[str, list], -) -> (Dataset, Union[Dataset, None]): - """ - Loads the train and eval datasets (if provided) passes them through the tokenizer and - returns them ready to use in training - - :param train_dataset: the name or path to the train dataset - :param eval_dataset: the name or path to the eval dataset - :param dataset_columns_to_train: which columns to pass to the model as inputs - (need to pass through the tokenizer first) - :param train_load_dataset_kwargs: kwargs for dataset loading - :param eval_load_dataset_kwargs: kwargs for dataset loading - :param tokenizer: the tokenizer to pass the data through - - :returns: tokenized datasets - """ - if not tokenizer.pad_token: - tokenizer.pad_token = tokenizer.eos_token - - # we take col name/s in a list for easy generalization - if isinstance(dataset_columns_to_train, str): - dataset_columns_to_train = [dataset_columns_to_train] - - if isinstance(train_dataset, mlrun.datastore.DataItem): - train_dataset = Dataset.from_pandas(train_dataset.as_df()) - return ( - train_dataset.map( - lambda examples: tokenizer( - *[examples[col] for col in dataset_columns_to_train], - truncation=True, - padding=True, - ), - batched=True, - ), - None, - ) - - # Load datasets - # if provided two paths/names we load each separately using designated func - if eval_dataset: - train_dataset = _dataset_loader( - dataset=train_dataset, is_train=True, **train_load_dataset_kwargs - ) - eval_dataset = _dataset_loader( - dataset=eval_dataset, is_train=False, **eval_load_dataset_kwargs - ) - - # if only on path is given then we must check if it contains both dataset or if only one should be used - else: - dataset = load_dataset(train_dataset, **train_load_dataset_kwargs) - if "train" in dataset: - train_dataset = dataset.get("train") - if "test" in dataset: - eval_dataset = dataset.get("test") - elif "eval" in dataset: - eval_dataset = dataset.get("eval") - elif "validation" in dataset: - eval_dataset = dataset.get("validation") - else: - # only train dataset given, tokenize and return it - return ( - train_dataset.map( - lambda examples: tokenizer( - *[examples[col] for col in dataset_columns_to_train], - truncation=True, - padding=True, - ), - batched=True, - ), - None, - ) - else: - logger.error("train dataset is mandatory") - raise KeyError("no train dataset found in given dataset") - - # Tokenize the data so the model can understand it - tokenized_train_dataset = train_dataset.map( - lambda examples: tokenizer( - *[examples[col] for col in dataset_columns_to_train], - truncation=True, - padding=True, - ), - batched=True, - ) - - tokenized_eval_dataset = eval_dataset.map( - lambda examples: tokenizer( - *[examples[col] for col in dataset_columns_to_train], - truncation=True, - padding=True, - ), - batched=True, - ) - - return tokenized_train_dataset, tokenized_eval_dataset - - -
[docs]def finetune_llm( - context: mlrun.MLClientCtx, - train_dataset: Union[str, mlrun.datastore.DataItem], - eval_dataset: str = None, - train_load_dataset_kwargs: dict = {}, - eval_load_dataset_kwargs: dict = {}, - dataset_columns_to_train: Union[str, list] = "text", - model: Union[str, List[str]] = "huggingface-model", - tokenizer: Union[str, List[str]] = None, - deepspeed_config: Union[dict, bool] = False, - quantization_config: Union[dict, bool] = False, - lora_config: Union[dict, bool] = False, - training_config: dict = {}, - model_pretrained_config: dict = {}, - tokenizer_pretrained_config: dict = {}, - data_collator_config: dict = {}, - task: str = "text-generation", - use_cuda: bool = True, - framework: str = "pt", - device_map: str = "auto", - **kwargs, -): - """ - Fine-tunes a Language Model (LLM) on a specific task using the provided dataset. - The function takes various configuration parameters to customize the training process - and adapt the model to specific tasks using a provided dataset. - - :param context: mlrun context in order to log trained model - :param dataset_columns_to_train: which columns to pass to the model as inputs - :param eval_load_dataset_kwargs: kwargs for dataset loading - :param train_load_dataset_kwargs: kwargs for dataset loading - :param framework: pt ot tf - :param use_cuda: use gpu or not - :param tokenizer_pretrained_config: config to load the pretrained tokenizer - :param model_pretrained_config: config to load the pretrained model - :param tokenizer: a tuple containing tokenizer name and class, or str with tokenizer name or path - :param model: a tuple containing model name and class, or str with model name or path - :param train_dataset: The train dataset used for fine-tuning the language model. - :param eval_dataset: The eval dataset used for evaluate the language model during training. - :param deepspeed_config: Configuration options for DeepSpeed (optional). - :param quantization_config: Configuration options for model quantization (optional). - :param lora_config: Configuration options for Low-Rank Approximation (LoRA) (optional). - :param training_config: Configuration options specific to the fine-tuning training process (optional). - :param data_collator_config: Configuration options for data collation during training (optional). - :param task: A description of the specific task the model is being fine-tuned for. - :param kwargs: Additional keyword arguments. - """ - - # TODO: match forward.keyword to dataset.keyword - check if relevant in new design - # TODO: add warning for label, and add option to modify dataset col names - check if relevant in new design - - # Look for updates to configs given in kwargs - configs = { - ConfigKeys.deepspeed: deepspeed_config, - ConfigKeys.quantization: quantization_config, - ConfigKeys.lora: lora_config, - ConfigKeys.training: training_config, - ConfigKeys.model_pretrained: model_pretrained_config, - ConfigKeys.tokenizer_pretrained: tokenizer_pretrained_config, - ConfigKeys.data_collator: data_collator_config, - } - _update_config(dst=configs, src=kwargs) - - # check gpu permission and availability - if use_cuda: - if torch.cuda.is_available(): - # Clean gpu cache - torch.cuda.empty_cache() - else: - logger.warning("'use_cuda' is set to True, but no cuda device is available") - - # get model and tokenizer - model_name, model, tokenizer = _set_model_and_tokenizer( - model=model, - tokenizer=tokenizer, - task=task, - framework=framework, - lora_config=configs[ConfigKeys.lora], - quantization_config=configs[ConfigKeys.quantization], - use_cuda=use_cuda, - tokenizer_pretrained_config=tokenizer_pretrained_config, - model_pretrained_config=configs[ConfigKeys.model_pretrained], - device_map=device_map, - ) - - # Load datasets - tokenized_train, tokenized_eval = _prepare_dataset( - train_dataset=train_dataset, - eval_dataset=eval_dataset, - train_load_dataset_kwargs=train_load_dataset_kwargs, - eval_load_dataset_kwargs=eval_load_dataset_kwargs, - tokenizer=tokenizer, - dataset_columns_to_train=dataset_columns_to_train, - ) - - # Initialize the data collator for the trainer to use in order to create batches of data - data_collator = transformers.DataCollatorForLanguageModeling( - tokenizer=tokenizer, mlm=False, **data_collator_config - ) - - # Initialize training kwargs from user kwargs: - train_kwargs = configs[ConfigKeys.training] - - # If deepspeed config given we add it to training kwargs - if configs[ConfigKeys.deepspeed]: - train_kwargs["deepspeed"] = configs[ConfigKeys.deepspeed] - - # Take a look at the trainable parameters in the model - _print_trainable_parameters(model) - - # Preparing training arguments: - training_args = transformers.TrainingArguments( - output_dir=tempfile.mkdtemp(), - **train_kwargs, - ) - - trainer = transformers.Trainer( - model=model, - train_dataset=tokenized_train, - eval_dataset=tokenized_eval, - tokenizer=tokenizer, - data_collator=data_collator, - args=training_args, - ) - - apply_mlrun(trainer, model_name=model_name.split("/")[-1]) - model.config.use_cache = ( - False # silence the warnings. Please re-enable for inference! - ) - - # Apply training with evaluation: - context.logger.info(f"training '{model_name}'") - trainer.train() - - temp_directory = tempfile.TemporaryDirectory().name - trainer.save_model(temp_directory) - - # Zip the model directory: - shutil.make_archive( - base_name="model", - format="zip", - root_dir=temp_directory, - ) - - # Log the model: - context.log_model( - key="model", - db_key=model_name.split("/")[-1], - model_file="model.zip", - tag="", - framework="Hugging Face", - )
- - -
[docs]def evaluate( - context, - model_path, - data: pd.DataFrame, - model_name: str = None, - tokenizer_name: str = None, -): - """ - Evaluating the model using perplexity, for more information visit: - https://huggingface.co/docs/transformers/perplexity - - :param context: mlrun context - :param model_path: path to the model directory - :param data: the data to evaluate the model - :param model_name: name of base model - :param tokenizer_name: name of base tokenizer - """ - # Get the model artifact and file: - ( - model_file, - model_artifact, - extra_data, - ) = mlrun.artifacts.get_model(model_path) - - # Read the name: - _model_name = model_artifact.spec.db_key - - # Extract logged model files: - model_directory = os.path.join(os.path.dirname(model_file), _model_name) - with zipfile.ZipFile(model_file, "r") as zip_file: - zip_file.extractall(model_directory) - - # Loading the saved pretrained tokenizer and model: - dataset = Dataset.from_pandas(data) - tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) - pad_token_id = tokenizer.eos_token_id - model = AutoModelForCausalLM.from_pretrained( - model_name, device_map="cuda:0", trust_remote_code=True, load_in_8bit=True - ) - model = PeftModel.from_pretrained(model, model_directory) - model.eval() - encodings = tokenizer("\n\n".join(dataset["text"][:5]), return_tensors="pt") - - max_length = 1024 - stride = 512 - seq_len = encodings.input_ids.size(1) - - nlls = [] - prev_end_loc = 0 - for begin_loc in range(0, seq_len, stride): - end_loc = min(begin_loc + max_length, seq_len) - trg_len = end_loc - prev_end_loc # may be different from stride on last loop - input_ids = encodings.input_ids[:, begin_loc:end_loc] - target_ids = input_ids.clone() - target_ids[:, :-trg_len] = -100 - - with torch.no_grad(): - outputs = model(input_ids.cuda(), labels=target_ids) - - # loss is calculated using CrossEntropyLoss which averages over valid labels - # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels - # to the left by 1. - neg_log_likelihood = outputs.loss - - nlls.append(neg_log_likelihood) - - prev_end_loc = end_loc - if end_loc == seq_len: - break - - ppl = torch.exp(torch.stack(nlls).mean()).item() - context.log_result("perplexity", ppl)
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/huggingface_auto_trainer/latest/static/item.html b/functions/development/huggingface_auto_trainer/latest/static/item.html deleted file mode 100644 index 2ca49a41..00000000 --- a/functions/development/huggingface_auto_trainer/latest/static/item.html +++ /dev/null @@ -1,47 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- machine-learning
-- model-training
-description: fine-tune llm model with ease
-doc: ''
-example: huggingface_auto_trainer.ipynb
-generationDate: 2023-08-21:17-25
-hidden: false
-icon: ''
-labels:
-  author: Zeevr
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 1.4.0
-name: huggingface-auto-trainer
-platformVersion: 3.5.0
-spec:
-  filename: huggingface_auto_trainer.py
-  handler: finetune_llm
-  image: mlrun/mlrun
-  kind: job
-  requirements: []
-url: ''
-version: 1.0.0
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/huggingface_auto_trainer/latest/static/source.html b/functions/development/huggingface_auto_trainer/latest/static/source.html deleted file mode 100644 index 7c445e5d..00000000 --- a/functions/development/huggingface_auto_trainer/latest/static/source.html +++ /dev/null @@ -1,877 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-import importlib
-import os
-import shutil
-import tempfile
-import zipfile
-from abc import ABC
-from typing import Dict, List, Tuple, Union
-
-import mlrun
-import numpy as np
-import pandas as pd
-import peft
-import torch
-import transformers
-from datasets import Dataset, load_dataset
-from mlrun.artifacts.manager import Artifact, PlotlyArtifact
-from mlrun.datastore import is_store_uri
-from mlrun.frameworks._common import CommonTypes, MLRunInterface
-from mlrun.utils import logger
-from peft import (LoraConfig, PeftModel, get_peft_model,
-                  prepare_model_for_kbit_training)
-from plotly import graph_objects as go
-from transformers import (AutoModelForCausalLM, AutoTokenizer,
-                          BitsAndBytesConfig, DataCollatorForLanguageModeling,
-                          PreTrainedModel, PreTrainedTokenizer, Trainer,
-                          TrainerCallback, TrainerControl, TrainerState,
-                          TrainingArguments)
-
-supported_tasks = [
-    "question-answering",
-    "summarization",
-    "table-question-answering",
-    "text2text-generation",
-    "text-classification",
-    "sentiment-analysis",
-    "text-generation",
-    "token-classification",
-    "translation",
-    "translation_xx_to_yy",
-]
-
-
-class ConfigKeys:
-    deepspeed = "deepspeed"
-    quantization = "quantization"
-    lora = "lora"
-    training = "training"
-    tokenizer_pretrained = "tokenizer_pretrained"
-    model_pretrained = "model_pretrained"
-    data_collator = "data_collator"
-
-
-# ----------------------from MLRUN--------------------------------
-class HFTrainerMLRunInterface(MLRunInterface, ABC):
-    """
-    This is temporary and will be built in mlrun 1.5.0
-    Interface for adding MLRun features for tensorflow keras API.
-    """
-
-    # MLRuns context default name:
-    DEFAULT_CONTEXT_NAME = "mlrun-huggingface"
-
-    # Attributes to replace so the MLRun interface will be fully enabled.
-    _REPLACED_METHODS = [
-        "train",
-        # "evaluate"
-    ]
-
-    @classmethod
-    def add_interface(
-        cls,
-        obj: Trainer,
-        restoration: CommonTypes.MLRunInterfaceRestorationType = None,
-    ):
-        super(HFTrainerMLRunInterface, cls).add_interface(
-            obj=obj, restoration=restoration
-        )
-
-    @classmethod
-    def mlrun_train(cls):
-        def wrapper(self: Trainer, *args, **kwargs):
-            # Restore the evaluation method as `train` will use it:
-            # cls._restore_attribute(obj=self, attribute_name="evaluate")
-
-            # Call the original fit method:
-            result = self.original_train(*args, **kwargs)
-
-            # Replace the evaluation method again:
-            # cls._replace_function(obj=self, function_name="evaluate")
-
-            return result
-
-        return wrapper
-
-
-class MLRunCallback(TrainerCallback):
-    """
-    This is temporary and will be built in mlrun 1.5.0
-    Callback for collecting logs during training / evaluation of the `Trainer` API.
-    """
-
-    def __init__(
-        self,
-        context: mlrun.MLClientCtx = None,
-        model_name: str = "model",
-        tag: str = "",
-        labels: Dict[str, str] = None,
-        extra_data: dict = None,
-    ):
-        super().__init__()
-
-        # Store the configurations:
-        self._context = (
-            context
-            if context is not None
-            else mlrun.get_or_create_ctx("./mlrun-huggingface")
-        )
-        self._model_name = model_name
-        self._tag = tag
-        self._labels = labels
-        self._extra_data = extra_data if extra_data is not None else {}
-
-        # Set up the logging mode:
-        self._is_training = False
-        self._steps: List[List[int]] = []
-        self._metric_scores: Dict[str, List[float]] = {}
-        self._artifacts: Dict[str, Artifact] = {}
-
-    def on_epoch_begin(
-        self,
-        args: TrainingArguments,
-        state: TrainerState,
-        control: TrainerControl,
-        **kwargs,
-    ):
-        if not state.is_world_process_zero:
-            return
-        self._steps.append([])
-
-    def on_epoch_end(
-        self,
-        args: TrainingArguments,
-        state: TrainerState,
-        control: TrainerControl,
-        **kwargs,
-    ):
-        if not state.is_world_process_zero:
-            return
-        self.log_metrics()
-
-    def on_log(
-        self,
-        args: TrainingArguments,
-        state: TrainerState,
-        control: TrainerControl,
-        logs: Dict[str, float] = None,
-        **kwargs,
-    ):
-        if not state.is_world_process_zero:
-            return
-        recent_logs = state.log_history[-1].copy()
-
-        recent_logs.pop("epoch")
-        current_step = int(recent_logs.pop("step"))
-        if current_step not in self._steps[-1]:
-            self._steps[-1].append(current_step)
-
-        for metric_name, metric_score in recent_logs.items():
-            if metric_name.startswith("train_"):
-                if metric_name.split("train_")[1] not in self._metric_scores:
-                    self._metric_scores[metric_name] = [metric_score]
-                continue
-            if metric_name not in self._metric_scores:
-                self._metric_scores[metric_name] = []
-            self._metric_scores[metric_name].append(metric_score)
-
-    def on_train_begin(
-        self,
-        args: TrainingArguments,
-        state: TrainerState,
-        control: TrainerControl,
-        **kwargs,
-    ):
-        if not state.is_world_process_zero:
-            return
-        self._is_training = True
-
-    def on_train_end(
-        self,
-        args: TrainingArguments,
-        state: TrainerState,
-        control: TrainerControl,
-        model: PreTrainedModel = None,
-        tokenizer: PreTrainedTokenizer = None,
-        **kwargs,
-    ):
-        if not state.is_world_process_zero:
-            return
-        self.log_metrics()
-
-    def on_evaluate(
-        self,
-        args: TrainingArguments,
-        state: TrainerState,
-        control: TrainerControl,
-        **kwargs,
-    ):
-        if not state.is_world_process_zero:
-            return
-        self.log_metrics()
-
-        if self._is_training:
-            return
-
-    def log_metrics(self):
-        for metric_name, metric_scores in self._metric_scores.items():
-            self._context.log_result(key=metric_name, value=metric_scores[-1])
-            if len(metric_scores) > 1:
-                self.log_metric_plot(name=metric_name, scores=metric_scores)
-        self._context.commit(completed=False)
-
-    def log_metric_plot(self, name: str, scores: List[float]):
-        # Initialize a plotly figure:
-        metric_figure = go.Figure()
-
-        # Add titles:
-        metric_figure.update_layout(
-            title=name.capitalize().replace("_", " "),
-            xaxis_title="Samples",
-            yaxis_title="Scores",
-        )
-
-        # Draw:
-        metric_figure.add_trace(
-            go.Scatter(x=np.arange(len(scores)), y=scores, mode="lines")
-        )
-
-        # Create the plotly artifact:
-        artifact_name = f"{name}_plot"
-        artifact = PlotlyArtifact(key=artifact_name, figure=metric_figure)
-        self._artifacts[artifact_name] = self._context.log_artifact(artifact)
-
-
-def apply_mlrun(
-    trainer: transformers.Trainer,
-    model_name: str = None,
-    tag: str = "",
-    context: mlrun.MLClientCtx = None,
-    auto_log: bool = True,
-    labels: Dict[str, str] = None,
-    extra_data: dict = None,
-    **kwargs,
-):
-    """
-    This is temporary and will be built in mlrun 1.5.0
-    """
-    # Get parameters defaults:
-    if context is None:
-        context = mlrun.get_or_create_ctx(HFTrainerMLRunInterface.DEFAULT_CONTEXT_NAME)
-
-    HFTrainerMLRunInterface.add_interface(obj=trainer)
-
-    if auto_log:
-        trainer.add_callback(
-            MLRunCallback(
-                context=context,
-                model_name=model_name,
-                tag=tag,
-                labels=labels,
-                extra_data=extra_data,
-            )
-        )
-
-
-# ----------------------end from MLRUN--------------------------------
-
-
-def _print_trainable_parameters(model):
-    """
-    Prints the number of trainable parameters in the model.
-    """
-    trainable_params = 0
-    all_param = 0
-    for _, param in model.named_parameters():
-        all_param += param.numel()
-        if param.requires_grad:
-            trainable_params += param.numel()
-    print(
-        f"trainable params: {trainable_params} || all params: {all_param} || trainable%:"
-        f" {100 * trainable_params / all_param}"
-    )
-
-
-# default configs
-# will be used if user provides "True" with config name as input
-QUANTIZATION_CONFIG = transformers.BitsAndBytesConfig(
-    load_in_4bit=True,
-    bnb_4bit_use_double_quant=True,
-    bnb_4bit_quant_type="nf4",
-    bnb_4bit_compute_dtype=torch.bfloat16,
-)
-
-LORA_CONFIG = peft.LoraConfig(
-    r=8,
-    lora_alpha=32,
-    target_modules=["query_key_value"],
-    lora_dropout=0.05,
-    bias="none",
-    task_type="CAUSAL_LM",
-)
-
-DEEPSPEED_CONFIG = {
-    "train_micro_batch_size_per_gpu": "auto",
-    "fp16": {"enabled": True},
-    "autotuning": {
-        "enabled": True,
-        "arg_mappings": {
-            "train_micro_batch_size_per_gpu": "--per_device_train_batch_size",
-            "gradient_accumulation_steps ": "--gradient_accumulation_steps",
-        },
-    },
-    "zero_optimization": {
-        "stage": 2,
-    },
-}
-
-
-def _update_config(src: dict, dst: dict):
-    """
-    update configs according to user, this way the user can add/modify values in default configs for e.g.
-
-    goes over all configs and corresponding prefixes, collect all the keys from the given dict that start
-     with the prefix and add them to appropriate config
-
-    :param src: dict of all candidate values to update dict.
-    :param dst: dict containing all configs to update.
-    """
-
-    for config_name, config in dst.items():
-
-        # If given True we use default dict
-        # Can also be False or a config dict given from user, so we check specifically fo True
-        if config is True and config_name == "quantization":
-            config = QUANTIZATION_CONFIG
-
-        if config is True and config_name == "lora":
-            config = LORA_CONFIG
-
-        if config is True and config_name == "deepspeed":
-            config = DEEPSPEED_CONFIG
-
-        # in some cases we can get a boolean value, in that case no need to look for args
-        if isinstance(config, bool):
-            config = None
-
-        elif isinstance(config, dict):
-            for key, val in src.items():
-                if key.startswith(config_name):
-                    config[key.replace(f"{config_name}_", "")] = val
-
-        # update by config name
-        else:
-            for key, val in src.items():
-                if key.startswith(config_name):
-                    setattr(config, key.replace(f"{config_name}_", ""), val)
-
-        dst.update({config_name: config})
-
-
-def _get_class_object(class_path: str) -> type:
-    """
-    given a full class name, this function returns the correct class
-
-    :param class_path: a full class name (ex. 'transformers.AutoModelForCausalLM')
-
-    :return the wanted class object
-    """
-    module_path, class_name = class_path.rsplit(".", 1)
-    module = importlib.import_module(module_path)
-    return getattr(module, class_name)
-
-
-def _set_model_and_tokenizer(
-    model: Union[str, List[str]],
-    tokenizer: Union[str, List[str]],
-    task: str,
-    framework: str,
-    lora_config: dict,
-    quantization_config: dict,
-    use_cuda: bool,
-    tokenizer_pretrained_config,
-    model_pretrained_config,
-    device_map: str,
-):
-    """
-    get the correct model and tokenizer according to given user inputs
-
-    :param model: a tuple containing model name and class, or str with model name or path
-    :param tokenizer: a tuple containing tokenizer name and class, or str with tokenizer name or path
-    :param task: a supported nlp task, used to choose model if not provided
-    :param framework: pt or tf
-    :param lora_config: lora config or None, to load model in appropriate way
-    :param quantization_config: quantization config or None, to load model in appropriate way
-    :param use_cuda: use gpu or not
-    :param tokenizer_pretrained_config: config to load the pretrained tokenizer
-    :param model_pretrained_config: config to load the pretrained model
-    :param device_map: a device map for model training if using number of gpu's
-
-    :returns: model and tokenizer
-    """
-    # if task is not supported and no model was given we can't choose one
-    if task and task not in supported_tasks and not model:
-        logger.error("unsupported task option chosen")
-        raise
-
-    # load model from store
-    if isinstance(model, str) and is_store_uri(model):
-        pass
-        # TODO: load both model and tokenizer and return, need guy's help
-
-    # if it's a tuple them we assume it contains of both name and class
-    if isinstance(model, list):
-        model_name, model_class = model
-        model_class = _get_class_object(model_class)
-
-    # in the case we don't get the model class we need the task in order to choose the correct model
-    else:
-        if task is None:
-            logger.error("task must be chosen in order to determine the correct model")
-            raise Exception(
-                "this function requires either a supported task or a model and model class to be chosen"
-            )
-
-        _, available_classes, task_options = transformers.pipelines.check_task(task)
-
-        if isinstance(model, str):
-            model_name = model
-
-        # if model is not given, we take the default model for the given task
-        else:
-            model_name, _ = transformers.pipelines.get_default_model_and_revision(
-                available_classes, framework, task_options
-            )
-        if not available_classes.get(framework, tuple()):
-            logger.error(
-                "given task's default model is not supported in specified framework"
-            )
-            raise Exception(
-                "this function requires either a supported task or a model and model class to be chosen"
-            )
-
-        model_class = available_classes[framework][0]
-
-    # load the pretrained model
-    if use_cuda:
-        device_map = device_map
-    else:
-        device_map = None
-
-    model = model_class.from_pretrained(
-        model_name,
-        quantization_config=quantization_config,
-        device_map=device_map,
-        **model_pretrained_config,
-    )
-
-    # If quantization config is given we will load a quantized model, if not a regular one
-    if quantization_config:
-        model.gradient_checkpointing_enable()
-        model = peft.prepare_model_for_kbit_training(model)
-
-    # If lora config was given we want to do lora fine tune, we update model here
-    if lora_config:
-        model = peft.get_peft_model(model, lora_config)
-
-    # if not specified we choose the default tokenizer that corresponding to the model
-    if tokenizer is None:
-        tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
-        return model_name, model, tokenizer
-
-    if isinstance(tokenizer, str):
-        tokenizer_name = tokenizer
-        tokenizer_class = transformers.AutoTokenizer
-
-    # if it's not a str then it's a tuple of both name and class
-    else:
-        tokenizer_name, tokenizer_class = tokenizer
-        tokenizer_class = _get_class_object(tokenizer_class)
-
-    tokenizer = tokenizer_class.from_pretrained(
-        tokenizer_name, **tokenizer_pretrained_config
-    )
-
-    tokenizer.pad_token = tokenizer.eos_token
-
-    return model_name, model, tokenizer
-
-
-def _dataset_loader(dataset: str, is_train: bool = True, **kwargs) -> Dataset:
-    """
-    loads the specific dataset provided by the user
-
-    :param dataset: name or path of dataset to load
-    :param is_train: bool that indicates the purpose of the dataset
-    :param kwargs: other kwargs for loading the dataset
-
-    :returns: loaded dataset
-    """
-    # if split in kwargs then the user decides how to split the dataset
-    if "split" in kwargs:
-        return load_dataset(dataset, **kwargs)
-
-    # if it's a dataset for train we split with train
-    if is_train:
-        return load_dataset(dataset, split="train", **kwargs)
-
-    # if it's eval dataset, then a lot of names are acceptable for the set and we check all of them
-    dataset = load_dataset(dataset, **kwargs)
-    if "test" in dataset:
-        return dataset.get("test")
-    elif "eval" in dataset:
-        return dataset.get("eval")
-    elif "validation" in dataset:
-        return dataset.get("validation")
-
-
-def _prepare_dataset(
-    train_dataset: str,
-    eval_dataset: str,
-    train_load_dataset_kwargs,
-    eval_load_dataset_kwargs,
-    tokenizer,
-    dataset_columns_to_train: Union[str, list],
-) -> (Dataset, Union[Dataset, None]):
-    """
-    Loads the train and eval datasets (if provided) passes them through the tokenizer and
-    returns them ready to use in training
-
-    :param train_dataset: the name or path to the train dataset
-    :param eval_dataset: the name or path to the eval dataset
-    :param dataset_columns_to_train: which columns to pass to the model as inputs
-                                        (need to pass through the tokenizer first)
-    :param train_load_dataset_kwargs: kwargs for dataset loading
-    :param eval_load_dataset_kwargs: kwargs for dataset loading
-    :param tokenizer: the tokenizer to pass the data through
-
-    :returns: tokenized datasets
-    """
-    if not tokenizer.pad_token:
-        tokenizer.pad_token = tokenizer.eos_token
-
-    # we take col name/s in a list for easy generalization
-    if isinstance(dataset_columns_to_train, str):
-        dataset_columns_to_train = [dataset_columns_to_train]
-
-    if isinstance(train_dataset, mlrun.datastore.DataItem):
-        train_dataset = Dataset.from_pandas(train_dataset.as_df())
-        return (
-            train_dataset.map(
-                lambda examples: tokenizer(
-                    *[examples[col] for col in dataset_columns_to_train],
-                    truncation=True,
-                    padding=True,
-                ),
-                batched=True,
-            ),
-            None,
-        )
-
-    # Load datasets
-    # if provided two paths/names we load each separately using designated func
-    if eval_dataset:
-        train_dataset = _dataset_loader(
-            dataset=train_dataset, is_train=True, **train_load_dataset_kwargs
-        )
-        eval_dataset = _dataset_loader(
-            dataset=eval_dataset, is_train=False, **eval_load_dataset_kwargs
-        )
-
-    # if only on path is given then we must check if it contains both dataset or if only one should be used
-    else:
-        dataset = load_dataset(train_dataset, **train_load_dataset_kwargs)
-        if "train" in dataset:
-            train_dataset = dataset.get("train")
-            if "test" in dataset:
-                eval_dataset = dataset.get("test")
-            elif "eval" in dataset:
-                eval_dataset = dataset.get("eval")
-            elif "validation" in dataset:
-                eval_dataset = dataset.get("validation")
-            else:
-                # only train dataset given, tokenize and return it
-                return (
-                    train_dataset.map(
-                        lambda examples: tokenizer(
-                            *[examples[col] for col in dataset_columns_to_train],
-                            truncation=True,
-                            padding=True,
-                        ),
-                        batched=True,
-                    ),
-                    None,
-                )
-        else:
-            logger.error("train dataset is mandatory")
-            raise KeyError("no train dataset found in given dataset")
-
-    # Tokenize the data so the model can understand it
-    tokenized_train_dataset = train_dataset.map(
-        lambda examples: tokenizer(
-            *[examples[col] for col in dataset_columns_to_train],
-            truncation=True,
-            padding=True,
-        ),
-        batched=True,
-    )
-
-    tokenized_eval_dataset = eval_dataset.map(
-        lambda examples: tokenizer(
-            *[examples[col] for col in dataset_columns_to_train],
-            truncation=True,
-            padding=True,
-        ),
-        batched=True,
-    )
-
-    return tokenized_train_dataset, tokenized_eval_dataset
-
-
-def finetune_llm(
-    context: mlrun.MLClientCtx,
-    train_dataset: Union[str, mlrun.datastore.DataItem],
-    eval_dataset: str = None,
-    train_load_dataset_kwargs: dict = {},
-    eval_load_dataset_kwargs: dict = {},
-    dataset_columns_to_train: Union[str, list] = "text",
-    model: Union[str, List[str]] = "huggingface-model",
-    tokenizer: Union[str, List[str]] = None,
-    deepspeed_config: Union[dict, bool] = False,
-    quantization_config: Union[dict, bool] = False,
-    lora_config: Union[dict, bool] = False,
-    training_config: dict = {},
-    model_pretrained_config: dict = {},
-    tokenizer_pretrained_config: dict = {},
-    data_collator_config: dict = {},
-    task: str = "text-generation",
-    use_cuda: bool = True,
-    framework: str = "pt",
-    device_map: str = "auto",
-    **kwargs,
-):
-    """
-    Fine-tunes a Language Model (LLM) on a specific task using the provided dataset.
-     The function takes various configuration parameters to customize the training process
-     and adapt the model to specific tasks using a provided dataset.
-
-    :param context: mlrun context in order to log trained model
-    :param dataset_columns_to_train: which columns to pass to the model as inputs
-    :param eval_load_dataset_kwargs: kwargs for dataset loading
-    :param train_load_dataset_kwargs: kwargs for dataset loading
-    :param framework: pt ot tf
-    :param use_cuda: use gpu or not
-    :param tokenizer_pretrained_config: config to load the pretrained tokenizer
-    :param model_pretrained_config: config to load the pretrained model
-    :param tokenizer: a tuple containing tokenizer name and class, or str with tokenizer name or path
-    :param model: a tuple containing model name and class, or str with model name or path
-    :param train_dataset: The train dataset used for fine-tuning the language model.
-    :param eval_dataset: The eval dataset used for evaluate the language model during training.
-    :param deepspeed_config: Configuration options for DeepSpeed (optional).
-    :param quantization_config: Configuration options for model quantization (optional).
-    :param lora_config: Configuration options for Low-Rank Approximation (LoRA) (optional).
-    :param training_config: Configuration options specific to the fine-tuning training process (optional).
-    :param data_collator_config: Configuration options for data collation during training (optional).
-    :param task: A description of the specific task the model is being fine-tuned for.
-    :param kwargs: Additional keyword arguments.
-    """
-
-    # TODO: match forward.keyword to dataset.keyword - check if relevant in new design
-    # TODO: add warning for label, and add option to modify dataset col names - check if relevant in new design
-
-    # Look for updates to configs given in kwargs
-    configs = {
-        ConfigKeys.deepspeed: deepspeed_config,
-        ConfigKeys.quantization: quantization_config,
-        ConfigKeys.lora: lora_config,
-        ConfigKeys.training: training_config,
-        ConfigKeys.model_pretrained: model_pretrained_config,
-        ConfigKeys.tokenizer_pretrained: tokenizer_pretrained_config,
-        ConfigKeys.data_collator: data_collator_config,
-    }
-    _update_config(dst=configs, src=kwargs)
-
-    # check gpu permission and availability
-    if use_cuda:
-        if torch.cuda.is_available():
-            # Clean gpu cache
-            torch.cuda.empty_cache()
-        else:
-            logger.warning("'use_cuda' is set to True, but no cuda device is available")
-
-    # get model and tokenizer
-    model_name, model, tokenizer = _set_model_and_tokenizer(
-        model=model,
-        tokenizer=tokenizer,
-        task=task,
-        framework=framework,
-        lora_config=configs[ConfigKeys.lora],
-        quantization_config=configs[ConfigKeys.quantization],
-        use_cuda=use_cuda,
-        tokenizer_pretrained_config=tokenizer_pretrained_config,
-        model_pretrained_config=configs[ConfigKeys.model_pretrained],
-        device_map=device_map,
-    )
-
-    # Load datasets
-    tokenized_train, tokenized_eval = _prepare_dataset(
-        train_dataset=train_dataset,
-        eval_dataset=eval_dataset,
-        train_load_dataset_kwargs=train_load_dataset_kwargs,
-        eval_load_dataset_kwargs=eval_load_dataset_kwargs,
-        tokenizer=tokenizer,
-        dataset_columns_to_train=dataset_columns_to_train,
-    )
-
-    # Initialize the data collator for the trainer to use in order to create batches of data
-    data_collator = transformers.DataCollatorForLanguageModeling(
-        tokenizer=tokenizer, mlm=False, **data_collator_config
-    )
-
-    # Initialize training kwargs from user kwargs:
-    train_kwargs = configs[ConfigKeys.training]
-
-    # If deepspeed config given we add it to training kwargs
-    if configs[ConfigKeys.deepspeed]:
-        train_kwargs["deepspeed"] = configs[ConfigKeys.deepspeed]
-
-    # Take a look at the trainable parameters in the model
-    _print_trainable_parameters(model)
-
-    # Preparing training arguments:
-    training_args = transformers.TrainingArguments(
-        output_dir=tempfile.mkdtemp(),
-        **train_kwargs,
-    )
-
-    trainer = transformers.Trainer(
-        model=model,
-        train_dataset=tokenized_train,
-        eval_dataset=tokenized_eval,
-        tokenizer=tokenizer,
-        data_collator=data_collator,
-        args=training_args,
-    )
-
-    apply_mlrun(trainer, model_name=model_name.split("/")[-1])
-    model.config.use_cache = (
-        False  # silence the warnings. Please re-enable for inference!
-    )
-
-    # Apply training with evaluation:
-    context.logger.info(f"training '{model_name}'")
-    trainer.train()
-
-    temp_directory = tempfile.TemporaryDirectory().name
-    trainer.save_model(temp_directory)
-
-    # Zip the model directory:
-    shutil.make_archive(
-        base_name="model",
-        format="zip",
-        root_dir=temp_directory,
-    )
-
-    # Log the model:
-    context.log_model(
-        key="model",
-        db_key=model_name.split("/")[-1],
-        model_file="model.zip",
-        tag="",
-        framework="Hugging Face",
-    )
-
-
-def evaluate(
-    context,
-    model_path,
-    data: pd.DataFrame,
-    model_name: str = None,
-    tokenizer_name: str = None,
-):
-    """
-    Evaluating the model using perplexity, for more information visit:
-    https://huggingface.co/docs/transformers/perplexity
-
-    :param context:     mlrun context
-    :param model_path:  path to the model directory
-    :param data:        the data to evaluate the model
-    :param model_name:  name of base model
-    :param tokenizer_name: name of base tokenizer
-    """
-    # Get the model artifact and file:
-    (
-        model_file,
-        model_artifact,
-        extra_data,
-    ) = mlrun.artifacts.get_model(model_path)
-
-    # Read the name:
-    _model_name = model_artifact.spec.db_key
-
-    # Extract logged model files:
-    model_directory = os.path.join(os.path.dirname(model_file), _model_name)
-    with zipfile.ZipFile(model_file, "r") as zip_file:
-        zip_file.extractall(model_directory)
-
-    # Loading the saved pretrained tokenizer and model:
-    dataset = Dataset.from_pandas(data)
-    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
-    pad_token_id = tokenizer.eos_token_id
-    model = AutoModelForCausalLM.from_pretrained(
-        model_name, device_map="cuda:0", trust_remote_code=True, load_in_8bit=True
-    )
-    model = PeftModel.from_pretrained(model, model_directory)
-    model.eval()
-    encodings = tokenizer("\n\n".join(dataset["text"][:5]), return_tensors="pt")
-
-    max_length = 1024
-    stride = 512
-    seq_len = encodings.input_ids.size(1)
-
-    nlls = []
-    prev_end_loc = 0
-    for begin_loc in range(0, seq_len, stride):
-        end_loc = min(begin_loc + max_length, seq_len)
-        trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
-        input_ids = encodings.input_ids[:, begin_loc:end_loc]
-        target_ids = input_ids.clone()
-        target_ids[:, :-trg_len] = -100
-
-        with torch.no_grad():
-            outputs = model(input_ids.cuda(), labels=target_ids)
-
-            # loss is calculated using CrossEntropyLoss which averages over valid labels
-            # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
-            # to the left by 1.
-            neg_log_likelihood = outputs.loss
-
-        nlls.append(neg_log_likelihood)
-
-        prev_end_loc = end_loc
-        if end_loc == seq_len:
-            break
-
-    ppl = torch.exp(torch.stack(nlls).mean()).item()
-    context.log_result("perplexity", ppl)
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/ingest/0.0.1/src/function.yaml b/functions/development/ingest/0.0.1/src/function.yaml deleted file mode 100644 index a05ca669..00000000 --- a/functions/development/ingest/0.0.1/src/function.yaml +++ /dev/null @@ -1,87 +0,0 @@ -kind: job -metadata: - name: ingest - tag: '' - hash: 7e28700a86ebdd18d887fe588492201a1e3ef2f6 - project: '' - labels: - author: yonish - categories: - - data-preparation - - data-analysis - - feature-store -spec: - command: '' - args: [] - image: mlrun/mlrun - build: - functionSourceCode: ZnJvbSB0eXBpbmcgaW1wb3J0IFVuaW9uLCBMaXN0LCBEaWN0CgppbXBvcnQgbWxydW4uZmVhdHVyZV9zdG9yZSBhcyBmcwpmcm9tIG1scnVuLmV4ZWN1dGlvbiBpbXBvcnQgTUxDbGllbnRDdHgKZnJvbSBtbHJ1bi5kYXRhX3R5cGVzIGltcG9ydCBJbmZlck9wdGlvbnMKCgpkZWYgaW5nZXN0KAogICAgY29udGV4dDogTUxDbGllbnRDdHgsCiAgICBmZWF0dXJlc2V0OiBzdHIsCiAgICBzb3VyY2U6IHN0ciwKICAgIHRhcmdldHM6IExpc3RbVW5pb25bc3RyLCBEaWN0XV0gPSBOb25lLAogICAgbmFtZXNwYWNlPU5vbmUsCiAgICBpbmZlcl9vcHRpb25zPU5vbmUsCiAgICBydW5fY29uZmlnOiBVbmlvbltzdHIsIERpY3RdID0gTm9uZSwKICAgIHNwYXJrX2NvbnRleHQ9Tm9uZSwKICAgIG92ZXJ3cml0ZT1Ob25lLAopOgogICAgIiIiUmVhZCBsb2NhbCBEYXRhRnJhbWUsIGZpbGUsIFVSTCwgb3Igc291cmNlIGludG8gdGhlIGZlYXR1cmUgc3RvcmUKICAgIEluZ2VzdCByZWFkcyBmcm9tIHRoZSBzb3VyY2UsIHJ1biB0aGUgZ3JhcGggdHJhbnNmb3JtYXRpb25zLCBpbmZlcnMgIG1ldGFkYXRhIGFuZCBzdGF0cwogICAgYW5kIHdyaXRlcyB0aGUgcmVzdWx0cyB0byB0aGUgZGVmYXVsdCBvZiBzcGVjaWZpZWQgdGFyZ2V0cwoKICAgIHdoZW4gdGFyZ2V0cyBhcmUgbm90IHNwZWNpZmllZCBkYXRhIGlzIHN0b3JlZCBpbiB0aGUgY29uZmlndXJlZCBkZWZhdWx0IHRhcmdldHMKICAgICh3aWxsIHVzdWFsbHkgYmUgTm9TUUwgZm9yIHJlYWwtdGltZSBhbmQgUGFycXVldCBmb3Igb2ZmbGluZSkuCgogICAgZXhhbXBsZTo6CgogICAgICAgIHN0b2Nrc19zZXQgPSBGZWF0dXJlU2V0KCJzdG9ja3MiLCBlbnRpdGllcz1bRW50aXR5KCJ0aWNrZXIiKV0pCiAgICAgICAgc3RvY2tzID0gcGQucmVhZF9jc3YoInN0b2Nrcy5jc3YiKQogICAgICAgIGRmID0gaW5nZXN0KHN0b2Nrc19zZXQsIHN0b2NrcywgaW5mZXJfb3B0aW9ucz1mc3RvcmUuSW5mZXJPcHRpb25zLmRlZmF1bHQoKSkKCiAgICAgICAgIyBmb3IgcnVubmluZyBhcyByZW1vdGUgam9iCiAgICAgICAgY29uZmlnID0gUnVuQ29uZmlnKGltYWdlPSdtbHJ1bi9tbHJ1bicpLmFwcGx5KG1vdW50X3YzaW8oKSkKICAgICAgICBkZiA9IGluZ2VzdChzdG9ja3Nfc2V0LCBzdG9ja3MsIHJ1bl9jb25maWc9Y29uZmlnKQoKICAgICAgICAjIHNwZWNpZnkgc291cmNlIGFuZCB0YXJnZXRzCiAgICAgICAgc291cmNlID0gQ1NWU291cmNlKCJteWNzdiIsIHBhdGg9Im1lYXN1cmVtZW50cy5jc3YiKQogICAgICAgIHRhcmdldHMgPSBbQ1NWVGFyZ2V0KCJteWNzdiIsIHBhdGg9Ii4vbXljc3YuY3N2IildCiAgICAgICAgaW5nZXN0KG1lYXN1cmVtZW50cywgc291cmNlLCB0YXJnZXRzKQoKICAgIDpwYXJhbSBjb250ZXh0OiAgICAgICBNTFJ1biBjb250ZXh0CiAgICA6cGFyYW0gZmVhdHVyZXNldDogICAgZmVhdHVyZSBzZXQgb2JqZWN0IG9yIGZlYXR1cmVzZXQudXJpLiAodXJpIG11c3QgYmUgb2YgYSBmZWF0dXJlIHNldCB0aGF0IGlzIGluIHRoZSBEQiwKICAgICAgICAgICAgICAgICAgICAgICAgICBjYWxsIGAuc2F2ZSgpYCBpZiBpdCdzIG5vdCkKICAgIDpwYXJhbSBzb3VyY2U6ICAgICAgICBzb3VyY2UgZGF0YWZyYW1lIG9yIGZpbGUgcGF0aAogICAgOnBhcmFtIHRhcmdldHM6ICAgICAgIG9wdGlvbmFsIGxpc3Qgb2YgZGF0YSB0YXJnZXQgb2JqZWN0cwogICAgOnBhcmFtIG5hbWVzcGFjZTogICAgIG5hbWVzcGFjZSBvciBtb2R1bGUgY29udGFpbmluZyBncmFwaCBjbGFzc2VzCiAgICA6cGFyYW0gaW5mZXJfb3B0aW9uczogc2NoZW1hIGFuZCBzdGF0cyBpbmZlciBvcHRpb25zCiAgICA6cGFyYW0gcnVuX2NvbmZpZzogICAgZnVuY3Rpb24gYW5kL29yIHJ1biBjb25maWd1cmF0aW9uIGZvciByZW1vdGUgam9icywKICAgICAgICAgICAgICAgICAgICAgICAgICBzZWUgOnB5OmNsYXNzOmB+bWxydW4uZmVhdHVyZV9zdG9yZS5SdW5Db25maWdgCiAgICA6cGFyYW0gc3BhcmtfY29udGV4dDogbG9jYWwgc3Bhcmsgc2Vzc2lvbiBmb3Igc3BhcmsgaW5nZXN0aW9uLCBleGFtcGxlIGZvciBjcmVhdGluZyB0aGUgc3BhcmsgY29udGV4dDoKICAgICAgICAgICAgICAgICAgICAgICAgICBgc3BhcmsgPSBTcGFya1Nlc3Npb24uYnVpbGRlci5hcHBOYW1lKCJTcGFyayBmdW5jdGlvbiIpLmdldE9yQ3JlYXRlKClgCiAgICAgICAgICAgICAgICAgICAgICAgICAgRm9yIHJlbW90ZSBzcGFyayBpbmdlc3Rpb24sIHRoaXMgc2hvdWxkIGNvbnRhaW4gdGhlIHJlbW90ZSBzcGFyayBzZXJ2aWNlIG5hbWUKICAgIDpwYXJhbSBvdmVyd3JpdGU6ICAgICBkZWxldGUgdGhlIHRhcmdldHMnIGRhdGEgcHJpb3IgdG8gaW5nZXN0aW9uCiAgICAgICAgICAgICAgICAgICAgICAgICAgKGRlZmF1bHQ6IFRydWUgZm9yIG5vbi1zY2hlZHVsZWQgaW5nZXN0IC0gZGVsZXRlcyB0aGUgdGFyZ2V0cyB0aGF0IGFyZSBhYm91dCB0byBiZSBpbmdlc3RlZC4KICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgRmFsc2UgZm9yIHNjaGVkdWxlZCBpbmdlc3QgLSBkb2VzIG5vdCBkZWxldGUgdGhlIHRhcmdldCkKCiAgICAiIiIKICAgICMgU2V0dGluZyBpbmZlcl9vcHRpb25zIHRvIGRlZmF1bHQ6CiAgICBjb250ZXh0Ll9wYXJhbWV0ZXJzWyJpbmZlcl9vcHRpb25zIl0gPSBpbmZlcl9vcHRpb25zIG9yIEluZmVyT3B0aW9ucy5kZWZhdWx0KCkKCiAgICBjb250ZXh0LmxvZ2dlci5pbmZvKGYiQ2FsbGluZyBpbmdlc3Rpb24gdGFzayB3aXRoOiB7ZmVhdHVyZXNldH0iKQoKICAgICMgaW5nZXN0IGNhbGxlZCB3aXRoIG1scnVuX2NvbnRleHQsIGZlYXR1cmVfc2V0LCBzb3VyY2UgYW5kIHRhcmdldHMgcGFzc2VkIHdpdGggY29udGV4dAogICAgIyBUaGlzIHBhcmFtcyBoZXJlIGZvciBkb2N1bWVudGF0aW9uIHB1cnBvc2VzIG9ubHkKICAgIGZzLmluZ2VzdCgKICAgICAgICBtbHJ1bl9jb250ZXh0PWNvbnRleHQsCiAgICAgICAgbmFtZXNwYWNlPW5hbWVzcGFjZSwKICAgICAgICBzcGFya19jb250ZXh0PXNwYXJrX2NvbnRleHQsCiAgICApCiAgICBjb250ZXh0LmxvZ19yZXN1bHQoImZlYXR1cmVzZXQiLCBmZWF0dXJlc2V0KQo= - commands: [] - code_origin: https://github.com/mlrun/functions.git#886a88217c2a2570c81a14877f9c1dfb1ac8a244:C:\Users\yonatans\projects\functions\ingest\ingest.py - origin_filename: C:\Users\yonatans\projects\functions\ingest\ingest.py - entry_points: - ingest: - name: ingest - doc: "Read local DataFrame, file, URL, or source into the feature store\nIngest\ - \ reads from the source, run the graph transformations, infers metadata and\ - \ stats\nand writes the results to the default of specified targets\n\nwhen\ - \ targets are not specified data is stored in the configured default targets\n\ - (will usually be NoSQL for real-time and Parquet for offline).\n\nexample::\n\ - \n stocks_set = FeatureSet(\"stocks\", entities=[Entity(\"ticker\")])\n\ - \ stocks = pd.read_csv(\"stocks.csv\")\n df = ingest(stocks_set, stocks,\ - \ infer_options=fstore.InferOptions.default())\n\n # for running as remote\ - \ job\n config = RunConfig(image='mlrun/mlrun').apply(mount_v3io())\n \ - \ df = ingest(stocks_set, stocks, run_config=config)\n\n # specify source\ - \ and targets\n source = CSVSource(\"mycsv\", path=\"measurements.csv\"\ - )\n targets = [CSVTarget(\"mycsv\", path=\"./mycsv.csv\")]\n ingest(measurements,\ - \ source, targets)" - parameters: - - name: context - type: MLClientCtx - doc: MLRun context - default: '' - - name: featureset - type: str - doc: feature set object or featureset.uri. (uri must be of a feature set that - is in the DB, call `.save()` if it's not) - default: '' - - name: source - type: str - doc: source dataframe or file path - default: '' - - name: targets - type: List[Union[str, Dict]] - doc: optional list of data target objects - default: null - - name: namespace - doc: namespace or module containing graph classes - default: null - - name: infer_options - doc: schema and stats infer options - default: null - - name: run_config - type: Union[str, Dict] - doc: function and/or run configuration for remote jobs, see :py:class:`~mlrun.feature_store.RunConfig` - default: null - - name: spark_context - doc: 'local spark session for spark ingestion, example for creating the spark - context: `spark = SparkSession.builder.appName("Spark function").getOrCreate()` - For remote spark ingestion, this should contain the remote spark service - name' - default: null - - name: overwrite - doc: 'delete the targets'' data prior to ingestion (default: True for non-scheduled - ingest - deletes the targets that are about to be ingested. False for scheduled - ingest - does not delete the target)' - default: null - outputs: - - default: '' - lineno: 8 - description: Feature Store ingest function that runs the transformation graph on - the source of the featureset. - default_handler: ingest - disable_auto_mount: false - env: [] - priority_class_name: '' - affinity: null -verbose: false diff --git a/functions/development/ingest/0.0.1/src/ingest.ipynb b/functions/development/ingest/0.0.1/src/ingest.ipynb deleted file mode 100644 index 7da398b4..00000000 --- a/functions/development/ingest/0.0.1/src/ingest.ipynb +++ /dev/null @@ -1,762 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Feature Store Ingest" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Read local DataFrame, file, URL, or source into the feature store\n", - "Ingest reads from the source, run the graph transformations, infers metadata and stats\n", - "and writes the results to the default of specified targets." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Creating Project" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import mlrun\n", - "import mlrun.feature_store as fstore\n", - "from mlrun.datastore.sources import CSVSource\n", - "from mlrun.feature_store.steps import *\n", - "from mlrun.features import MinMaxValidator\n", - "import pandas as pd\n", - "import datetime" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2022-01-31 13:52:16,939 [info] loaded project ingest from MLRun DB\n" - ] - } - ], - "source": [ - "# Initialize the MLRun project object\n", - "project = mlrun.get_or_create_project('ingest', context=\"./\", user_project=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Create Sample Data For Demo" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "quotes = pd.DataFrame(\n", - " {\n", - " \"time\": [\n", - " pd.Timestamp(\"2016-05-25 13:30:00.023\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.023\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.030\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.041\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.048\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.049\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.072\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.075\"),\n", - " ],\n", - " \"ticker\": [\"GOOG\", \"MSFT\", \"MSFT\", \"MSFT\", \"GOOG\", \"AAPL\", \"GOOG\", \"MSFT\"],\n", - " \"bid\": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01],\n", - " \"ask\": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03],\n", - " }\n", - ")\n", - "\n", - "# move date:\n", - "max_date = quotes[\"time\"].max()\n", - "now_date = datetime.datetime.now()\n", - "delta = now_date - max_date\n", - "quotes[\"time\"] = quotes[\"time\"] + delta" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
timetickerbidask
02022-01-31 13:52:16.905388GOOG720.50720.93
12022-01-31 13:52:16.905388MSFT51.9551.96
22022-01-31 13:52:16.912388MSFT51.9751.98
32022-01-31 13:52:16.923388MSFT51.9952.00
42022-01-31 13:52:16.930388GOOG720.50720.93
52022-01-31 13:52:16.931388AAPL97.9998.01
62022-01-31 13:52:16.954388GOOG720.50720.88
72022-01-31 13:52:16.957388MSFT52.0152.03
\n", - "
" - ], - "text/plain": [ - " time ticker bid ask\n", - "0 2022-01-31 13:52:16.905388 GOOG 720.50 720.93\n", - "1 2022-01-31 13:52:16.905388 MSFT 51.95 51.96\n", - "2 2022-01-31 13:52:16.912388 MSFT 51.97 51.98\n", - "3 2022-01-31 13:52:16.923388 MSFT 51.99 52.00\n", - "4 2022-01-31 13:52:16.930388 GOOG 720.50 720.93\n", - "5 2022-01-31 13:52:16.931388 AAPL 97.99 98.01\n", - "6 2022-01-31 13:52:16.954388 GOOG 720.50 720.88\n", - "7 2022-01-31 13:52:16.957388 MSFT 52.01 52.03" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "quotes" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Build Advanced Feature Set - With Feature Engineering Pipeline" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Define a custom pipeline step (python class)" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "class MyMap(MapClass):\n", - " def __init__(self, multiplier=1, **kwargs):\n", - " super().__init__(**kwargs)\n", - " self._multiplier = multiplier\n", - "\n", - " def do(self, event):\n", - " event[\"multi\"] = event[\"bid\"] * self._multiplier\n", - " return event" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Build and show the transformatiom pipeline" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "image/svg+xml": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "mlrun-flow\n", - "\n", - "\n", - "\n", - "_start\n", - "\n", - "start\n", - "\n", - "\n", - "\n", - "map.MyMap\n", - "\n", - "map.MyMap\n", - "\n", - "\n", - "\n", - "_start->map.MyMap\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "storey.Extend\n", - "\n", - "storey.Extend\n", - "\n", - "\n", - "\n", - "map.MyMap->storey.Extend\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "filter\n", - "\n", - "filter\n", - "\n", - "\n", - "\n", - "storey.Extend->filter\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "FeaturesetValidator\n", - "\n", - "FeaturesetValidator\n", - "\n", - "\n", - "\n", - "filter->FeaturesetValidator\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "Aggregates\n", - "\n", - "Aggregates\n", - "\n", - "\n", - "\n", - "FeaturesetValidator->Aggregates\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "parquet\n", - "\n", - "\n", - "parquet\n", - "\n", - "\n", - "\n", - "Aggregates->parquet\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "nosql\n", - "\n", - "\n", - "nosql\n", - "\n", - "\n", - "\n", - "Aggregates->nosql\n", - "\n", - "\n", - "\n", - "\n", - "\n" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "quotes_set = fstore.FeatureSet(\"stock-quotes\", entities=[fstore.Entity(\"ticker\")])\n", - "\n", - "quotes_set.graph.to(\"map.MyMap\", multiplier=3).to(\n", - " \"storey.Extend\", _fn=\"({'extra': event['bid'] * 77})\"\n", - ").to(\"storey.Filter\", \"filter\", _fn=\"(event['bid'] > 51.92)\").to(\n", - " FeaturesetValidator()\n", - ")\n", - "\n", - "quotes_set.add_aggregation(\"ask\", [\"sum\", \"max\"], \"1h\", \"10m\", name=\"asks1\")\n", - "quotes_set.add_aggregation(\"ask\", [\"sum\", \"max\"], \"5h\", \"10m\", name=\"asks5\")\n", - "quotes_set.add_aggregation(\"bid\", [\"min\", \"max\"], \"1h\", \"10m\", name=\"bids\")\n", - "\n", - "# add feature validation policy\n", - "quotes_set[\"bid\"] = fstore.Feature(\n", - " validator=MinMaxValidator(min=52, severity=\"info\")\n", - ")\n", - "\n", - "# add default target definitions and plot\n", - "quotes_set.set_targets()\n", - "quotes_set.plot(rankdir=\"LR\", with_targets=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Saving the feature set in the feature store " - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "quotes_set.save()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Creating the data source of the feature set to apply the ingest on:" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "data_uri = 'quotes.csv'\n", - "quotes.to_csv(data_uri, index=False)\n", - "source = CSVSource('quotes', data_uri).to_dict()\n", - "source" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Import ingest function" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "ingest_fn = mlrun.import_function(\"function.yaml\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Running the function locally" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2022-01-31 13:52:17,201 [info] starting run ingest-ingest uid=4bd5d12691a8439d90bf53847f59df1a DB=http://mlrun-api:8080\n", - "> 2022-01-31 13:52:17,354 [info] Ingesting the FeatureSet: store://feature-sets/ingest-yonatan/stock-quotes\n", - "> 2022-01-31 13:52:17,427 [info] starting ingestion task to store://feature-sets/ingest-yonatan/stock-quotes:latest.\n", - "info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 13:52:19.466055 args={'min': 52, 'value': 51.95}\n", - "info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 13:52:19.466072 args={'min': 52, 'value': 51.97}\n", - "info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 13:52:19.466085 args={'min': 52, 'value': 51.99}\n", - "info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 13:52:19.671677 args={'min': 52, 'value': 51.95}\n", - "info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 13:52:19.671692 args={'min': 52, 'value': 51.97}\n", - "info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 13:52:19.671708 args={'min': 52, 'value': 51.99}\n", - "> 2022-01-31 13:52:19,915 [info] ingestion task completed, targets:\n", - "> 2022-01-31 13:52:19,915 [info] [{'name': 'parquet', 'kind': 'parquet', 'path': 'v3io:///projects/ingest-yonatan/FeatureStore/stock-quotes/parquet/sets/stock-quotes-latest', 'status': 'created', 'updated': '2022-01-31T13:52:19.649303+00:00', 'last_written': datetime.datetime(2022, 1, 31, 13, 52, 19, 671753)}, {'name': 'nosql', 'kind': 'nosql', 'path': 'v3io:///projects/ingest-yonatan/FeatureStore/stock-quotes/nosql/sets/stock-quotes-latest', 'status': 'created', 'updated': '2022-01-31T13:52:19.650044+00:00'}]\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
ingest-yonatan0Jan 31 13:52:17completedingest-ingest
v3io_user=yonatan
kind=
owner=yonatan
host=jupyter-yoni-647b99c95d-w4jlc
featureset=store://feature-sets/ingest-yonatan/stock-quotes
source={'kind': 'csv', 'name': 'quotes', 'path': 'quotes.csv'}
infer_options=63
overwrite=None
targets=None
featureset=store://feature-sets/ingest-yonatan/stock-quotes
\n", - "
\n", - "
\n", - "
\n", - " Title\n", - " ×\n", - "
\n", - " \n", - "
\n", - "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - }, - { - "data": { - "text/html": [ - " > to track results use the .show() or .logs() methods or click here to open in UI" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2022-01-31 13:52:20,045 [info] run executed, status=completed\n" - ] - } - ], - "source": [ - "ingest_run = ingest_fn.run(\n", - " handler=\"ingest\",\n", - " params={\n", - " \"featureset\": quotes_set.uri,\n", - " \"source\": source,\n", - " },\n", - " local=True,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## View of the targets' state after run" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'created'" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fstore.get_feature_set(ingest_run.outputs['featureset']).status.state" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/functions/development/ingest/0.0.1/src/ingest.py b/functions/development/ingest/0.0.1/src/ingest.py deleted file mode 100644 index cf3285d8..00000000 --- a/functions/development/ingest/0.0.1/src/ingest.py +++ /dev/null @@ -1,70 +0,0 @@ -from typing import Union, List, Dict - -import mlrun.feature_store as fs -from mlrun.execution import MLClientCtx -from mlrun.data_types import InferOptions - - -def ingest( - context: MLClientCtx, - featureset: str, - source: str, - targets: List[Union[str, Dict]] = None, - namespace=None, - infer_options=None, - run_config: Union[str, Dict] = None, - spark_context=None, - overwrite=None, -): - """Read local DataFrame, file, URL, or source into the feature store - Ingest reads from the source, run the graph transformations, infers metadata and stats - and writes the results to the default of specified targets - - when targets are not specified data is stored in the configured default targets - (will usually be NoSQL for real-time and Parquet for offline). - - example:: - - stocks_set = FeatureSet("stocks", entities=[Entity("ticker")]) - stocks = pd.read_csv("stocks.csv") - df = ingest(stocks_set, stocks, infer_options=fstore.InferOptions.default()) - - # for running as remote job - config = RunConfig(image='mlrun/mlrun').apply(mount_v3io()) - df = ingest(stocks_set, stocks, run_config=config) - - # specify source and targets - source = CSVSource("mycsv", path="measurements.csv") - targets = [CSVTarget("mycsv", path="./mycsv.csv")] - ingest(measurements, source, targets) - - :param context: MLRun context - :param featureset: feature set object or featureset.uri. (uri must be of a feature set that is in the DB, - call `.save()` if it's not) - :param source: source dataframe or file path - :param targets: optional list of data target objects - :param namespace: namespace or module containing graph classes - :param infer_options: schema and stats infer options - :param run_config: function and/or run configuration for remote jobs, - see :py:class:`~mlrun.feature_store.RunConfig` - :param spark_context: local spark session for spark ingestion, example for creating the spark context: - `spark = SparkSession.builder.appName("Spark function").getOrCreate()` - For remote spark ingestion, this should contain the remote spark service name - :param overwrite: delete the targets' data prior to ingestion - (default: True for non-scheduled ingest - deletes the targets that are about to be ingested. - False for scheduled ingest - does not delete the target) - - """ - # Setting infer_options to default: - context._parameters["infer_options"] = infer_options or InferOptions.default() - - context.logger.info(f"Calling ingestion task with: {featureset}") - - # ingest called with mlrun_context, feature_set, source and targets passed with context - # This params here for documentation purposes only - fs.ingest( - mlrun_context=context, - namespace=namespace, - spark_context=spark_context, - ) - context.log_result("featureset", featureset) diff --git a/functions/development/ingest/0.0.1/src/item.yaml b/functions/development/ingest/0.0.1/src/item.yaml deleted file mode 100644 index 8d80eef4..00000000 --- a/functions/development/ingest/0.0.1/src/item.yaml +++ /dev/null @@ -1,25 +0,0 @@ -apiVersion: v1 -categories: - - data-preparation - - data-analysis - - feature-store -description: Feature Store ingest function that runs the transformation graph on the source of the featureset. -doc: '' -example: ingest.ipynb -generationDate: 2021-11-13:00-15 -icon: '' -labels: - author: yonish -maintainers: [] -marketplaceType: '' -mlrunVersion: '' -name: ingest -platformVersion: '' -spec: - filename: ingest.py - handler: ingest - image: mlrun/mlrun - kind: job - requirements: [] -url: '' -version: 0.0.1 \ No newline at end of file diff --git a/functions/development/ingest/0.0.1/src/test_ingest.py b/functions/development/ingest/0.0.1/src/test_ingest.py deleted file mode 100644 index 062898dc..00000000 --- a/functions/development/ingest/0.0.1/src/test_ingest.py +++ /dev/null @@ -1,136 +0,0 @@ -import os -import tempfile -import shutil -import datetime - -import mlrun -import mlrun.feature_store as fstore -from mlrun.datastore.sources import CSVSource -from mlrun.feature_store.steps import * -from mlrun.features import MinMaxValidator -import pandas as pd - - -def _set_environment(): - artifact_path = tempfile.TemporaryDirectory().name - os.makedirs(artifact_path) - project = mlrun.new_project("ingest-test") - return artifact_path, project - - -def _cleanup_environment(artifact_path: str): - """ - Cleanup the test environment, deleting files and artifacts created during the test. - - :param artifact_path: The artifact path to delete. - """ - # Clean the local directory: - for test_output in [ - *os.listdir(artifact_path), - "schedules", - "runs", - "artifacts", - "functions", - ]: - test_output_path = os.path.abspath(f"./{test_output}") - if os.path.exists(test_output_path): - if os.path.isdir(test_output_path): - shutil.rmtree(test_output_path) - else: - os.remove(test_output_path) - - # Clean the artifacts' directory: - shutil.rmtree(artifact_path) - - -def create_dataframes() -> (pd.DataFrame, pd.DataFrame): - quotes = pd.DataFrame( - { - "time": [ - pd.Timestamp("2016-05-25 13:30:00.023"), - pd.Timestamp("2016-05-25 13:30:00.023"), - pd.Timestamp("2016-05-25 13:30:00.030"), - pd.Timestamp("2016-05-25 13:30:00.041"), - pd.Timestamp("2016-05-25 13:30:00.048"), - pd.Timestamp("2016-05-25 13:30:00.049"), - pd.Timestamp("2016-05-25 13:30:00.072"), - pd.Timestamp("2016-05-25 13:30:00.075"), - ], - "ticker": ["GOOG", "MSFT", "MSFT", "MSFT", "GOOG", "AAPL", "GOOG", "MSFT"], - "bid": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01], - "ask": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03], - } - ) - - # move date: - max_date = quotes["time"].max() - now_date = datetime.datetime.now() - delta = now_date - max_date - quotes["time"] = quotes["time"] + delta - - return quotes - - -class MyMap(MapClass): - def __init__(self, multiplier=1, **kwargs): - super().__init__(**kwargs) - self._multiplier = multiplier - - def do(self, event): - event["multi"] = event["bid"] * self._multiplier - return event - - -def _create_feature_set(): - quotes_set = fstore.FeatureSet("stock-quotes", entities=[fstore.Entity("ticker")]) - - quotes_set.graph.to("test_ingest.MyMap", multiplier=3).to( - "storey.Extend", _fn="({'extra': event['bid'] * 77})" - ).to("storey.Filter", "filter", _fn="(event['bid'] > 51.92)").to( - FeaturesetValidator() - ) - - quotes_set.add_aggregation("ask", ["sum", "max"], "1h", "10m", name="asks1") - quotes_set.add_aggregation("ask", ["sum", "max"], "5h", "10m", name="asks5") - quotes_set.add_aggregation("bid", ["min", "max"], "1h", "10m", name="bids") - - # add feature validation policy - quotes_set["bid"] = fstore.Feature( - validator=MinMaxValidator(min=52, severity="info") - ) - - # add default target definitions - quotes_set.set_targets() - return quotes_set - - -def test_ingest(): - artifact_path, project = _set_environment() - ingest_fn = mlrun.import_function("function.yaml") - quotes = create_dataframes() - - quotes_set = _create_feature_set() - quotes_set.save() - - data_uri = os.path.join(artifact_path, "quotes.csv") - quotes.to_csv(data_uri, index=False) - source = CSVSource("quotes", data_uri).to_dict() - - ingest_run = None - try: - ingest_run = ingest_fn.run( - handler="ingest", - params={ - "featureset": quotes_set.uri, - "source": source, - }, - local=True, - ) - - except Exception as exception: - print(f"- The test failed - raised the following error:\n- {exception}") - assert ( - fstore.get_feature_set(ingest_run.outputs["featureset"]).status.state - == "created" - ), "Targets not created successfully" - _cleanup_environment(artifact_path) diff --git a/functions/development/ingest/0.0.1/static/documentation.html b/functions/development/ingest/0.0.1/static/documentation.html deleted file mode 100644 index c7861d66..00000000 --- a/functions/development/ingest/0.0.1/static/documentation.html +++ /dev/null @@ -1,173 +0,0 @@ - - - - - - - -ingest package - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- - -
-
-
-
-
-
-

ingest package

-
-

Submodules

-
-
-

ingest.ingest module

-
-
-ingest.ingest.ingest(context: mlrun.execution.MLClientCtx, featureset: str, source: str, targets: Optional[List[Union[str, Dict]]] = None, namespace=None, infer_options=None, run_config: Optional[Union[str, Dict]] = None, spark_context=None, overwrite=None)[source]
-

Read local DataFrame, file, URL, or source into the feature store -Ingest reads from the source, run the graph transformations, infers metadata and stats -and writes the results to the default of specified targets

-

when targets are not specified data is stored in the configured default targets -(will usually be NoSQL for real-time and Parquet for offline).

-

example:

-
stocks_set = FeatureSet("stocks", entities=[Entity("ticker")])
-stocks = pd.read_csv("stocks.csv")
-df = ingest(stocks_set, stocks, infer_options=fstore.InferOptions.default())
-
-# for running as remote job
-config = RunConfig(image='mlrun/mlrun').apply(mount_v3io())
-df = ingest(stocks_set, stocks, run_config=config)
-
-# specify source and targets
-source = CSVSource("mycsv", path="measurements.csv")
-targets = [CSVTarget("mycsv", path="./mycsv.csv")]
-ingest(measurements, source, targets)
-
-
-
-
Parameters
-
    -
  • context – MLRun context

  • -
  • featureset – feature set object or featureset.uri. (uri must be of a feature set that is in the DB, -call .save() if it’s not)

  • -
  • source – source dataframe or file path

  • -
  • targets – optional list of data target objects

  • -
  • namespace – namespace or module containing graph classes

  • -
  • infer_options – schema and stats infer options

  • -
  • run_config – function and/or run configuration for remote jobs, -see RunConfig

  • -
  • spark_context – local spark session for spark ingestion, example for creating the spark context: -spark = SparkSession.builder.appName(“Spark function”).getOrCreate() -For remote spark ingestion, this should contain the remote spark service name

  • -
  • overwrite

    delete the targets’ data prior to ingestion -(default: True for non-scheduled ingest - deletes the targets that are about to be ingested.

    -
    -

    False for scheduled ingest - does not delete the target)

    -
    -

  • -
-
-
-
-
-
-

Module contents

-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/ingest/0.0.1/static/example.html b/functions/development/ingest/0.0.1/static/example.html deleted file mode 100644 index 1250bc58..00000000 --- a/functions/development/ingest/0.0.1/static/example.html +++ /dev/null @@ -1,606 +0,0 @@ - - - - - - - -Feature Store Ingest - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
- -
-
-
-
-

Feature Store Ingest

-

Read local DataFrame, file, URL, or source into the feature store -Ingest reads from the source, run the graph transformations, infers metadata and stats -and writes the results to the default of specified targets.

-
-

Creating Project

-
-
-
import mlrun
-import mlrun.feature_store as fstore
-from  mlrun.datastore.sources import CSVSource
-from mlrun.feature_store.steps import *
-from mlrun.features import MinMaxValidator
-import pandas as pd
-import datetime
-
-
-
-
-
-
-
# Initialize the MLRun project object
-project = mlrun.get_or_create_project('ingest', context="./", user_project=True)
-
-
-
-
-
> 2022-01-31 13:52:16,939 [info] loaded project ingest from MLRun DB
-
-
-
-
-
-
-

Create Sample Data For Demo

-
-
-
quotes = pd.DataFrame(
-    {
-        "time": [
-            pd.Timestamp("2016-05-25 13:30:00.023"),
-            pd.Timestamp("2016-05-25 13:30:00.023"),
-            pd.Timestamp("2016-05-25 13:30:00.030"),
-            pd.Timestamp("2016-05-25 13:30:00.041"),
-            pd.Timestamp("2016-05-25 13:30:00.048"),
-            pd.Timestamp("2016-05-25 13:30:00.049"),
-            pd.Timestamp("2016-05-25 13:30:00.072"),
-            pd.Timestamp("2016-05-25 13:30:00.075"),
-        ],
-        "ticker": ["GOOG", "MSFT", "MSFT", "MSFT", "GOOG", "AAPL", "GOOG", "MSFT"],
-        "bid": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01],
-        "ask": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03],
-    }
-)
-
-# move date:
-max_date = quotes["time"].max()
-now_date = datetime.datetime.now()
-delta = now_date - max_date
-quotes["time"] = quotes["time"] + delta
-
-
-
-
-
-
-
quotes
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
timetickerbidask
02022-01-31 13:52:16.905388GOOG720.50720.93
12022-01-31 13:52:16.905388MSFT51.9551.96
22022-01-31 13:52:16.912388MSFT51.9751.98
32022-01-31 13:52:16.923388MSFT51.9952.00
42022-01-31 13:52:16.930388GOOG720.50720.93
52022-01-31 13:52:16.931388AAPL97.9998.01
62022-01-31 13:52:16.954388GOOG720.50720.88
72022-01-31 13:52:16.957388MSFT52.0152.03
-
-
-
-
-

Build Advanced Feature Set - With Feature Engineering Pipeline

-

Define a custom pipeline step (python class)

-
-
-
class MyMap(MapClass):
-    def __init__(self, multiplier=1, **kwargs):
-        super().__init__(**kwargs)
-        self._multiplier = multiplier
-
-    def do(self, event):
-        event["multi"] = event["bid"] * self._multiplier
-        return event
-
-
-
-
-

Build and show the transformatiom pipeline

-
-
-
quotes_set = fstore.FeatureSet("stock-quotes", entities=[fstore.Entity("ticker")])
-
-quotes_set.graph.to("map.MyMap", multiplier=3).to(
-    "storey.Extend", _fn="({'extra': event['bid'] * 77})"
-).to("storey.Filter", "filter", _fn="(event['bid'] > 51.92)").to(
-    FeaturesetValidator()
-)
-
-quotes_set.add_aggregation("ask", ["sum", "max"], "1h", "10m", name="asks1")
-quotes_set.add_aggregation("ask", ["sum", "max"], "5h", "10m", name="asks5")
-quotes_set.add_aggregation("bid", ["min", "max"], "1h", "10m", name="bids")
-
-# add feature validation policy
-quotes_set["bid"] = fstore.Feature(
-    validator=MinMaxValidator(min=52, severity="info")
-)
-
-# add default target definitions and plot
-quotes_set.set_targets()
-quotes_set.plot(rankdir="LR", with_targets=True)
-
-
-
-
-_images/ingest_example_12_0.svg
-
-

Saving the feature set in the feature store

-
-
-
quotes_set.save()
-
-
-
-
-

Creating the data source of the feature set to apply the ingest on:

-
-
-
data_uri = 'quotes.csv'
-quotes.to_csv(data_uri, index=False)
-source = CSVSource('quotes', data_uri).to_dict()
-source
-
-
-
-
-
-
-

Import ingest function

-
-
-
ingest_fn = mlrun.import_function("function.yaml")
-
-
-
-
-
-
-

Running the function locally

-
-
-
ingest_run = ingest_fn.run(
-    handler="ingest",
-    params={
-        "featureset": quotes_set.uri,
-        "source": source,
-    },
-    local=True,
-)
-
-
-
-
-
> 2022-01-31 13:52:17,201 [info] starting run ingest-ingest uid=4bd5d12691a8439d90bf53847f59df1a DB=http://mlrun-api:8080
-> 2022-01-31 13:52:17,354 [info] Ingesting the FeatureSet: store://feature-sets/ingest-yonatan/stock-quotes
-> 2022-01-31 13:52:17,427 [info] starting ingestion task to store://feature-sets/ingest-yonatan/stock-quotes:latest.
-info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 13:52:19.466055 args={'min': 52, 'value': 51.95}
-info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 13:52:19.466072 args={'min': 52, 'value': 51.97}
-info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 13:52:19.466085 args={'min': 52, 'value': 51.99}
-info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 13:52:19.671677 args={'min': 52, 'value': 51.95}
-info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 13:52:19.671692 args={'min': 52, 'value': 51.97}
-info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 13:52:19.671708 args={'min': 52, 'value': 51.99}
-> 2022-01-31 13:52:19,915 [info] ingestion task completed, targets:
-> 2022-01-31 13:52:19,915 [info] [{'name': 'parquet', 'kind': 'parquet', 'path': 'v3io:///projects/ingest-yonatan/FeatureStore/stock-quotes/parquet/sets/stock-quotes-latest', 'status': 'created', 'updated': '2022-01-31T13:52:19.649303+00:00', 'last_written': datetime.datetime(2022, 1, 31, 13, 52, 19, 671753)}, {'name': 'nosql', 'kind': 'nosql', 'path': 'v3io:///projects/ingest-yonatan/FeatureStore/stock-quotes/nosql/sets/stock-quotes-latest', 'status': 'created', 'updated': '2022-01-31T13:52:19.650044+00:00'}]
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
ingest-yonatan0Jan 31 13:52:17completedingest-ingest
v3io_user=yonatan
kind=
owner=yonatan
host=jupyter-yoni-647b99c95d-w4jlc
featureset=store://feature-sets/ingest-yonatan/stock-quotes
source={'kind': 'csv', 'name': 'quotes', 'path': 'quotes.csv'}
infer_options=63
overwrite=None
targets=None
featureset=store://feature-sets/ingest-yonatan/stock-quotes
-
- -
-

-
-
-
> to track results use the .show() or .logs() methods or click here to open in UI
> 2022-01-31 13:52:20,045 [info] run executed, status=completed
-
-
-
-
-
-
-

View of the targets’ state after run

-
-
-
fstore.get_feature_set(ingest_run.outputs['featureset']).status.state
-
-
-
-
-
'created'
-
-
-
-
-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/ingest/0.0.1/static/function.html b/functions/development/ingest/0.0.1/static/function.html deleted file mode 100644 index d58ef431..00000000 --- a/functions/development/ingest/0.0.1/static/function.html +++ /dev/null @@ -1,109 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: job
-metadata:
-  name: ingest
-  tag: ''
-  hash: 7e28700a86ebdd18d887fe588492201a1e3ef2f6
-  project: ''
-  labels:
-    author: yonish
-  categories:
-  - data-preparation
-  - data-analysis
-  - feature-store
-spec:
-  command: ''
-  args: []
-  image: mlrun/mlrun
-  build:
-    functionSourceCode: ZnJvbSB0eXBpbmcgaW1wb3J0IFVuaW9uLCBMaXN0LCBEaWN0CgppbXBvcnQgbWxydW4uZmVhdHVyZV9zdG9yZSBhcyBmcwpmcm9tIG1scnVuLmV4ZWN1dGlvbiBpbXBvcnQgTUxDbGllbnRDdHgKZnJvbSBtbHJ1bi5kYXRhX3R5cGVzIGltcG9ydCBJbmZlck9wdGlvbnMKCgpkZWYgaW5nZXN0KAogICAgY29udGV4dDogTUxDbGllbnRDdHgsCiAgICBmZWF0dXJlc2V0OiBzdHIsCiAgICBzb3VyY2U6IHN0ciwKICAgIHRhcmdldHM6IExpc3RbVW5pb25bc3RyLCBEaWN0XV0gPSBOb25lLAogICAgbmFtZXNwYWNlPU5vbmUsCiAgICBpbmZlcl9vcHRpb25zPU5vbmUsCiAgICBydW5fY29uZmlnOiBVbmlvbltzdHIsIERpY3RdID0gTm9uZSwKICAgIHNwYXJrX2NvbnRleHQ9Tm9uZSwKICAgIG92ZXJ3cml0ZT1Ob25lLAopOgogICAgIiIiUmVhZCBsb2NhbCBEYXRhRnJhbWUsIGZpbGUsIFVSTCwgb3Igc291cmNlIGludG8gdGhlIGZlYXR1cmUgc3RvcmUKICAgIEluZ2VzdCByZWFkcyBmcm9tIHRoZSBzb3VyY2UsIHJ1biB0aGUgZ3JhcGggdHJhbnNmb3JtYXRpb25zLCBpbmZlcnMgIG1ldGFkYXRhIGFuZCBzdGF0cwogICAgYW5kIHdyaXRlcyB0aGUgcmVzdWx0cyB0byB0aGUgZGVmYXVsdCBvZiBzcGVjaWZpZWQgdGFyZ2V0cwoKICAgIHdoZW4gdGFyZ2V0cyBhcmUgbm90IHNwZWNpZmllZCBkYXRhIGlzIHN0b3JlZCBpbiB0aGUgY29uZmlndXJlZCBkZWZhdWx0IHRhcmdldHMKICAgICh3aWxsIHVzdWFsbHkgYmUgTm9TUUwgZm9yIHJlYWwtdGltZSBhbmQgUGFycXVldCBmb3Igb2ZmbGluZSkuCgogICAgZXhhbXBsZTo6CgogICAgICAgIHN0b2Nrc19zZXQgPSBGZWF0dXJlU2V0KCJzdG9ja3MiLCBlbnRpdGllcz1bRW50aXR5KCJ0aWNrZXIiKV0pCiAgICAgICAgc3RvY2tzID0gcGQucmVhZF9jc3YoInN0b2Nrcy5jc3YiKQogICAgICAgIGRmID0gaW5nZXN0KHN0b2Nrc19zZXQsIHN0b2NrcywgaW5mZXJfb3B0aW9ucz1mc3RvcmUuSW5mZXJPcHRpb25zLmRlZmF1bHQoKSkKCiAgICAgICAgIyBmb3IgcnVubmluZyBhcyByZW1vdGUgam9iCiAgICAgICAgY29uZmlnID0gUnVuQ29uZmlnKGltYWdlPSdtbHJ1bi9tbHJ1bicpLmFwcGx5KG1vdW50X3YzaW8oKSkKICAgICAgICBkZiA9IGluZ2VzdChzdG9ja3Nfc2V0LCBzdG9ja3MsIHJ1bl9jb25maWc9Y29uZmlnKQoKICAgICAgICAjIHNwZWNpZnkgc291cmNlIGFuZCB0YXJnZXRzCiAgICAgICAgc291cmNlID0gQ1NWU291cmNlKCJteWNzdiIsIHBhdGg9Im1lYXN1cmVtZW50cy5jc3YiKQogICAgICAgIHRhcmdldHMgPSBbQ1NWVGFyZ2V0KCJteWNzdiIsIHBhdGg9Ii4vbXljc3YuY3N2IildCiAgICAgICAgaW5nZXN0KG1lYXN1cmVtZW50cywgc291cmNlLCB0YXJnZXRzKQoKICAgIDpwYXJhbSBjb250ZXh0OiAgICAgICBNTFJ1biBjb250ZXh0CiAgICA6cGFyYW0gZmVhdHVyZXNldDogICAgZmVhdHVyZSBzZXQgb2JqZWN0IG9yIGZlYXR1cmVzZXQudXJpLiAodXJpIG11c3QgYmUgb2YgYSBmZWF0dXJlIHNldCB0aGF0IGlzIGluIHRoZSBEQiwKICAgICAgICAgICAgICAgICAgICAgICAgICBjYWxsIGAuc2F2ZSgpYCBpZiBpdCdzIG5vdCkKICAgIDpwYXJhbSBzb3VyY2U6ICAgICAgICBzb3VyY2UgZGF0YWZyYW1lIG9yIGZpbGUgcGF0aAogICAgOnBhcmFtIHRhcmdldHM6ICAgICAgIG9wdGlvbmFsIGxpc3Qgb2YgZGF0YSB0YXJnZXQgb2JqZWN0cwogICAgOnBhcmFtIG5hbWVzcGFjZTogICAgIG5hbWVzcGFjZSBvciBtb2R1bGUgY29udGFpbmluZyBncmFwaCBjbGFzc2VzCiAgICA6cGFyYW0gaW5mZXJfb3B0aW9uczogc2NoZW1hIGFuZCBzdGF0cyBpbmZlciBvcHRpb25zCiAgICA6cGFyYW0gcnVuX2NvbmZpZzogICAgZnVuY3Rpb24gYW5kL29yIHJ1biBjb25maWd1cmF0aW9uIGZvciByZW1vdGUgam9icywKICAgICAgICAgICAgICAgICAgICAgICAgICBzZWUgOnB5OmNsYXNzOmB+bWxydW4uZmVhdHVyZV9zdG9yZS5SdW5Db25maWdgCiAgICA6cGFyYW0gc3BhcmtfY29udGV4dDogbG9jYWwgc3Bhcmsgc2Vzc2lvbiBmb3Igc3BhcmsgaW5nZXN0aW9uLCBleGFtcGxlIGZvciBjcmVhdGluZyB0aGUgc3BhcmsgY29udGV4dDoKICAgICAgICAgICAgICAgICAgICAgICAgICBgc3BhcmsgPSBTcGFya1Nlc3Npb24uYnVpbGRlci5hcHBOYW1lKCJTcGFyayBmdW5jdGlvbiIpLmdldE9yQ3JlYXRlKClgCiAgICAgICAgICAgICAgICAgICAgICAgICAgRm9yIHJlbW90ZSBzcGFyayBpbmdlc3Rpb24sIHRoaXMgc2hvdWxkIGNvbnRhaW4gdGhlIHJlbW90ZSBzcGFyayBzZXJ2aWNlIG5hbWUKICAgIDpwYXJhbSBvdmVyd3JpdGU6ICAgICBkZWxldGUgdGhlIHRhcmdldHMnIGRhdGEgcHJpb3IgdG8gaW5nZXN0aW9uCiAgICAgICAgICAgICAgICAgICAgICAgICAgKGRlZmF1bHQ6IFRydWUgZm9yIG5vbi1zY2hlZHVsZWQgaW5nZXN0IC0gZGVsZXRlcyB0aGUgdGFyZ2V0cyB0aGF0IGFyZSBhYm91dCB0byBiZSBpbmdlc3RlZC4KICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgRmFsc2UgZm9yIHNjaGVkdWxlZCBpbmdlc3QgLSBkb2VzIG5vdCBkZWxldGUgdGhlIHRhcmdldCkKCiAgICAiIiIKICAgICMgU2V0dGluZyBpbmZlcl9vcHRpb25zIHRvIGRlZmF1bHQ6CiAgICBjb250ZXh0Ll9wYXJhbWV0ZXJzWyJpbmZlcl9vcHRpb25zIl0gPSBpbmZlcl9vcHRpb25zIG9yIEluZmVyT3B0aW9ucy5kZWZhdWx0KCkKCiAgICBjb250ZXh0LmxvZ2dlci5pbmZvKGYiQ2FsbGluZyBpbmdlc3Rpb24gdGFzayB3aXRoOiB7ZmVhdHVyZXNldH0iKQoKICAgICMgaW5nZXN0IGNhbGxlZCB3aXRoIG1scnVuX2NvbnRleHQsIGZlYXR1cmVfc2V0LCBzb3VyY2UgYW5kIHRhcmdldHMgcGFzc2VkIHdpdGggY29udGV4dAogICAgIyBUaGlzIHBhcmFtcyBoZXJlIGZvciBkb2N1bWVudGF0aW9uIHB1cnBvc2VzIG9ubHkKICAgIGZzLmluZ2VzdCgKICAgICAgICBtbHJ1bl9jb250ZXh0PWNvbnRleHQsCiAgICAgICAgbmFtZXNwYWNlPW5hbWVzcGFjZSwKICAgICAgICBzcGFya19jb250ZXh0PXNwYXJrX2NvbnRleHQsCiAgICApCiAgICBjb250ZXh0LmxvZ19yZXN1bHQoImZlYXR1cmVzZXQiLCBmZWF0dXJlc2V0KQo=
-    commands: []
-    code_origin: https://github.com/mlrun/functions.git#886a88217c2a2570c81a14877f9c1dfb1ac8a244:C:\Users\yonatans\projects\functions\ingest\ingest.py
-    origin_filename: C:\Users\yonatans\projects\functions\ingest\ingest.py
-  entry_points:
-    ingest:
-      name: ingest
-      doc: "Read local DataFrame, file, URL, or source into the feature store\nIngest\
-        \ reads from the source, run the graph transformations, infers  metadata and\
-        \ stats\nand writes the results to the default of specified targets\n\nwhen\
-        \ targets are not specified data is stored in the configured default targets\n\
-        (will usually be NoSQL for real-time and Parquet for offline).\n\nexample::\n\
-        \n    stocks_set = FeatureSet(\"stocks\", entities=[Entity(\"ticker\")])\n\
-        \    stocks = pd.read_csv(\"stocks.csv\")\n    df = ingest(stocks_set, stocks,\
-        \ infer_options=fstore.InferOptions.default())\n\n    # for running as remote\
-        \ job\n    config = RunConfig(image='mlrun/mlrun').apply(mount_v3io())\n \
-        \   df = ingest(stocks_set, stocks, run_config=config)\n\n    # specify source\
-        \ and targets\n    source = CSVSource(\"mycsv\", path=\"measurements.csv\"\
-        )\n    targets = [CSVTarget(\"mycsv\", path=\"./mycsv.csv\")]\n    ingest(measurements,\
-        \ source, targets)"
-      parameters:
-      - name: context
-        type: MLClientCtx
-        doc: MLRun context
-        default: ''
-      - name: featureset
-        type: str
-        doc: feature set object or featureset.uri. (uri must be of a feature set that
-          is in the DB, call `.save()` if it's not)
-        default: ''
-      - name: source
-        type: str
-        doc: source dataframe or file path
-        default: ''
-      - name: targets
-        type: List[Union[str, Dict]]
-        doc: optional list of data target objects
-        default: null
-      - name: namespace
-        doc: namespace or module containing graph classes
-        default: null
-      - name: infer_options
-        doc: schema and stats infer options
-        default: null
-      - name: run_config
-        type: Union[str, Dict]
-        doc: function and/or run configuration for remote jobs, see :py:class:`~mlrun.feature_store.RunConfig`
-        default: null
-      - name: spark_context
-        doc: 'local spark session for spark ingestion, example for creating the spark
-          context: `spark = SparkSession.builder.appName("Spark function").getOrCreate()`
-          For remote spark ingestion, this should contain the remote spark service
-          name'
-        default: null
-      - name: overwrite
-        doc: 'delete the targets'' data prior to ingestion (default: True for non-scheduled
-          ingest - deletes the targets that are about to be ingested. False for scheduled
-          ingest - does not delete the target)'
-        default: null
-      outputs:
-      - default: ''
-      lineno: 8
-  description: Feature Store ingest function that runs the transformation graph on
-    the source of the featureset.
-  default_handler: ingest
-  disable_auto_mount: false
-  env: []
-  priority_class_name: ''
-  affinity: null
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/ingest/0.0.1/static/item.html b/functions/development/ingest/0.0.1/static/item.html deleted file mode 100644 index e6bab3c8..00000000 --- a/functions/development/ingest/0.0.1/static/item.html +++ /dev/null @@ -1,46 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-  - data-preparation
-  - data-analysis
-  - feature-store
-description: Feature Store ingest function that runs the transformation graph on the source of the featureset.
-doc: ''
-example: ingest.ipynb
-generationDate: 2021-11-13:00-15
-icon: ''
-labels:
-  author: yonish
-maintainers: []
-marketplaceType: ''
-mlrunVersion: ''
-name: ingest
-platformVersion: ''
-spec:
-  filename: ingest.py
-  handler: ingest
-  image: mlrun/mlrun
-  kind: job
-  requirements: []
-url: ''
-version: 0.0.1
-        
-    
- - \ No newline at end of file diff --git a/functions/development/ingest/0.0.1/static/source.html b/functions/development/ingest/0.0.1/static/source.html deleted file mode 100644 index 3ca7f986..00000000 --- a/functions/development/ingest/0.0.1/static/source.html +++ /dev/null @@ -1,92 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-from typing import Union, List, Dict
-
-import mlrun.feature_store as fs
-from mlrun.execution import MLClientCtx
-from mlrun.data_types import InferOptions
-
-
-def ingest(
-    context: MLClientCtx,
-    featureset: str,
-    source: str,
-    targets: List[Union[str, Dict]] = None,
-    namespace=None,
-    infer_options=None,
-    run_config: Union[str, Dict] = None,
-    spark_context=None,
-    overwrite=None,
-):
-    """Read local DataFrame, file, URL, or source into the feature store
-    Ingest reads from the source, run the graph transformations, infers  metadata and stats
-    and writes the results to the default of specified targets
-
-    when targets are not specified data is stored in the configured default targets
-    (will usually be NoSQL for real-time and Parquet for offline).
-
-    example::
-
-        stocks_set = FeatureSet("stocks", entities=[Entity("ticker")])
-        stocks = pd.read_csv("stocks.csv")
-        df = ingest(stocks_set, stocks, infer_options=fstore.InferOptions.default())
-
-        # for running as remote job
-        config = RunConfig(image='mlrun/mlrun').apply(mount_v3io())
-        df = ingest(stocks_set, stocks, run_config=config)
-
-        # specify source and targets
-        source = CSVSource("mycsv", path="measurements.csv")
-        targets = [CSVTarget("mycsv", path="./mycsv.csv")]
-        ingest(measurements, source, targets)
-
-    :param context:       MLRun context
-    :param featureset:    feature set object or featureset.uri. (uri must be of a feature set that is in the DB,
-                          call `.save()` if it's not)
-    :param source:        source dataframe or file path
-    :param targets:       optional list of data target objects
-    :param namespace:     namespace or module containing graph classes
-    :param infer_options: schema and stats infer options
-    :param run_config:    function and/or run configuration for remote jobs,
-                          see :py:class:`~mlrun.feature_store.RunConfig`
-    :param spark_context: local spark session for spark ingestion, example for creating the spark context:
-                          `spark = SparkSession.builder.appName("Spark function").getOrCreate()`
-                          For remote spark ingestion, this should contain the remote spark service name
-    :param overwrite:     delete the targets' data prior to ingestion
-                          (default: True for non-scheduled ingest - deletes the targets that are about to be ingested.
-                                    False for scheduled ingest - does not delete the target)
-
-    """
-    # Setting infer_options to default:
-    context._parameters["infer_options"] = infer_options or InferOptions.default()
-
-    context.logger.info(f"Calling ingestion task with: {featureset}")
-
-    # ingest called with mlrun_context, feature_set, source and targets passed with context
-    # This params here for documentation purposes only
-    fs.ingest(
-        mlrun_context=context,
-        namespace=namespace,
-        spark_context=spark_context,
-    )
-    context.log_result("featureset", featureset)
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/ingest/0.9.0/src/function.yaml b/functions/development/ingest/0.9.0/src/function.yaml deleted file mode 100644 index a05ca669..00000000 --- a/functions/development/ingest/0.9.0/src/function.yaml +++ /dev/null @@ -1,87 +0,0 @@ -kind: job -metadata: - name: ingest - tag: '' - hash: 7e28700a86ebdd18d887fe588492201a1e3ef2f6 - project: '' - labels: - author: yonish - categories: - - data-preparation - - data-analysis - - feature-store -spec: - command: '' - args: [] - image: mlrun/mlrun - build: - functionSourceCode: ZnJvbSB0eXBpbmcgaW1wb3J0IFVuaW9uLCBMaXN0LCBEaWN0CgppbXBvcnQgbWxydW4uZmVhdHVyZV9zdG9yZSBhcyBmcwpmcm9tIG1scnVuLmV4ZWN1dGlvbiBpbXBvcnQgTUxDbGllbnRDdHgKZnJvbSBtbHJ1bi5kYXRhX3R5cGVzIGltcG9ydCBJbmZlck9wdGlvbnMKCgpkZWYgaW5nZXN0KAogICAgY29udGV4dDogTUxDbGllbnRDdHgsCiAgICBmZWF0dXJlc2V0OiBzdHIsCiAgICBzb3VyY2U6IHN0ciwKICAgIHRhcmdldHM6IExpc3RbVW5pb25bc3RyLCBEaWN0XV0gPSBOb25lLAogICAgbmFtZXNwYWNlPU5vbmUsCiAgICBpbmZlcl9vcHRpb25zPU5vbmUsCiAgICBydW5fY29uZmlnOiBVbmlvbltzdHIsIERpY3RdID0gTm9uZSwKICAgIHNwYXJrX2NvbnRleHQ9Tm9uZSwKICAgIG92ZXJ3cml0ZT1Ob25lLAopOgogICAgIiIiUmVhZCBsb2NhbCBEYXRhRnJhbWUsIGZpbGUsIFVSTCwgb3Igc291cmNlIGludG8gdGhlIGZlYXR1cmUgc3RvcmUKICAgIEluZ2VzdCByZWFkcyBmcm9tIHRoZSBzb3VyY2UsIHJ1biB0aGUgZ3JhcGggdHJhbnNmb3JtYXRpb25zLCBpbmZlcnMgIG1ldGFkYXRhIGFuZCBzdGF0cwogICAgYW5kIHdyaXRlcyB0aGUgcmVzdWx0cyB0byB0aGUgZGVmYXVsdCBvZiBzcGVjaWZpZWQgdGFyZ2V0cwoKICAgIHdoZW4gdGFyZ2V0cyBhcmUgbm90IHNwZWNpZmllZCBkYXRhIGlzIHN0b3JlZCBpbiB0aGUgY29uZmlndXJlZCBkZWZhdWx0IHRhcmdldHMKICAgICh3aWxsIHVzdWFsbHkgYmUgTm9TUUwgZm9yIHJlYWwtdGltZSBhbmQgUGFycXVldCBmb3Igb2ZmbGluZSkuCgogICAgZXhhbXBsZTo6CgogICAgICAgIHN0b2Nrc19zZXQgPSBGZWF0dXJlU2V0KCJzdG9ja3MiLCBlbnRpdGllcz1bRW50aXR5KCJ0aWNrZXIiKV0pCiAgICAgICAgc3RvY2tzID0gcGQucmVhZF9jc3YoInN0b2Nrcy5jc3YiKQogICAgICAgIGRmID0gaW5nZXN0KHN0b2Nrc19zZXQsIHN0b2NrcywgaW5mZXJfb3B0aW9ucz1mc3RvcmUuSW5mZXJPcHRpb25zLmRlZmF1bHQoKSkKCiAgICAgICAgIyBmb3IgcnVubmluZyBhcyByZW1vdGUgam9iCiAgICAgICAgY29uZmlnID0gUnVuQ29uZmlnKGltYWdlPSdtbHJ1bi9tbHJ1bicpLmFwcGx5KG1vdW50X3YzaW8oKSkKICAgICAgICBkZiA9IGluZ2VzdChzdG9ja3Nfc2V0LCBzdG9ja3MsIHJ1bl9jb25maWc9Y29uZmlnKQoKICAgICAgICAjIHNwZWNpZnkgc291cmNlIGFuZCB0YXJnZXRzCiAgICAgICAgc291cmNlID0gQ1NWU291cmNlKCJteWNzdiIsIHBhdGg9Im1lYXN1cmVtZW50cy5jc3YiKQogICAgICAgIHRhcmdldHMgPSBbQ1NWVGFyZ2V0KCJteWNzdiIsIHBhdGg9Ii4vbXljc3YuY3N2IildCiAgICAgICAgaW5nZXN0KG1lYXN1cmVtZW50cywgc291cmNlLCB0YXJnZXRzKQoKICAgIDpwYXJhbSBjb250ZXh0OiAgICAgICBNTFJ1biBjb250ZXh0CiAgICA6cGFyYW0gZmVhdHVyZXNldDogICAgZmVhdHVyZSBzZXQgb2JqZWN0IG9yIGZlYXR1cmVzZXQudXJpLiAodXJpIG11c3QgYmUgb2YgYSBmZWF0dXJlIHNldCB0aGF0IGlzIGluIHRoZSBEQiwKICAgICAgICAgICAgICAgICAgICAgICAgICBjYWxsIGAuc2F2ZSgpYCBpZiBpdCdzIG5vdCkKICAgIDpwYXJhbSBzb3VyY2U6ICAgICAgICBzb3VyY2UgZGF0YWZyYW1lIG9yIGZpbGUgcGF0aAogICAgOnBhcmFtIHRhcmdldHM6ICAgICAgIG9wdGlvbmFsIGxpc3Qgb2YgZGF0YSB0YXJnZXQgb2JqZWN0cwogICAgOnBhcmFtIG5hbWVzcGFjZTogICAgIG5hbWVzcGFjZSBvciBtb2R1bGUgY29udGFpbmluZyBncmFwaCBjbGFzc2VzCiAgICA6cGFyYW0gaW5mZXJfb3B0aW9uczogc2NoZW1hIGFuZCBzdGF0cyBpbmZlciBvcHRpb25zCiAgICA6cGFyYW0gcnVuX2NvbmZpZzogICAgZnVuY3Rpb24gYW5kL29yIHJ1biBjb25maWd1cmF0aW9uIGZvciByZW1vdGUgam9icywKICAgICAgICAgICAgICAgICAgICAgICAgICBzZWUgOnB5OmNsYXNzOmB+bWxydW4uZmVhdHVyZV9zdG9yZS5SdW5Db25maWdgCiAgICA6cGFyYW0gc3BhcmtfY29udGV4dDogbG9jYWwgc3Bhcmsgc2Vzc2lvbiBmb3Igc3BhcmsgaW5nZXN0aW9uLCBleGFtcGxlIGZvciBjcmVhdGluZyB0aGUgc3BhcmsgY29udGV4dDoKICAgICAgICAgICAgICAgICAgICAgICAgICBgc3BhcmsgPSBTcGFya1Nlc3Npb24uYnVpbGRlci5hcHBOYW1lKCJTcGFyayBmdW5jdGlvbiIpLmdldE9yQ3JlYXRlKClgCiAgICAgICAgICAgICAgICAgICAgICAgICAgRm9yIHJlbW90ZSBzcGFyayBpbmdlc3Rpb24sIHRoaXMgc2hvdWxkIGNvbnRhaW4gdGhlIHJlbW90ZSBzcGFyayBzZXJ2aWNlIG5hbWUKICAgIDpwYXJhbSBvdmVyd3JpdGU6ICAgICBkZWxldGUgdGhlIHRhcmdldHMnIGRhdGEgcHJpb3IgdG8gaW5nZXN0aW9uCiAgICAgICAgICAgICAgICAgICAgICAgICAgKGRlZmF1bHQ6IFRydWUgZm9yIG5vbi1zY2hlZHVsZWQgaW5nZXN0IC0gZGVsZXRlcyB0aGUgdGFyZ2V0cyB0aGF0IGFyZSBhYm91dCB0byBiZSBpbmdlc3RlZC4KICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgRmFsc2UgZm9yIHNjaGVkdWxlZCBpbmdlc3QgLSBkb2VzIG5vdCBkZWxldGUgdGhlIHRhcmdldCkKCiAgICAiIiIKICAgICMgU2V0dGluZyBpbmZlcl9vcHRpb25zIHRvIGRlZmF1bHQ6CiAgICBjb250ZXh0Ll9wYXJhbWV0ZXJzWyJpbmZlcl9vcHRpb25zIl0gPSBpbmZlcl9vcHRpb25zIG9yIEluZmVyT3B0aW9ucy5kZWZhdWx0KCkKCiAgICBjb250ZXh0LmxvZ2dlci5pbmZvKGYiQ2FsbGluZyBpbmdlc3Rpb24gdGFzayB3aXRoOiB7ZmVhdHVyZXNldH0iKQoKICAgICMgaW5nZXN0IGNhbGxlZCB3aXRoIG1scnVuX2NvbnRleHQsIGZlYXR1cmVfc2V0LCBzb3VyY2UgYW5kIHRhcmdldHMgcGFzc2VkIHdpdGggY29udGV4dAogICAgIyBUaGlzIHBhcmFtcyBoZXJlIGZvciBkb2N1bWVudGF0aW9uIHB1cnBvc2VzIG9ubHkKICAgIGZzLmluZ2VzdCgKICAgICAgICBtbHJ1bl9jb250ZXh0PWNvbnRleHQsCiAgICAgICAgbmFtZXNwYWNlPW5hbWVzcGFjZSwKICAgICAgICBzcGFya19jb250ZXh0PXNwYXJrX2NvbnRleHQsCiAgICApCiAgICBjb250ZXh0LmxvZ19yZXN1bHQoImZlYXR1cmVzZXQiLCBmZWF0dXJlc2V0KQo= - commands: [] - code_origin: https://github.com/mlrun/functions.git#886a88217c2a2570c81a14877f9c1dfb1ac8a244:C:\Users\yonatans\projects\functions\ingest\ingest.py - origin_filename: C:\Users\yonatans\projects\functions\ingest\ingest.py - entry_points: - ingest: - name: ingest - doc: "Read local DataFrame, file, URL, or source into the feature store\nIngest\ - \ reads from the source, run the graph transformations, infers metadata and\ - \ stats\nand writes the results to the default of specified targets\n\nwhen\ - \ targets are not specified data is stored in the configured default targets\n\ - (will usually be NoSQL for real-time and Parquet for offline).\n\nexample::\n\ - \n stocks_set = FeatureSet(\"stocks\", entities=[Entity(\"ticker\")])\n\ - \ stocks = pd.read_csv(\"stocks.csv\")\n df = ingest(stocks_set, stocks,\ - \ infer_options=fstore.InferOptions.default())\n\n # for running as remote\ - \ job\n config = RunConfig(image='mlrun/mlrun').apply(mount_v3io())\n \ - \ df = ingest(stocks_set, stocks, run_config=config)\n\n # specify source\ - \ and targets\n source = CSVSource(\"mycsv\", path=\"measurements.csv\"\ - )\n targets = [CSVTarget(\"mycsv\", path=\"./mycsv.csv\")]\n ingest(measurements,\ - \ source, targets)" - parameters: - - name: context - type: MLClientCtx - doc: MLRun context - default: '' - - name: featureset - type: str - doc: feature set object or featureset.uri. (uri must be of a feature set that - is in the DB, call `.save()` if it's not) - default: '' - - name: source - type: str - doc: source dataframe or file path - default: '' - - name: targets - type: List[Union[str, Dict]] - doc: optional list of data target objects - default: null - - name: namespace - doc: namespace or module containing graph classes - default: null - - name: infer_options - doc: schema and stats infer options - default: null - - name: run_config - type: Union[str, Dict] - doc: function and/or run configuration for remote jobs, see :py:class:`~mlrun.feature_store.RunConfig` - default: null - - name: spark_context - doc: 'local spark session for spark ingestion, example for creating the spark - context: `spark = SparkSession.builder.appName("Spark function").getOrCreate()` - For remote spark ingestion, this should contain the remote spark service - name' - default: null - - name: overwrite - doc: 'delete the targets'' data prior to ingestion (default: True for non-scheduled - ingest - deletes the targets that are about to be ingested. False for scheduled - ingest - does not delete the target)' - default: null - outputs: - - default: '' - lineno: 8 - description: Feature Store ingest function that runs the transformation graph on - the source of the featureset. - default_handler: ingest - disable_auto_mount: false - env: [] - priority_class_name: '' - affinity: null -verbose: false diff --git a/functions/development/ingest/0.9.0/src/ingest.ipynb b/functions/development/ingest/0.9.0/src/ingest.ipynb deleted file mode 100644 index 7da398b4..00000000 --- a/functions/development/ingest/0.9.0/src/ingest.ipynb +++ /dev/null @@ -1,762 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Feature Store Ingest" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Read local DataFrame, file, URL, or source into the feature store\n", - "Ingest reads from the source, run the graph transformations, infers metadata and stats\n", - "and writes the results to the default of specified targets." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Creating Project" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import mlrun\n", - "import mlrun.feature_store as fstore\n", - "from mlrun.datastore.sources import CSVSource\n", - "from mlrun.feature_store.steps import *\n", - "from mlrun.features import MinMaxValidator\n", - "import pandas as pd\n", - "import datetime" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2022-01-31 13:52:16,939 [info] loaded project ingest from MLRun DB\n" - ] - } - ], - "source": [ - "# Initialize the MLRun project object\n", - "project = mlrun.get_or_create_project('ingest', context=\"./\", user_project=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Create Sample Data For Demo" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "quotes = pd.DataFrame(\n", - " {\n", - " \"time\": [\n", - " pd.Timestamp(\"2016-05-25 13:30:00.023\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.023\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.030\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.041\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.048\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.049\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.072\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.075\"),\n", - " ],\n", - " \"ticker\": [\"GOOG\", \"MSFT\", \"MSFT\", \"MSFT\", \"GOOG\", \"AAPL\", \"GOOG\", \"MSFT\"],\n", - " \"bid\": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01],\n", - " \"ask\": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03],\n", - " }\n", - ")\n", - "\n", - "# move date:\n", - "max_date = quotes[\"time\"].max()\n", - "now_date = datetime.datetime.now()\n", - "delta = now_date - max_date\n", - "quotes[\"time\"] = quotes[\"time\"] + delta" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
timetickerbidask
02022-01-31 13:52:16.905388GOOG720.50720.93
12022-01-31 13:52:16.905388MSFT51.9551.96
22022-01-31 13:52:16.912388MSFT51.9751.98
32022-01-31 13:52:16.923388MSFT51.9952.00
42022-01-31 13:52:16.930388GOOG720.50720.93
52022-01-31 13:52:16.931388AAPL97.9998.01
62022-01-31 13:52:16.954388GOOG720.50720.88
72022-01-31 13:52:16.957388MSFT52.0152.03
\n", - "
" - ], - "text/plain": [ - " time ticker bid ask\n", - "0 2022-01-31 13:52:16.905388 GOOG 720.50 720.93\n", - "1 2022-01-31 13:52:16.905388 MSFT 51.95 51.96\n", - "2 2022-01-31 13:52:16.912388 MSFT 51.97 51.98\n", - "3 2022-01-31 13:52:16.923388 MSFT 51.99 52.00\n", - "4 2022-01-31 13:52:16.930388 GOOG 720.50 720.93\n", - "5 2022-01-31 13:52:16.931388 AAPL 97.99 98.01\n", - "6 2022-01-31 13:52:16.954388 GOOG 720.50 720.88\n", - "7 2022-01-31 13:52:16.957388 MSFT 52.01 52.03" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "quotes" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Build Advanced Feature Set - With Feature Engineering Pipeline" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Define a custom pipeline step (python class)" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "class MyMap(MapClass):\n", - " def __init__(self, multiplier=1, **kwargs):\n", - " super().__init__(**kwargs)\n", - " self._multiplier = multiplier\n", - "\n", - " def do(self, event):\n", - " event[\"multi\"] = event[\"bid\"] * self._multiplier\n", - " return event" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Build and show the transformatiom pipeline" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "image/svg+xml": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "mlrun-flow\n", - "\n", - "\n", - "\n", - "_start\n", - "\n", - "start\n", - "\n", - "\n", - "\n", - "map.MyMap\n", - "\n", - "map.MyMap\n", - "\n", - "\n", - "\n", - "_start->map.MyMap\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "storey.Extend\n", - "\n", - "storey.Extend\n", - "\n", - "\n", - "\n", - "map.MyMap->storey.Extend\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "filter\n", - "\n", - "filter\n", - "\n", - "\n", - "\n", - "storey.Extend->filter\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "FeaturesetValidator\n", - "\n", - "FeaturesetValidator\n", - "\n", - "\n", - "\n", - "filter->FeaturesetValidator\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "Aggregates\n", - "\n", - "Aggregates\n", - "\n", - "\n", - "\n", - "FeaturesetValidator->Aggregates\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "parquet\n", - "\n", - "\n", - "parquet\n", - "\n", - "\n", - "\n", - "Aggregates->parquet\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "nosql\n", - "\n", - "\n", - "nosql\n", - "\n", - "\n", - "\n", - "Aggregates->nosql\n", - "\n", - "\n", - "\n", - "\n", - "\n" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "quotes_set = fstore.FeatureSet(\"stock-quotes\", entities=[fstore.Entity(\"ticker\")])\n", - "\n", - "quotes_set.graph.to(\"map.MyMap\", multiplier=3).to(\n", - " \"storey.Extend\", _fn=\"({'extra': event['bid'] * 77})\"\n", - ").to(\"storey.Filter\", \"filter\", _fn=\"(event['bid'] > 51.92)\").to(\n", - " FeaturesetValidator()\n", - ")\n", - "\n", - "quotes_set.add_aggregation(\"ask\", [\"sum\", \"max\"], \"1h\", \"10m\", name=\"asks1\")\n", - "quotes_set.add_aggregation(\"ask\", [\"sum\", \"max\"], \"5h\", \"10m\", name=\"asks5\")\n", - "quotes_set.add_aggregation(\"bid\", [\"min\", \"max\"], \"1h\", \"10m\", name=\"bids\")\n", - "\n", - "# add feature validation policy\n", - "quotes_set[\"bid\"] = fstore.Feature(\n", - " validator=MinMaxValidator(min=52, severity=\"info\")\n", - ")\n", - "\n", - "# add default target definitions and plot\n", - "quotes_set.set_targets()\n", - "quotes_set.plot(rankdir=\"LR\", with_targets=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Saving the feature set in the feature store " - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "quotes_set.save()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Creating the data source of the feature set to apply the ingest on:" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "data_uri = 'quotes.csv'\n", - "quotes.to_csv(data_uri, index=False)\n", - "source = CSVSource('quotes', data_uri).to_dict()\n", - "source" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Import ingest function" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "ingest_fn = mlrun.import_function(\"function.yaml\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Running the function locally" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2022-01-31 13:52:17,201 [info] starting run ingest-ingest uid=4bd5d12691a8439d90bf53847f59df1a DB=http://mlrun-api:8080\n", - "> 2022-01-31 13:52:17,354 [info] Ingesting the FeatureSet: store://feature-sets/ingest-yonatan/stock-quotes\n", - "> 2022-01-31 13:52:17,427 [info] starting ingestion task to store://feature-sets/ingest-yonatan/stock-quotes:latest.\n", - "info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 13:52:19.466055 args={'min': 52, 'value': 51.95}\n", - "info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 13:52:19.466072 args={'min': 52, 'value': 51.97}\n", - "info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 13:52:19.466085 args={'min': 52, 'value': 51.99}\n", - "info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 13:52:19.671677 args={'min': 52, 'value': 51.95}\n", - "info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 13:52:19.671692 args={'min': 52, 'value': 51.97}\n", - "info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 13:52:19.671708 args={'min': 52, 'value': 51.99}\n", - "> 2022-01-31 13:52:19,915 [info] ingestion task completed, targets:\n", - "> 2022-01-31 13:52:19,915 [info] [{'name': 'parquet', 'kind': 'parquet', 'path': 'v3io:///projects/ingest-yonatan/FeatureStore/stock-quotes/parquet/sets/stock-quotes-latest', 'status': 'created', 'updated': '2022-01-31T13:52:19.649303+00:00', 'last_written': datetime.datetime(2022, 1, 31, 13, 52, 19, 671753)}, {'name': 'nosql', 'kind': 'nosql', 'path': 'v3io:///projects/ingest-yonatan/FeatureStore/stock-quotes/nosql/sets/stock-quotes-latest', 'status': 'created', 'updated': '2022-01-31T13:52:19.650044+00:00'}]\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
ingest-yonatan0Jan 31 13:52:17completedingest-ingest
v3io_user=yonatan
kind=
owner=yonatan
host=jupyter-yoni-647b99c95d-w4jlc
featureset=store://feature-sets/ingest-yonatan/stock-quotes
source={'kind': 'csv', 'name': 'quotes', 'path': 'quotes.csv'}
infer_options=63
overwrite=None
targets=None
featureset=store://feature-sets/ingest-yonatan/stock-quotes
\n", - "
\n", - "
\n", - "
\n", - " Title\n", - " ×\n", - "
\n", - " \n", - "
\n", - "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - }, - { - "data": { - "text/html": [ - " > to track results use the .show() or .logs() methods or click here to open in UI" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2022-01-31 13:52:20,045 [info] run executed, status=completed\n" - ] - } - ], - "source": [ - "ingest_run = ingest_fn.run(\n", - " handler=\"ingest\",\n", - " params={\n", - " \"featureset\": quotes_set.uri,\n", - " \"source\": source,\n", - " },\n", - " local=True,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## View of the targets' state after run" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'created'" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fstore.get_feature_set(ingest_run.outputs['featureset']).status.state" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/functions/development/ingest/0.9.0/src/ingest.py b/functions/development/ingest/0.9.0/src/ingest.py deleted file mode 100644 index cf3285d8..00000000 --- a/functions/development/ingest/0.9.0/src/ingest.py +++ /dev/null @@ -1,70 +0,0 @@ -from typing import Union, List, Dict - -import mlrun.feature_store as fs -from mlrun.execution import MLClientCtx -from mlrun.data_types import InferOptions - - -def ingest( - context: MLClientCtx, - featureset: str, - source: str, - targets: List[Union[str, Dict]] = None, - namespace=None, - infer_options=None, - run_config: Union[str, Dict] = None, - spark_context=None, - overwrite=None, -): - """Read local DataFrame, file, URL, or source into the feature store - Ingest reads from the source, run the graph transformations, infers metadata and stats - and writes the results to the default of specified targets - - when targets are not specified data is stored in the configured default targets - (will usually be NoSQL for real-time and Parquet for offline). - - example:: - - stocks_set = FeatureSet("stocks", entities=[Entity("ticker")]) - stocks = pd.read_csv("stocks.csv") - df = ingest(stocks_set, stocks, infer_options=fstore.InferOptions.default()) - - # for running as remote job - config = RunConfig(image='mlrun/mlrun').apply(mount_v3io()) - df = ingest(stocks_set, stocks, run_config=config) - - # specify source and targets - source = CSVSource("mycsv", path="measurements.csv") - targets = [CSVTarget("mycsv", path="./mycsv.csv")] - ingest(measurements, source, targets) - - :param context: MLRun context - :param featureset: feature set object or featureset.uri. (uri must be of a feature set that is in the DB, - call `.save()` if it's not) - :param source: source dataframe or file path - :param targets: optional list of data target objects - :param namespace: namespace or module containing graph classes - :param infer_options: schema and stats infer options - :param run_config: function and/or run configuration for remote jobs, - see :py:class:`~mlrun.feature_store.RunConfig` - :param spark_context: local spark session for spark ingestion, example for creating the spark context: - `spark = SparkSession.builder.appName("Spark function").getOrCreate()` - For remote spark ingestion, this should contain the remote spark service name - :param overwrite: delete the targets' data prior to ingestion - (default: True for non-scheduled ingest - deletes the targets that are about to be ingested. - False for scheduled ingest - does not delete the target) - - """ - # Setting infer_options to default: - context._parameters["infer_options"] = infer_options or InferOptions.default() - - context.logger.info(f"Calling ingestion task with: {featureset}") - - # ingest called with mlrun_context, feature_set, source and targets passed with context - # This params here for documentation purposes only - fs.ingest( - mlrun_context=context, - namespace=namespace, - spark_context=spark_context, - ) - context.log_result("featureset", featureset) diff --git a/functions/development/ingest/0.9.0/src/item.yaml b/functions/development/ingest/0.9.0/src/item.yaml deleted file mode 100644 index 33036771..00000000 --- a/functions/development/ingest/0.9.0/src/item.yaml +++ /dev/null @@ -1,25 +0,0 @@ -apiVersion: v1 -categories: - - data-preparation - - data-analysis - - feature-store -description: Feature Store ingest function that runs the transformation graph on the source of the featureset. -doc: '' -example: ingest.ipynb -generationDate: 2021-11-13:00-15 -icon: '' -labels: - author: yonish -maintainers: [] -marketplaceType: '' -mlrunVersion: '' -name: ingest -platformVersion: '' -spec: - filename: ingest.py - handler: ingest - image: mlrun/mlrun - kind: job - requirements: [] -url: '' -version: 0.9.0 \ No newline at end of file diff --git a/functions/development/ingest/0.9.0/src/test_ingest.py b/functions/development/ingest/0.9.0/src/test_ingest.py deleted file mode 100644 index 062898dc..00000000 --- a/functions/development/ingest/0.9.0/src/test_ingest.py +++ /dev/null @@ -1,136 +0,0 @@ -import os -import tempfile -import shutil -import datetime - -import mlrun -import mlrun.feature_store as fstore -from mlrun.datastore.sources import CSVSource -from mlrun.feature_store.steps import * -from mlrun.features import MinMaxValidator -import pandas as pd - - -def _set_environment(): - artifact_path = tempfile.TemporaryDirectory().name - os.makedirs(artifact_path) - project = mlrun.new_project("ingest-test") - return artifact_path, project - - -def _cleanup_environment(artifact_path: str): - """ - Cleanup the test environment, deleting files and artifacts created during the test. - - :param artifact_path: The artifact path to delete. - """ - # Clean the local directory: - for test_output in [ - *os.listdir(artifact_path), - "schedules", - "runs", - "artifacts", - "functions", - ]: - test_output_path = os.path.abspath(f"./{test_output}") - if os.path.exists(test_output_path): - if os.path.isdir(test_output_path): - shutil.rmtree(test_output_path) - else: - os.remove(test_output_path) - - # Clean the artifacts' directory: - shutil.rmtree(artifact_path) - - -def create_dataframes() -> (pd.DataFrame, pd.DataFrame): - quotes = pd.DataFrame( - { - "time": [ - pd.Timestamp("2016-05-25 13:30:00.023"), - pd.Timestamp("2016-05-25 13:30:00.023"), - pd.Timestamp("2016-05-25 13:30:00.030"), - pd.Timestamp("2016-05-25 13:30:00.041"), - pd.Timestamp("2016-05-25 13:30:00.048"), - pd.Timestamp("2016-05-25 13:30:00.049"), - pd.Timestamp("2016-05-25 13:30:00.072"), - pd.Timestamp("2016-05-25 13:30:00.075"), - ], - "ticker": ["GOOG", "MSFT", "MSFT", "MSFT", "GOOG", "AAPL", "GOOG", "MSFT"], - "bid": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01], - "ask": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03], - } - ) - - # move date: - max_date = quotes["time"].max() - now_date = datetime.datetime.now() - delta = now_date - max_date - quotes["time"] = quotes["time"] + delta - - return quotes - - -class MyMap(MapClass): - def __init__(self, multiplier=1, **kwargs): - super().__init__(**kwargs) - self._multiplier = multiplier - - def do(self, event): - event["multi"] = event["bid"] * self._multiplier - return event - - -def _create_feature_set(): - quotes_set = fstore.FeatureSet("stock-quotes", entities=[fstore.Entity("ticker")]) - - quotes_set.graph.to("test_ingest.MyMap", multiplier=3).to( - "storey.Extend", _fn="({'extra': event['bid'] * 77})" - ).to("storey.Filter", "filter", _fn="(event['bid'] > 51.92)").to( - FeaturesetValidator() - ) - - quotes_set.add_aggregation("ask", ["sum", "max"], "1h", "10m", name="asks1") - quotes_set.add_aggregation("ask", ["sum", "max"], "5h", "10m", name="asks5") - quotes_set.add_aggregation("bid", ["min", "max"], "1h", "10m", name="bids") - - # add feature validation policy - quotes_set["bid"] = fstore.Feature( - validator=MinMaxValidator(min=52, severity="info") - ) - - # add default target definitions - quotes_set.set_targets() - return quotes_set - - -def test_ingest(): - artifact_path, project = _set_environment() - ingest_fn = mlrun.import_function("function.yaml") - quotes = create_dataframes() - - quotes_set = _create_feature_set() - quotes_set.save() - - data_uri = os.path.join(artifact_path, "quotes.csv") - quotes.to_csv(data_uri, index=False) - source = CSVSource("quotes", data_uri).to_dict() - - ingest_run = None - try: - ingest_run = ingest_fn.run( - handler="ingest", - params={ - "featureset": quotes_set.uri, - "source": source, - }, - local=True, - ) - - except Exception as exception: - print(f"- The test failed - raised the following error:\n- {exception}") - assert ( - fstore.get_feature_set(ingest_run.outputs["featureset"]).status.state - == "created" - ), "Targets not created successfully" - _cleanup_environment(artifact_path) diff --git a/functions/development/ingest/0.9.0/static/documentation.html b/functions/development/ingest/0.9.0/static/documentation.html deleted file mode 100644 index c7861d66..00000000 --- a/functions/development/ingest/0.9.0/static/documentation.html +++ /dev/null @@ -1,173 +0,0 @@ - - - - - - - -ingest package - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- - -
-
-
-
-
-
-

ingest package

-
-

Submodules

-
-
-

ingest.ingest module

-
-
-ingest.ingest.ingest(context: mlrun.execution.MLClientCtx, featureset: str, source: str, targets: Optional[List[Union[str, Dict]]] = None, namespace=None, infer_options=None, run_config: Optional[Union[str, Dict]] = None, spark_context=None, overwrite=None)[source]
-

Read local DataFrame, file, URL, or source into the feature store -Ingest reads from the source, run the graph transformations, infers metadata and stats -and writes the results to the default of specified targets

-

when targets are not specified data is stored in the configured default targets -(will usually be NoSQL for real-time and Parquet for offline).

-

example:

-
stocks_set = FeatureSet("stocks", entities=[Entity("ticker")])
-stocks = pd.read_csv("stocks.csv")
-df = ingest(stocks_set, stocks, infer_options=fstore.InferOptions.default())
-
-# for running as remote job
-config = RunConfig(image='mlrun/mlrun').apply(mount_v3io())
-df = ingest(stocks_set, stocks, run_config=config)
-
-# specify source and targets
-source = CSVSource("mycsv", path="measurements.csv")
-targets = [CSVTarget("mycsv", path="./mycsv.csv")]
-ingest(measurements, source, targets)
-
-
-
-
Parameters
-
    -
  • context – MLRun context

  • -
  • featureset – feature set object or featureset.uri. (uri must be of a feature set that is in the DB, -call .save() if it’s not)

  • -
  • source – source dataframe or file path

  • -
  • targets – optional list of data target objects

  • -
  • namespace – namespace or module containing graph classes

  • -
  • infer_options – schema and stats infer options

  • -
  • run_config – function and/or run configuration for remote jobs, -see RunConfig

  • -
  • spark_context – local spark session for spark ingestion, example for creating the spark context: -spark = SparkSession.builder.appName(“Spark function”).getOrCreate() -For remote spark ingestion, this should contain the remote spark service name

  • -
  • overwrite

    delete the targets’ data prior to ingestion -(default: True for non-scheduled ingest - deletes the targets that are about to be ingested.

    -
    -

    False for scheduled ingest - does not delete the target)

    -
    -

  • -
-
-
-
-
-
-

Module contents

-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/ingest/0.9.0/static/example.html b/functions/development/ingest/0.9.0/static/example.html deleted file mode 100644 index 1250bc58..00000000 --- a/functions/development/ingest/0.9.0/static/example.html +++ /dev/null @@ -1,606 +0,0 @@ - - - - - - - -Feature Store Ingest - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
- -
-
-
-
-

Feature Store Ingest

-

Read local DataFrame, file, URL, or source into the feature store -Ingest reads from the source, run the graph transformations, infers metadata and stats -and writes the results to the default of specified targets.

-
-

Creating Project

-
-
-
import mlrun
-import mlrun.feature_store as fstore
-from  mlrun.datastore.sources import CSVSource
-from mlrun.feature_store.steps import *
-from mlrun.features import MinMaxValidator
-import pandas as pd
-import datetime
-
-
-
-
-
-
-
# Initialize the MLRun project object
-project = mlrun.get_or_create_project('ingest', context="./", user_project=True)
-
-
-
-
-
> 2022-01-31 13:52:16,939 [info] loaded project ingest from MLRun DB
-
-
-
-
-
-
-

Create Sample Data For Demo

-
-
-
quotes = pd.DataFrame(
-    {
-        "time": [
-            pd.Timestamp("2016-05-25 13:30:00.023"),
-            pd.Timestamp("2016-05-25 13:30:00.023"),
-            pd.Timestamp("2016-05-25 13:30:00.030"),
-            pd.Timestamp("2016-05-25 13:30:00.041"),
-            pd.Timestamp("2016-05-25 13:30:00.048"),
-            pd.Timestamp("2016-05-25 13:30:00.049"),
-            pd.Timestamp("2016-05-25 13:30:00.072"),
-            pd.Timestamp("2016-05-25 13:30:00.075"),
-        ],
-        "ticker": ["GOOG", "MSFT", "MSFT", "MSFT", "GOOG", "AAPL", "GOOG", "MSFT"],
-        "bid": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01],
-        "ask": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03],
-    }
-)
-
-# move date:
-max_date = quotes["time"].max()
-now_date = datetime.datetime.now()
-delta = now_date - max_date
-quotes["time"] = quotes["time"] + delta
-
-
-
-
-
-
-
quotes
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
timetickerbidask
02022-01-31 13:52:16.905388GOOG720.50720.93
12022-01-31 13:52:16.905388MSFT51.9551.96
22022-01-31 13:52:16.912388MSFT51.9751.98
32022-01-31 13:52:16.923388MSFT51.9952.00
42022-01-31 13:52:16.930388GOOG720.50720.93
52022-01-31 13:52:16.931388AAPL97.9998.01
62022-01-31 13:52:16.954388GOOG720.50720.88
72022-01-31 13:52:16.957388MSFT52.0152.03
-
-
-
-
-

Build Advanced Feature Set - With Feature Engineering Pipeline

-

Define a custom pipeline step (python class)

-
-
-
class MyMap(MapClass):
-    def __init__(self, multiplier=1, **kwargs):
-        super().__init__(**kwargs)
-        self._multiplier = multiplier
-
-    def do(self, event):
-        event["multi"] = event["bid"] * self._multiplier
-        return event
-
-
-
-
-

Build and show the transformatiom pipeline

-
-
-
quotes_set = fstore.FeatureSet("stock-quotes", entities=[fstore.Entity("ticker")])
-
-quotes_set.graph.to("map.MyMap", multiplier=3).to(
-    "storey.Extend", _fn="({'extra': event['bid'] * 77})"
-).to("storey.Filter", "filter", _fn="(event['bid'] > 51.92)").to(
-    FeaturesetValidator()
-)
-
-quotes_set.add_aggregation("ask", ["sum", "max"], "1h", "10m", name="asks1")
-quotes_set.add_aggregation("ask", ["sum", "max"], "5h", "10m", name="asks5")
-quotes_set.add_aggregation("bid", ["min", "max"], "1h", "10m", name="bids")
-
-# add feature validation policy
-quotes_set["bid"] = fstore.Feature(
-    validator=MinMaxValidator(min=52, severity="info")
-)
-
-# add default target definitions and plot
-quotes_set.set_targets()
-quotes_set.plot(rankdir="LR", with_targets=True)
-
-
-
-
-_images/ingest_example_12_0.svg
-
-

Saving the feature set in the feature store

-
-
-
quotes_set.save()
-
-
-
-
-

Creating the data source of the feature set to apply the ingest on:

-
-
-
data_uri = 'quotes.csv'
-quotes.to_csv(data_uri, index=False)
-source = CSVSource('quotes', data_uri).to_dict()
-source
-
-
-
-
-
-
-

Import ingest function

-
-
-
ingest_fn = mlrun.import_function("function.yaml")
-
-
-
-
-
-
-

Running the function locally

-
-
-
ingest_run = ingest_fn.run(
-    handler="ingest",
-    params={
-        "featureset": quotes_set.uri,
-        "source": source,
-    },
-    local=True,
-)
-
-
-
-
-
> 2022-01-31 13:52:17,201 [info] starting run ingest-ingest uid=4bd5d12691a8439d90bf53847f59df1a DB=http://mlrun-api:8080
-> 2022-01-31 13:52:17,354 [info] Ingesting the FeatureSet: store://feature-sets/ingest-yonatan/stock-quotes
-> 2022-01-31 13:52:17,427 [info] starting ingestion task to store://feature-sets/ingest-yonatan/stock-quotes:latest.
-info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 13:52:19.466055 args={'min': 52, 'value': 51.95}
-info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 13:52:19.466072 args={'min': 52, 'value': 51.97}
-info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 13:52:19.466085 args={'min': 52, 'value': 51.99}
-info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 13:52:19.671677 args={'min': 52, 'value': 51.95}
-info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 13:52:19.671692 args={'min': 52, 'value': 51.97}
-info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 13:52:19.671708 args={'min': 52, 'value': 51.99}
-> 2022-01-31 13:52:19,915 [info] ingestion task completed, targets:
-> 2022-01-31 13:52:19,915 [info] [{'name': 'parquet', 'kind': 'parquet', 'path': 'v3io:///projects/ingest-yonatan/FeatureStore/stock-quotes/parquet/sets/stock-quotes-latest', 'status': 'created', 'updated': '2022-01-31T13:52:19.649303+00:00', 'last_written': datetime.datetime(2022, 1, 31, 13, 52, 19, 671753)}, {'name': 'nosql', 'kind': 'nosql', 'path': 'v3io:///projects/ingest-yonatan/FeatureStore/stock-quotes/nosql/sets/stock-quotes-latest', 'status': 'created', 'updated': '2022-01-31T13:52:19.650044+00:00'}]
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
ingest-yonatan0Jan 31 13:52:17completedingest-ingest
v3io_user=yonatan
kind=
owner=yonatan
host=jupyter-yoni-647b99c95d-w4jlc
featureset=store://feature-sets/ingest-yonatan/stock-quotes
source={'kind': 'csv', 'name': 'quotes', 'path': 'quotes.csv'}
infer_options=63
overwrite=None
targets=None
featureset=store://feature-sets/ingest-yonatan/stock-quotes
-
- -
-

-
-
-
> to track results use the .show() or .logs() methods or click here to open in UI
> 2022-01-31 13:52:20,045 [info] run executed, status=completed
-
-
-
-
-
-
-

View of the targets’ state after run

-
-
-
fstore.get_feature_set(ingest_run.outputs['featureset']).status.state
-
-
-
-
-
'created'
-
-
-
-
-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/ingest/0.9.0/static/function.html b/functions/development/ingest/0.9.0/static/function.html deleted file mode 100644 index d58ef431..00000000 --- a/functions/development/ingest/0.9.0/static/function.html +++ /dev/null @@ -1,109 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: job
-metadata:
-  name: ingest
-  tag: ''
-  hash: 7e28700a86ebdd18d887fe588492201a1e3ef2f6
-  project: ''
-  labels:
-    author: yonish
-  categories:
-  - data-preparation
-  - data-analysis
-  - feature-store
-spec:
-  command: ''
-  args: []
-  image: mlrun/mlrun
-  build:
-    functionSourceCode: ZnJvbSB0eXBpbmcgaW1wb3J0IFVuaW9uLCBMaXN0LCBEaWN0CgppbXBvcnQgbWxydW4uZmVhdHVyZV9zdG9yZSBhcyBmcwpmcm9tIG1scnVuLmV4ZWN1dGlvbiBpbXBvcnQgTUxDbGllbnRDdHgKZnJvbSBtbHJ1bi5kYXRhX3R5cGVzIGltcG9ydCBJbmZlck9wdGlvbnMKCgpkZWYgaW5nZXN0KAogICAgY29udGV4dDogTUxDbGllbnRDdHgsCiAgICBmZWF0dXJlc2V0OiBzdHIsCiAgICBzb3VyY2U6IHN0ciwKICAgIHRhcmdldHM6IExpc3RbVW5pb25bc3RyLCBEaWN0XV0gPSBOb25lLAogICAgbmFtZXNwYWNlPU5vbmUsCiAgICBpbmZlcl9vcHRpb25zPU5vbmUsCiAgICBydW5fY29uZmlnOiBVbmlvbltzdHIsIERpY3RdID0gTm9uZSwKICAgIHNwYXJrX2NvbnRleHQ9Tm9uZSwKICAgIG92ZXJ3cml0ZT1Ob25lLAopOgogICAgIiIiUmVhZCBsb2NhbCBEYXRhRnJhbWUsIGZpbGUsIFVSTCwgb3Igc291cmNlIGludG8gdGhlIGZlYXR1cmUgc3RvcmUKICAgIEluZ2VzdCByZWFkcyBmcm9tIHRoZSBzb3VyY2UsIHJ1biB0aGUgZ3JhcGggdHJhbnNmb3JtYXRpb25zLCBpbmZlcnMgIG1ldGFkYXRhIGFuZCBzdGF0cwogICAgYW5kIHdyaXRlcyB0aGUgcmVzdWx0cyB0byB0aGUgZGVmYXVsdCBvZiBzcGVjaWZpZWQgdGFyZ2V0cwoKICAgIHdoZW4gdGFyZ2V0cyBhcmUgbm90IHNwZWNpZmllZCBkYXRhIGlzIHN0b3JlZCBpbiB0aGUgY29uZmlndXJlZCBkZWZhdWx0IHRhcmdldHMKICAgICh3aWxsIHVzdWFsbHkgYmUgTm9TUUwgZm9yIHJlYWwtdGltZSBhbmQgUGFycXVldCBmb3Igb2ZmbGluZSkuCgogICAgZXhhbXBsZTo6CgogICAgICAgIHN0b2Nrc19zZXQgPSBGZWF0dXJlU2V0KCJzdG9ja3MiLCBlbnRpdGllcz1bRW50aXR5KCJ0aWNrZXIiKV0pCiAgICAgICAgc3RvY2tzID0gcGQucmVhZF9jc3YoInN0b2Nrcy5jc3YiKQogICAgICAgIGRmID0gaW5nZXN0KHN0b2Nrc19zZXQsIHN0b2NrcywgaW5mZXJfb3B0aW9ucz1mc3RvcmUuSW5mZXJPcHRpb25zLmRlZmF1bHQoKSkKCiAgICAgICAgIyBmb3IgcnVubmluZyBhcyByZW1vdGUgam9iCiAgICAgICAgY29uZmlnID0gUnVuQ29uZmlnKGltYWdlPSdtbHJ1bi9tbHJ1bicpLmFwcGx5KG1vdW50X3YzaW8oKSkKICAgICAgICBkZiA9IGluZ2VzdChzdG9ja3Nfc2V0LCBzdG9ja3MsIHJ1bl9jb25maWc9Y29uZmlnKQoKICAgICAgICAjIHNwZWNpZnkgc291cmNlIGFuZCB0YXJnZXRzCiAgICAgICAgc291cmNlID0gQ1NWU291cmNlKCJteWNzdiIsIHBhdGg9Im1lYXN1cmVtZW50cy5jc3YiKQogICAgICAgIHRhcmdldHMgPSBbQ1NWVGFyZ2V0KCJteWNzdiIsIHBhdGg9Ii4vbXljc3YuY3N2IildCiAgICAgICAgaW5nZXN0KG1lYXN1cmVtZW50cywgc291cmNlLCB0YXJnZXRzKQoKICAgIDpwYXJhbSBjb250ZXh0OiAgICAgICBNTFJ1biBjb250ZXh0CiAgICA6cGFyYW0gZmVhdHVyZXNldDogICAgZmVhdHVyZSBzZXQgb2JqZWN0IG9yIGZlYXR1cmVzZXQudXJpLiAodXJpIG11c3QgYmUgb2YgYSBmZWF0dXJlIHNldCB0aGF0IGlzIGluIHRoZSBEQiwKICAgICAgICAgICAgICAgICAgICAgICAgICBjYWxsIGAuc2F2ZSgpYCBpZiBpdCdzIG5vdCkKICAgIDpwYXJhbSBzb3VyY2U6ICAgICAgICBzb3VyY2UgZGF0YWZyYW1lIG9yIGZpbGUgcGF0aAogICAgOnBhcmFtIHRhcmdldHM6ICAgICAgIG9wdGlvbmFsIGxpc3Qgb2YgZGF0YSB0YXJnZXQgb2JqZWN0cwogICAgOnBhcmFtIG5hbWVzcGFjZTogICAgIG5hbWVzcGFjZSBvciBtb2R1bGUgY29udGFpbmluZyBncmFwaCBjbGFzc2VzCiAgICA6cGFyYW0gaW5mZXJfb3B0aW9uczogc2NoZW1hIGFuZCBzdGF0cyBpbmZlciBvcHRpb25zCiAgICA6cGFyYW0gcnVuX2NvbmZpZzogICAgZnVuY3Rpb24gYW5kL29yIHJ1biBjb25maWd1cmF0aW9uIGZvciByZW1vdGUgam9icywKICAgICAgICAgICAgICAgICAgICAgICAgICBzZWUgOnB5OmNsYXNzOmB+bWxydW4uZmVhdHVyZV9zdG9yZS5SdW5Db25maWdgCiAgICA6cGFyYW0gc3BhcmtfY29udGV4dDogbG9jYWwgc3Bhcmsgc2Vzc2lvbiBmb3Igc3BhcmsgaW5nZXN0aW9uLCBleGFtcGxlIGZvciBjcmVhdGluZyB0aGUgc3BhcmsgY29udGV4dDoKICAgICAgICAgICAgICAgICAgICAgICAgICBgc3BhcmsgPSBTcGFya1Nlc3Npb24uYnVpbGRlci5hcHBOYW1lKCJTcGFyayBmdW5jdGlvbiIpLmdldE9yQ3JlYXRlKClgCiAgICAgICAgICAgICAgICAgICAgICAgICAgRm9yIHJlbW90ZSBzcGFyayBpbmdlc3Rpb24sIHRoaXMgc2hvdWxkIGNvbnRhaW4gdGhlIHJlbW90ZSBzcGFyayBzZXJ2aWNlIG5hbWUKICAgIDpwYXJhbSBvdmVyd3JpdGU6ICAgICBkZWxldGUgdGhlIHRhcmdldHMnIGRhdGEgcHJpb3IgdG8gaW5nZXN0aW9uCiAgICAgICAgICAgICAgICAgICAgICAgICAgKGRlZmF1bHQ6IFRydWUgZm9yIG5vbi1zY2hlZHVsZWQgaW5nZXN0IC0gZGVsZXRlcyB0aGUgdGFyZ2V0cyB0aGF0IGFyZSBhYm91dCB0byBiZSBpbmdlc3RlZC4KICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgRmFsc2UgZm9yIHNjaGVkdWxlZCBpbmdlc3QgLSBkb2VzIG5vdCBkZWxldGUgdGhlIHRhcmdldCkKCiAgICAiIiIKICAgICMgU2V0dGluZyBpbmZlcl9vcHRpb25zIHRvIGRlZmF1bHQ6CiAgICBjb250ZXh0Ll9wYXJhbWV0ZXJzWyJpbmZlcl9vcHRpb25zIl0gPSBpbmZlcl9vcHRpb25zIG9yIEluZmVyT3B0aW9ucy5kZWZhdWx0KCkKCiAgICBjb250ZXh0LmxvZ2dlci5pbmZvKGYiQ2FsbGluZyBpbmdlc3Rpb24gdGFzayB3aXRoOiB7ZmVhdHVyZXNldH0iKQoKICAgICMgaW5nZXN0IGNhbGxlZCB3aXRoIG1scnVuX2NvbnRleHQsIGZlYXR1cmVfc2V0LCBzb3VyY2UgYW5kIHRhcmdldHMgcGFzc2VkIHdpdGggY29udGV4dAogICAgIyBUaGlzIHBhcmFtcyBoZXJlIGZvciBkb2N1bWVudGF0aW9uIHB1cnBvc2VzIG9ubHkKICAgIGZzLmluZ2VzdCgKICAgICAgICBtbHJ1bl9jb250ZXh0PWNvbnRleHQsCiAgICAgICAgbmFtZXNwYWNlPW5hbWVzcGFjZSwKICAgICAgICBzcGFya19jb250ZXh0PXNwYXJrX2NvbnRleHQsCiAgICApCiAgICBjb250ZXh0LmxvZ19yZXN1bHQoImZlYXR1cmVzZXQiLCBmZWF0dXJlc2V0KQo=
-    commands: []
-    code_origin: https://github.com/mlrun/functions.git#886a88217c2a2570c81a14877f9c1dfb1ac8a244:C:\Users\yonatans\projects\functions\ingest\ingest.py
-    origin_filename: C:\Users\yonatans\projects\functions\ingest\ingest.py
-  entry_points:
-    ingest:
-      name: ingest
-      doc: "Read local DataFrame, file, URL, or source into the feature store\nIngest\
-        \ reads from the source, run the graph transformations, infers  metadata and\
-        \ stats\nand writes the results to the default of specified targets\n\nwhen\
-        \ targets are not specified data is stored in the configured default targets\n\
-        (will usually be NoSQL for real-time and Parquet for offline).\n\nexample::\n\
-        \n    stocks_set = FeatureSet(\"stocks\", entities=[Entity(\"ticker\")])\n\
-        \    stocks = pd.read_csv(\"stocks.csv\")\n    df = ingest(stocks_set, stocks,\
-        \ infer_options=fstore.InferOptions.default())\n\n    # for running as remote\
-        \ job\n    config = RunConfig(image='mlrun/mlrun').apply(mount_v3io())\n \
-        \   df = ingest(stocks_set, stocks, run_config=config)\n\n    # specify source\
-        \ and targets\n    source = CSVSource(\"mycsv\", path=\"measurements.csv\"\
-        )\n    targets = [CSVTarget(\"mycsv\", path=\"./mycsv.csv\")]\n    ingest(measurements,\
-        \ source, targets)"
-      parameters:
-      - name: context
-        type: MLClientCtx
-        doc: MLRun context
-        default: ''
-      - name: featureset
-        type: str
-        doc: feature set object or featureset.uri. (uri must be of a feature set that
-          is in the DB, call `.save()` if it's not)
-        default: ''
-      - name: source
-        type: str
-        doc: source dataframe or file path
-        default: ''
-      - name: targets
-        type: List[Union[str, Dict]]
-        doc: optional list of data target objects
-        default: null
-      - name: namespace
-        doc: namespace or module containing graph classes
-        default: null
-      - name: infer_options
-        doc: schema and stats infer options
-        default: null
-      - name: run_config
-        type: Union[str, Dict]
-        doc: function and/or run configuration for remote jobs, see :py:class:`~mlrun.feature_store.RunConfig`
-        default: null
-      - name: spark_context
-        doc: 'local spark session for spark ingestion, example for creating the spark
-          context: `spark = SparkSession.builder.appName("Spark function").getOrCreate()`
-          For remote spark ingestion, this should contain the remote spark service
-          name'
-        default: null
-      - name: overwrite
-        doc: 'delete the targets'' data prior to ingestion (default: True for non-scheduled
-          ingest - deletes the targets that are about to be ingested. False for scheduled
-          ingest - does not delete the target)'
-        default: null
-      outputs:
-      - default: ''
-      lineno: 8
-  description: Feature Store ingest function that runs the transformation graph on
-    the source of the featureset.
-  default_handler: ingest
-  disable_auto_mount: false
-  env: []
-  priority_class_name: ''
-  affinity: null
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/ingest/0.9.0/static/item.html b/functions/development/ingest/0.9.0/static/item.html deleted file mode 100644 index 0a0e663a..00000000 --- a/functions/development/ingest/0.9.0/static/item.html +++ /dev/null @@ -1,46 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-  - data-preparation
-  - data-analysis
-  - feature-store
-description: Feature Store ingest function that runs the transformation graph on the source of the featureset.
-doc: ''
-example: ingest.ipynb
-generationDate: 2021-11-13:00-15
-icon: ''
-labels:
-  author: yonish
-maintainers: []
-marketplaceType: ''
-mlrunVersion: ''
-name: ingest
-platformVersion: ''
-spec:
-  filename: ingest.py
-  handler: ingest
-  image: mlrun/mlrun
-  kind: job
-  requirements: []
-url: ''
-version: 0.9.0
-        
-    
- - \ No newline at end of file diff --git a/functions/development/ingest/0.9.0/static/source.html b/functions/development/ingest/0.9.0/static/source.html deleted file mode 100644 index 3ca7f986..00000000 --- a/functions/development/ingest/0.9.0/static/source.html +++ /dev/null @@ -1,92 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-from typing import Union, List, Dict
-
-import mlrun.feature_store as fs
-from mlrun.execution import MLClientCtx
-from mlrun.data_types import InferOptions
-
-
-def ingest(
-    context: MLClientCtx,
-    featureset: str,
-    source: str,
-    targets: List[Union[str, Dict]] = None,
-    namespace=None,
-    infer_options=None,
-    run_config: Union[str, Dict] = None,
-    spark_context=None,
-    overwrite=None,
-):
-    """Read local DataFrame, file, URL, or source into the feature store
-    Ingest reads from the source, run the graph transformations, infers  metadata and stats
-    and writes the results to the default of specified targets
-
-    when targets are not specified data is stored in the configured default targets
-    (will usually be NoSQL for real-time and Parquet for offline).
-
-    example::
-
-        stocks_set = FeatureSet("stocks", entities=[Entity("ticker")])
-        stocks = pd.read_csv("stocks.csv")
-        df = ingest(stocks_set, stocks, infer_options=fstore.InferOptions.default())
-
-        # for running as remote job
-        config = RunConfig(image='mlrun/mlrun').apply(mount_v3io())
-        df = ingest(stocks_set, stocks, run_config=config)
-
-        # specify source and targets
-        source = CSVSource("mycsv", path="measurements.csv")
-        targets = [CSVTarget("mycsv", path="./mycsv.csv")]
-        ingest(measurements, source, targets)
-
-    :param context:       MLRun context
-    :param featureset:    feature set object or featureset.uri. (uri must be of a feature set that is in the DB,
-                          call `.save()` if it's not)
-    :param source:        source dataframe or file path
-    :param targets:       optional list of data target objects
-    :param namespace:     namespace or module containing graph classes
-    :param infer_options: schema and stats infer options
-    :param run_config:    function and/or run configuration for remote jobs,
-                          see :py:class:`~mlrun.feature_store.RunConfig`
-    :param spark_context: local spark session for spark ingestion, example for creating the spark context:
-                          `spark = SparkSession.builder.appName("Spark function").getOrCreate()`
-                          For remote spark ingestion, this should contain the remote spark service name
-    :param overwrite:     delete the targets' data prior to ingestion
-                          (default: True for non-scheduled ingest - deletes the targets that are about to be ingested.
-                                    False for scheduled ingest - does not delete the target)
-
-    """
-    # Setting infer_options to default:
-    context._parameters["infer_options"] = infer_options or InferOptions.default()
-
-    context.logger.info(f"Calling ingestion task with: {featureset}")
-
-    # ingest called with mlrun_context, feature_set, source and targets passed with context
-    # This params here for documentation purposes only
-    fs.ingest(
-        mlrun_context=context,
-        namespace=namespace,
-        spark_context=spark_context,
-    )
-    context.log_result("featureset", featureset)
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/ingest/1.1.0/src/function.yaml b/functions/development/ingest/1.1.0/src/function.yaml deleted file mode 100644 index a05ca669..00000000 --- a/functions/development/ingest/1.1.0/src/function.yaml +++ /dev/null @@ -1,87 +0,0 @@ -kind: job -metadata: - name: ingest - tag: '' - hash: 7e28700a86ebdd18d887fe588492201a1e3ef2f6 - project: '' - labels: - author: yonish - categories: - - data-preparation - - data-analysis - - feature-store -spec: - command: '' - args: [] - image: mlrun/mlrun - build: - functionSourceCode: ZnJvbSB0eXBpbmcgaW1wb3J0IFVuaW9uLCBMaXN0LCBEaWN0CgppbXBvcnQgbWxydW4uZmVhdHVyZV9zdG9yZSBhcyBmcwpmcm9tIG1scnVuLmV4ZWN1dGlvbiBpbXBvcnQgTUxDbGllbnRDdHgKZnJvbSBtbHJ1bi5kYXRhX3R5cGVzIGltcG9ydCBJbmZlck9wdGlvbnMKCgpkZWYgaW5nZXN0KAogICAgY29udGV4dDogTUxDbGllbnRDdHgsCiAgICBmZWF0dXJlc2V0OiBzdHIsCiAgICBzb3VyY2U6IHN0ciwKICAgIHRhcmdldHM6IExpc3RbVW5pb25bc3RyLCBEaWN0XV0gPSBOb25lLAogICAgbmFtZXNwYWNlPU5vbmUsCiAgICBpbmZlcl9vcHRpb25zPU5vbmUsCiAgICBydW5fY29uZmlnOiBVbmlvbltzdHIsIERpY3RdID0gTm9uZSwKICAgIHNwYXJrX2NvbnRleHQ9Tm9uZSwKICAgIG92ZXJ3cml0ZT1Ob25lLAopOgogICAgIiIiUmVhZCBsb2NhbCBEYXRhRnJhbWUsIGZpbGUsIFVSTCwgb3Igc291cmNlIGludG8gdGhlIGZlYXR1cmUgc3RvcmUKICAgIEluZ2VzdCByZWFkcyBmcm9tIHRoZSBzb3VyY2UsIHJ1biB0aGUgZ3JhcGggdHJhbnNmb3JtYXRpb25zLCBpbmZlcnMgIG1ldGFkYXRhIGFuZCBzdGF0cwogICAgYW5kIHdyaXRlcyB0aGUgcmVzdWx0cyB0byB0aGUgZGVmYXVsdCBvZiBzcGVjaWZpZWQgdGFyZ2V0cwoKICAgIHdoZW4gdGFyZ2V0cyBhcmUgbm90IHNwZWNpZmllZCBkYXRhIGlzIHN0b3JlZCBpbiB0aGUgY29uZmlndXJlZCBkZWZhdWx0IHRhcmdldHMKICAgICh3aWxsIHVzdWFsbHkgYmUgTm9TUUwgZm9yIHJlYWwtdGltZSBhbmQgUGFycXVldCBmb3Igb2ZmbGluZSkuCgogICAgZXhhbXBsZTo6CgogICAgICAgIHN0b2Nrc19zZXQgPSBGZWF0dXJlU2V0KCJzdG9ja3MiLCBlbnRpdGllcz1bRW50aXR5KCJ0aWNrZXIiKV0pCiAgICAgICAgc3RvY2tzID0gcGQucmVhZF9jc3YoInN0b2Nrcy5jc3YiKQogICAgICAgIGRmID0gaW5nZXN0KHN0b2Nrc19zZXQsIHN0b2NrcywgaW5mZXJfb3B0aW9ucz1mc3RvcmUuSW5mZXJPcHRpb25zLmRlZmF1bHQoKSkKCiAgICAgICAgIyBmb3IgcnVubmluZyBhcyByZW1vdGUgam9iCiAgICAgICAgY29uZmlnID0gUnVuQ29uZmlnKGltYWdlPSdtbHJ1bi9tbHJ1bicpLmFwcGx5KG1vdW50X3YzaW8oKSkKICAgICAgICBkZiA9IGluZ2VzdChzdG9ja3Nfc2V0LCBzdG9ja3MsIHJ1bl9jb25maWc9Y29uZmlnKQoKICAgICAgICAjIHNwZWNpZnkgc291cmNlIGFuZCB0YXJnZXRzCiAgICAgICAgc291cmNlID0gQ1NWU291cmNlKCJteWNzdiIsIHBhdGg9Im1lYXN1cmVtZW50cy5jc3YiKQogICAgICAgIHRhcmdldHMgPSBbQ1NWVGFyZ2V0KCJteWNzdiIsIHBhdGg9Ii4vbXljc3YuY3N2IildCiAgICAgICAgaW5nZXN0KG1lYXN1cmVtZW50cywgc291cmNlLCB0YXJnZXRzKQoKICAgIDpwYXJhbSBjb250ZXh0OiAgICAgICBNTFJ1biBjb250ZXh0CiAgICA6cGFyYW0gZmVhdHVyZXNldDogICAgZmVhdHVyZSBzZXQgb2JqZWN0IG9yIGZlYXR1cmVzZXQudXJpLiAodXJpIG11c3QgYmUgb2YgYSBmZWF0dXJlIHNldCB0aGF0IGlzIGluIHRoZSBEQiwKICAgICAgICAgICAgICAgICAgICAgICAgICBjYWxsIGAuc2F2ZSgpYCBpZiBpdCdzIG5vdCkKICAgIDpwYXJhbSBzb3VyY2U6ICAgICAgICBzb3VyY2UgZGF0YWZyYW1lIG9yIGZpbGUgcGF0aAogICAgOnBhcmFtIHRhcmdldHM6ICAgICAgIG9wdGlvbmFsIGxpc3Qgb2YgZGF0YSB0YXJnZXQgb2JqZWN0cwogICAgOnBhcmFtIG5hbWVzcGFjZTogICAgIG5hbWVzcGFjZSBvciBtb2R1bGUgY29udGFpbmluZyBncmFwaCBjbGFzc2VzCiAgICA6cGFyYW0gaW5mZXJfb3B0aW9uczogc2NoZW1hIGFuZCBzdGF0cyBpbmZlciBvcHRpb25zCiAgICA6cGFyYW0gcnVuX2NvbmZpZzogICAgZnVuY3Rpb24gYW5kL29yIHJ1biBjb25maWd1cmF0aW9uIGZvciByZW1vdGUgam9icywKICAgICAgICAgICAgICAgICAgICAgICAgICBzZWUgOnB5OmNsYXNzOmB+bWxydW4uZmVhdHVyZV9zdG9yZS5SdW5Db25maWdgCiAgICA6cGFyYW0gc3BhcmtfY29udGV4dDogbG9jYWwgc3Bhcmsgc2Vzc2lvbiBmb3Igc3BhcmsgaW5nZXN0aW9uLCBleGFtcGxlIGZvciBjcmVhdGluZyB0aGUgc3BhcmsgY29udGV4dDoKICAgICAgICAgICAgICAgICAgICAgICAgICBgc3BhcmsgPSBTcGFya1Nlc3Npb24uYnVpbGRlci5hcHBOYW1lKCJTcGFyayBmdW5jdGlvbiIpLmdldE9yQ3JlYXRlKClgCiAgICAgICAgICAgICAgICAgICAgICAgICAgRm9yIHJlbW90ZSBzcGFyayBpbmdlc3Rpb24sIHRoaXMgc2hvdWxkIGNvbnRhaW4gdGhlIHJlbW90ZSBzcGFyayBzZXJ2aWNlIG5hbWUKICAgIDpwYXJhbSBvdmVyd3JpdGU6ICAgICBkZWxldGUgdGhlIHRhcmdldHMnIGRhdGEgcHJpb3IgdG8gaW5nZXN0aW9uCiAgICAgICAgICAgICAgICAgICAgICAgICAgKGRlZmF1bHQ6IFRydWUgZm9yIG5vbi1zY2hlZHVsZWQgaW5nZXN0IC0gZGVsZXRlcyB0aGUgdGFyZ2V0cyB0aGF0IGFyZSBhYm91dCB0byBiZSBpbmdlc3RlZC4KICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgRmFsc2UgZm9yIHNjaGVkdWxlZCBpbmdlc3QgLSBkb2VzIG5vdCBkZWxldGUgdGhlIHRhcmdldCkKCiAgICAiIiIKICAgICMgU2V0dGluZyBpbmZlcl9vcHRpb25zIHRvIGRlZmF1bHQ6CiAgICBjb250ZXh0Ll9wYXJhbWV0ZXJzWyJpbmZlcl9vcHRpb25zIl0gPSBpbmZlcl9vcHRpb25zIG9yIEluZmVyT3B0aW9ucy5kZWZhdWx0KCkKCiAgICBjb250ZXh0LmxvZ2dlci5pbmZvKGYiQ2FsbGluZyBpbmdlc3Rpb24gdGFzayB3aXRoOiB7ZmVhdHVyZXNldH0iKQoKICAgICMgaW5nZXN0IGNhbGxlZCB3aXRoIG1scnVuX2NvbnRleHQsIGZlYXR1cmVfc2V0LCBzb3VyY2UgYW5kIHRhcmdldHMgcGFzc2VkIHdpdGggY29udGV4dAogICAgIyBUaGlzIHBhcmFtcyBoZXJlIGZvciBkb2N1bWVudGF0aW9uIHB1cnBvc2VzIG9ubHkKICAgIGZzLmluZ2VzdCgKICAgICAgICBtbHJ1bl9jb250ZXh0PWNvbnRleHQsCiAgICAgICAgbmFtZXNwYWNlPW5hbWVzcGFjZSwKICAgICAgICBzcGFya19jb250ZXh0PXNwYXJrX2NvbnRleHQsCiAgICApCiAgICBjb250ZXh0LmxvZ19yZXN1bHQoImZlYXR1cmVzZXQiLCBmZWF0dXJlc2V0KQo= - commands: [] - code_origin: https://github.com/mlrun/functions.git#886a88217c2a2570c81a14877f9c1dfb1ac8a244:C:\Users\yonatans\projects\functions\ingest\ingest.py - origin_filename: C:\Users\yonatans\projects\functions\ingest\ingest.py - entry_points: - ingest: - name: ingest - doc: "Read local DataFrame, file, URL, or source into the feature store\nIngest\ - \ reads from the source, run the graph transformations, infers metadata and\ - \ stats\nand writes the results to the default of specified targets\n\nwhen\ - \ targets are not specified data is stored in the configured default targets\n\ - (will usually be NoSQL for real-time and Parquet for offline).\n\nexample::\n\ - \n stocks_set = FeatureSet(\"stocks\", entities=[Entity(\"ticker\")])\n\ - \ stocks = pd.read_csv(\"stocks.csv\")\n df = ingest(stocks_set, stocks,\ - \ infer_options=fstore.InferOptions.default())\n\n # for running as remote\ - \ job\n config = RunConfig(image='mlrun/mlrun').apply(mount_v3io())\n \ - \ df = ingest(stocks_set, stocks, run_config=config)\n\n # specify source\ - \ and targets\n source = CSVSource(\"mycsv\", path=\"measurements.csv\"\ - )\n targets = [CSVTarget(\"mycsv\", path=\"./mycsv.csv\")]\n ingest(measurements,\ - \ source, targets)" - parameters: - - name: context - type: MLClientCtx - doc: MLRun context - default: '' - - name: featureset - type: str - doc: feature set object or featureset.uri. (uri must be of a feature set that - is in the DB, call `.save()` if it's not) - default: '' - - name: source - type: str - doc: source dataframe or file path - default: '' - - name: targets - type: List[Union[str, Dict]] - doc: optional list of data target objects - default: null - - name: namespace - doc: namespace or module containing graph classes - default: null - - name: infer_options - doc: schema and stats infer options - default: null - - name: run_config - type: Union[str, Dict] - doc: function and/or run configuration for remote jobs, see :py:class:`~mlrun.feature_store.RunConfig` - default: null - - name: spark_context - doc: 'local spark session for spark ingestion, example for creating the spark - context: `spark = SparkSession.builder.appName("Spark function").getOrCreate()` - For remote spark ingestion, this should contain the remote spark service - name' - default: null - - name: overwrite - doc: 'delete the targets'' data prior to ingestion (default: True for non-scheduled - ingest - deletes the targets that are about to be ingested. False for scheduled - ingest - does not delete the target)' - default: null - outputs: - - default: '' - lineno: 8 - description: Feature Store ingest function that runs the transformation graph on - the source of the featureset. - default_handler: ingest - disable_auto_mount: false - env: [] - priority_class_name: '' - affinity: null -verbose: false diff --git a/functions/development/ingest/1.1.0/src/ingest.ipynb b/functions/development/ingest/1.1.0/src/ingest.ipynb deleted file mode 100644 index 7da398b4..00000000 --- a/functions/development/ingest/1.1.0/src/ingest.ipynb +++ /dev/null @@ -1,762 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Feature Store Ingest" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Read local DataFrame, file, URL, or source into the feature store\n", - "Ingest reads from the source, run the graph transformations, infers metadata and stats\n", - "and writes the results to the default of specified targets." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Creating Project" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import mlrun\n", - "import mlrun.feature_store as fstore\n", - "from mlrun.datastore.sources import CSVSource\n", - "from mlrun.feature_store.steps import *\n", - "from mlrun.features import MinMaxValidator\n", - "import pandas as pd\n", - "import datetime" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2022-01-31 13:52:16,939 [info] loaded project ingest from MLRun DB\n" - ] - } - ], - "source": [ - "# Initialize the MLRun project object\n", - "project = mlrun.get_or_create_project('ingest', context=\"./\", user_project=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Create Sample Data For Demo" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "quotes = pd.DataFrame(\n", - " {\n", - " \"time\": [\n", - " pd.Timestamp(\"2016-05-25 13:30:00.023\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.023\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.030\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.041\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.048\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.049\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.072\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.075\"),\n", - " ],\n", - " \"ticker\": [\"GOOG\", \"MSFT\", \"MSFT\", \"MSFT\", \"GOOG\", \"AAPL\", \"GOOG\", \"MSFT\"],\n", - " \"bid\": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01],\n", - " \"ask\": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03],\n", - " }\n", - ")\n", - "\n", - "# move date:\n", - "max_date = quotes[\"time\"].max()\n", - "now_date = datetime.datetime.now()\n", - "delta = now_date - max_date\n", - "quotes[\"time\"] = quotes[\"time\"] + delta" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
timetickerbidask
02022-01-31 13:52:16.905388GOOG720.50720.93
12022-01-31 13:52:16.905388MSFT51.9551.96
22022-01-31 13:52:16.912388MSFT51.9751.98
32022-01-31 13:52:16.923388MSFT51.9952.00
42022-01-31 13:52:16.930388GOOG720.50720.93
52022-01-31 13:52:16.931388AAPL97.9998.01
62022-01-31 13:52:16.954388GOOG720.50720.88
72022-01-31 13:52:16.957388MSFT52.0152.03
\n", - "
" - ], - "text/plain": [ - " time ticker bid ask\n", - "0 2022-01-31 13:52:16.905388 GOOG 720.50 720.93\n", - "1 2022-01-31 13:52:16.905388 MSFT 51.95 51.96\n", - "2 2022-01-31 13:52:16.912388 MSFT 51.97 51.98\n", - "3 2022-01-31 13:52:16.923388 MSFT 51.99 52.00\n", - "4 2022-01-31 13:52:16.930388 GOOG 720.50 720.93\n", - "5 2022-01-31 13:52:16.931388 AAPL 97.99 98.01\n", - "6 2022-01-31 13:52:16.954388 GOOG 720.50 720.88\n", - "7 2022-01-31 13:52:16.957388 MSFT 52.01 52.03" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "quotes" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Build Advanced Feature Set - With Feature Engineering Pipeline" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Define a custom pipeline step (python class)" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "class MyMap(MapClass):\n", - " def __init__(self, multiplier=1, **kwargs):\n", - " super().__init__(**kwargs)\n", - " self._multiplier = multiplier\n", - "\n", - " def do(self, event):\n", - " event[\"multi\"] = event[\"bid\"] * self._multiplier\n", - " return event" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Build and show the transformatiom pipeline" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "image/svg+xml": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "mlrun-flow\n", - "\n", - "\n", - "\n", - "_start\n", - "\n", - "start\n", - "\n", - "\n", - "\n", - "map.MyMap\n", - "\n", - "map.MyMap\n", - "\n", - "\n", - "\n", - "_start->map.MyMap\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "storey.Extend\n", - "\n", - "storey.Extend\n", - "\n", - "\n", - "\n", - "map.MyMap->storey.Extend\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "filter\n", - "\n", - "filter\n", - "\n", - "\n", - "\n", - "storey.Extend->filter\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "FeaturesetValidator\n", - "\n", - "FeaturesetValidator\n", - "\n", - "\n", - "\n", - "filter->FeaturesetValidator\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "Aggregates\n", - "\n", - "Aggregates\n", - "\n", - "\n", - "\n", - "FeaturesetValidator->Aggregates\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "parquet\n", - "\n", - "\n", - "parquet\n", - "\n", - "\n", - "\n", - "Aggregates->parquet\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "nosql\n", - "\n", - "\n", - "nosql\n", - "\n", - "\n", - "\n", - "Aggregates->nosql\n", - "\n", - "\n", - "\n", - "\n", - "\n" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "quotes_set = fstore.FeatureSet(\"stock-quotes\", entities=[fstore.Entity(\"ticker\")])\n", - "\n", - "quotes_set.graph.to(\"map.MyMap\", multiplier=3).to(\n", - " \"storey.Extend\", _fn=\"({'extra': event['bid'] * 77})\"\n", - ").to(\"storey.Filter\", \"filter\", _fn=\"(event['bid'] > 51.92)\").to(\n", - " FeaturesetValidator()\n", - ")\n", - "\n", - "quotes_set.add_aggregation(\"ask\", [\"sum\", \"max\"], \"1h\", \"10m\", name=\"asks1\")\n", - "quotes_set.add_aggregation(\"ask\", [\"sum\", \"max\"], \"5h\", \"10m\", name=\"asks5\")\n", - "quotes_set.add_aggregation(\"bid\", [\"min\", \"max\"], \"1h\", \"10m\", name=\"bids\")\n", - "\n", - "# add feature validation policy\n", - "quotes_set[\"bid\"] = fstore.Feature(\n", - " validator=MinMaxValidator(min=52, severity=\"info\")\n", - ")\n", - "\n", - "# add default target definitions and plot\n", - "quotes_set.set_targets()\n", - "quotes_set.plot(rankdir=\"LR\", with_targets=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Saving the feature set in the feature store " - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "quotes_set.save()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Creating the data source of the feature set to apply the ingest on:" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "data_uri = 'quotes.csv'\n", - "quotes.to_csv(data_uri, index=False)\n", - "source = CSVSource('quotes', data_uri).to_dict()\n", - "source" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Import ingest function" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "ingest_fn = mlrun.import_function(\"function.yaml\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Running the function locally" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2022-01-31 13:52:17,201 [info] starting run ingest-ingest uid=4bd5d12691a8439d90bf53847f59df1a DB=http://mlrun-api:8080\n", - "> 2022-01-31 13:52:17,354 [info] Ingesting the FeatureSet: store://feature-sets/ingest-yonatan/stock-quotes\n", - "> 2022-01-31 13:52:17,427 [info] starting ingestion task to store://feature-sets/ingest-yonatan/stock-quotes:latest.\n", - "info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 13:52:19.466055 args={'min': 52, 'value': 51.95}\n", - "info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 13:52:19.466072 args={'min': 52, 'value': 51.97}\n", - "info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 13:52:19.466085 args={'min': 52, 'value': 51.99}\n", - "info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 13:52:19.671677 args={'min': 52, 'value': 51.95}\n", - "info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 13:52:19.671692 args={'min': 52, 'value': 51.97}\n", - "info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 13:52:19.671708 args={'min': 52, 'value': 51.99}\n", - "> 2022-01-31 13:52:19,915 [info] ingestion task completed, targets:\n", - "> 2022-01-31 13:52:19,915 [info] [{'name': 'parquet', 'kind': 'parquet', 'path': 'v3io:///projects/ingest-yonatan/FeatureStore/stock-quotes/parquet/sets/stock-quotes-latest', 'status': 'created', 'updated': '2022-01-31T13:52:19.649303+00:00', 'last_written': datetime.datetime(2022, 1, 31, 13, 52, 19, 671753)}, {'name': 'nosql', 'kind': 'nosql', 'path': 'v3io:///projects/ingest-yonatan/FeatureStore/stock-quotes/nosql/sets/stock-quotes-latest', 'status': 'created', 'updated': '2022-01-31T13:52:19.650044+00:00'}]\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
ingest-yonatan0Jan 31 13:52:17completedingest-ingest
v3io_user=yonatan
kind=
owner=yonatan
host=jupyter-yoni-647b99c95d-w4jlc
featureset=store://feature-sets/ingest-yonatan/stock-quotes
source={'kind': 'csv', 'name': 'quotes', 'path': 'quotes.csv'}
infer_options=63
overwrite=None
targets=None
featureset=store://feature-sets/ingest-yonatan/stock-quotes
\n", - "
\n", - "
\n", - "
\n", - " Title\n", - " ×\n", - "
\n", - " \n", - "
\n", - "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - }, - { - "data": { - "text/html": [ - " > to track results use the .show() or .logs() methods or click here to open in UI" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2022-01-31 13:52:20,045 [info] run executed, status=completed\n" - ] - } - ], - "source": [ - "ingest_run = ingest_fn.run(\n", - " handler=\"ingest\",\n", - " params={\n", - " \"featureset\": quotes_set.uri,\n", - " \"source\": source,\n", - " },\n", - " local=True,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## View of the targets' state after run" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'created'" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fstore.get_feature_set(ingest_run.outputs['featureset']).status.state" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/functions/development/ingest/1.1.0/src/ingest.py b/functions/development/ingest/1.1.0/src/ingest.py deleted file mode 100644 index 1412cbaf..00000000 --- a/functions/development/ingest/1.1.0/src/ingest.py +++ /dev/null @@ -1,84 +0,0 @@ -# Copyright 2019 Iguazio -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -from typing import Union, List, Dict - -import mlrun.feature_store as fs -from mlrun.execution import MLClientCtx -from mlrun.data_types import InferOptions - - -def ingest( - context: MLClientCtx, - featureset: str, - source: str, - targets: List[Union[str, Dict]] = None, - namespace=None, - infer_options=None, - run_config: Union[str, Dict] = None, - spark_context=None, - overwrite=None, -): - """Read local DataFrame, file, URL, or source into the feature store - Ingest reads from the source, run the graph transformations, infers metadata and stats - and writes the results to the default of specified targets - - when targets are not specified data is stored in the configured default targets - (will usually be NoSQL for real-time and Parquet for offline). - - example:: - - stocks_set = FeatureSet("stocks", entities=[Entity("ticker")]) - stocks = pd.read_csv("stocks.csv") - df = ingest(stocks_set, stocks, infer_options=fstore.InferOptions.default()) - - # for running as remote job - config = RunConfig(image='mlrun/mlrun').apply(mount_v3io()) - df = ingest(stocks_set, stocks, run_config=config) - - # specify source and targets - source = CSVSource("mycsv", path="measurements.csv") - targets = [CSVTarget("mycsv", path="./mycsv.csv")] - ingest(measurements, source, targets) - - :param context: MLRun context - :param featureset: feature set object or featureset.uri. (uri must be of a feature set that is in the DB, - call `.save()` if it's not) - :param source: source dataframe or file path - :param targets: optional list of data target objects - :param namespace: namespace or module containing graph classes - :param infer_options: schema and stats infer options - :param run_config: function and/or run configuration for remote jobs, - see :py:class:`~mlrun.feature_store.RunConfig` - :param spark_context: local spark session for spark ingestion, example for creating the spark context: - `spark = SparkSession.builder.appName("Spark function").getOrCreate()` - For remote spark ingestion, this should contain the remote spark service name - :param overwrite: delete the targets' data prior to ingestion - (default: True for non-scheduled ingest - deletes the targets that are about to be ingested. - False for scheduled ingest - does not delete the target) - - """ - # Setting infer_options to default: - context._parameters["infer_options"] = infer_options or InferOptions.default() - - context.logger.info(f"Calling ingestion task with: {featureset}") - - # ingest called with mlrun_context, feature_set, source and targets passed with context - # This params here for documentation purposes only - fs.ingest( - mlrun_context=context, - namespace=namespace, - spark_context=spark_context, - ) - context.log_result("featureset", featureset) diff --git a/functions/development/ingest/1.1.0/src/item.yaml b/functions/development/ingest/1.1.0/src/item.yaml deleted file mode 100644 index 8665e88f..00000000 --- a/functions/development/ingest/1.1.0/src/item.yaml +++ /dev/null @@ -1,27 +0,0 @@ -apiVersion: v1 -categories: -- data-preparation -- data-analysis -- feature-store -description: Feature Store ingest function that runs the transformation graph on the - source of the featureset. -doc: '' -example: ingest.ipynb -generationDate: 2022-08-28:17-25 -hidden: false -icon: '' -labels: - author: yonish -maintainers: [] -marketplaceType: '' -mlrunVersion: 1.1.0 -name: ingest -platformVersion: 3.5.0 -spec: - filename: ingest.py - handler: ingest - image: mlrun/mlrun - kind: job - requirements: [] -url: '' -version: 1.1.0 diff --git a/functions/development/ingest/1.1.0/src/test_ingest.py b/functions/development/ingest/1.1.0/src/test_ingest.py deleted file mode 100644 index 224f520b..00000000 --- a/functions/development/ingest/1.1.0/src/test_ingest.py +++ /dev/null @@ -1,171 +0,0 @@ -# Copyright 2019 Iguazio -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -import os -import tempfile -import shutil -import datetime -import pytest - -import mlrun -import mlrun.feature_store as fstore -from mlrun.datastore.sources import CSVSource -from mlrun.feature_store.steps import * -from mlrun.features import MinMaxValidator -import pandas as pd - -REQUIRED_ENV_VARS = [ - "MLRUN_DBPATH", - "MLRUN_ARTIFACT_PATH", - "V3IO_USERNAME", - "V3IO_API", - "V3IO_ACCESS_KEY", -] - - -def _validate_environment_variables() -> bool: - """ - Checks that all required Environment variables are set. - """ - environment_keys = os.environ.keys() - return all(key in environment_keys for key in REQUIRED_ENV_VARS) - - -def _set_environment(): - artifact_path = tempfile.TemporaryDirectory().name - os.makedirs(artifact_path) - project = mlrun.new_project("ingest-test") - return artifact_path, project - - -def _cleanup_environment(artifact_path: str): - """ - Cleanup the test environment, deleting files and artifacts created during the test. - - :param artifact_path: The artifact path to delete. - """ - # Clean the local directory: - for test_output in [ - *os.listdir(artifact_path), - "schedules", - "runs", - "artifacts", - "functions", - ]: - test_output_path = os.path.abspath(f"./{test_output}") - if os.path.exists(test_output_path): - if os.path.isdir(test_output_path): - shutil.rmtree(test_output_path) - else: - os.remove(test_output_path) - - # Clean the artifacts' directory: - shutil.rmtree(artifact_path) - - -def create_dataframes() -> (pd.DataFrame, pd.DataFrame): - quotes = pd.DataFrame( - { - "time": [ - pd.Timestamp("2016-05-25 13:30:00.023"), - pd.Timestamp("2016-05-25 13:30:00.023"), - pd.Timestamp("2016-05-25 13:30:00.030"), - pd.Timestamp("2016-05-25 13:30:00.041"), - pd.Timestamp("2016-05-25 13:30:00.048"), - pd.Timestamp("2016-05-25 13:30:00.049"), - pd.Timestamp("2016-05-25 13:30:00.072"), - pd.Timestamp("2016-05-25 13:30:00.075"), - ], - "ticker": ["GOOG", "MSFT", "MSFT", "MSFT", "GOOG", "AAPL", "GOOG", "MSFT"], - "bid": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01], - "ask": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03], - } - ) - - # move date: - max_date = quotes["time"].max() - now_date = datetime.datetime.now() - delta = now_date - max_date - quotes["time"] = quotes["time"] + delta - - return quotes - - -class MyMap(MapClass): - def __init__(self, multiplier=1, **kwargs): - super().__init__(**kwargs) - self._multiplier = multiplier - - def do(self, event): - event["multi"] = event["bid"] * self._multiplier - return event - - -def _create_feature_set(): - quotes_set = fstore.FeatureSet("stock-quotes", entities=[fstore.Entity("ticker")]) - - quotes_set.graph.to("test_ingest.MyMap", multiplier=3).to( - "storey.Extend", _fn="({'extra': event['bid'] * 77})" - ).to("storey.Filter", "filter", _fn="(event['bid'] > 51.92)").to( - FeaturesetValidator() - ) - - quotes_set.add_aggregation("ask", ["sum", "max"], "1h", "10m", name="asks1") - quotes_set.add_aggregation("ask", ["sum", "max"], "5h", "10m", name="asks5") - quotes_set.add_aggregation("bid", ["min", "max"], "1h", "10m", name="bids") - - # add feature validation policy - quotes_set["bid"] = fstore.Feature( - validator=MinMaxValidator(min=52, severity="info") - ) - - # add default target definitions - quotes_set.set_targets() - return quotes_set - - -@pytest.mark.skipif( - condition=not _validate_environment_variables(), - reason="Project's environment variables are not set", -) -def test_ingest(): - artifact_path, project = _set_environment() - ingest_fn = mlrun.import_function("function.yaml") - quotes = create_dataframes() - - quotes_set = _create_feature_set() - quotes_set.save() - - data_uri = os.path.join(artifact_path, "quotes.csv") - quotes.to_csv(data_uri, index=False) - source = CSVSource("quotes", data_uri).to_dict() - - ingest_run = None - try: - ingest_run = ingest_fn.run( - handler="ingest", - params={ - "featureset": quotes_set.uri, - "source": source, - }, - local=True, - ) - - except Exception as exception: - print(f"- The test failed - raised the following error:\n- {exception}") - assert ( - fstore.get_feature_set(ingest_run.outputs["featureset"]).status.state - == "created" - ), "Targets not created successfully" - _cleanup_environment(artifact_path) diff --git a/functions/development/ingest/1.1.0/static/documentation.html b/functions/development/ingest/1.1.0/static/documentation.html deleted file mode 100644 index 7c25074a..00000000 --- a/functions/development/ingest/1.1.0/static/documentation.html +++ /dev/null @@ -1,270 +0,0 @@ - - - - - - - -ingest package - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - - - -
-
- - -
-
-
- -
-

ingest package

- -
- -
-
-
-
-
-

ingest package#

-
-

Submodules#

-
-
-

ingest.ingest module#

-
-
-ingest.ingest.ingest(context: mlrun.execution.MLClientCtx, featureset: str, source: str, targets: Optional[List[Union[str, Dict]]] = None, namespace=None, infer_options=None, run_config: Optional[Union[str, Dict]] = None, spark_context=None, overwrite=None)[source]#
-

Read local DataFrame, file, URL, or source into the feature store -Ingest reads from the source, run the graph transformations, infers metadata and stats -and writes the results to the default of specified targets

-

when targets are not specified data is stored in the configured default targets -(will usually be NoSQL for real-time and Parquet for offline).

-

example:

-
stocks_set = FeatureSet("stocks", entities=[Entity("ticker")])
-stocks = pd.read_csv("stocks.csv")
-df = ingest(stocks_set, stocks, infer_options=fstore.InferOptions.default())
-
-# for running as remote job
-config = RunConfig(image='mlrun/mlrun').apply(mount_v3io())
-df = ingest(stocks_set, stocks, run_config=config)
-
-# specify source and targets
-source = CSVSource("mycsv", path="measurements.csv")
-targets = [CSVTarget("mycsv", path="./mycsv.csv")]
-ingest(measurements, source, targets)
-
-
-
-
Parameters
-
    -
  • context – MLRun context

  • -
  • featureset – feature set object or featureset.uri. (uri must be of a feature set that is in the DB, -call .save() if it’s not)

  • -
  • source – source dataframe or file path

  • -
  • targets – optional list of data target objects

  • -
  • namespace – namespace or module containing graph classes

  • -
  • infer_options – schema and stats infer options

  • -
  • run_config – function and/or run configuration for remote jobs, -see RunConfig

  • -
  • spark_context – local spark session for spark ingestion, example for creating the spark context: -spark = SparkSession.builder.appName(“Spark function”).getOrCreate() -For remote spark ingestion, this should contain the remote spark service name

  • -
  • overwrite

    delete the targets’ data prior to ingestion -(default: True for non-scheduled ingest - deletes the targets that are about to be ingested.

    -
    -

    False for scheduled ingest - does not delete the target)

    -
    -

  • -
-
-
-
-
-
-

Module contents#

-
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/ingest/1.1.0/static/example.html b/functions/development/ingest/1.1.0/static/example.html deleted file mode 100644 index ae4fa592..00000000 --- a/functions/development/ingest/1.1.0/static/example.html +++ /dev/null @@ -1,728 +0,0 @@ - - - - - - - -Feature Store Ingest - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - - - -
-
- - -
-
-
- - -
-
-
-

Feature Store Ingest#

-

Read local DataFrame, file, URL, or source into the feature store -Ingest reads from the source, run the graph transformations, infers metadata and stats -and writes the results to the default of specified targets.

-
-

Creating Project#

-
-
-
import mlrun
-import mlrun.feature_store as fstore
-from  mlrun.datastore.sources import CSVSource
-from mlrun.feature_store.steps import *
-from mlrun.features import MinMaxValidator
-import pandas as pd
-import datetime
-
-
-
-
-
-
-
# Initialize the MLRun project object
-project = mlrun.get_or_create_project('ingest', context="./", user_project=True)
-
-
-
-
-
> 2022-01-31 13:52:16,939 [info] loaded project ingest from MLRun DB
-
-
-
-
-
-
-

Create Sample Data For Demo#

-
-
-
quotes = pd.DataFrame(
-    {
-        "time": [
-            pd.Timestamp("2016-05-25 13:30:00.023"),
-            pd.Timestamp("2016-05-25 13:30:00.023"),
-            pd.Timestamp("2016-05-25 13:30:00.030"),
-            pd.Timestamp("2016-05-25 13:30:00.041"),
-            pd.Timestamp("2016-05-25 13:30:00.048"),
-            pd.Timestamp("2016-05-25 13:30:00.049"),
-            pd.Timestamp("2016-05-25 13:30:00.072"),
-            pd.Timestamp("2016-05-25 13:30:00.075"),
-        ],
-        "ticker": ["GOOG", "MSFT", "MSFT", "MSFT", "GOOG", "AAPL", "GOOG", "MSFT"],
-        "bid": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01],
-        "ask": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03],
-    }
-)
-
-# move date:
-max_date = quotes["time"].max()
-now_date = datetime.datetime.now()
-delta = now_date - max_date
-quotes["time"] = quotes["time"] + delta
-
-
-
-
-
-
-
quotes
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
timetickerbidask
02022-01-31 13:52:16.905388GOOG720.50720.93
12022-01-31 13:52:16.905388MSFT51.9551.96
22022-01-31 13:52:16.912388MSFT51.9751.98
32022-01-31 13:52:16.923388MSFT51.9952.00
42022-01-31 13:52:16.930388GOOG720.50720.93
52022-01-31 13:52:16.931388AAPL97.9998.01
62022-01-31 13:52:16.954388GOOG720.50720.88
72022-01-31 13:52:16.957388MSFT52.0152.03
-
-
-
-
-

Build Advanced Feature Set - With Feature Engineering Pipeline#

-

Define a custom pipeline step (python class)

-
-
-
class MyMap(MapClass):
-    def __init__(self, multiplier=1, **kwargs):
-        super().__init__(**kwargs)
-        self._multiplier = multiplier
-
-    def do(self, event):
-        event["multi"] = event["bid"] * self._multiplier
-        return event
-
-
-
-
-

Build and show the transformatiom pipeline

-
-
-
quotes_set = fstore.FeatureSet("stock-quotes", entities=[fstore.Entity("ticker")])
-
-quotes_set.graph.to("map.MyMap", multiplier=3).to(
-    "storey.Extend", _fn="({'extra': event['bid'] * 77})"
-).to("storey.Filter", "filter", _fn="(event['bid'] > 51.92)").to(
-    FeaturesetValidator()
-)
-
-quotes_set.add_aggregation("ask", ["sum", "max"], "1h", "10m", name="asks1")
-quotes_set.add_aggregation("ask", ["sum", "max"], "5h", "10m", name="asks5")
-quotes_set.add_aggregation("bid", ["min", "max"], "1h", "10m", name="bids")
-
-# add feature validation policy
-quotes_set["bid"] = fstore.Feature(
-    validator=MinMaxValidator(min=52, severity="info")
-)
-
-# add default target definitions and plot
-quotes_set.set_targets()
-quotes_set.plot(rankdir="LR", with_targets=True)
-
-
-
-
-_images/90dea0f8f896da07046841b1af664828b2019e767b67694d701904b76cd2a0ae.svg
-
-

Saving the feature set in the feature store

-
-
-
quotes_set.save()
-
-
-
-
-

Creating the data source of the feature set to apply the ingest on:

-
-
-
data_uri = 'quotes.csv'
-quotes.to_csv(data_uri, index=False)
-source = CSVSource('quotes', data_uri).to_dict()
-source
-
-
-
-
-
-
-

Import ingest function#

-
-
-
ingest_fn = mlrun.import_function("function.yaml")
-
-
-
-
-
-
-

Running the function locally#

-
-
-
ingest_run = ingest_fn.run(
-    handler="ingest",
-    params={
-        "featureset": quotes_set.uri,
-        "source": source,
-    },
-    local=True,
-)
-
-
-
-
-
> 2022-01-31 13:52:17,201 [info] starting run ingest-ingest uid=4bd5d12691a8439d90bf53847f59df1a DB=http://mlrun-api:8080
-> 2022-01-31 13:52:17,354 [info] Ingesting the FeatureSet: store://feature-sets/ingest-yonatan/stock-quotes
-> 2022-01-31 13:52:17,427 [info] starting ingestion task to store://feature-sets/ingest-yonatan/stock-quotes:latest.
-info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 13:52:19.466055 args={'min': 52, 'value': 51.95}
-info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 13:52:19.466072 args={'min': 52, 'value': 51.97}
-info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 13:52:19.466085 args={'min': 52, 'value': 51.99}
-info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 13:52:19.671677 args={'min': 52, 'value': 51.95}
-info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 13:52:19.671692 args={'min': 52, 'value': 51.97}
-info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 13:52:19.671708 args={'min': 52, 'value': 51.99}
-> 2022-01-31 13:52:19,915 [info] ingestion task completed, targets:
-> 2022-01-31 13:52:19,915 [info] [{'name': 'parquet', 'kind': 'parquet', 'path': 'v3io:///projects/ingest-yonatan/FeatureStore/stock-quotes/parquet/sets/stock-quotes-latest', 'status': 'created', 'updated': '2022-01-31T13:52:19.649303+00:00', 'last_written': datetime.datetime(2022, 1, 31, 13, 52, 19, 671753)}, {'name': 'nosql', 'kind': 'nosql', 'path': 'v3io:///projects/ingest-yonatan/FeatureStore/stock-quotes/nosql/sets/stock-quotes-latest', 'status': 'created', 'updated': '2022-01-31T13:52:19.650044+00:00'}]
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
ingest-yonatan0Jan 31 13:52:17completedingest-ingest
v3io_user=yonatan
kind=
owner=yonatan
host=jupyter-yoni-647b99c95d-w4jlc
featureset=store://feature-sets/ingest-yonatan/stock-quotes
source={'kind': 'csv', 'name': 'quotes', 'path': 'quotes.csv'}
infer_options=63
overwrite=None
targets=None
featureset=store://feature-sets/ingest-yonatan/stock-quotes
-
- -
-

-
-
-
> to track results use the .show() or .logs() methods or click here to open in UI
> 2022-01-31 13:52:20,045 [info] run executed, status=completed
-
-
-
-
-
-
-

View of the targets’ state after run#

-
-
-
fstore.get_feature_set(ingest_run.outputs['featureset']).status.state
-
-
-
-
-
'created'
-
-
-
-
-
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/ingest/1.1.0/static/function.html b/functions/development/ingest/1.1.0/static/function.html deleted file mode 100644 index d58ef431..00000000 --- a/functions/development/ingest/1.1.0/static/function.html +++ /dev/null @@ -1,109 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: job
-metadata:
-  name: ingest
-  tag: ''
-  hash: 7e28700a86ebdd18d887fe588492201a1e3ef2f6
-  project: ''
-  labels:
-    author: yonish
-  categories:
-  - data-preparation
-  - data-analysis
-  - feature-store
-spec:
-  command: ''
-  args: []
-  image: mlrun/mlrun
-  build:
-    functionSourceCode: ZnJvbSB0eXBpbmcgaW1wb3J0IFVuaW9uLCBMaXN0LCBEaWN0CgppbXBvcnQgbWxydW4uZmVhdHVyZV9zdG9yZSBhcyBmcwpmcm9tIG1scnVuLmV4ZWN1dGlvbiBpbXBvcnQgTUxDbGllbnRDdHgKZnJvbSBtbHJ1bi5kYXRhX3R5cGVzIGltcG9ydCBJbmZlck9wdGlvbnMKCgpkZWYgaW5nZXN0KAogICAgY29udGV4dDogTUxDbGllbnRDdHgsCiAgICBmZWF0dXJlc2V0OiBzdHIsCiAgICBzb3VyY2U6IHN0ciwKICAgIHRhcmdldHM6IExpc3RbVW5pb25bc3RyLCBEaWN0XV0gPSBOb25lLAogICAgbmFtZXNwYWNlPU5vbmUsCiAgICBpbmZlcl9vcHRpb25zPU5vbmUsCiAgICBydW5fY29uZmlnOiBVbmlvbltzdHIsIERpY3RdID0gTm9uZSwKICAgIHNwYXJrX2NvbnRleHQ9Tm9uZSwKICAgIG92ZXJ3cml0ZT1Ob25lLAopOgogICAgIiIiUmVhZCBsb2NhbCBEYXRhRnJhbWUsIGZpbGUsIFVSTCwgb3Igc291cmNlIGludG8gdGhlIGZlYXR1cmUgc3RvcmUKICAgIEluZ2VzdCByZWFkcyBmcm9tIHRoZSBzb3VyY2UsIHJ1biB0aGUgZ3JhcGggdHJhbnNmb3JtYXRpb25zLCBpbmZlcnMgIG1ldGFkYXRhIGFuZCBzdGF0cwogICAgYW5kIHdyaXRlcyB0aGUgcmVzdWx0cyB0byB0aGUgZGVmYXVsdCBvZiBzcGVjaWZpZWQgdGFyZ2V0cwoKICAgIHdoZW4gdGFyZ2V0cyBhcmUgbm90IHNwZWNpZmllZCBkYXRhIGlzIHN0b3JlZCBpbiB0aGUgY29uZmlndXJlZCBkZWZhdWx0IHRhcmdldHMKICAgICh3aWxsIHVzdWFsbHkgYmUgTm9TUUwgZm9yIHJlYWwtdGltZSBhbmQgUGFycXVldCBmb3Igb2ZmbGluZSkuCgogICAgZXhhbXBsZTo6CgogICAgICAgIHN0b2Nrc19zZXQgPSBGZWF0dXJlU2V0KCJzdG9ja3MiLCBlbnRpdGllcz1bRW50aXR5KCJ0aWNrZXIiKV0pCiAgICAgICAgc3RvY2tzID0gcGQucmVhZF9jc3YoInN0b2Nrcy5jc3YiKQogICAgICAgIGRmID0gaW5nZXN0KHN0b2Nrc19zZXQsIHN0b2NrcywgaW5mZXJfb3B0aW9ucz1mc3RvcmUuSW5mZXJPcHRpb25zLmRlZmF1bHQoKSkKCiAgICAgICAgIyBmb3IgcnVubmluZyBhcyByZW1vdGUgam9iCiAgICAgICAgY29uZmlnID0gUnVuQ29uZmlnKGltYWdlPSdtbHJ1bi9tbHJ1bicpLmFwcGx5KG1vdW50X3YzaW8oKSkKICAgICAgICBkZiA9IGluZ2VzdChzdG9ja3Nfc2V0LCBzdG9ja3MsIHJ1bl9jb25maWc9Y29uZmlnKQoKICAgICAgICAjIHNwZWNpZnkgc291cmNlIGFuZCB0YXJnZXRzCiAgICAgICAgc291cmNlID0gQ1NWU291cmNlKCJteWNzdiIsIHBhdGg9Im1lYXN1cmVtZW50cy5jc3YiKQogICAgICAgIHRhcmdldHMgPSBbQ1NWVGFyZ2V0KCJteWNzdiIsIHBhdGg9Ii4vbXljc3YuY3N2IildCiAgICAgICAgaW5nZXN0KG1lYXN1cmVtZW50cywgc291cmNlLCB0YXJnZXRzKQoKICAgIDpwYXJhbSBjb250ZXh0OiAgICAgICBNTFJ1biBjb250ZXh0CiAgICA6cGFyYW0gZmVhdHVyZXNldDogICAgZmVhdHVyZSBzZXQgb2JqZWN0IG9yIGZlYXR1cmVzZXQudXJpLiAodXJpIG11c3QgYmUgb2YgYSBmZWF0dXJlIHNldCB0aGF0IGlzIGluIHRoZSBEQiwKICAgICAgICAgICAgICAgICAgICAgICAgICBjYWxsIGAuc2F2ZSgpYCBpZiBpdCdzIG5vdCkKICAgIDpwYXJhbSBzb3VyY2U6ICAgICAgICBzb3VyY2UgZGF0YWZyYW1lIG9yIGZpbGUgcGF0aAogICAgOnBhcmFtIHRhcmdldHM6ICAgICAgIG9wdGlvbmFsIGxpc3Qgb2YgZGF0YSB0YXJnZXQgb2JqZWN0cwogICAgOnBhcmFtIG5hbWVzcGFjZTogICAgIG5hbWVzcGFjZSBvciBtb2R1bGUgY29udGFpbmluZyBncmFwaCBjbGFzc2VzCiAgICA6cGFyYW0gaW5mZXJfb3B0aW9uczogc2NoZW1hIGFuZCBzdGF0cyBpbmZlciBvcHRpb25zCiAgICA6cGFyYW0gcnVuX2NvbmZpZzogICAgZnVuY3Rpb24gYW5kL29yIHJ1biBjb25maWd1cmF0aW9uIGZvciByZW1vdGUgam9icywKICAgICAgICAgICAgICAgICAgICAgICAgICBzZWUgOnB5OmNsYXNzOmB+bWxydW4uZmVhdHVyZV9zdG9yZS5SdW5Db25maWdgCiAgICA6cGFyYW0gc3BhcmtfY29udGV4dDogbG9jYWwgc3Bhcmsgc2Vzc2lvbiBmb3Igc3BhcmsgaW5nZXN0aW9uLCBleGFtcGxlIGZvciBjcmVhdGluZyB0aGUgc3BhcmsgY29udGV4dDoKICAgICAgICAgICAgICAgICAgICAgICAgICBgc3BhcmsgPSBTcGFya1Nlc3Npb24uYnVpbGRlci5hcHBOYW1lKCJTcGFyayBmdW5jdGlvbiIpLmdldE9yQ3JlYXRlKClgCiAgICAgICAgICAgICAgICAgICAgICAgICAgRm9yIHJlbW90ZSBzcGFyayBpbmdlc3Rpb24sIHRoaXMgc2hvdWxkIGNvbnRhaW4gdGhlIHJlbW90ZSBzcGFyayBzZXJ2aWNlIG5hbWUKICAgIDpwYXJhbSBvdmVyd3JpdGU6ICAgICBkZWxldGUgdGhlIHRhcmdldHMnIGRhdGEgcHJpb3IgdG8gaW5nZXN0aW9uCiAgICAgICAgICAgICAgICAgICAgICAgICAgKGRlZmF1bHQ6IFRydWUgZm9yIG5vbi1zY2hlZHVsZWQgaW5nZXN0IC0gZGVsZXRlcyB0aGUgdGFyZ2V0cyB0aGF0IGFyZSBhYm91dCB0byBiZSBpbmdlc3RlZC4KICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgRmFsc2UgZm9yIHNjaGVkdWxlZCBpbmdlc3QgLSBkb2VzIG5vdCBkZWxldGUgdGhlIHRhcmdldCkKCiAgICAiIiIKICAgICMgU2V0dGluZyBpbmZlcl9vcHRpb25zIHRvIGRlZmF1bHQ6CiAgICBjb250ZXh0Ll9wYXJhbWV0ZXJzWyJpbmZlcl9vcHRpb25zIl0gPSBpbmZlcl9vcHRpb25zIG9yIEluZmVyT3B0aW9ucy5kZWZhdWx0KCkKCiAgICBjb250ZXh0LmxvZ2dlci5pbmZvKGYiQ2FsbGluZyBpbmdlc3Rpb24gdGFzayB3aXRoOiB7ZmVhdHVyZXNldH0iKQoKICAgICMgaW5nZXN0IGNhbGxlZCB3aXRoIG1scnVuX2NvbnRleHQsIGZlYXR1cmVfc2V0LCBzb3VyY2UgYW5kIHRhcmdldHMgcGFzc2VkIHdpdGggY29udGV4dAogICAgIyBUaGlzIHBhcmFtcyBoZXJlIGZvciBkb2N1bWVudGF0aW9uIHB1cnBvc2VzIG9ubHkKICAgIGZzLmluZ2VzdCgKICAgICAgICBtbHJ1bl9jb250ZXh0PWNvbnRleHQsCiAgICAgICAgbmFtZXNwYWNlPW5hbWVzcGFjZSwKICAgICAgICBzcGFya19jb250ZXh0PXNwYXJrX2NvbnRleHQsCiAgICApCiAgICBjb250ZXh0LmxvZ19yZXN1bHQoImZlYXR1cmVzZXQiLCBmZWF0dXJlc2V0KQo=
-    commands: []
-    code_origin: https://github.com/mlrun/functions.git#886a88217c2a2570c81a14877f9c1dfb1ac8a244:C:\Users\yonatans\projects\functions\ingest\ingest.py
-    origin_filename: C:\Users\yonatans\projects\functions\ingest\ingest.py
-  entry_points:
-    ingest:
-      name: ingest
-      doc: "Read local DataFrame, file, URL, or source into the feature store\nIngest\
-        \ reads from the source, run the graph transformations, infers  metadata and\
-        \ stats\nand writes the results to the default of specified targets\n\nwhen\
-        \ targets are not specified data is stored in the configured default targets\n\
-        (will usually be NoSQL for real-time and Parquet for offline).\n\nexample::\n\
-        \n    stocks_set = FeatureSet(\"stocks\", entities=[Entity(\"ticker\")])\n\
-        \    stocks = pd.read_csv(\"stocks.csv\")\n    df = ingest(stocks_set, stocks,\
-        \ infer_options=fstore.InferOptions.default())\n\n    # for running as remote\
-        \ job\n    config = RunConfig(image='mlrun/mlrun').apply(mount_v3io())\n \
-        \   df = ingest(stocks_set, stocks, run_config=config)\n\n    # specify source\
-        \ and targets\n    source = CSVSource(\"mycsv\", path=\"measurements.csv\"\
-        )\n    targets = [CSVTarget(\"mycsv\", path=\"./mycsv.csv\")]\n    ingest(measurements,\
-        \ source, targets)"
-      parameters:
-      - name: context
-        type: MLClientCtx
-        doc: MLRun context
-        default: ''
-      - name: featureset
-        type: str
-        doc: feature set object or featureset.uri. (uri must be of a feature set that
-          is in the DB, call `.save()` if it's not)
-        default: ''
-      - name: source
-        type: str
-        doc: source dataframe or file path
-        default: ''
-      - name: targets
-        type: List[Union[str, Dict]]
-        doc: optional list of data target objects
-        default: null
-      - name: namespace
-        doc: namespace or module containing graph classes
-        default: null
-      - name: infer_options
-        doc: schema and stats infer options
-        default: null
-      - name: run_config
-        type: Union[str, Dict]
-        doc: function and/or run configuration for remote jobs, see :py:class:`~mlrun.feature_store.RunConfig`
-        default: null
-      - name: spark_context
-        doc: 'local spark session for spark ingestion, example for creating the spark
-          context: `spark = SparkSession.builder.appName("Spark function").getOrCreate()`
-          For remote spark ingestion, this should contain the remote spark service
-          name'
-        default: null
-      - name: overwrite
-        doc: 'delete the targets'' data prior to ingestion (default: True for non-scheduled
-          ingest - deletes the targets that are about to be ingested. False for scheduled
-          ingest - does not delete the target)'
-        default: null
-      outputs:
-      - default: ''
-      lineno: 8
-  description: Feature Store ingest function that runs the transformation graph on
-    the source of the featureset.
-  default_handler: ingest
-  disable_auto_mount: false
-  env: []
-  priority_class_name: ''
-  affinity: null
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/ingest/1.1.0/static/ingest.html b/functions/development/ingest/1.1.0/static/ingest.html deleted file mode 100644 index 32c6e3f1..00000000 --- a/functions/development/ingest/1.1.0/static/ingest.html +++ /dev/null @@ -1,224 +0,0 @@ - - - - - - - -ingest.ingest - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - -
-
- -
-
-
-
-
- -
-

- -
-
-
-
-
-
-
-

Source code for ingest.ingest

-# Copyright 2019 Iguazio
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-from typing import Union, List, Dict
-
-import mlrun.feature_store as fs
-from mlrun.execution import MLClientCtx
-from mlrun.data_types import InferOptions
-
-
-
[docs]def ingest( - context: MLClientCtx, - featureset: str, - source: str, - targets: List[Union[str, Dict]] = None, - namespace=None, - infer_options=None, - run_config: Union[str, Dict] = None, - spark_context=None, - overwrite=None, -): - """Read local DataFrame, file, URL, or source into the feature store - Ingest reads from the source, run the graph transformations, infers metadata and stats - and writes the results to the default of specified targets - - when targets are not specified data is stored in the configured default targets - (will usually be NoSQL for real-time and Parquet for offline). - - example:: - - stocks_set = FeatureSet("stocks", entities=[Entity("ticker")]) - stocks = pd.read_csv("stocks.csv") - df = ingest(stocks_set, stocks, infer_options=fstore.InferOptions.default()) - - # for running as remote job - config = RunConfig(image='mlrun/mlrun').apply(mount_v3io()) - df = ingest(stocks_set, stocks, run_config=config) - - # specify source and targets - source = CSVSource("mycsv", path="measurements.csv") - targets = [CSVTarget("mycsv", path="./mycsv.csv")] - ingest(measurements, source, targets) - - :param context: MLRun context - :param featureset: feature set object or featureset.uri. (uri must be of a feature set that is in the DB, - call `.save()` if it's not) - :param source: source dataframe or file path - :param targets: optional list of data target objects - :param namespace: namespace or module containing graph classes - :param infer_options: schema and stats infer options - :param run_config: function and/or run configuration for remote jobs, - see :py:class:`~mlrun.feature_store.RunConfig` - :param spark_context: local spark session for spark ingestion, example for creating the spark context: - `spark = SparkSession.builder.appName("Spark function").getOrCreate()` - For remote spark ingestion, this should contain the remote spark service name - :param overwrite: delete the targets' data prior to ingestion - (default: True for non-scheduled ingest - deletes the targets that are about to be ingested. - False for scheduled ingest - does not delete the target) - - """ - # Setting infer_options to default: - context._parameters["infer_options"] = infer_options or InferOptions.default() - - context.logger.info(f"Calling ingestion task with: {featureset}") - - # ingest called with mlrun_context, feature_set, source and targets passed with context - # This params here for documentation purposes only - fs.ingest( - mlrun_context=context, - namespace=namespace, - spark_context=spark_context, - ) - context.log_result("featureset", featureset)
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/ingest/1.1.0/static/item.html b/functions/development/ingest/1.1.0/static/item.html deleted file mode 100644 index e36ddb73..00000000 --- a/functions/development/ingest/1.1.0/static/item.html +++ /dev/null @@ -1,49 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- data-preparation
-- data-analysis
-- feature-store
-description: Feature Store ingest function that runs the transformation graph on the
-  source of the featureset.
-doc: ''
-example: ingest.ipynb
-generationDate: 2022-08-28:17-25
-hidden: false
-icon: ''
-labels:
-  author: yonish
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 1.1.0
-name: ingest
-platformVersion: 3.5.0
-spec:
-  filename: ingest.py
-  handler: ingest
-  image: mlrun/mlrun
-  kind: job
-  requirements: []
-url: ''
-version: 1.1.0
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/ingest/1.1.0/static/source.html b/functions/development/ingest/1.1.0/static/source.html deleted file mode 100644 index ade154bf..00000000 --- a/functions/development/ingest/1.1.0/static/source.html +++ /dev/null @@ -1,106 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-# Copyright 2019 Iguazio
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-from typing import Union, List, Dict
-
-import mlrun.feature_store as fs
-from mlrun.execution import MLClientCtx
-from mlrun.data_types import InferOptions
-
-
-def ingest(
-    context: MLClientCtx,
-    featureset: str,
-    source: str,
-    targets: List[Union[str, Dict]] = None,
-    namespace=None,
-    infer_options=None,
-    run_config: Union[str, Dict] = None,
-    spark_context=None,
-    overwrite=None,
-):
-    """Read local DataFrame, file, URL, or source into the feature store
-    Ingest reads from the source, run the graph transformations, infers  metadata and stats
-    and writes the results to the default of specified targets
-
-    when targets are not specified data is stored in the configured default targets
-    (will usually be NoSQL for real-time and Parquet for offline).
-
-    example::
-
-        stocks_set = FeatureSet("stocks", entities=[Entity("ticker")])
-        stocks = pd.read_csv("stocks.csv")
-        df = ingest(stocks_set, stocks, infer_options=fstore.InferOptions.default())
-
-        # for running as remote job
-        config = RunConfig(image='mlrun/mlrun').apply(mount_v3io())
-        df = ingest(stocks_set, stocks, run_config=config)
-
-        # specify source and targets
-        source = CSVSource("mycsv", path="measurements.csv")
-        targets = [CSVTarget("mycsv", path="./mycsv.csv")]
-        ingest(measurements, source, targets)
-
-    :param context:       MLRun context
-    :param featureset:    feature set object or featureset.uri. (uri must be of a feature set that is in the DB,
-                          call `.save()` if it's not)
-    :param source:        source dataframe or file path
-    :param targets:       optional list of data target objects
-    :param namespace:     namespace or module containing graph classes
-    :param infer_options: schema and stats infer options
-    :param run_config:    function and/or run configuration for remote jobs,
-                          see :py:class:`~mlrun.feature_store.RunConfig`
-    :param spark_context: local spark session for spark ingestion, example for creating the spark context:
-                          `spark = SparkSession.builder.appName("Spark function").getOrCreate()`
-                          For remote spark ingestion, this should contain the remote spark service name
-    :param overwrite:     delete the targets' data prior to ingestion
-                          (default: True for non-scheduled ingest - deletes the targets that are about to be ingested.
-                                    False for scheduled ingest - does not delete the target)
-
-    """
-    # Setting infer_options to default:
-    context._parameters["infer_options"] = infer_options or InferOptions.default()
-
-    context.logger.info(f"Calling ingestion task with: {featureset}")
-
-    # ingest called with mlrun_context, feature_set, source and targets passed with context
-    # This params here for documentation purposes only
-    fs.ingest(
-        mlrun_context=context,
-        namespace=namespace,
-        spark_context=spark_context,
-    )
-    context.log_result("featureset", featureset)
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/ingest/latest/src/function.yaml b/functions/development/ingest/latest/src/function.yaml deleted file mode 100644 index a05ca669..00000000 --- a/functions/development/ingest/latest/src/function.yaml +++ /dev/null @@ -1,87 +0,0 @@ -kind: job -metadata: - name: ingest - tag: '' - hash: 7e28700a86ebdd18d887fe588492201a1e3ef2f6 - project: '' - labels: - author: yonish - categories: - - data-preparation - - data-analysis - - feature-store -spec: - command: '' - args: [] - image: mlrun/mlrun - build: - functionSourceCode: ZnJvbSB0eXBpbmcgaW1wb3J0IFVuaW9uLCBMaXN0LCBEaWN0CgppbXBvcnQgbWxydW4uZmVhdHVyZV9zdG9yZSBhcyBmcwpmcm9tIG1scnVuLmV4ZWN1dGlvbiBpbXBvcnQgTUxDbGllbnRDdHgKZnJvbSBtbHJ1bi5kYXRhX3R5cGVzIGltcG9ydCBJbmZlck9wdGlvbnMKCgpkZWYgaW5nZXN0KAogICAgY29udGV4dDogTUxDbGllbnRDdHgsCiAgICBmZWF0dXJlc2V0OiBzdHIsCiAgICBzb3VyY2U6IHN0ciwKICAgIHRhcmdldHM6IExpc3RbVW5pb25bc3RyLCBEaWN0XV0gPSBOb25lLAogICAgbmFtZXNwYWNlPU5vbmUsCiAgICBpbmZlcl9vcHRpb25zPU5vbmUsCiAgICBydW5fY29uZmlnOiBVbmlvbltzdHIsIERpY3RdID0gTm9uZSwKICAgIHNwYXJrX2NvbnRleHQ9Tm9uZSwKICAgIG92ZXJ3cml0ZT1Ob25lLAopOgogICAgIiIiUmVhZCBsb2NhbCBEYXRhRnJhbWUsIGZpbGUsIFVSTCwgb3Igc291cmNlIGludG8gdGhlIGZlYXR1cmUgc3RvcmUKICAgIEluZ2VzdCByZWFkcyBmcm9tIHRoZSBzb3VyY2UsIHJ1biB0aGUgZ3JhcGggdHJhbnNmb3JtYXRpb25zLCBpbmZlcnMgIG1ldGFkYXRhIGFuZCBzdGF0cwogICAgYW5kIHdyaXRlcyB0aGUgcmVzdWx0cyB0byB0aGUgZGVmYXVsdCBvZiBzcGVjaWZpZWQgdGFyZ2V0cwoKICAgIHdoZW4gdGFyZ2V0cyBhcmUgbm90IHNwZWNpZmllZCBkYXRhIGlzIHN0b3JlZCBpbiB0aGUgY29uZmlndXJlZCBkZWZhdWx0IHRhcmdldHMKICAgICh3aWxsIHVzdWFsbHkgYmUgTm9TUUwgZm9yIHJlYWwtdGltZSBhbmQgUGFycXVldCBmb3Igb2ZmbGluZSkuCgogICAgZXhhbXBsZTo6CgogICAgICAgIHN0b2Nrc19zZXQgPSBGZWF0dXJlU2V0KCJzdG9ja3MiLCBlbnRpdGllcz1bRW50aXR5KCJ0aWNrZXIiKV0pCiAgICAgICAgc3RvY2tzID0gcGQucmVhZF9jc3YoInN0b2Nrcy5jc3YiKQogICAgICAgIGRmID0gaW5nZXN0KHN0b2Nrc19zZXQsIHN0b2NrcywgaW5mZXJfb3B0aW9ucz1mc3RvcmUuSW5mZXJPcHRpb25zLmRlZmF1bHQoKSkKCiAgICAgICAgIyBmb3IgcnVubmluZyBhcyByZW1vdGUgam9iCiAgICAgICAgY29uZmlnID0gUnVuQ29uZmlnKGltYWdlPSdtbHJ1bi9tbHJ1bicpLmFwcGx5KG1vdW50X3YzaW8oKSkKICAgICAgICBkZiA9IGluZ2VzdChzdG9ja3Nfc2V0LCBzdG9ja3MsIHJ1bl9jb25maWc9Y29uZmlnKQoKICAgICAgICAjIHNwZWNpZnkgc291cmNlIGFuZCB0YXJnZXRzCiAgICAgICAgc291cmNlID0gQ1NWU291cmNlKCJteWNzdiIsIHBhdGg9Im1lYXN1cmVtZW50cy5jc3YiKQogICAgICAgIHRhcmdldHMgPSBbQ1NWVGFyZ2V0KCJteWNzdiIsIHBhdGg9Ii4vbXljc3YuY3N2IildCiAgICAgICAgaW5nZXN0KG1lYXN1cmVtZW50cywgc291cmNlLCB0YXJnZXRzKQoKICAgIDpwYXJhbSBjb250ZXh0OiAgICAgICBNTFJ1biBjb250ZXh0CiAgICA6cGFyYW0gZmVhdHVyZXNldDogICAgZmVhdHVyZSBzZXQgb2JqZWN0IG9yIGZlYXR1cmVzZXQudXJpLiAodXJpIG11c3QgYmUgb2YgYSBmZWF0dXJlIHNldCB0aGF0IGlzIGluIHRoZSBEQiwKICAgICAgICAgICAgICAgICAgICAgICAgICBjYWxsIGAuc2F2ZSgpYCBpZiBpdCdzIG5vdCkKICAgIDpwYXJhbSBzb3VyY2U6ICAgICAgICBzb3VyY2UgZGF0YWZyYW1lIG9yIGZpbGUgcGF0aAogICAgOnBhcmFtIHRhcmdldHM6ICAgICAgIG9wdGlvbmFsIGxpc3Qgb2YgZGF0YSB0YXJnZXQgb2JqZWN0cwogICAgOnBhcmFtIG5hbWVzcGFjZTogICAgIG5hbWVzcGFjZSBvciBtb2R1bGUgY29udGFpbmluZyBncmFwaCBjbGFzc2VzCiAgICA6cGFyYW0gaW5mZXJfb3B0aW9uczogc2NoZW1hIGFuZCBzdGF0cyBpbmZlciBvcHRpb25zCiAgICA6cGFyYW0gcnVuX2NvbmZpZzogICAgZnVuY3Rpb24gYW5kL29yIHJ1biBjb25maWd1cmF0aW9uIGZvciByZW1vdGUgam9icywKICAgICAgICAgICAgICAgICAgICAgICAgICBzZWUgOnB5OmNsYXNzOmB+bWxydW4uZmVhdHVyZV9zdG9yZS5SdW5Db25maWdgCiAgICA6cGFyYW0gc3BhcmtfY29udGV4dDogbG9jYWwgc3Bhcmsgc2Vzc2lvbiBmb3Igc3BhcmsgaW5nZXN0aW9uLCBleGFtcGxlIGZvciBjcmVhdGluZyB0aGUgc3BhcmsgY29udGV4dDoKICAgICAgICAgICAgICAgICAgICAgICAgICBgc3BhcmsgPSBTcGFya1Nlc3Npb24uYnVpbGRlci5hcHBOYW1lKCJTcGFyayBmdW5jdGlvbiIpLmdldE9yQ3JlYXRlKClgCiAgICAgICAgICAgICAgICAgICAgICAgICAgRm9yIHJlbW90ZSBzcGFyayBpbmdlc3Rpb24sIHRoaXMgc2hvdWxkIGNvbnRhaW4gdGhlIHJlbW90ZSBzcGFyayBzZXJ2aWNlIG5hbWUKICAgIDpwYXJhbSBvdmVyd3JpdGU6ICAgICBkZWxldGUgdGhlIHRhcmdldHMnIGRhdGEgcHJpb3IgdG8gaW5nZXN0aW9uCiAgICAgICAgICAgICAgICAgICAgICAgICAgKGRlZmF1bHQ6IFRydWUgZm9yIG5vbi1zY2hlZHVsZWQgaW5nZXN0IC0gZGVsZXRlcyB0aGUgdGFyZ2V0cyB0aGF0IGFyZSBhYm91dCB0byBiZSBpbmdlc3RlZC4KICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgRmFsc2UgZm9yIHNjaGVkdWxlZCBpbmdlc3QgLSBkb2VzIG5vdCBkZWxldGUgdGhlIHRhcmdldCkKCiAgICAiIiIKICAgICMgU2V0dGluZyBpbmZlcl9vcHRpb25zIHRvIGRlZmF1bHQ6CiAgICBjb250ZXh0Ll9wYXJhbWV0ZXJzWyJpbmZlcl9vcHRpb25zIl0gPSBpbmZlcl9vcHRpb25zIG9yIEluZmVyT3B0aW9ucy5kZWZhdWx0KCkKCiAgICBjb250ZXh0LmxvZ2dlci5pbmZvKGYiQ2FsbGluZyBpbmdlc3Rpb24gdGFzayB3aXRoOiB7ZmVhdHVyZXNldH0iKQoKICAgICMgaW5nZXN0IGNhbGxlZCB3aXRoIG1scnVuX2NvbnRleHQsIGZlYXR1cmVfc2V0LCBzb3VyY2UgYW5kIHRhcmdldHMgcGFzc2VkIHdpdGggY29udGV4dAogICAgIyBUaGlzIHBhcmFtcyBoZXJlIGZvciBkb2N1bWVudGF0aW9uIHB1cnBvc2VzIG9ubHkKICAgIGZzLmluZ2VzdCgKICAgICAgICBtbHJ1bl9jb250ZXh0PWNvbnRleHQsCiAgICAgICAgbmFtZXNwYWNlPW5hbWVzcGFjZSwKICAgICAgICBzcGFya19jb250ZXh0PXNwYXJrX2NvbnRleHQsCiAgICApCiAgICBjb250ZXh0LmxvZ19yZXN1bHQoImZlYXR1cmVzZXQiLCBmZWF0dXJlc2V0KQo= - commands: [] - code_origin: https://github.com/mlrun/functions.git#886a88217c2a2570c81a14877f9c1dfb1ac8a244:C:\Users\yonatans\projects\functions\ingest\ingest.py - origin_filename: C:\Users\yonatans\projects\functions\ingest\ingest.py - entry_points: - ingest: - name: ingest - doc: "Read local DataFrame, file, URL, or source into the feature store\nIngest\ - \ reads from the source, run the graph transformations, infers metadata and\ - \ stats\nand writes the results to the default of specified targets\n\nwhen\ - \ targets are not specified data is stored in the configured default targets\n\ - (will usually be NoSQL for real-time and Parquet for offline).\n\nexample::\n\ - \n stocks_set = FeatureSet(\"stocks\", entities=[Entity(\"ticker\")])\n\ - \ stocks = pd.read_csv(\"stocks.csv\")\n df = ingest(stocks_set, stocks,\ - \ infer_options=fstore.InferOptions.default())\n\n # for running as remote\ - \ job\n config = RunConfig(image='mlrun/mlrun').apply(mount_v3io())\n \ - \ df = ingest(stocks_set, stocks, run_config=config)\n\n # specify source\ - \ and targets\n source = CSVSource(\"mycsv\", path=\"measurements.csv\"\ - )\n targets = [CSVTarget(\"mycsv\", path=\"./mycsv.csv\")]\n ingest(measurements,\ - \ source, targets)" - parameters: - - name: context - type: MLClientCtx - doc: MLRun context - default: '' - - name: featureset - type: str - doc: feature set object or featureset.uri. (uri must be of a feature set that - is in the DB, call `.save()` if it's not) - default: '' - - name: source - type: str - doc: source dataframe or file path - default: '' - - name: targets - type: List[Union[str, Dict]] - doc: optional list of data target objects - default: null - - name: namespace - doc: namespace or module containing graph classes - default: null - - name: infer_options - doc: schema and stats infer options - default: null - - name: run_config - type: Union[str, Dict] - doc: function and/or run configuration for remote jobs, see :py:class:`~mlrun.feature_store.RunConfig` - default: null - - name: spark_context - doc: 'local spark session for spark ingestion, example for creating the spark - context: `spark = SparkSession.builder.appName("Spark function").getOrCreate()` - For remote spark ingestion, this should contain the remote spark service - name' - default: null - - name: overwrite - doc: 'delete the targets'' data prior to ingestion (default: True for non-scheduled - ingest - deletes the targets that are about to be ingested. False for scheduled - ingest - does not delete the target)' - default: null - outputs: - - default: '' - lineno: 8 - description: Feature Store ingest function that runs the transformation graph on - the source of the featureset. - default_handler: ingest - disable_auto_mount: false - env: [] - priority_class_name: '' - affinity: null -verbose: false diff --git a/functions/development/ingest/latest/src/ingest.ipynb b/functions/development/ingest/latest/src/ingest.ipynb deleted file mode 100644 index 7da398b4..00000000 --- a/functions/development/ingest/latest/src/ingest.ipynb +++ /dev/null @@ -1,762 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Feature Store Ingest" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Read local DataFrame, file, URL, or source into the feature store\n", - "Ingest reads from the source, run the graph transformations, infers metadata and stats\n", - "and writes the results to the default of specified targets." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Creating Project" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import mlrun\n", - "import mlrun.feature_store as fstore\n", - "from mlrun.datastore.sources import CSVSource\n", - "from mlrun.feature_store.steps import *\n", - "from mlrun.features import MinMaxValidator\n", - "import pandas as pd\n", - "import datetime" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2022-01-31 13:52:16,939 [info] loaded project ingest from MLRun DB\n" - ] - } - ], - "source": [ - "# Initialize the MLRun project object\n", - "project = mlrun.get_or_create_project('ingest', context=\"./\", user_project=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Create Sample Data For Demo" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "quotes = pd.DataFrame(\n", - " {\n", - " \"time\": [\n", - " pd.Timestamp(\"2016-05-25 13:30:00.023\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.023\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.030\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.041\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.048\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.049\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.072\"),\n", - " pd.Timestamp(\"2016-05-25 13:30:00.075\"),\n", - " ],\n", - " \"ticker\": [\"GOOG\", \"MSFT\", \"MSFT\", \"MSFT\", \"GOOG\", \"AAPL\", \"GOOG\", \"MSFT\"],\n", - " \"bid\": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01],\n", - " \"ask\": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03],\n", - " }\n", - ")\n", - "\n", - "# move date:\n", - "max_date = quotes[\"time\"].max()\n", - "now_date = datetime.datetime.now()\n", - "delta = now_date - max_date\n", - "quotes[\"time\"] = quotes[\"time\"] + delta" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
timetickerbidask
02022-01-31 13:52:16.905388GOOG720.50720.93
12022-01-31 13:52:16.905388MSFT51.9551.96
22022-01-31 13:52:16.912388MSFT51.9751.98
32022-01-31 13:52:16.923388MSFT51.9952.00
42022-01-31 13:52:16.930388GOOG720.50720.93
52022-01-31 13:52:16.931388AAPL97.9998.01
62022-01-31 13:52:16.954388GOOG720.50720.88
72022-01-31 13:52:16.957388MSFT52.0152.03
\n", - "
" - ], - "text/plain": [ - " time ticker bid ask\n", - "0 2022-01-31 13:52:16.905388 GOOG 720.50 720.93\n", - "1 2022-01-31 13:52:16.905388 MSFT 51.95 51.96\n", - "2 2022-01-31 13:52:16.912388 MSFT 51.97 51.98\n", - "3 2022-01-31 13:52:16.923388 MSFT 51.99 52.00\n", - "4 2022-01-31 13:52:16.930388 GOOG 720.50 720.93\n", - "5 2022-01-31 13:52:16.931388 AAPL 97.99 98.01\n", - "6 2022-01-31 13:52:16.954388 GOOG 720.50 720.88\n", - "7 2022-01-31 13:52:16.957388 MSFT 52.01 52.03" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "quotes" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Build Advanced Feature Set - With Feature Engineering Pipeline" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Define a custom pipeline step (python class)" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "class MyMap(MapClass):\n", - " def __init__(self, multiplier=1, **kwargs):\n", - " super().__init__(**kwargs)\n", - " self._multiplier = multiplier\n", - "\n", - " def do(self, event):\n", - " event[\"multi\"] = event[\"bid\"] * self._multiplier\n", - " return event" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Build and show the transformatiom pipeline" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "image/svg+xml": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "mlrun-flow\n", - "\n", - "\n", - "\n", - "_start\n", - "\n", - "start\n", - "\n", - "\n", - "\n", - "map.MyMap\n", - "\n", - "map.MyMap\n", - "\n", - "\n", - "\n", - "_start->map.MyMap\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "storey.Extend\n", - "\n", - "storey.Extend\n", - "\n", - "\n", - "\n", - "map.MyMap->storey.Extend\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "filter\n", - "\n", - "filter\n", - "\n", - "\n", - "\n", - "storey.Extend->filter\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "FeaturesetValidator\n", - "\n", - "FeaturesetValidator\n", - "\n", - "\n", - "\n", - "filter->FeaturesetValidator\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "Aggregates\n", - "\n", - "Aggregates\n", - "\n", - "\n", - "\n", - "FeaturesetValidator->Aggregates\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "parquet\n", - "\n", - "\n", - "parquet\n", - "\n", - "\n", - "\n", - "Aggregates->parquet\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "nosql\n", - "\n", - "\n", - "nosql\n", - "\n", - "\n", - "\n", - "Aggregates->nosql\n", - "\n", - "\n", - "\n", - "\n", - "\n" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "quotes_set = fstore.FeatureSet(\"stock-quotes\", entities=[fstore.Entity(\"ticker\")])\n", - "\n", - "quotes_set.graph.to(\"map.MyMap\", multiplier=3).to(\n", - " \"storey.Extend\", _fn=\"({'extra': event['bid'] * 77})\"\n", - ").to(\"storey.Filter\", \"filter\", _fn=\"(event['bid'] > 51.92)\").to(\n", - " FeaturesetValidator()\n", - ")\n", - "\n", - "quotes_set.add_aggregation(\"ask\", [\"sum\", \"max\"], \"1h\", \"10m\", name=\"asks1\")\n", - "quotes_set.add_aggregation(\"ask\", [\"sum\", \"max\"], \"5h\", \"10m\", name=\"asks5\")\n", - "quotes_set.add_aggregation(\"bid\", [\"min\", \"max\"], \"1h\", \"10m\", name=\"bids\")\n", - "\n", - "# add feature validation policy\n", - "quotes_set[\"bid\"] = fstore.Feature(\n", - " validator=MinMaxValidator(min=52, severity=\"info\")\n", - ")\n", - "\n", - "# add default target definitions and plot\n", - "quotes_set.set_targets()\n", - "quotes_set.plot(rankdir=\"LR\", with_targets=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Saving the feature set in the feature store " - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "quotes_set.save()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Creating the data source of the feature set to apply the ingest on:" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "data_uri = 'quotes.csv'\n", - "quotes.to_csv(data_uri, index=False)\n", - "source = CSVSource('quotes', data_uri).to_dict()\n", - "source" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Import ingest function" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "ingest_fn = mlrun.import_function(\"function.yaml\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Running the function locally" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2022-01-31 13:52:17,201 [info] starting run ingest-ingest uid=4bd5d12691a8439d90bf53847f59df1a DB=http://mlrun-api:8080\n", - "> 2022-01-31 13:52:17,354 [info] Ingesting the FeatureSet: store://feature-sets/ingest-yonatan/stock-quotes\n", - "> 2022-01-31 13:52:17,427 [info] starting ingestion task to store://feature-sets/ingest-yonatan/stock-quotes:latest.\n", - "info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 13:52:19.466055 args={'min': 52, 'value': 51.95}\n", - "info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 13:52:19.466072 args={'min': 52, 'value': 51.97}\n", - "info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 13:52:19.466085 args={'min': 52, 'value': 51.99}\n", - "info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 13:52:19.671677 args={'min': 52, 'value': 51.95}\n", - "info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 13:52:19.671692 args={'min': 52, 'value': 51.97}\n", - "info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 13:52:19.671708 args={'min': 52, 'value': 51.99}\n", - "> 2022-01-31 13:52:19,915 [info] ingestion task completed, targets:\n", - "> 2022-01-31 13:52:19,915 [info] [{'name': 'parquet', 'kind': 'parquet', 'path': 'v3io:///projects/ingest-yonatan/FeatureStore/stock-quotes/parquet/sets/stock-quotes-latest', 'status': 'created', 'updated': '2022-01-31T13:52:19.649303+00:00', 'last_written': datetime.datetime(2022, 1, 31, 13, 52, 19, 671753)}, {'name': 'nosql', 'kind': 'nosql', 'path': 'v3io:///projects/ingest-yonatan/FeatureStore/stock-quotes/nosql/sets/stock-quotes-latest', 'status': 'created', 'updated': '2022-01-31T13:52:19.650044+00:00'}]\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
ingest-yonatan0Jan 31 13:52:17completedingest-ingest
v3io_user=yonatan
kind=
owner=yonatan
host=jupyter-yoni-647b99c95d-w4jlc
featureset=store://feature-sets/ingest-yonatan/stock-quotes
source={'kind': 'csv', 'name': 'quotes', 'path': 'quotes.csv'}
infer_options=63
overwrite=None
targets=None
featureset=store://feature-sets/ingest-yonatan/stock-quotes
\n", - "
\n", - "
\n", - "
\n", - " Title\n", - " ×\n", - "
\n", - " \n", - "
\n", - "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - }, - { - "data": { - "text/html": [ - " > to track results use the .show() or .logs() methods or click here to open in UI" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2022-01-31 13:52:20,045 [info] run executed, status=completed\n" - ] - } - ], - "source": [ - "ingest_run = ingest_fn.run(\n", - " handler=\"ingest\",\n", - " params={\n", - " \"featureset\": quotes_set.uri,\n", - " \"source\": source,\n", - " },\n", - " local=True,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## View of the targets' state after run" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'created'" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fstore.get_feature_set(ingest_run.outputs['featureset']).status.state" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/functions/development/ingest/latest/src/ingest.py b/functions/development/ingest/latest/src/ingest.py deleted file mode 100644 index 1412cbaf..00000000 --- a/functions/development/ingest/latest/src/ingest.py +++ /dev/null @@ -1,84 +0,0 @@ -# Copyright 2019 Iguazio -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -from typing import Union, List, Dict - -import mlrun.feature_store as fs -from mlrun.execution import MLClientCtx -from mlrun.data_types import InferOptions - - -def ingest( - context: MLClientCtx, - featureset: str, - source: str, - targets: List[Union[str, Dict]] = None, - namespace=None, - infer_options=None, - run_config: Union[str, Dict] = None, - spark_context=None, - overwrite=None, -): - """Read local DataFrame, file, URL, or source into the feature store - Ingest reads from the source, run the graph transformations, infers metadata and stats - and writes the results to the default of specified targets - - when targets are not specified data is stored in the configured default targets - (will usually be NoSQL for real-time and Parquet for offline). - - example:: - - stocks_set = FeatureSet("stocks", entities=[Entity("ticker")]) - stocks = pd.read_csv("stocks.csv") - df = ingest(stocks_set, stocks, infer_options=fstore.InferOptions.default()) - - # for running as remote job - config = RunConfig(image='mlrun/mlrun').apply(mount_v3io()) - df = ingest(stocks_set, stocks, run_config=config) - - # specify source and targets - source = CSVSource("mycsv", path="measurements.csv") - targets = [CSVTarget("mycsv", path="./mycsv.csv")] - ingest(measurements, source, targets) - - :param context: MLRun context - :param featureset: feature set object or featureset.uri. (uri must be of a feature set that is in the DB, - call `.save()` if it's not) - :param source: source dataframe or file path - :param targets: optional list of data target objects - :param namespace: namespace or module containing graph classes - :param infer_options: schema and stats infer options - :param run_config: function and/or run configuration for remote jobs, - see :py:class:`~mlrun.feature_store.RunConfig` - :param spark_context: local spark session for spark ingestion, example for creating the spark context: - `spark = SparkSession.builder.appName("Spark function").getOrCreate()` - For remote spark ingestion, this should contain the remote spark service name - :param overwrite: delete the targets' data prior to ingestion - (default: True for non-scheduled ingest - deletes the targets that are about to be ingested. - False for scheduled ingest - does not delete the target) - - """ - # Setting infer_options to default: - context._parameters["infer_options"] = infer_options or InferOptions.default() - - context.logger.info(f"Calling ingestion task with: {featureset}") - - # ingest called with mlrun_context, feature_set, source and targets passed with context - # This params here for documentation purposes only - fs.ingest( - mlrun_context=context, - namespace=namespace, - spark_context=spark_context, - ) - context.log_result("featureset", featureset) diff --git a/functions/development/ingest/latest/src/item.yaml b/functions/development/ingest/latest/src/item.yaml deleted file mode 100644 index 8665e88f..00000000 --- a/functions/development/ingest/latest/src/item.yaml +++ /dev/null @@ -1,27 +0,0 @@ -apiVersion: v1 -categories: -- data-preparation -- data-analysis -- feature-store -description: Feature Store ingest function that runs the transformation graph on the - source of the featureset. -doc: '' -example: ingest.ipynb -generationDate: 2022-08-28:17-25 -hidden: false -icon: '' -labels: - author: yonish -maintainers: [] -marketplaceType: '' -mlrunVersion: 1.1.0 -name: ingest -platformVersion: 3.5.0 -spec: - filename: ingest.py - handler: ingest - image: mlrun/mlrun - kind: job - requirements: [] -url: '' -version: 1.1.0 diff --git a/functions/development/ingest/latest/src/test_ingest.py b/functions/development/ingest/latest/src/test_ingest.py deleted file mode 100644 index 224f520b..00000000 --- a/functions/development/ingest/latest/src/test_ingest.py +++ /dev/null @@ -1,171 +0,0 @@ -# Copyright 2019 Iguazio -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -import os -import tempfile -import shutil -import datetime -import pytest - -import mlrun -import mlrun.feature_store as fstore -from mlrun.datastore.sources import CSVSource -from mlrun.feature_store.steps import * -from mlrun.features import MinMaxValidator -import pandas as pd - -REQUIRED_ENV_VARS = [ - "MLRUN_DBPATH", - "MLRUN_ARTIFACT_PATH", - "V3IO_USERNAME", - "V3IO_API", - "V3IO_ACCESS_KEY", -] - - -def _validate_environment_variables() -> bool: - """ - Checks that all required Environment variables are set. - """ - environment_keys = os.environ.keys() - return all(key in environment_keys for key in REQUIRED_ENV_VARS) - - -def _set_environment(): - artifact_path = tempfile.TemporaryDirectory().name - os.makedirs(artifact_path) - project = mlrun.new_project("ingest-test") - return artifact_path, project - - -def _cleanup_environment(artifact_path: str): - """ - Cleanup the test environment, deleting files and artifacts created during the test. - - :param artifact_path: The artifact path to delete. - """ - # Clean the local directory: - for test_output in [ - *os.listdir(artifact_path), - "schedules", - "runs", - "artifacts", - "functions", - ]: - test_output_path = os.path.abspath(f"./{test_output}") - if os.path.exists(test_output_path): - if os.path.isdir(test_output_path): - shutil.rmtree(test_output_path) - else: - os.remove(test_output_path) - - # Clean the artifacts' directory: - shutil.rmtree(artifact_path) - - -def create_dataframes() -> (pd.DataFrame, pd.DataFrame): - quotes = pd.DataFrame( - { - "time": [ - pd.Timestamp("2016-05-25 13:30:00.023"), - pd.Timestamp("2016-05-25 13:30:00.023"), - pd.Timestamp("2016-05-25 13:30:00.030"), - pd.Timestamp("2016-05-25 13:30:00.041"), - pd.Timestamp("2016-05-25 13:30:00.048"), - pd.Timestamp("2016-05-25 13:30:00.049"), - pd.Timestamp("2016-05-25 13:30:00.072"), - pd.Timestamp("2016-05-25 13:30:00.075"), - ], - "ticker": ["GOOG", "MSFT", "MSFT", "MSFT", "GOOG", "AAPL", "GOOG", "MSFT"], - "bid": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01], - "ask": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03], - } - ) - - # move date: - max_date = quotes["time"].max() - now_date = datetime.datetime.now() - delta = now_date - max_date - quotes["time"] = quotes["time"] + delta - - return quotes - - -class MyMap(MapClass): - def __init__(self, multiplier=1, **kwargs): - super().__init__(**kwargs) - self._multiplier = multiplier - - def do(self, event): - event["multi"] = event["bid"] * self._multiplier - return event - - -def _create_feature_set(): - quotes_set = fstore.FeatureSet("stock-quotes", entities=[fstore.Entity("ticker")]) - - quotes_set.graph.to("test_ingest.MyMap", multiplier=3).to( - "storey.Extend", _fn="({'extra': event['bid'] * 77})" - ).to("storey.Filter", "filter", _fn="(event['bid'] > 51.92)").to( - FeaturesetValidator() - ) - - quotes_set.add_aggregation("ask", ["sum", "max"], "1h", "10m", name="asks1") - quotes_set.add_aggregation("ask", ["sum", "max"], "5h", "10m", name="asks5") - quotes_set.add_aggregation("bid", ["min", "max"], "1h", "10m", name="bids") - - # add feature validation policy - quotes_set["bid"] = fstore.Feature( - validator=MinMaxValidator(min=52, severity="info") - ) - - # add default target definitions - quotes_set.set_targets() - return quotes_set - - -@pytest.mark.skipif( - condition=not _validate_environment_variables(), - reason="Project's environment variables are not set", -) -def test_ingest(): - artifact_path, project = _set_environment() - ingest_fn = mlrun.import_function("function.yaml") - quotes = create_dataframes() - - quotes_set = _create_feature_set() - quotes_set.save() - - data_uri = os.path.join(artifact_path, "quotes.csv") - quotes.to_csv(data_uri, index=False) - source = CSVSource("quotes", data_uri).to_dict() - - ingest_run = None - try: - ingest_run = ingest_fn.run( - handler="ingest", - params={ - "featureset": quotes_set.uri, - "source": source, - }, - local=True, - ) - - except Exception as exception: - print(f"- The test failed - raised the following error:\n- {exception}") - assert ( - fstore.get_feature_set(ingest_run.outputs["featureset"]).status.state - == "created" - ), "Targets not created successfully" - _cleanup_environment(artifact_path) diff --git a/functions/development/ingest/latest/static/documentation.html b/functions/development/ingest/latest/static/documentation.html deleted file mode 100644 index 7c25074a..00000000 --- a/functions/development/ingest/latest/static/documentation.html +++ /dev/null @@ -1,270 +0,0 @@ - - - - - - - -ingest package - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - - - -
-
- - -
-
-
- -
-

ingest package

- -
- -
-
-
-
-
-

ingest package#

-
-

Submodules#

-
-
-

ingest.ingest module#

-
-
-ingest.ingest.ingest(context: mlrun.execution.MLClientCtx, featureset: str, source: str, targets: Optional[List[Union[str, Dict]]] = None, namespace=None, infer_options=None, run_config: Optional[Union[str, Dict]] = None, spark_context=None, overwrite=None)[source]#
-

Read local DataFrame, file, URL, or source into the feature store -Ingest reads from the source, run the graph transformations, infers metadata and stats -and writes the results to the default of specified targets

-

when targets are not specified data is stored in the configured default targets -(will usually be NoSQL for real-time and Parquet for offline).

-

example:

-
stocks_set = FeatureSet("stocks", entities=[Entity("ticker")])
-stocks = pd.read_csv("stocks.csv")
-df = ingest(stocks_set, stocks, infer_options=fstore.InferOptions.default())
-
-# for running as remote job
-config = RunConfig(image='mlrun/mlrun').apply(mount_v3io())
-df = ingest(stocks_set, stocks, run_config=config)
-
-# specify source and targets
-source = CSVSource("mycsv", path="measurements.csv")
-targets = [CSVTarget("mycsv", path="./mycsv.csv")]
-ingest(measurements, source, targets)
-
-
-
-
Parameters
-
    -
  • context – MLRun context

  • -
  • featureset – feature set object or featureset.uri. (uri must be of a feature set that is in the DB, -call .save() if it’s not)

  • -
  • source – source dataframe or file path

  • -
  • targets – optional list of data target objects

  • -
  • namespace – namespace or module containing graph classes

  • -
  • infer_options – schema and stats infer options

  • -
  • run_config – function and/or run configuration for remote jobs, -see RunConfig

  • -
  • spark_context – local spark session for spark ingestion, example for creating the spark context: -spark = SparkSession.builder.appName(“Spark function”).getOrCreate() -For remote spark ingestion, this should contain the remote spark service name

  • -
  • overwrite

    delete the targets’ data prior to ingestion -(default: True for non-scheduled ingest - deletes the targets that are about to be ingested.

    -
    -

    False for scheduled ingest - does not delete the target)

    -
    -

  • -
-
-
-
-
-
-

Module contents#

-
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/ingest/latest/static/example.html b/functions/development/ingest/latest/static/example.html deleted file mode 100644 index ae4fa592..00000000 --- a/functions/development/ingest/latest/static/example.html +++ /dev/null @@ -1,728 +0,0 @@ - - - - - - - -Feature Store Ingest - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - - - -
-
- - -
-
-
- - -
-
-
-

Feature Store Ingest#

-

Read local DataFrame, file, URL, or source into the feature store -Ingest reads from the source, run the graph transformations, infers metadata and stats -and writes the results to the default of specified targets.

-
-

Creating Project#

-
-
-
import mlrun
-import mlrun.feature_store as fstore
-from  mlrun.datastore.sources import CSVSource
-from mlrun.feature_store.steps import *
-from mlrun.features import MinMaxValidator
-import pandas as pd
-import datetime
-
-
-
-
-
-
-
# Initialize the MLRun project object
-project = mlrun.get_or_create_project('ingest', context="./", user_project=True)
-
-
-
-
-
> 2022-01-31 13:52:16,939 [info] loaded project ingest from MLRun DB
-
-
-
-
-
-
-

Create Sample Data For Demo#

-
-
-
quotes = pd.DataFrame(
-    {
-        "time": [
-            pd.Timestamp("2016-05-25 13:30:00.023"),
-            pd.Timestamp("2016-05-25 13:30:00.023"),
-            pd.Timestamp("2016-05-25 13:30:00.030"),
-            pd.Timestamp("2016-05-25 13:30:00.041"),
-            pd.Timestamp("2016-05-25 13:30:00.048"),
-            pd.Timestamp("2016-05-25 13:30:00.049"),
-            pd.Timestamp("2016-05-25 13:30:00.072"),
-            pd.Timestamp("2016-05-25 13:30:00.075"),
-        ],
-        "ticker": ["GOOG", "MSFT", "MSFT", "MSFT", "GOOG", "AAPL", "GOOG", "MSFT"],
-        "bid": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01],
-        "ask": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03],
-    }
-)
-
-# move date:
-max_date = quotes["time"].max()
-now_date = datetime.datetime.now()
-delta = now_date - max_date
-quotes["time"] = quotes["time"] + delta
-
-
-
-
-
-
-
quotes
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
timetickerbidask
02022-01-31 13:52:16.905388GOOG720.50720.93
12022-01-31 13:52:16.905388MSFT51.9551.96
22022-01-31 13:52:16.912388MSFT51.9751.98
32022-01-31 13:52:16.923388MSFT51.9952.00
42022-01-31 13:52:16.930388GOOG720.50720.93
52022-01-31 13:52:16.931388AAPL97.9998.01
62022-01-31 13:52:16.954388GOOG720.50720.88
72022-01-31 13:52:16.957388MSFT52.0152.03
-
-
-
-
-

Build Advanced Feature Set - With Feature Engineering Pipeline#

-

Define a custom pipeline step (python class)

-
-
-
class MyMap(MapClass):
-    def __init__(self, multiplier=1, **kwargs):
-        super().__init__(**kwargs)
-        self._multiplier = multiplier
-
-    def do(self, event):
-        event["multi"] = event["bid"] * self._multiplier
-        return event
-
-
-
-
-

Build and show the transformatiom pipeline

-
-
-
quotes_set = fstore.FeatureSet("stock-quotes", entities=[fstore.Entity("ticker")])
-
-quotes_set.graph.to("map.MyMap", multiplier=3).to(
-    "storey.Extend", _fn="({'extra': event['bid'] * 77})"
-).to("storey.Filter", "filter", _fn="(event['bid'] > 51.92)").to(
-    FeaturesetValidator()
-)
-
-quotes_set.add_aggregation("ask", ["sum", "max"], "1h", "10m", name="asks1")
-quotes_set.add_aggregation("ask", ["sum", "max"], "5h", "10m", name="asks5")
-quotes_set.add_aggregation("bid", ["min", "max"], "1h", "10m", name="bids")
-
-# add feature validation policy
-quotes_set["bid"] = fstore.Feature(
-    validator=MinMaxValidator(min=52, severity="info")
-)
-
-# add default target definitions and plot
-quotes_set.set_targets()
-quotes_set.plot(rankdir="LR", with_targets=True)
-
-
-
-
-_images/90dea0f8f896da07046841b1af664828b2019e767b67694d701904b76cd2a0ae.svg
-
-

Saving the feature set in the feature store

-
-
-
quotes_set.save()
-
-
-
-
-

Creating the data source of the feature set to apply the ingest on:

-
-
-
data_uri = 'quotes.csv'
-quotes.to_csv(data_uri, index=False)
-source = CSVSource('quotes', data_uri).to_dict()
-source
-
-
-
-
-
-
-

Import ingest function#

-
-
-
ingest_fn = mlrun.import_function("function.yaml")
-
-
-
-
-
-
-

Running the function locally#

-
-
-
ingest_run = ingest_fn.run(
-    handler="ingest",
-    params={
-        "featureset": quotes_set.uri,
-        "source": source,
-    },
-    local=True,
-)
-
-
-
-
-
> 2022-01-31 13:52:17,201 [info] starting run ingest-ingest uid=4bd5d12691a8439d90bf53847f59df1a DB=http://mlrun-api:8080
-> 2022-01-31 13:52:17,354 [info] Ingesting the FeatureSet: store://feature-sets/ingest-yonatan/stock-quotes
-> 2022-01-31 13:52:17,427 [info] starting ingestion task to store://feature-sets/ingest-yonatan/stock-quotes:latest.
-info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 13:52:19.466055 args={'min': 52, 'value': 51.95}
-info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 13:52:19.466072 args={'min': 52, 'value': 51.97}
-info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 13:52:19.466085 args={'min': 52, 'value': 51.99}
-info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 13:52:19.671677 args={'min': 52, 'value': 51.95}
-info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 13:52:19.671692 args={'min': 52, 'value': 51.97}
-info! bid value is smaller than min, key=['MSFT'] time=2022-01-31 13:52:19.671708 args={'min': 52, 'value': 51.99}
-> 2022-01-31 13:52:19,915 [info] ingestion task completed, targets:
-> 2022-01-31 13:52:19,915 [info] [{'name': 'parquet', 'kind': 'parquet', 'path': 'v3io:///projects/ingest-yonatan/FeatureStore/stock-quotes/parquet/sets/stock-quotes-latest', 'status': 'created', 'updated': '2022-01-31T13:52:19.649303+00:00', 'last_written': datetime.datetime(2022, 1, 31, 13, 52, 19, 671753)}, {'name': 'nosql', 'kind': 'nosql', 'path': 'v3io:///projects/ingest-yonatan/FeatureStore/stock-quotes/nosql/sets/stock-quotes-latest', 'status': 'created', 'updated': '2022-01-31T13:52:19.650044+00:00'}]
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
ingest-yonatan0Jan 31 13:52:17completedingest-ingest
v3io_user=yonatan
kind=
owner=yonatan
host=jupyter-yoni-647b99c95d-w4jlc
featureset=store://feature-sets/ingest-yonatan/stock-quotes
source={'kind': 'csv', 'name': 'quotes', 'path': 'quotes.csv'}
infer_options=63
overwrite=None
targets=None
featureset=store://feature-sets/ingest-yonatan/stock-quotes
-
- -
-

-
-
-
> to track results use the .show() or .logs() methods or click here to open in UI
> 2022-01-31 13:52:20,045 [info] run executed, status=completed
-
-
-
-
-
-
-

View of the targets’ state after run#

-
-
-
fstore.get_feature_set(ingest_run.outputs['featureset']).status.state
-
-
-
-
-
'created'
-
-
-
-
-
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/ingest/latest/static/function.html b/functions/development/ingest/latest/static/function.html deleted file mode 100644 index d58ef431..00000000 --- a/functions/development/ingest/latest/static/function.html +++ /dev/null @@ -1,109 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: job
-metadata:
-  name: ingest
-  tag: ''
-  hash: 7e28700a86ebdd18d887fe588492201a1e3ef2f6
-  project: ''
-  labels:
-    author: yonish
-  categories:
-  - data-preparation
-  - data-analysis
-  - feature-store
-spec:
-  command: ''
-  args: []
-  image: mlrun/mlrun
-  build:
-    functionSourceCode: ZnJvbSB0eXBpbmcgaW1wb3J0IFVuaW9uLCBMaXN0LCBEaWN0CgppbXBvcnQgbWxydW4uZmVhdHVyZV9zdG9yZSBhcyBmcwpmcm9tIG1scnVuLmV4ZWN1dGlvbiBpbXBvcnQgTUxDbGllbnRDdHgKZnJvbSBtbHJ1bi5kYXRhX3R5cGVzIGltcG9ydCBJbmZlck9wdGlvbnMKCgpkZWYgaW5nZXN0KAogICAgY29udGV4dDogTUxDbGllbnRDdHgsCiAgICBmZWF0dXJlc2V0OiBzdHIsCiAgICBzb3VyY2U6IHN0ciwKICAgIHRhcmdldHM6IExpc3RbVW5pb25bc3RyLCBEaWN0XV0gPSBOb25lLAogICAgbmFtZXNwYWNlPU5vbmUsCiAgICBpbmZlcl9vcHRpb25zPU5vbmUsCiAgICBydW5fY29uZmlnOiBVbmlvbltzdHIsIERpY3RdID0gTm9uZSwKICAgIHNwYXJrX2NvbnRleHQ9Tm9uZSwKICAgIG92ZXJ3cml0ZT1Ob25lLAopOgogICAgIiIiUmVhZCBsb2NhbCBEYXRhRnJhbWUsIGZpbGUsIFVSTCwgb3Igc291cmNlIGludG8gdGhlIGZlYXR1cmUgc3RvcmUKICAgIEluZ2VzdCByZWFkcyBmcm9tIHRoZSBzb3VyY2UsIHJ1biB0aGUgZ3JhcGggdHJhbnNmb3JtYXRpb25zLCBpbmZlcnMgIG1ldGFkYXRhIGFuZCBzdGF0cwogICAgYW5kIHdyaXRlcyB0aGUgcmVzdWx0cyB0byB0aGUgZGVmYXVsdCBvZiBzcGVjaWZpZWQgdGFyZ2V0cwoKICAgIHdoZW4gdGFyZ2V0cyBhcmUgbm90IHNwZWNpZmllZCBkYXRhIGlzIHN0b3JlZCBpbiB0aGUgY29uZmlndXJlZCBkZWZhdWx0IHRhcmdldHMKICAgICh3aWxsIHVzdWFsbHkgYmUgTm9TUUwgZm9yIHJlYWwtdGltZSBhbmQgUGFycXVldCBmb3Igb2ZmbGluZSkuCgogICAgZXhhbXBsZTo6CgogICAgICAgIHN0b2Nrc19zZXQgPSBGZWF0dXJlU2V0KCJzdG9ja3MiLCBlbnRpdGllcz1bRW50aXR5KCJ0aWNrZXIiKV0pCiAgICAgICAgc3RvY2tzID0gcGQucmVhZF9jc3YoInN0b2Nrcy5jc3YiKQogICAgICAgIGRmID0gaW5nZXN0KHN0b2Nrc19zZXQsIHN0b2NrcywgaW5mZXJfb3B0aW9ucz1mc3RvcmUuSW5mZXJPcHRpb25zLmRlZmF1bHQoKSkKCiAgICAgICAgIyBmb3IgcnVubmluZyBhcyByZW1vdGUgam9iCiAgICAgICAgY29uZmlnID0gUnVuQ29uZmlnKGltYWdlPSdtbHJ1bi9tbHJ1bicpLmFwcGx5KG1vdW50X3YzaW8oKSkKICAgICAgICBkZiA9IGluZ2VzdChzdG9ja3Nfc2V0LCBzdG9ja3MsIHJ1bl9jb25maWc9Y29uZmlnKQoKICAgICAgICAjIHNwZWNpZnkgc291cmNlIGFuZCB0YXJnZXRzCiAgICAgICAgc291cmNlID0gQ1NWU291cmNlKCJteWNzdiIsIHBhdGg9Im1lYXN1cmVtZW50cy5jc3YiKQogICAgICAgIHRhcmdldHMgPSBbQ1NWVGFyZ2V0KCJteWNzdiIsIHBhdGg9Ii4vbXljc3YuY3N2IildCiAgICAgICAgaW5nZXN0KG1lYXN1cmVtZW50cywgc291cmNlLCB0YXJnZXRzKQoKICAgIDpwYXJhbSBjb250ZXh0OiAgICAgICBNTFJ1biBjb250ZXh0CiAgICA6cGFyYW0gZmVhdHVyZXNldDogICAgZmVhdHVyZSBzZXQgb2JqZWN0IG9yIGZlYXR1cmVzZXQudXJpLiAodXJpIG11c3QgYmUgb2YgYSBmZWF0dXJlIHNldCB0aGF0IGlzIGluIHRoZSBEQiwKICAgICAgICAgICAgICAgICAgICAgICAgICBjYWxsIGAuc2F2ZSgpYCBpZiBpdCdzIG5vdCkKICAgIDpwYXJhbSBzb3VyY2U6ICAgICAgICBzb3VyY2UgZGF0YWZyYW1lIG9yIGZpbGUgcGF0aAogICAgOnBhcmFtIHRhcmdldHM6ICAgICAgIG9wdGlvbmFsIGxpc3Qgb2YgZGF0YSB0YXJnZXQgb2JqZWN0cwogICAgOnBhcmFtIG5hbWVzcGFjZTogICAgIG5hbWVzcGFjZSBvciBtb2R1bGUgY29udGFpbmluZyBncmFwaCBjbGFzc2VzCiAgICA6cGFyYW0gaW5mZXJfb3B0aW9uczogc2NoZW1hIGFuZCBzdGF0cyBpbmZlciBvcHRpb25zCiAgICA6cGFyYW0gcnVuX2NvbmZpZzogICAgZnVuY3Rpb24gYW5kL29yIHJ1biBjb25maWd1cmF0aW9uIGZvciByZW1vdGUgam9icywKICAgICAgICAgICAgICAgICAgICAgICAgICBzZWUgOnB5OmNsYXNzOmB+bWxydW4uZmVhdHVyZV9zdG9yZS5SdW5Db25maWdgCiAgICA6cGFyYW0gc3BhcmtfY29udGV4dDogbG9jYWwgc3Bhcmsgc2Vzc2lvbiBmb3Igc3BhcmsgaW5nZXN0aW9uLCBleGFtcGxlIGZvciBjcmVhdGluZyB0aGUgc3BhcmsgY29udGV4dDoKICAgICAgICAgICAgICAgICAgICAgICAgICBgc3BhcmsgPSBTcGFya1Nlc3Npb24uYnVpbGRlci5hcHBOYW1lKCJTcGFyayBmdW5jdGlvbiIpLmdldE9yQ3JlYXRlKClgCiAgICAgICAgICAgICAgICAgICAgICAgICAgRm9yIHJlbW90ZSBzcGFyayBpbmdlc3Rpb24sIHRoaXMgc2hvdWxkIGNvbnRhaW4gdGhlIHJlbW90ZSBzcGFyayBzZXJ2aWNlIG5hbWUKICAgIDpwYXJhbSBvdmVyd3JpdGU6ICAgICBkZWxldGUgdGhlIHRhcmdldHMnIGRhdGEgcHJpb3IgdG8gaW5nZXN0aW9uCiAgICAgICAgICAgICAgICAgICAgICAgICAgKGRlZmF1bHQ6IFRydWUgZm9yIG5vbi1zY2hlZHVsZWQgaW5nZXN0IC0gZGVsZXRlcyB0aGUgdGFyZ2V0cyB0aGF0IGFyZSBhYm91dCB0byBiZSBpbmdlc3RlZC4KICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgRmFsc2UgZm9yIHNjaGVkdWxlZCBpbmdlc3QgLSBkb2VzIG5vdCBkZWxldGUgdGhlIHRhcmdldCkKCiAgICAiIiIKICAgICMgU2V0dGluZyBpbmZlcl9vcHRpb25zIHRvIGRlZmF1bHQ6CiAgICBjb250ZXh0Ll9wYXJhbWV0ZXJzWyJpbmZlcl9vcHRpb25zIl0gPSBpbmZlcl9vcHRpb25zIG9yIEluZmVyT3B0aW9ucy5kZWZhdWx0KCkKCiAgICBjb250ZXh0LmxvZ2dlci5pbmZvKGYiQ2FsbGluZyBpbmdlc3Rpb24gdGFzayB3aXRoOiB7ZmVhdHVyZXNldH0iKQoKICAgICMgaW5nZXN0IGNhbGxlZCB3aXRoIG1scnVuX2NvbnRleHQsIGZlYXR1cmVfc2V0LCBzb3VyY2UgYW5kIHRhcmdldHMgcGFzc2VkIHdpdGggY29udGV4dAogICAgIyBUaGlzIHBhcmFtcyBoZXJlIGZvciBkb2N1bWVudGF0aW9uIHB1cnBvc2VzIG9ubHkKICAgIGZzLmluZ2VzdCgKICAgICAgICBtbHJ1bl9jb250ZXh0PWNvbnRleHQsCiAgICAgICAgbmFtZXNwYWNlPW5hbWVzcGFjZSwKICAgICAgICBzcGFya19jb250ZXh0PXNwYXJrX2NvbnRleHQsCiAgICApCiAgICBjb250ZXh0LmxvZ19yZXN1bHQoImZlYXR1cmVzZXQiLCBmZWF0dXJlc2V0KQo=
-    commands: []
-    code_origin: https://github.com/mlrun/functions.git#886a88217c2a2570c81a14877f9c1dfb1ac8a244:C:\Users\yonatans\projects\functions\ingest\ingest.py
-    origin_filename: C:\Users\yonatans\projects\functions\ingest\ingest.py
-  entry_points:
-    ingest:
-      name: ingest
-      doc: "Read local DataFrame, file, URL, or source into the feature store\nIngest\
-        \ reads from the source, run the graph transformations, infers  metadata and\
-        \ stats\nand writes the results to the default of specified targets\n\nwhen\
-        \ targets are not specified data is stored in the configured default targets\n\
-        (will usually be NoSQL for real-time and Parquet for offline).\n\nexample::\n\
-        \n    stocks_set = FeatureSet(\"stocks\", entities=[Entity(\"ticker\")])\n\
-        \    stocks = pd.read_csv(\"stocks.csv\")\n    df = ingest(stocks_set, stocks,\
-        \ infer_options=fstore.InferOptions.default())\n\n    # for running as remote\
-        \ job\n    config = RunConfig(image='mlrun/mlrun').apply(mount_v3io())\n \
-        \   df = ingest(stocks_set, stocks, run_config=config)\n\n    # specify source\
-        \ and targets\n    source = CSVSource(\"mycsv\", path=\"measurements.csv\"\
-        )\n    targets = [CSVTarget(\"mycsv\", path=\"./mycsv.csv\")]\n    ingest(measurements,\
-        \ source, targets)"
-      parameters:
-      - name: context
-        type: MLClientCtx
-        doc: MLRun context
-        default: ''
-      - name: featureset
-        type: str
-        doc: feature set object or featureset.uri. (uri must be of a feature set that
-          is in the DB, call `.save()` if it's not)
-        default: ''
-      - name: source
-        type: str
-        doc: source dataframe or file path
-        default: ''
-      - name: targets
-        type: List[Union[str, Dict]]
-        doc: optional list of data target objects
-        default: null
-      - name: namespace
-        doc: namespace or module containing graph classes
-        default: null
-      - name: infer_options
-        doc: schema and stats infer options
-        default: null
-      - name: run_config
-        type: Union[str, Dict]
-        doc: function and/or run configuration for remote jobs, see :py:class:`~mlrun.feature_store.RunConfig`
-        default: null
-      - name: spark_context
-        doc: 'local spark session for spark ingestion, example for creating the spark
-          context: `spark = SparkSession.builder.appName("Spark function").getOrCreate()`
-          For remote spark ingestion, this should contain the remote spark service
-          name'
-        default: null
-      - name: overwrite
-        doc: 'delete the targets'' data prior to ingestion (default: True for non-scheduled
-          ingest - deletes the targets that are about to be ingested. False for scheduled
-          ingest - does not delete the target)'
-        default: null
-      outputs:
-      - default: ''
-      lineno: 8
-  description: Feature Store ingest function that runs the transformation graph on
-    the source of the featureset.
-  default_handler: ingest
-  disable_auto_mount: false
-  env: []
-  priority_class_name: ''
-  affinity: null
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/ingest/latest/static/ingest.html b/functions/development/ingest/latest/static/ingest.html deleted file mode 100644 index 32c6e3f1..00000000 --- a/functions/development/ingest/latest/static/ingest.html +++ /dev/null @@ -1,224 +0,0 @@ - - - - - - - -ingest.ingest - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - -
-
- -
-
-
-
-
- -
-

- -
-
-
-
-
-
-
-

Source code for ingest.ingest

-# Copyright 2019 Iguazio
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-from typing import Union, List, Dict
-
-import mlrun.feature_store as fs
-from mlrun.execution import MLClientCtx
-from mlrun.data_types import InferOptions
-
-
-
[docs]def ingest( - context: MLClientCtx, - featureset: str, - source: str, - targets: List[Union[str, Dict]] = None, - namespace=None, - infer_options=None, - run_config: Union[str, Dict] = None, - spark_context=None, - overwrite=None, -): - """Read local DataFrame, file, URL, or source into the feature store - Ingest reads from the source, run the graph transformations, infers metadata and stats - and writes the results to the default of specified targets - - when targets are not specified data is stored in the configured default targets - (will usually be NoSQL for real-time and Parquet for offline). - - example:: - - stocks_set = FeatureSet("stocks", entities=[Entity("ticker")]) - stocks = pd.read_csv("stocks.csv") - df = ingest(stocks_set, stocks, infer_options=fstore.InferOptions.default()) - - # for running as remote job - config = RunConfig(image='mlrun/mlrun').apply(mount_v3io()) - df = ingest(stocks_set, stocks, run_config=config) - - # specify source and targets - source = CSVSource("mycsv", path="measurements.csv") - targets = [CSVTarget("mycsv", path="./mycsv.csv")] - ingest(measurements, source, targets) - - :param context: MLRun context - :param featureset: feature set object or featureset.uri. (uri must be of a feature set that is in the DB, - call `.save()` if it's not) - :param source: source dataframe or file path - :param targets: optional list of data target objects - :param namespace: namespace or module containing graph classes - :param infer_options: schema and stats infer options - :param run_config: function and/or run configuration for remote jobs, - see :py:class:`~mlrun.feature_store.RunConfig` - :param spark_context: local spark session for spark ingestion, example for creating the spark context: - `spark = SparkSession.builder.appName("Spark function").getOrCreate()` - For remote spark ingestion, this should contain the remote spark service name - :param overwrite: delete the targets' data prior to ingestion - (default: True for non-scheduled ingest - deletes the targets that are about to be ingested. - False for scheduled ingest - does not delete the target) - - """ - # Setting infer_options to default: - context._parameters["infer_options"] = infer_options or InferOptions.default() - - context.logger.info(f"Calling ingestion task with: {featureset}") - - # ingest called with mlrun_context, feature_set, source and targets passed with context - # This params here for documentation purposes only - fs.ingest( - mlrun_context=context, - namespace=namespace, - spark_context=spark_context, - ) - context.log_result("featureset", featureset)
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/ingest/latest/static/item.html b/functions/development/ingest/latest/static/item.html deleted file mode 100644 index e36ddb73..00000000 --- a/functions/development/ingest/latest/static/item.html +++ /dev/null @@ -1,49 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- data-preparation
-- data-analysis
-- feature-store
-description: Feature Store ingest function that runs the transformation graph on the
-  source of the featureset.
-doc: ''
-example: ingest.ipynb
-generationDate: 2022-08-28:17-25
-hidden: false
-icon: ''
-labels:
-  author: yonish
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 1.1.0
-name: ingest
-platformVersion: 3.5.0
-spec:
-  filename: ingest.py
-  handler: ingest
-  image: mlrun/mlrun
-  kind: job
-  requirements: []
-url: ''
-version: 1.1.0
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/ingest/latest/static/source.html b/functions/development/ingest/latest/static/source.html deleted file mode 100644 index ade154bf..00000000 --- a/functions/development/ingest/latest/static/source.html +++ /dev/null @@ -1,106 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-# Copyright 2019 Iguazio
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-from typing import Union, List, Dict
-
-import mlrun.feature_store as fs
-from mlrun.execution import MLClientCtx
-from mlrun.data_types import InferOptions
-
-
-def ingest(
-    context: MLClientCtx,
-    featureset: str,
-    source: str,
-    targets: List[Union[str, Dict]] = None,
-    namespace=None,
-    infer_options=None,
-    run_config: Union[str, Dict] = None,
-    spark_context=None,
-    overwrite=None,
-):
-    """Read local DataFrame, file, URL, or source into the feature store
-    Ingest reads from the source, run the graph transformations, infers  metadata and stats
-    and writes the results to the default of specified targets
-
-    when targets are not specified data is stored in the configured default targets
-    (will usually be NoSQL for real-time and Parquet for offline).
-
-    example::
-
-        stocks_set = FeatureSet("stocks", entities=[Entity("ticker")])
-        stocks = pd.read_csv("stocks.csv")
-        df = ingest(stocks_set, stocks, infer_options=fstore.InferOptions.default())
-
-        # for running as remote job
-        config = RunConfig(image='mlrun/mlrun').apply(mount_v3io())
-        df = ingest(stocks_set, stocks, run_config=config)
-
-        # specify source and targets
-        source = CSVSource("mycsv", path="measurements.csv")
-        targets = [CSVTarget("mycsv", path="./mycsv.csv")]
-        ingest(measurements, source, targets)
-
-    :param context:       MLRun context
-    :param featureset:    feature set object or featureset.uri. (uri must be of a feature set that is in the DB,
-                          call `.save()` if it's not)
-    :param source:        source dataframe or file path
-    :param targets:       optional list of data target objects
-    :param namespace:     namespace or module containing graph classes
-    :param infer_options: schema and stats infer options
-    :param run_config:    function and/or run configuration for remote jobs,
-                          see :py:class:`~mlrun.feature_store.RunConfig`
-    :param spark_context: local spark session for spark ingestion, example for creating the spark context:
-                          `spark = SparkSession.builder.appName("Spark function").getOrCreate()`
-                          For remote spark ingestion, this should contain the remote spark service name
-    :param overwrite:     delete the targets' data prior to ingestion
-                          (default: True for non-scheduled ingest - deletes the targets that are about to be ingested.
-                                    False for scheduled ingest - does not delete the target)
-
-    """
-    # Setting infer_options to default:
-    context._parameters["infer_options"] = infer_options or InferOptions.default()
-
-    context.logger.info(f"Calling ingestion task with: {featureset}")
-
-    # ingest called with mlrun_context, feature_set, source and targets passed with context
-    # This params here for documentation purposes only
-    fs.ingest(
-        mlrun_context=context,
-        namespace=namespace,
-        spark_context=spark_context,
-    )
-    context.log_result("featureset", featureset)
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/model_monitoring_stream/0.0.1/src/function.yaml b/functions/development/model_monitoring_stream/0.0.1/src/function.yaml deleted file mode 100644 index fbdccbbe..00000000 --- a/functions/development/model_monitoring_stream/0.0.1/src/function.yaml +++ /dev/null @@ -1,40 +0,0 @@ -kind: remote -metadata: - name: model-monitoring-stream - tag: '' - hash: 4dd87a1b2a2a92f3621f54b289a21126c2710e74 - project: default - categories: - - monitoring -spec: - command: '' - args: [] - image: mlrun/mlrun - description: '' - min_replicas: 1 - max_replicas: 4 - env: [] - base_spec: - apiVersion: nuclio.io/v1 - kind: Function - metadata: - name: model-monitoring-stream - labels: {} - annotations: - nuclio.io/generated_by: function generated from /home/kali/functions/model_monitoring_stream/model_monitoring_stream.py - spec: - runtime: python:3.6 - handler: model_monitoring_stream:handler - env: [] - volumes: [] - build: - commands: [] - noBaseImagesPull: true - functionSourceCode:  - source: '' - build: - commands: [] - code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/model_monitoring_stream/model_monitoring_stream.py - default_handler: handler - affinity: null -verbose: false diff --git a/functions/development/model_monitoring_stream/0.0.1/src/item.yaml b/functions/development/model_monitoring_stream/0.0.1/src/item.yaml deleted file mode 100644 index 832dafa3..00000000 --- a/functions/development/model_monitoring_stream/0.0.1/src/item.yaml +++ /dev/null @@ -1,22 +0,0 @@ -apiVersion: v1 -categories: -- monitoring -description: '' -doc: '' -example: model_monitoring_stream.ipynb -generationDate: 2021-05-19:23-13 -icon: '' -labels: {} -maintainers: [] -marketplaceType: '' -mlrunVersion: '' -name: model-monitoring-stream -platformVersion: '' -spec: - filename: model_monitoring_stream.py - handler: handler - image: mlrun/mlrun - kind: nuclio - requirements: [] -url: '' -version: 0.0.1 diff --git a/functions/development/model_monitoring_stream/0.0.1/src/model_monitoring_stream.ipynb b/functions/development/model_monitoring_stream/0.0.1/src/model_monitoring_stream.ipynb deleted file mode 100644 index 93d8c92e..00000000 --- a/functions/development/model_monitoring_stream/0.0.1/src/model_monitoring_stream.ipynb +++ /dev/null @@ -1,178 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "source": [ - "# Model Monitoring\n", - "\n", - "## Initial set up (and pre-requisites)\n", - "1. Make sure you have the `mlrun-api` datasource available in your Grafana instance, otherwise add it by:\n", - " 1. Open your grafana instance\n", - " 2. Navigate to `Configuration -> Data Sources`\n", - " 3. Press `Add data source` and configure the following parameters\n", - " ```\n", - " Name: mlrun-api\n", - " URL: http://mlrun-api:8080/api/grafana-proxy/model-endpoints\n", - " Access: Server (default)\n", - "\n", - " ## Add a custom header of:\n", - " X-V3io-Session-Key: \n", - " ```\n", - " 4. Press `Save & Test` to make sure it works, a confirmation message should appear when this button is pressed\n", - "\n", - "2. Import the available dashboards `(./dashboards/*)` to you Grafana instance\n", - "3. To allow the system to utilize drift measurement, make sure you supply the train set when logging the model on the\n", - " training step\n", - "\n", - " ```python\n", - " # Log model\n", - " context.log_model(\n", - " \"model\",\n", - " body=dumps(model),\n", - " artifact_path=context.artifact_subpath(\"models\"),\n", - " extra_data=eval_metrics,\n", - " model_file=\"model.pkl\",\n", - " metrics=context.results,\n", - " training_set=X_test, # <- make sure this is passed into log_model\n", - " labels={\"class\": \"sklearn.linear_model.LogisticRegression\"}\n", - " )\n", - " ```\n", - "4. When serving a model, make sure that the Nuclio function is deployed with tracking enabled by applying\n", - " `fn.set_tracking()`\n", - "\n", - "## Configuration\n", - "The stream processing portion of the model monitoring, can be deployed under multiple configuration options. The\n", - "available configurations can be found under `stream.Config`. Once configured it should be supplied as environment\n", - "parameters to the Nuclio function by setting `fn.set_envs`\n", - "\n", - "```python\n", - "project: str # project name\n", - "sample_window: int # The sampling window for the data that flows into the TSDB and the KV\n", - "kv_path_template: str # Path template for the kv table\n", - "tsdb_path_template: str # Path template for the tsdb table\n", - "parquet_path_template: str # v3io parquets path template, assumes v3io is mounted\n", - "tsdb_batching_max_events: int # The max amount of event to batch before writing the batch to tsdb\n", - "tsdb_batching_timeout_secs: int # The max amount of seconds a given batch can be gathered before being emitted\n", - "parquet_batching_max_events: int # The max amount of event to batch before writing the batch to parquet\n", - "parquet_batching_timeout_secs: int # The max amount of seconds, a given batch can be gathered before being written to parquet\n", - "container: str # container name\n", - "v3io_access_key: str # V3IO Access key\n", - "v3io_framesd: str # V3IO framesd URL\n", - "time_format: str # The time format into which time related fields will be converted\n", - "aggregate_count_windows: List[str] # List of window sizes for predictions count\n", - "aggregate_count_period: str # Period of predictions count windows\n", - "aggregate_avg_windows: List[str] # List of window sizes for average latency\n", - "aggregate_avg_period: str # Period of average latency windows\n", - "```" - ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%% md\n" - } - } - }, - { - "cell_type": "markdown", - "source": [ - "## Export function yaml" - ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%% md\n" - } - } - }, - { - "cell_type": "code", - "execution_count": null, - "outputs": [], - "source": [ - "from mlrun import code_to_function\n", - "from mlrun.runtimes import RemoteRuntime\n", - "\n", - "\n", - "fn: RemoteRuntime = code_to_function(\n", - " name=\"model-monitoring-stream\",\n", - " kind=\"nuclio\",\n", - " image=\"mlrun/mlrun\",\n", - " filename=\"model_monitoring_stream.py\",\n", - " handler=\"handler\",\n", - ")\n", - "fn.export(\"model_monitoring_stream.yaml\")\n" - ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%%\n" - } - } - }, - { - "cell_type": "markdown", - "source": [ - "## Deploy Stream Processing" - ], - "metadata": { - "collapsed": false - } - }, - { - "cell_type": "code", - "execution_count": null, - "outputs": [], - "source": [ - "import os\n", - "\n", - "from mlrun import import_function\n", - "from mlrun.platforms import mount_v3io\n", - "from mlrun.runtimes import RemoteRuntime\n", - "import json\n", - "\n", - "# Set project name\n", - "project = \"\"\n", - "\n", - "fn: RemoteRuntime = import_function(\"hub://model_monitoring_stream\")\n", - "\n", - "fn.add_v3io_stream_trigger(\n", - " stream_path=f\"projects/{project}/model-endpoints/stream\",\n", - " name=\"monitoring_stream_trigger\",\n", - ")\n", - "\n", - "fn.set_env(\"MODEL_MONITORING_PARAMETERS\", json.dumps({\"project\": project, \"v3io_framesd\": os.environ.get(\"V3IO_FRAMESD\")}))\n", - "\n", - "fn.metadata.project = project\n", - "fn.apply(mount_v3io())\n", - "fn.deploy()" - ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%%\n" - } - } - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 2 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} \ No newline at end of file diff --git a/functions/development/model_monitoring_stream/0.0.1/src/model_monitoring_stream.py b/functions/development/model_monitoring_stream/0.0.1/src/model_monitoring_stream.py deleted file mode 100644 index 8cdca8d6..00000000 --- a/functions/development/model_monitoring_stream/0.0.1/src/model_monitoring_stream.py +++ /dev/null @@ -1,724 +0,0 @@ -import json -from collections import defaultdict -from datetime import datetime -from os import environ -from typing import Dict, List, Set, Optional, Any, Union - -import pandas as pd -from mlrun.config import config -from mlrun.run import MLClientCtx -from mlrun.utils import logger -from mlrun.utils.model_monitoring import ( - parse_model_endpoint_store_prefix, - create_model_endpoint_id, -) -from mlrun.utils.v3io_clients import get_v3io_client, get_frames_client -from nuclio import Event -from storey import ( - FieldAggregator, - NoopDriver, - Table, - Map, - MapClass, - AggregateByKey, - build_flow, - Filter, - FlatMap, - TSDBTarget, - ParquetTarget, - SyncEmitSource, -) -from storey.dtypes import SlidingWindows -from storey.steps import SampleWindow - -# Constants -ISO_8061_UTC = "%Y-%m-%d %H:%M:%S.%f%z" -FUNCTION_URI = "function_uri" -MODEL = "model" -VERSION = "version" -VERSIONED_MODEL = "versioned_model" -MODEL_CLASS = "model_class" -TIMESTAMP = "timestamp" -ENDPOINT_ID = "endpoint_id" -REQUEST_ID = "request_id" -LABELS = "labels" -UNPACKED_LABELS = "unpacked_labels" -LATENCY_AVG_5M = "latency_avg_5m" -LATENCY_AVG_1H = "latency_avg_1h" -PREDICTIONS_PER_SECOND = "predictions_per_second" -PREDICTIONS_COUNT_5M = "predictions_count_5m" -PREDICTIONS_COUNT_1H = "predictions_count_1h" -FIRST_REQUEST = "first_request" -LAST_REQUEST = "last_request" -ERROR_COUNT = "error_count" -ENTITIES = "entities" -FEATURE_NAMES = "feature_names" -LABEL_COLUMNS = "label_columns" -LATENCY = "latency" -RECORD_TYPE = "record_type" -FEATURES = "features" -PREDICTION = "prediction" -PREDICTIONS = "predictions" -NAMED_FEATURES = "named_features" -NAMED_PREDICTIONS = "named_predictions" -BASE_METRICS = "base_metrics" -CUSTOM_METRICS = "custom_metrics" -ENDPOINT_FEATURES = "endpoint_features" -METRICS = "metrics" -BATCH_TIMESTAMP = "batch_timestamp" -TIME_FORMAT: str = "%Y-%m-%d %H:%M:%S.%f" # ISO 8061 - - -# Stream processing code -class EventStreamProcessor: - def __init__( - self, - project: str, - sample_window: int = 10, - tsdb_batching_max_events: int = 10, - tsdb_batching_timeout_secs: int = 60 * 5, # Default 5 minutes - parquet_batching_max_events: int = 10_000, - parquet_batching_timeout_secs: int = 60 * 60, # Default 1 hour - aggregate_count_windows: Optional[List[str]] = None, - aggregate_count_period: str = "30s", - aggregate_avg_windows: Optional[List[str]] = None, - aggregate_avg_period: str = "30s", - v3io_access_key: Optional[str] = None, - v3io_framesd: Optional[str] = None, - ): - self.project = project - self.sample_window = sample_window - self.tsdb_batching_max_events = tsdb_batching_max_events - self.tsdb_batching_timeout_secs = tsdb_batching_timeout_secs - self.parquet_batching_max_events = parquet_batching_max_events - self.parquet_batching_timeout_secs = parquet_batching_timeout_secs - self.aggregate_count_windows = aggregate_count_windows or ["5m", "1h"] - self.aggregate_count_period = aggregate_count_period - self.aggregate_avg_windows = aggregate_avg_windows or ["5m", "1h"] - self.aggregate_avg_period = aggregate_avg_period - self.v3io_access_key = v3io_access_key or environ.get("V3IO_ACCESS_KEY") - self.v3io_framesd = v3io_framesd or config.v3io_framesd - - template = config.model_endpoint_monitoring.store_prefixes.default - - kv_path = template.format(project=project, kind="endpoints") - _, self.kv_container, self.kv_path = parse_model_endpoint_store_prefix(kv_path) - - tsdb_path = template.format(project=project, kind="events") - _, self.tsdb_container, self.tsdb_path = parse_model_endpoint_store_prefix( - tsdb_path - ) - self.tsdb_path = f"{self.tsdb_container}/{self.tsdb_path}" - - self.parquet_path = template.format(project=project, kind="parquet") - - logger.info( - "Writer paths", - kv_path=self.kv_path, - tsdb_path=self.tsdb_path, - parquet_path=self.parquet_path, - ) - - self._kv_keys = [ - FUNCTION_URI, - MODEL, - MODEL_CLASS, - TIMESTAMP, - ENDPOINT_ID, - LABELS, - UNPACKED_LABELS, - LATENCY_AVG_5M, - LATENCY_AVG_1H, - PREDICTIONS_PER_SECOND, - PREDICTIONS_COUNT_5M, - PREDICTIONS_COUNT_1H, - FIRST_REQUEST, - LAST_REQUEST, - ERROR_COUNT, - ] - - self._flow = build_flow( - [ - SyncEmitSource(), - ProcessEndpointEvent(self.kv_container, self.kv_path), - FilterNotNone(), - FlatMap(lambda x: x), - MapFeatureNames(self.kv_container, self.kv_path), - # Branch 1: Aggregate events, count averages and update TSDB and KV - [ - AggregateByKey( - aggregates=[ - FieldAggregator( - PREDICTIONS, - ENDPOINT_ID, - ["count"], - SlidingWindows( - self.aggregate_count_windows, - self.aggregate_count_period, - ), - ), - FieldAggregator( - LATENCY, - LATENCY, - ["avg"], - SlidingWindows( - self.aggregate_avg_windows, - self.aggregate_avg_period, - ), - ), - ], - table=Table("notable", NoopDriver()), - ), - SampleWindow( - self.sample_window - ), # Add required gap between event to apply sampling - Map(self.compute_predictions_per_second), - # Branch 1.1: Updated KV - [ - Map(self.process_before_kv), - WriteToKV(container=self.kv_container, table=self.kv_path), - InferSchema( - v3io_access_key=self.v3io_access_key, - v3io_framesd=self.v3io_framesd, - container=self.kv_container, - table=self.kv_path, - ), - ], - # Branch 1.2: Update TSDB - [ - # Map the event into taggable fields, add record type to each field - Map(self.process_before_events_tsdb), - [ - FilterKeys(BASE_METRICS), - UnpackValues(BASE_METRICS), - TSDBTarget( - path=self.tsdb_path, - rate="10/m", - time_col=TIMESTAMP, - container=self.tsdb_container, - access_key=self.v3io_access_key, - v3io_frames=self.v3io_framesd, - index_cols=[ENDPOINT_ID, RECORD_TYPE], - # Settings for _Batching - max_events=self.tsdb_batching_max_events, - timeout_secs=self.tsdb_batching_timeout_secs, - key=ENDPOINT_ID, - ), - ], - [ - FilterKeys(ENDPOINT_FEATURES), - UnpackValues(ENDPOINT_FEATURES), - TSDBTarget( - path=self.tsdb_path, - rate="10/m", - time_col=TIMESTAMP, - container=self.tsdb_container, - access_key=self.v3io_access_key, - v3io_frames=self.v3io_framesd, - index_cols=[ENDPOINT_ID, RECORD_TYPE], - # Settings for _Batching - max_events=self.tsdb_batching_max_events, - timeout_secs=self.tsdb_batching_timeout_secs, - key=ENDPOINT_ID, - ), - ], - [ - FilterKeys(CUSTOM_METRICS), - FilterNotNone(), - UnpackValues(CUSTOM_METRICS), - TSDBTarget( - path=self.tsdb_path, - rate="10/m", - time_col=TIMESTAMP, - container=self.tsdb_container, - access_key=self.v3io_access_key, - v3io_frames=self.v3io_framesd, - index_cols=[ENDPOINT_ID, RECORD_TYPE], - # Settings for _Batching - max_events=self.tsdb_batching_max_events, - timeout_secs=self.tsdb_batching_timeout_secs, - key=ENDPOINT_ID, - ), - ], - ], - ], - # Branch 2: Batch events, write to parquet - [ - Map(self.process_before_parquet), - ParquetTarget( - path=self.parquet_path, - partition_cols=["$key", "$year", "$month", "$day", "$hour"], - infer_columns_from_data=True, - # Settings for _Batching - max_events=self.parquet_batching_max_events, - timeout_secs=self.parquet_batching_timeout_secs, - ), - ], - ] - ).run() - - def consume(self, event: Dict): - events = [] - if "headers" in event and "values" in event: - for values in event["values"]: - events.append({k: v for k, v in zip(event["headers"], values)}) - else: - events.append(event) - - for enriched in map(enrich_even_details, events): - if enriched is not None: - self._flow.emit( - enriched, - key=enriched[ENDPOINT_ID], - event_time=datetime.strptime(enriched["when"], ISO_8061_UTC), - ) - else: - pass - - @staticmethod - def compute_predictions_per_second(event: dict): - event[PREDICTIONS_PER_SECOND] = float(event[PREDICTIONS_COUNT_5M]) / 600 - return event - - def process_before_kv(self, event: dict): - # Filter relevant keys - e = {k: event[k] for k in self._kv_keys} - # Unpack labels dictionary - e = {**e, **e.pop(UNPACKED_LABELS, {})} - # Write labels to kv as json string to be presentable later - e[LABELS] = json.dumps(e[LABELS]) - return e - - @staticmethod - def process_before_events_tsdb(event: Dict): - base_fields = [TIMESTAMP, ENDPOINT_ID] - - base_event = {k: event[k] for k in base_fields} - base_event[TIMESTAMP] = pd.to_datetime( - base_event[TIMESTAMP], format=TIME_FORMAT - ) - - base_metrics = { - RECORD_TYPE: BASE_METRICS, - PREDICTIONS_PER_SECOND: event[PREDICTIONS_PER_SECOND], - PREDICTIONS_COUNT_5M: event[PREDICTIONS_COUNT_5M], - PREDICTIONS_COUNT_1H: event[PREDICTIONS_COUNT_1H], - LATENCY_AVG_5M: event[LATENCY_AVG_5M], - LATENCY_AVG_1H: event[LATENCY_AVG_1H], - **base_event, - } - - endpoint_features = { - RECORD_TYPE: ENDPOINT_FEATURES, - **event[NAMED_PREDICTIONS], - **event[NAMED_FEATURES], - **base_event, - } - - processed = {BASE_METRICS: base_metrics, ENDPOINT_FEATURES: endpoint_features} - - if event[METRICS]: - processed[CUSTOM_METRICS] = { - RECORD_TYPE: CUSTOM_METRICS, - **event[METRICS], - **base_event, - } - - return processed - - @staticmethod - def process_before_parquet(event: dict): - def set_none_if_empty(_event: dict, keys: List[str]): - for key in keys: - if not _event.get(key): - _event[key] = None - - def drop_if_exists(_event: dict, keys: List[str]): - for key in keys: - _event.pop(key, None) - - def unpack_if_exists(_event: dict, keys: List[str]): - for key in keys: - value = _event.get(key) - if value is not None: - _event = {**value, **event} - - drop_if_exists(event, [UNPACKED_LABELS, FEATURES]) - unpack_if_exists(event, [ENTITIES]) - set_none_if_empty(event, [LABELS, METRICS, ENTITIES]) - return event - - -class ProcessEndpointEvent(MapClass): - def __init__(self, kv_container: str, kv_path: str, **kwargs): - super().__init__(**kwargs) - self.kv_container: str = kv_container - self.kv_path: str = kv_path - self.first_request: Dict[str, str] = dict() - self.last_request: Dict[str, str] = dict() - self.error_count: Dict[str, int] = defaultdict(int) - self.endpoints: Set[str] = set() - - def do(self, event: dict): - function_uri = event[FUNCTION_URI] - versioned_model = event[VERSIONED_MODEL] - endpoint_id = event[ENDPOINT_ID] - - # In case this process fails, resume state from existing record - self.resume_state(endpoint_id) - - # Handle errors coming from stream - found_errors = self.handle_errors(endpoint_id, event) - if found_errors: - return None - - # Validate event fields - model_class = event.get("model_class") or event.get("class") - timestamp = event.get("when") - request_id = event.get("request", {}).get("id") - latency = event.get("microsec") - features = event.get("request", {}).get("inputs") - predictions = event.get("resp", {}).get("outputs") - - if not self.is_valid( - endpoint_id, - is_not_none, - timestamp, - ["when"], - ): - return None - - if endpoint_id not in self.first_request: - self.first_request[endpoint_id] = timestamp - self.last_request[endpoint_id] = timestamp - - if not self.is_valid( - endpoint_id, - is_not_none, - request_id, - ["request", "id"], - ): - return None - if not self.is_valid( - endpoint_id, - is_not_none, - latency, - ["microsec"], - ): - return None - if not self.is_valid( - endpoint_id, - is_not_none, - features, - ["request", "inputs"], - ): - return None - if not self.is_valid( - endpoint_id, - is_not_none, - predictions, - ["resp", "outputs"], - ): - return None - - unpacked_labels = {f"_{k}": v for k, v in event.get(LABELS, {}).items()} - - # Separate each model invocation into sub events - events = [] - for i, (feature, prediction) in enumerate(zip(features, predictions)): - if not self.is_valid( - endpoint_id, - is_list_of_numerics, - feature, - ["request", "inputs", f"[{i}]"], - ): - return None - - if not isinstance(prediction, list): - prediction = [prediction] - - events.append( - { - FUNCTION_URI: function_uri, - MODEL: versioned_model, - MODEL_CLASS: model_class, - TIMESTAMP: timestamp, - ENDPOINT_ID: endpoint_id, - REQUEST_ID: request_id, - LATENCY: latency, - FEATURES: feature, - PREDICTION: prediction, - FIRST_REQUEST: self.first_request[endpoint_id], - LAST_REQUEST: self.last_request[endpoint_id], - ERROR_COUNT: self.error_count[endpoint_id], - LABELS: event.get(LABELS, {}), - METRICS: event.get(METRICS, {}), - ENTITIES: event.get("request", {}).get(ENTITIES, {}), - UNPACKED_LABELS: unpacked_labels, - } - ) - return events - - def resume_state(self, endpoint_id): - # Make sure process is resumable, if process fails for any reason, be able to pick things up close to where we - # left them - if endpoint_id not in self.endpoints: - logger.info("Trying to resume state", endpoint_id=endpoint_id) - endpoint_record = get_endpoint_record( - kv_container=self.kv_container, - kv_path=self.kv_path, - endpoint_id=endpoint_id, - ) - if endpoint_record: - first_request = endpoint_record.get(FIRST_REQUEST) - if first_request: - self.first_request[endpoint_id] = first_request - error_count = endpoint_record.get(ERROR_COUNT) - if error_count: - self.error_count[endpoint_id] = error_count - self.endpoints.add(endpoint_id) - - def is_valid( - self, endpoint_id: str, validation_function, field: Any, dict_path: List[str] - ): - if validation_function(field, dict_path): - return True - self.error_count[endpoint_id] += 1 - return False - - def handle_errors(self, endpoint_id, event) -> bool: - if "error" in event: - self.error_count[endpoint_id] += 1 - return True - - return False - - -def enrich_even_details(event) -> Optional[dict]: - function_uri = event.get(FUNCTION_URI) - - if not is_not_none(function_uri, [FUNCTION_URI]): - return None - - model = event.get(MODEL) - if not is_not_none(model, [MODEL]): - return None - - version = event.get(VERSION) - versioned_model = f"{model}:{version}" if version else model - - endpoint_id = create_model_endpoint_id( - function_uri=function_uri, - versioned_model=versioned_model, - ) - - endpoint_id = str(endpoint_id) - - event[VERSIONED_MODEL] = versioned_model - event[ENDPOINT_ID] = endpoint_id - - return event - - -def is_not_none(field: Any, dict_path: List[str]): - if field is not None: - return True - logger.error( - f"Expected event field is missing: {field} [Event -> {''.join(dict_path)}]" - ) - return False - - -def is_list_of_numerics( - field: List[Union[int, float, dict, list]], dict_path: List[str] -): - if all(isinstance(x, int) or isinstance(x, float) for x in field): - return True - logger.error( - f"Expected event field is missing: {field} [Event -> {''.join(dict_path)}]" - ) - return False - - -class FilterNotNone(Filter): - def __init__(self, **kwargs): - super().__init__(fn=lambda event: event is not None, **kwargs) - - -class FilterKeys(MapClass): - def __init__(self, *args, **kwargs): - super().__init__(**kwargs) - self.keys = list(args) - - def do(self, event): - new_event = {} - for key in self.keys: - if key in event: - new_event[key] = event[key] - - return new_event if new_event else None - - -class UnpackValues(MapClass): - def __init__(self, *args, **kwargs): - super().__init__(**kwargs) - self.keys_to_unpack = set(args) - - def do(self, event): - unpacked = {} - for key in event.keys(): - if key in self.keys_to_unpack: - unpacked = {**unpacked, **event[key]} - else: - unpacked[key] = event[key] - return unpacked - - -class MapFeatureNames(MapClass): - def __init__(self, kv_container: str, kv_path: str, **kwargs): - super().__init__(**kwargs) - self.kv_container = kv_container - self.kv_path = kv_path - self.feature_names = {} - self.label_columns = {} - - def do(self, event: Dict): - endpoint_id = event[ENDPOINT_ID] - - if endpoint_id not in self.feature_names: - endpoint_record = get_endpoint_record( - kv_container=self.kv_container, - kv_path=self.kv_path, - endpoint_id=endpoint_id, - ) - feature_names = endpoint_record.get(FEATURE_NAMES) - feature_names = json.loads(feature_names) if feature_names else None - - label_columns = endpoint_record.get(LABEL_COLUMNS) - label_columns = json.loads(label_columns) if label_columns else None - - if not feature_names: - logger.warn( - f"Feature names are not initialized, they will be automatically generated", - endpoint_id=endpoint_id, - ) - feature_names = [f"f{i}" for i, _ in enumerate(event[FEATURES])] - get_v3io_client().kv.update( - container=self.kv_container, - table_path=self.kv_path, - key=event[ENDPOINT_ID], - attributes={FEATURE_NAMES: json.dumps(feature_names)}, - ) - - if not label_columns: - logger.warn( - f"label column names are not initialized, they will be automatically generated", - endpoint_id=endpoint_id, - ) - label_columns = [f"p{i}" for i, _ in enumerate(event[PREDICTION])] - get_v3io_client().kv.update( - container=self.kv_container, - table_path=self.kv_path, - key=event[ENDPOINT_ID], - attributes={LABEL_COLUMNS: json.dumps(label_columns)}, - ) - - self.label_columns[endpoint_id] = label_columns - self.feature_names[endpoint_id] = feature_names - - feature_names = self.feature_names[endpoint_id] - features = event[FEATURES] - event[NAMED_FEATURES] = { - name: feature for name, feature in zip(feature_names, features) - } - - label_columns = self.label_columns[endpoint_id] - prediction = event[PREDICTION] - event[NAMED_PREDICTIONS] = { - name: prediction for name, prediction in zip(label_columns, prediction) - } - return event - - -class WriteToKV(MapClass): - def __init__(self, container: str, table: str, **kwargs): - super().__init__(**kwargs) - self.container = container - self.table = table - - def do(self, event: Dict): - get_v3io_client().kv.update( - container=self.container, - table_path=self.table, - key=event[ENDPOINT_ID], - attributes=event, - ) - return event - - -class InferSchema(MapClass): - def __init__( - self, - v3io_access_key: str, - v3io_framesd: str, - container: str, - table: str, - **kwargs, - ): - super().__init__(**kwargs) - self.container = container - self.v3io_access_key = v3io_access_key - self.v3io_framesd = v3io_framesd - self.table = table - self.keys = set() - - def do(self, event: Dict): - key_set = set(event.keys()) - if not key_set.issubset(self.keys): - self.keys.update(key_set) - get_frames_client( - token=self.v3io_access_key, - container=self.container, - address=self.v3io_framesd, - ).execute(backend="kv", table=self.table, command="infer_schema") - logger.info( - "Found new keys, inferred schema", table=self.table, event=event - ) - return event - - -def get_endpoint_record( - kv_container: str, kv_path: str, endpoint_id: str -) -> Optional[dict]: - logger.info( - f"Grabbing endpoint data", - endpoint_id=endpoint_id, - table_path=kv_path, - ) - try: - endpoint_record = ( - get_v3io_client() - .kv.get( - container=kv_container, - table_path=kv_path, - key=endpoint_id, - ) - .output.item - ) - return endpoint_record - except Exception: - return None - - -def init_context(context: MLClientCtx): - context.logger.info("Initializing EventStreamProcessor") - parameters = environ.get("MODEL_MONITORING_PARAMETERS") - parameters = json.loads(parameters) if parameters else {} - stream_processor = EventStreamProcessor(**parameters) - setattr(context, "stream_processor", stream_processor) - - -def handler(context: MLClientCtx, event: Event): - event_body = json.loads(event.body) - context.logger.info(event_body) - context.stream_processor.consume(event_body) diff --git a/functions/development/model_monitoring_stream/0.0.1/src/requirements.txt b/functions/development/model_monitoring_stream/0.0.1/src/requirements.txt deleted file mode 100644 index 5e3645de..00000000 --- a/functions/development/model_monitoring_stream/0.0.1/src/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -storey \ No newline at end of file diff --git a/functions/development/model_monitoring_stream/0.0.1/static/documentation.html b/functions/development/model_monitoring_stream/0.0.1/static/documentation.html deleted file mode 100644 index f4c0265e..00000000 --- a/functions/development/model_monitoring_stream/0.0.1/static/documentation.html +++ /dev/null @@ -1,245 +0,0 @@ - - - - - - - -model_monitoring_stream package - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- - -
-
-
-
-
-
-

model_monitoring_stream package

-
-

Submodules

-
-
-

model_monitoring_stream.model_monitoring_stream module

-
-
-class model_monitoring_stream.model_monitoring_stream.EventStreamProcessor(project: str, sample_window: int = 10, tsdb_batching_max_events: int = 10, tsdb_batching_timeout_secs: int = 300, parquet_batching_max_events: int = 10000, parquet_batching_timeout_secs: int = 3600, aggregate_count_windows: Optional[List[str]] = None, aggregate_count_period: str = '30s', aggregate_avg_windows: Optional[List[str]] = None, aggregate_avg_period: str = '30s', v3io_access_key: Optional[str] = None, v3io_framesd: Optional[str] = None)[source]
-

Bases: object

-
-
-static compute_predictions_per_second(event: dict)[source]
-
-
-
-consume(event: Dict)[source]
-
-
-
-static process_before_events_tsdb(event: Dict)[source]
-
-
-
-process_before_kv(event: dict)[source]
-
-
-
-static process_before_parquet(event: dict)[source]
-
-
-
-
-class model_monitoring_stream.model_monitoring_stream.FilterKeys(*args: Any, **kwargs: Any)[source]
-

Bases: storey.

-
-
-do(event)[source]
-
-
-
-
-class model_monitoring_stream.model_monitoring_stream.FilterNotNone(*args: Any, **kwargs: Any)[source]
-

Bases: storey.

-
-
-
-class model_monitoring_stream.model_monitoring_stream.InferSchema(*args: Any, **kwargs: Any)[source]
-

Bases: storey.

-
-
-do(event: Dict)[source]
-
-
-
-
-class model_monitoring_stream.model_monitoring_stream.MapFeatureNames(*args: Any, **kwargs: Any)[source]
-

Bases: storey.

-
-
-do(event: Dict)[source]
-
-
-
-
-class model_monitoring_stream.model_monitoring_stream.ProcessEndpointEvent(*args: Any, **kwargs: Any)[source]
-

Bases: storey.

-
-
-do(event: dict)[source]
-
-
-
-handle_errors(endpoint_id, event)bool[source]
-
-
-
-is_valid(endpoint_id: str, validation_function, field: Any, dict_path: List[str])[source]
-
-
-
-resume_state(endpoint_id)[source]
-
-
-
-
-class model_monitoring_stream.model_monitoring_stream.UnpackValues(*args: Any, **kwargs: Any)[source]
-

Bases: storey.

-
-
-do(event)[source]
-
-
-
-
-class model_monitoring_stream.model_monitoring_stream.WriteToKV(*args: Any, **kwargs: Any)[source]
-

Bases: storey.

-
-
-do(event: Dict)[source]
-
-
-
-
-model_monitoring_stream.model_monitoring_stream.enrich_even_details(event)Optional[dict][source]
-
-
-
-model_monitoring_stream.model_monitoring_stream.get_endpoint_record(kv_container: str, kv_path: str, endpoint_id: str)Optional[dict][source]
-
-
-
-model_monitoring_stream.model_monitoring_stream.handler(context: mlrun.execution.MLClientCtx, event: nuclio.request.Event)[source]
-
-
-
-model_monitoring_stream.model_monitoring_stream.init_context(context: mlrun.execution.MLClientCtx)[source]
-
-
-
-model_monitoring_stream.model_monitoring_stream.is_list_of_numerics(field: List[Union[int, float, dict, list]], dict_path: List[str])[source]
-
-
-
-model_monitoring_stream.model_monitoring_stream.is_not_none(field: Any, dict_path: List[str])[source]
-
-
-
-

Module contents

-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/model_monitoring_stream/0.0.1/static/example.html b/functions/development/model_monitoring_stream/0.0.1/static/example.html deleted file mode 100644 index c7fffbc5..00000000 --- a/functions/development/model_monitoring_stream/0.0.1/static/example.html +++ /dev/null @@ -1,240 +0,0 @@ - - - - - - - -Model Monitoring - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- - -
-
-
-
-
-
-

Model Monitoring

-
-

Initial set up (and pre-requisites)

-
    -
  1. Make sure you have the mlrun-api datasource available in your Grafana instance, otherwise add it by:

    -
      -
    1. Open your grafana instance

    2. -
    3. Navigate to Configuration -> Data Sources

    4. -
    5. Press Add data source and configure the following parameters

    6. -
    -
    Name: mlrun-api
    -URL: http://mlrun-api:8080/api/grafana-proxy/model-endpoints
    -Access: Server (default)
    -
    -## Add a custom header of:
    -X-V3io-Session-Key: <YOUR ACCESS KEY>
    -
    -
    -
      -
    1. Press Save & Test to make sure it works, a confirmation message should appear when this button is pressed

    2. -
    -
  2. -
  3. Import the available dashboards (./dashboards/*) to you Grafana instance

  4. -
  5. To allow the system to utilize drift measurement, make sure you supply the train set when logging the model on the -training step

    -
    # Log model
    -context.log_model(
    -    "model",
    -    body=dumps(model),
    -    artifact_path=context.artifact_subpath("models"),
    -    extra_data=eval_metrics,
    -    model_file="model.pkl",
    -    metrics=context.results,
    -    training_set=X_test,  # <- make sure this is passed into log_model
    -    labels={"class": "sklearn.linear_model.LogisticRegression"}
    -)
    -
    -
    -
  6. -
  7. When serving a model, make sure that the Nuclio function is deployed with tracking enabled by applying -fn.set_tracking()

  8. -
-
-
-

Configuration

-

The stream processing portion of the model monitoring, can be deployed under multiple configuration options. The -available configurations can be found under stream.Config. Once configured it should be supplied as environment -parameters to the Nuclio function by setting fn.set_envs

-
project: str                        # project name
-sample_window: int                  # The sampling window for the data that flows into the TSDB and the KV
-kv_path_template: str               # Path template for the kv table
-tsdb_path_template: str             # Path template for the tsdb table
-parquet_path_template: str          # v3io parquets path template, assumes v3io is mounted
-tsdb_batching_max_events: int       # The max amount of event to batch before writing the batch to tsdb
-tsdb_batching_timeout_secs: int     # The max amount of seconds a given batch can be gathered before being emitted
-parquet_batching_max_events: int    # The max amount of event to batch before writing the batch to parquet
-parquet_batching_timeout_secs: int  # The max amount of seconds, a given batch can be gathered before being written to parquet
-container: str                      # container name
-v3io_access_key: str                # V3IO Access key
-v3io_framesd: str                   # V3IO framesd URL
-time_format: str                    # The time format into which time related fields will be converted
-aggregate_count_windows: List[str]  # List of window sizes for predictions count
-aggregate_count_period: str         # Period of predictions count windows
-aggregate_avg_windows: List[str]    # List of window sizes for average latency
-aggregate_avg_period: str           # Period of average latency windows
-
-
-
-
-

Export function yaml

-
-
-
from mlrun import code_to_function
-from mlrun.runtimes import RemoteRuntime
-
-
-fn: RemoteRuntime = code_to_function(
-    name="model-monitoring-stream",
-    kind="nuclio",
-    image="mlrun/mlrun",
-    filename="model_monitoring_stream.py",
-    handler="handler",
-)
-fn.export("model_monitoring_stream.yaml")
-
-
-
-
-
-
-

Deploy Stream Processing

-
-
-
import os
-
-from mlrun import import_function
-from mlrun.platforms import mount_v3io
-from mlrun.runtimes import RemoteRuntime
-import json
-
-# Set project name
-project = ""
-
-fn: RemoteRuntime = import_function("hub://model_monitoring_stream")
-
-fn.add_v3io_stream_trigger(
-    stream_path=f"projects/{project}/model-endpoints/stream",
-    name="monitoring_stream_trigger",
-)
-
-fn.set_env("MODEL_MONITORING_PARAMETERS", json.dumps({"project": project, "v3io_framesd": os.environ.get("V3IO_FRAMESD")}))
-
-fn.metadata.project = project
-fn.apply(mount_v3io())
-fn.deploy()
-
-
-
-
-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/model_monitoring_stream/0.0.1/static/function.html b/functions/development/model_monitoring_stream/0.0.1/static/function.html deleted file mode 100644 index ea759c11..00000000 --- a/functions/development/model_monitoring_stream/0.0.1/static/function.html +++ /dev/null @@ -1,62 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: remote
-metadata:
-  name: model-monitoring-stream
-  tag: ''
-  hash: 4dd87a1b2a2a92f3621f54b289a21126c2710e74
-  project: default
-  categories:
-  - monitoring
-spec:
-  command: ''
-  args: []
-  image: mlrun/mlrun
-  description: ''
-  min_replicas: 1
-  max_replicas: 4
-  env: []
-  base_spec:
-    apiVersion: nuclio.io/v1
-    kind: Function
-    metadata:
-      name: model-monitoring-stream
-      labels: {}
-      annotations:
-        nuclio.io/generated_by: function generated from /home/kali/functions/model_monitoring_stream/model_monitoring_stream.py
-    spec:
-      runtime: python:3.6
-      handler: model_monitoring_stream:handler
-      env: []
-      volumes: []
-      build:
-        commands: []
-        noBaseImagesPull: true
-        functionSourceCode: 
-  source: ''
-  build:
-    commands: []
-    code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/model_monitoring_stream/model_monitoring_stream.py
-  default_handler: handler
-  affinity: null
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/model_monitoring_stream/0.0.1/static/item.html b/functions/development/model_monitoring_stream/0.0.1/static/item.html deleted file mode 100644 index 31b34368..00000000 --- a/functions/development/model_monitoring_stream/0.0.1/static/item.html +++ /dev/null @@ -1,44 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- monitoring
-description: ''
-doc: ''
-example: model_monitoring_stream.ipynb
-generationDate: 2021-05-19:23-13
-icon: ''
-labels: {}
-maintainers: []
-marketplaceType: ''
-mlrunVersion: ''
-name: model-monitoring-stream
-platformVersion: ''
-spec:
-  filename: model_monitoring_stream.py
-  handler: handler
-  image: mlrun/mlrun
-  kind: nuclio
-  requirements: []
-url: ''
-version: 0.0.1
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/model_monitoring_stream/0.0.1/static/source.html b/functions/development/model_monitoring_stream/0.0.1/static/source.html deleted file mode 100644 index 1fe8132b..00000000 --- a/functions/development/model_monitoring_stream/0.0.1/static/source.html +++ /dev/null @@ -1,746 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-import json
-from collections import defaultdict
-from datetime import datetime
-from os import environ
-from typing import Dict, List, Set, Optional, Any, Union
-
-import pandas as pd
-from mlrun.config import config
-from mlrun.run import MLClientCtx
-from mlrun.utils import logger
-from mlrun.utils.model_monitoring import (
-    parse_model_endpoint_store_prefix,
-    create_model_endpoint_id,
-)
-from mlrun.utils.v3io_clients import get_v3io_client, get_frames_client
-from nuclio import Event
-from storey import (
-    FieldAggregator,
-    NoopDriver,
-    Table,
-    Map,
-    MapClass,
-    AggregateByKey,
-    build_flow,
-    Filter,
-    FlatMap,
-    TSDBTarget,
-    ParquetTarget,
-    SyncEmitSource,
-)
-from storey.dtypes import SlidingWindows
-from storey.steps import SampleWindow
-
-# Constants
-ISO_8061_UTC = "%Y-%m-%d %H:%M:%S.%f%z"
-FUNCTION_URI = "function_uri"
-MODEL = "model"
-VERSION = "version"
-VERSIONED_MODEL = "versioned_model"
-MODEL_CLASS = "model_class"
-TIMESTAMP = "timestamp"
-ENDPOINT_ID = "endpoint_id"
-REQUEST_ID = "request_id"
-LABELS = "labels"
-UNPACKED_LABELS = "unpacked_labels"
-LATENCY_AVG_5M = "latency_avg_5m"
-LATENCY_AVG_1H = "latency_avg_1h"
-PREDICTIONS_PER_SECOND = "predictions_per_second"
-PREDICTIONS_COUNT_5M = "predictions_count_5m"
-PREDICTIONS_COUNT_1H = "predictions_count_1h"
-FIRST_REQUEST = "first_request"
-LAST_REQUEST = "last_request"
-ERROR_COUNT = "error_count"
-ENTITIES = "entities"
-FEATURE_NAMES = "feature_names"
-LABEL_COLUMNS = "label_columns"
-LATENCY = "latency"
-RECORD_TYPE = "record_type"
-FEATURES = "features"
-PREDICTION = "prediction"
-PREDICTIONS = "predictions"
-NAMED_FEATURES = "named_features"
-NAMED_PREDICTIONS = "named_predictions"
-BASE_METRICS = "base_metrics"
-CUSTOM_METRICS = "custom_metrics"
-ENDPOINT_FEATURES = "endpoint_features"
-METRICS = "metrics"
-BATCH_TIMESTAMP = "batch_timestamp"
-TIME_FORMAT: str = "%Y-%m-%d %H:%M:%S.%f"  # ISO 8061
-
-
-# Stream processing code
-class EventStreamProcessor:
-    def __init__(
-        self,
-        project: str,
-        sample_window: int = 10,
-        tsdb_batching_max_events: int = 10,
-        tsdb_batching_timeout_secs: int = 60 * 5,  # Default 5 minutes
-        parquet_batching_max_events: int = 10_000,
-        parquet_batching_timeout_secs: int = 60 * 60,  # Default 1 hour
-        aggregate_count_windows: Optional[List[str]] = None,
-        aggregate_count_period: str = "30s",
-        aggregate_avg_windows: Optional[List[str]] = None,
-        aggregate_avg_period: str = "30s",
-        v3io_access_key: Optional[str] = None,
-        v3io_framesd: Optional[str] = None,
-    ):
-        self.project = project
-        self.sample_window = sample_window
-        self.tsdb_batching_max_events = tsdb_batching_max_events
-        self.tsdb_batching_timeout_secs = tsdb_batching_timeout_secs
-        self.parquet_batching_max_events = parquet_batching_max_events
-        self.parquet_batching_timeout_secs = parquet_batching_timeout_secs
-        self.aggregate_count_windows = aggregate_count_windows or ["5m", "1h"]
-        self.aggregate_count_period = aggregate_count_period
-        self.aggregate_avg_windows = aggregate_avg_windows or ["5m", "1h"]
-        self.aggregate_avg_period = aggregate_avg_period
-        self.v3io_access_key = v3io_access_key or environ.get("V3IO_ACCESS_KEY")
-        self.v3io_framesd = v3io_framesd or config.v3io_framesd
-
-        template = config.model_endpoint_monitoring.store_prefixes.default
-
-        kv_path = template.format(project=project, kind="endpoints")
-        _, self.kv_container, self.kv_path = parse_model_endpoint_store_prefix(kv_path)
-
-        tsdb_path = template.format(project=project, kind="events")
-        _, self.tsdb_container, self.tsdb_path = parse_model_endpoint_store_prefix(
-            tsdb_path
-        )
-        self.tsdb_path = f"{self.tsdb_container}/{self.tsdb_path}"
-
-        self.parquet_path = template.format(project=project, kind="parquet")
-
-        logger.info(
-            "Writer paths",
-            kv_path=self.kv_path,
-            tsdb_path=self.tsdb_path,
-            parquet_path=self.parquet_path,
-        )
-
-        self._kv_keys = [
-            FUNCTION_URI,
-            MODEL,
-            MODEL_CLASS,
-            TIMESTAMP,
-            ENDPOINT_ID,
-            LABELS,
-            UNPACKED_LABELS,
-            LATENCY_AVG_5M,
-            LATENCY_AVG_1H,
-            PREDICTIONS_PER_SECOND,
-            PREDICTIONS_COUNT_5M,
-            PREDICTIONS_COUNT_1H,
-            FIRST_REQUEST,
-            LAST_REQUEST,
-            ERROR_COUNT,
-        ]
-
-        self._flow = build_flow(
-            [
-                SyncEmitSource(),
-                ProcessEndpointEvent(self.kv_container, self.kv_path),
-                FilterNotNone(),
-                FlatMap(lambda x: x),
-                MapFeatureNames(self.kv_container, self.kv_path),
-                # Branch 1: Aggregate events, count averages and update TSDB and KV
-                [
-                    AggregateByKey(
-                        aggregates=[
-                            FieldAggregator(
-                                PREDICTIONS,
-                                ENDPOINT_ID,
-                                ["count"],
-                                SlidingWindows(
-                                    self.aggregate_count_windows,
-                                    self.aggregate_count_period,
-                                ),
-                            ),
-                            FieldAggregator(
-                                LATENCY,
-                                LATENCY,
-                                ["avg"],
-                                SlidingWindows(
-                                    self.aggregate_avg_windows,
-                                    self.aggregate_avg_period,
-                                ),
-                            ),
-                        ],
-                        table=Table("notable", NoopDriver()),
-                    ),
-                    SampleWindow(
-                        self.sample_window
-                    ),  # Add required gap between event to apply sampling
-                    Map(self.compute_predictions_per_second),
-                    # Branch 1.1: Updated KV
-                    [
-                        Map(self.process_before_kv),
-                        WriteToKV(container=self.kv_container, table=self.kv_path),
-                        InferSchema(
-                            v3io_access_key=self.v3io_access_key,
-                            v3io_framesd=self.v3io_framesd,
-                            container=self.kv_container,
-                            table=self.kv_path,
-                        ),
-                    ],
-                    # Branch 1.2: Update TSDB
-                    [
-                        # Map the event into taggable fields, add record type to each field
-                        Map(self.process_before_events_tsdb),
-                        [
-                            FilterKeys(BASE_METRICS),
-                            UnpackValues(BASE_METRICS),
-                            TSDBTarget(
-                                path=self.tsdb_path,
-                                rate="10/m",
-                                time_col=TIMESTAMP,
-                                container=self.tsdb_container,
-                                access_key=self.v3io_access_key,
-                                v3io_frames=self.v3io_framesd,
-                                index_cols=[ENDPOINT_ID, RECORD_TYPE],
-                                # Settings for _Batching
-                                max_events=self.tsdb_batching_max_events,
-                                timeout_secs=self.tsdb_batching_timeout_secs,
-                                key=ENDPOINT_ID,
-                            ),
-                        ],
-                        [
-                            FilterKeys(ENDPOINT_FEATURES),
-                            UnpackValues(ENDPOINT_FEATURES),
-                            TSDBTarget(
-                                path=self.tsdb_path,
-                                rate="10/m",
-                                time_col=TIMESTAMP,
-                                container=self.tsdb_container,
-                                access_key=self.v3io_access_key,
-                                v3io_frames=self.v3io_framesd,
-                                index_cols=[ENDPOINT_ID, RECORD_TYPE],
-                                # Settings for _Batching
-                                max_events=self.tsdb_batching_max_events,
-                                timeout_secs=self.tsdb_batching_timeout_secs,
-                                key=ENDPOINT_ID,
-                            ),
-                        ],
-                        [
-                            FilterKeys(CUSTOM_METRICS),
-                            FilterNotNone(),
-                            UnpackValues(CUSTOM_METRICS),
-                            TSDBTarget(
-                                path=self.tsdb_path,
-                                rate="10/m",
-                                time_col=TIMESTAMP,
-                                container=self.tsdb_container,
-                                access_key=self.v3io_access_key,
-                                v3io_frames=self.v3io_framesd,
-                                index_cols=[ENDPOINT_ID, RECORD_TYPE],
-                                # Settings for _Batching
-                                max_events=self.tsdb_batching_max_events,
-                                timeout_secs=self.tsdb_batching_timeout_secs,
-                                key=ENDPOINT_ID,
-                            ),
-                        ],
-                    ],
-                ],
-                # Branch 2: Batch events, write to parquet
-                [
-                    Map(self.process_before_parquet),
-                    ParquetTarget(
-                        path=self.parquet_path,
-                        partition_cols=["$key", "$year", "$month", "$day", "$hour"],
-                        infer_columns_from_data=True,
-                        # Settings for _Batching
-                        max_events=self.parquet_batching_max_events,
-                        timeout_secs=self.parquet_batching_timeout_secs,
-                    ),
-                ],
-            ]
-        ).run()
-
-    def consume(self, event: Dict):
-        events = []
-        if "headers" in event and "values" in event:
-            for values in event["values"]:
-                events.append({k: v for k, v in zip(event["headers"], values)})
-        else:
-            events.append(event)
-
-        for enriched in map(enrich_even_details, events):
-            if enriched is not None:
-                self._flow.emit(
-                    enriched,
-                    key=enriched[ENDPOINT_ID],
-                    event_time=datetime.strptime(enriched["when"], ISO_8061_UTC),
-                )
-            else:
-                pass
-
-    @staticmethod
-    def compute_predictions_per_second(event: dict):
-        event[PREDICTIONS_PER_SECOND] = float(event[PREDICTIONS_COUNT_5M]) / 600
-        return event
-
-    def process_before_kv(self, event: dict):
-        # Filter relevant keys
-        e = {k: event[k] for k in self._kv_keys}
-        # Unpack labels dictionary
-        e = {**e, **e.pop(UNPACKED_LABELS, {})}
-        # Write labels to kv as json string to be presentable later
-        e[LABELS] = json.dumps(e[LABELS])
-        return e
-
-    @staticmethod
-    def process_before_events_tsdb(event: Dict):
-        base_fields = [TIMESTAMP, ENDPOINT_ID]
-
-        base_event = {k: event[k] for k in base_fields}
-        base_event[TIMESTAMP] = pd.to_datetime(
-            base_event[TIMESTAMP], format=TIME_FORMAT
-        )
-
-        base_metrics = {
-            RECORD_TYPE: BASE_METRICS,
-            PREDICTIONS_PER_SECOND: event[PREDICTIONS_PER_SECOND],
-            PREDICTIONS_COUNT_5M: event[PREDICTIONS_COUNT_5M],
-            PREDICTIONS_COUNT_1H: event[PREDICTIONS_COUNT_1H],
-            LATENCY_AVG_5M: event[LATENCY_AVG_5M],
-            LATENCY_AVG_1H: event[LATENCY_AVG_1H],
-            **base_event,
-        }
-
-        endpoint_features = {
-            RECORD_TYPE: ENDPOINT_FEATURES,
-            **event[NAMED_PREDICTIONS],
-            **event[NAMED_FEATURES],
-            **base_event,
-        }
-
-        processed = {BASE_METRICS: base_metrics, ENDPOINT_FEATURES: endpoint_features}
-
-        if event[METRICS]:
-            processed[CUSTOM_METRICS] = {
-                RECORD_TYPE: CUSTOM_METRICS,
-                **event[METRICS],
-                **base_event,
-            }
-
-        return processed
-
-    @staticmethod
-    def process_before_parquet(event: dict):
-        def set_none_if_empty(_event: dict, keys: List[str]):
-            for key in keys:
-                if not _event.get(key):
-                    _event[key] = None
-
-        def drop_if_exists(_event: dict, keys: List[str]):
-            for key in keys:
-                _event.pop(key, None)
-
-        def unpack_if_exists(_event: dict, keys: List[str]):
-            for key in keys:
-                value = _event.get(key)
-                if value is not None:
-                    _event = {**value, **event}
-
-        drop_if_exists(event, [UNPACKED_LABELS, FEATURES])
-        unpack_if_exists(event, [ENTITIES])
-        set_none_if_empty(event, [LABELS, METRICS, ENTITIES])
-        return event
-
-
-class ProcessEndpointEvent(MapClass):
-    def __init__(self, kv_container: str, kv_path: str, **kwargs):
-        super().__init__(**kwargs)
-        self.kv_container: str = kv_container
-        self.kv_path: str = kv_path
-        self.first_request: Dict[str, str] = dict()
-        self.last_request: Dict[str, str] = dict()
-        self.error_count: Dict[str, int] = defaultdict(int)
-        self.endpoints: Set[str] = set()
-
-    def do(self, event: dict):
-        function_uri = event[FUNCTION_URI]
-        versioned_model = event[VERSIONED_MODEL]
-        endpoint_id = event[ENDPOINT_ID]
-
-        # In case this process fails, resume state from existing record
-        self.resume_state(endpoint_id)
-
-        # Handle errors coming from stream
-        found_errors = self.handle_errors(endpoint_id, event)
-        if found_errors:
-            return None
-
-        # Validate event fields
-        model_class = event.get("model_class") or event.get("class")
-        timestamp = event.get("when")
-        request_id = event.get("request", {}).get("id")
-        latency = event.get("microsec")
-        features = event.get("request", {}).get("inputs")
-        predictions = event.get("resp", {}).get("outputs")
-
-        if not self.is_valid(
-            endpoint_id,
-            is_not_none,
-            timestamp,
-            ["when"],
-        ):
-            return None
-
-        if endpoint_id not in self.first_request:
-            self.first_request[endpoint_id] = timestamp
-        self.last_request[endpoint_id] = timestamp
-
-        if not self.is_valid(
-            endpoint_id,
-            is_not_none,
-            request_id,
-            ["request", "id"],
-        ):
-            return None
-        if not self.is_valid(
-            endpoint_id,
-            is_not_none,
-            latency,
-            ["microsec"],
-        ):
-            return None
-        if not self.is_valid(
-            endpoint_id,
-            is_not_none,
-            features,
-            ["request", "inputs"],
-        ):
-            return None
-        if not self.is_valid(
-            endpoint_id,
-            is_not_none,
-            predictions,
-            ["resp", "outputs"],
-        ):
-            return None
-
-        unpacked_labels = {f"_{k}": v for k, v in event.get(LABELS, {}).items()}
-
-        # Separate each model invocation into sub events
-        events = []
-        for i, (feature, prediction) in enumerate(zip(features, predictions)):
-            if not self.is_valid(
-                endpoint_id,
-                is_list_of_numerics,
-                feature,
-                ["request", "inputs", f"[{i}]"],
-            ):
-                return None
-
-            if not isinstance(prediction, list):
-                prediction = [prediction]
-
-            events.append(
-                {
-                    FUNCTION_URI: function_uri,
-                    MODEL: versioned_model,
-                    MODEL_CLASS: model_class,
-                    TIMESTAMP: timestamp,
-                    ENDPOINT_ID: endpoint_id,
-                    REQUEST_ID: request_id,
-                    LATENCY: latency,
-                    FEATURES: feature,
-                    PREDICTION: prediction,
-                    FIRST_REQUEST: self.first_request[endpoint_id],
-                    LAST_REQUEST: self.last_request[endpoint_id],
-                    ERROR_COUNT: self.error_count[endpoint_id],
-                    LABELS: event.get(LABELS, {}),
-                    METRICS: event.get(METRICS, {}),
-                    ENTITIES: event.get("request", {}).get(ENTITIES, {}),
-                    UNPACKED_LABELS: unpacked_labels,
-                }
-            )
-        return events
-
-    def resume_state(self, endpoint_id):
-        # Make sure process is resumable, if process fails for any reason, be able to pick things up close to where we
-        # left them
-        if endpoint_id not in self.endpoints:
-            logger.info("Trying to resume state", endpoint_id=endpoint_id)
-            endpoint_record = get_endpoint_record(
-                kv_container=self.kv_container,
-                kv_path=self.kv_path,
-                endpoint_id=endpoint_id,
-            )
-            if endpoint_record:
-                first_request = endpoint_record.get(FIRST_REQUEST)
-                if first_request:
-                    self.first_request[endpoint_id] = first_request
-                error_count = endpoint_record.get(ERROR_COUNT)
-                if error_count:
-                    self.error_count[endpoint_id] = error_count
-            self.endpoints.add(endpoint_id)
-
-    def is_valid(
-        self, endpoint_id: str, validation_function, field: Any, dict_path: List[str]
-    ):
-        if validation_function(field, dict_path):
-            return True
-        self.error_count[endpoint_id] += 1
-        return False
-
-    def handle_errors(self, endpoint_id, event) -> bool:
-        if "error" in event:
-            self.error_count[endpoint_id] += 1
-            return True
-
-        return False
-
-
-def enrich_even_details(event) -> Optional[dict]:
-    function_uri = event.get(FUNCTION_URI)
-
-    if not is_not_none(function_uri, [FUNCTION_URI]):
-        return None
-
-    model = event.get(MODEL)
-    if not is_not_none(model, [MODEL]):
-        return None
-
-    version = event.get(VERSION)
-    versioned_model = f"{model}:{version}" if version else model
-
-    endpoint_id = create_model_endpoint_id(
-        function_uri=function_uri,
-        versioned_model=versioned_model,
-    )
-
-    endpoint_id = str(endpoint_id)
-
-    event[VERSIONED_MODEL] = versioned_model
-    event[ENDPOINT_ID] = endpoint_id
-
-    return event
-
-
-def is_not_none(field: Any, dict_path: List[str]):
-    if field is not None:
-        return True
-    logger.error(
-        f"Expected event field is missing: {field} [Event -> {''.join(dict_path)}]"
-    )
-    return False
-
-
-def is_list_of_numerics(
-    field: List[Union[int, float, dict, list]], dict_path: List[str]
-):
-    if all(isinstance(x, int) or isinstance(x, float) for x in field):
-        return True
-    logger.error(
-        f"Expected event field is missing: {field} [Event -> {''.join(dict_path)}]"
-    )
-    return False
-
-
-class FilterNotNone(Filter):
-    def __init__(self, **kwargs):
-        super().__init__(fn=lambda event: event is not None, **kwargs)
-
-
-class FilterKeys(MapClass):
-    def __init__(self, *args, **kwargs):
-        super().__init__(**kwargs)
-        self.keys = list(args)
-
-    def do(self, event):
-        new_event = {}
-        for key in self.keys:
-            if key in event:
-                new_event[key] = event[key]
-
-        return new_event if new_event else None
-
-
-class UnpackValues(MapClass):
-    def __init__(self, *args, **kwargs):
-        super().__init__(**kwargs)
-        self.keys_to_unpack = set(args)
-
-    def do(self, event):
-        unpacked = {}
-        for key in event.keys():
-            if key in self.keys_to_unpack:
-                unpacked = {**unpacked, **event[key]}
-            else:
-                unpacked[key] = event[key]
-        return unpacked
-
-
-class MapFeatureNames(MapClass):
-    def __init__(self, kv_container: str, kv_path: str, **kwargs):
-        super().__init__(**kwargs)
-        self.kv_container = kv_container
-        self.kv_path = kv_path
-        self.feature_names = {}
-        self.label_columns = {}
-
-    def do(self, event: Dict):
-        endpoint_id = event[ENDPOINT_ID]
-
-        if endpoint_id not in self.feature_names:
-            endpoint_record = get_endpoint_record(
-                kv_container=self.kv_container,
-                kv_path=self.kv_path,
-                endpoint_id=endpoint_id,
-            )
-            feature_names = endpoint_record.get(FEATURE_NAMES)
-            feature_names = json.loads(feature_names) if feature_names else None
-
-            label_columns = endpoint_record.get(LABEL_COLUMNS)
-            label_columns = json.loads(label_columns) if label_columns else None
-
-            if not feature_names:
-                logger.warn(
-                    f"Feature names are not initialized, they will be automatically generated",
-                    endpoint_id=endpoint_id,
-                )
-                feature_names = [f"f{i}" for i, _ in enumerate(event[FEATURES])]
-                get_v3io_client().kv.update(
-                    container=self.kv_container,
-                    table_path=self.kv_path,
-                    key=event[ENDPOINT_ID],
-                    attributes={FEATURE_NAMES: json.dumps(feature_names)},
-                )
-
-            if not label_columns:
-                logger.warn(
-                    f"label column names are not initialized, they will be automatically generated",
-                    endpoint_id=endpoint_id,
-                )
-                label_columns = [f"p{i}" for i, _ in enumerate(event[PREDICTION])]
-                get_v3io_client().kv.update(
-                    container=self.kv_container,
-                    table_path=self.kv_path,
-                    key=event[ENDPOINT_ID],
-                    attributes={LABEL_COLUMNS: json.dumps(label_columns)},
-                )
-
-            self.label_columns[endpoint_id] = label_columns
-            self.feature_names[endpoint_id] = feature_names
-
-        feature_names = self.feature_names[endpoint_id]
-        features = event[FEATURES]
-        event[NAMED_FEATURES] = {
-            name: feature for name, feature in zip(feature_names, features)
-        }
-
-        label_columns = self.label_columns[endpoint_id]
-        prediction = event[PREDICTION]
-        event[NAMED_PREDICTIONS] = {
-            name: prediction for name, prediction in zip(label_columns, prediction)
-        }
-        return event
-
-
-class WriteToKV(MapClass):
-    def __init__(self, container: str, table: str, **kwargs):
-        super().__init__(**kwargs)
-        self.container = container
-        self.table = table
-
-    def do(self, event: Dict):
-        get_v3io_client().kv.update(
-            container=self.container,
-            table_path=self.table,
-            key=event[ENDPOINT_ID],
-            attributes=event,
-        )
-        return event
-
-
-class InferSchema(MapClass):
-    def __init__(
-        self,
-        v3io_access_key: str,
-        v3io_framesd: str,
-        container: str,
-        table: str,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.container = container
-        self.v3io_access_key = v3io_access_key
-        self.v3io_framesd = v3io_framesd
-        self.table = table
-        self.keys = set()
-
-    def do(self, event: Dict):
-        key_set = set(event.keys())
-        if not key_set.issubset(self.keys):
-            self.keys.update(key_set)
-            get_frames_client(
-                token=self.v3io_access_key,
-                container=self.container,
-                address=self.v3io_framesd,
-            ).execute(backend="kv", table=self.table, command="infer_schema")
-            logger.info(
-                "Found new keys, inferred schema", table=self.table, event=event
-            )
-        return event
-
-
-def get_endpoint_record(
-    kv_container: str, kv_path: str, endpoint_id: str
-) -> Optional[dict]:
-    logger.info(
-        f"Grabbing endpoint data",
-        endpoint_id=endpoint_id,
-        table_path=kv_path,
-    )
-    try:
-        endpoint_record = (
-            get_v3io_client()
-            .kv.get(
-                container=kv_container,
-                table_path=kv_path,
-                key=endpoint_id,
-            )
-            .output.item
-        )
-        return endpoint_record
-    except Exception:
-        return None
-
-
-def init_context(context: MLClientCtx):
-    context.logger.info("Initializing EventStreamProcessor")
-    parameters = environ.get("MODEL_MONITORING_PARAMETERS")
-    parameters = json.loads(parameters) if parameters else {}
-    stream_processor = EventStreamProcessor(**parameters)
-    setattr(context, "stream_processor", stream_processor)
-
-
-def handler(context: MLClientCtx, event: Event):
-    event_body = json.loads(event.body)
-    context.logger.info(event_body)
-    context.stream_processor.consume(event_body)
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/model_monitoring_stream/0.8.0/src/function.yaml b/functions/development/model_monitoring_stream/0.8.0/src/function.yaml deleted file mode 100644 index cfadd282..00000000 --- a/functions/development/model_monitoring_stream/0.8.0/src/function.yaml +++ /dev/null @@ -1,267 +0,0 @@ -kind: remote -metadata: - name: model-monitoring-stream - tag: '' - hash: 33f4d6de0858b3dfc9d150fc82fbed6feb05534c - project: default - categories: - - monitoring -spec: - command: '' - args: [] - image: livsmichael/mlrun-api:automation - entry_points: - consume: - name: consume - doc: '' - parameters: - - name: self - default: '' - - name: event - type: Dict - default: '' - outputs: - - default: '' - lineno: 293 - compute_predictions_per_second: - name: compute_predictions_per_second - doc: '' - parameters: - - name: event - type: dict - default: '' - outputs: - - default: '' - lineno: 311 - process_before_kv: - name: process_before_kv - doc: '' - parameters: - - name: self - default: '' - - name: event - type: dict - default: '' - outputs: - - default: '' - lineno: 316 - process_before_events_tsdb: - name: process_before_events_tsdb - doc: '' - parameters: - - name: event - type: Dict - default: '' - outputs: - - default: '' - lineno: 325 - process_before_parquet: - name: process_before_parquet - doc: '' - parameters: - - name: event - type: dict - default: '' - outputs: - - default: '' - lineno: 362 - set_none_if_empty: - name: set_none_if_empty - doc: '' - parameters: - - name: _event - type: dict - default: '' - - name: keys - type: List[str] - default: '' - outputs: - - default: '' - lineno: 364 - drop_if_exists: - name: drop_if_exists - doc: '' - parameters: - - name: _event - type: dict - default: '' - - name: keys - type: List[str] - default: '' - outputs: - - default: '' - lineno: 369 - unpack_if_exists: - name: unpack_if_exists - doc: '' - parameters: - - name: _event - type: dict - default: '' - - name: keys - type: List[str] - default: '' - outputs: - - default: '' - lineno: 373 - do: - name: do - doc: '' - parameters: - - name: self - default: '' - - name: event - type: Dict - default: '' - outputs: - - default: '' - lineno: 702 - resume_state: - name: resume_state - doc: '' - parameters: - - name: self - default: '' - - name: endpoint_id - default: '' - outputs: - - default: '' - lineno: 475 - is_valid: - name: is_valid - doc: '' - parameters: - - name: self - default: '' - - name: endpoint_id - type: str - default: '' - - name: validation_function - default: '' - - name: field - type: Any - default: '' - - name: dict_path - type: List[str] - default: '' - outputs: - - default: '' - lineno: 495 - handle_errors: - name: handle_errors - doc: '' - parameters: - - name: self - default: '' - - name: endpoint_id - default: '' - - name: event - default: '' - outputs: - - default: '' - type: bool - lineno: 503 - enrich_even_details: - name: enrich_even_details - doc: '' - parameters: - - name: event - default: '' - outputs: - - default: '' - lineno: 511 - is_not_none: - name: is_not_none - doc: '' - parameters: - - name: field - type: Any - default: '' - - name: dict_path - type: List[str] - default: '' - outputs: - - default: '' - lineno: 536 - is_list_of_numerics: - name: is_list_of_numerics - doc: '' - parameters: - - name: field - type: List[Union[int, float, dict, list]] - default: '' - - name: dict_path - type: List[str] - default: '' - outputs: - - default: '' - lineno: 545 - get_endpoint_record: - name: get_endpoint_record - doc: '' - parameters: - - name: kv_container - type: str - default: '' - - name: kv_path - type: str - default: '' - - name: endpoint_id - type: str - default: '' - - name: access_key - type: str - default: '' - outputs: - - default: '' - lineno: 717 - init_context: - name: init_context - doc: '' - parameters: - - name: context - type: MLClientCtx - default: '' - outputs: - - default: '' - lineno: 743 - handler: - name: handler - doc: '' - parameters: - - name: context - type: MLClientCtx - default: '' - - name: event - type: Event - default: '' - outputs: - - default: '' - lineno: 751 - description: '' - min_replicas: 1 - max_replicas: 4 - env: [] - base_spec: - apiVersion: nuclio.io/v1 - kind: Function - metadata: - name: model-monitoring-stream - labels: {} - annotations: - nuclio.io/generated_by: function generated from /home/michaell/projects/functions/model_monitoring_stream/model_monitoring_stream.py - spec: - runtime: python:3.6 - handler: model_monitoring_stream:handler - env: [] - volumes: [] - build: - commands: [] - noBaseImagesPull: true - functionSourceCode:  - source: '' - build: - commands: [] - code_origin: https://github.com/Michaelliv/functions.git#202b4c489e4c02c3025742ea237f1a042b7c6043:/home/michaell/projects/functions/model_monitoring_stream/model_monitoring_stream.py - default_handler: handler -verbose: false diff --git a/functions/development/model_monitoring_stream/0.8.0/src/item.yaml b/functions/development/model_monitoring_stream/0.8.0/src/item.yaml deleted file mode 100644 index 6c2c0a13..00000000 --- a/functions/development/model_monitoring_stream/0.8.0/src/item.yaml +++ /dev/null @@ -1,22 +0,0 @@ -apiVersion: v1 -categories: -- monitoring -description: '' -doc: '' -example: model_monitoring_stream.ipynb -generationDate: 2021-05-19:23-13 -icon: '' -labels: {} -maintainers: [] -marketplaceType: '' -mlrunVersion: 0.8.0 -name: model-monitoring-stream -platformVersion: 3.2.0 -spec: - filename: model_monitoring_stream.py - handler: handler - image: livsmichael/mlrun-api:automation - kind: nuclio - requirements: [] -url: '' -version: 0.8.0 diff --git a/functions/development/model_monitoring_stream/0.8.0/src/model_monitoring_stream.ipynb b/functions/development/model_monitoring_stream/0.8.0/src/model_monitoring_stream.ipynb deleted file mode 100644 index 93d8c92e..00000000 --- a/functions/development/model_monitoring_stream/0.8.0/src/model_monitoring_stream.ipynb +++ /dev/null @@ -1,178 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "source": [ - "# Model Monitoring\n", - "\n", - "## Initial set up (and pre-requisites)\n", - "1. Make sure you have the `mlrun-api` datasource available in your Grafana instance, otherwise add it by:\n", - " 1. Open your grafana instance\n", - " 2. Navigate to `Configuration -> Data Sources`\n", - " 3. Press `Add data source` and configure the following parameters\n", - " ```\n", - " Name: mlrun-api\n", - " URL: http://mlrun-api:8080/api/grafana-proxy/model-endpoints\n", - " Access: Server (default)\n", - "\n", - " ## Add a custom header of:\n", - " X-V3io-Session-Key: \n", - " ```\n", - " 4. Press `Save & Test` to make sure it works, a confirmation message should appear when this button is pressed\n", - "\n", - "2. Import the available dashboards `(./dashboards/*)` to you Grafana instance\n", - "3. To allow the system to utilize drift measurement, make sure you supply the train set when logging the model on the\n", - " training step\n", - "\n", - " ```python\n", - " # Log model\n", - " context.log_model(\n", - " \"model\",\n", - " body=dumps(model),\n", - " artifact_path=context.artifact_subpath(\"models\"),\n", - " extra_data=eval_metrics,\n", - " model_file=\"model.pkl\",\n", - " metrics=context.results,\n", - " training_set=X_test, # <- make sure this is passed into log_model\n", - " labels={\"class\": \"sklearn.linear_model.LogisticRegression\"}\n", - " )\n", - " ```\n", - "4. When serving a model, make sure that the Nuclio function is deployed with tracking enabled by applying\n", - " `fn.set_tracking()`\n", - "\n", - "## Configuration\n", - "The stream processing portion of the model monitoring, can be deployed under multiple configuration options. The\n", - "available configurations can be found under `stream.Config`. Once configured it should be supplied as environment\n", - "parameters to the Nuclio function by setting `fn.set_envs`\n", - "\n", - "```python\n", - "project: str # project name\n", - "sample_window: int # The sampling window for the data that flows into the TSDB and the KV\n", - "kv_path_template: str # Path template for the kv table\n", - "tsdb_path_template: str # Path template for the tsdb table\n", - "parquet_path_template: str # v3io parquets path template, assumes v3io is mounted\n", - "tsdb_batching_max_events: int # The max amount of event to batch before writing the batch to tsdb\n", - "tsdb_batching_timeout_secs: int # The max amount of seconds a given batch can be gathered before being emitted\n", - "parquet_batching_max_events: int # The max amount of event to batch before writing the batch to parquet\n", - "parquet_batching_timeout_secs: int # The max amount of seconds, a given batch can be gathered before being written to parquet\n", - "container: str # container name\n", - "v3io_access_key: str # V3IO Access key\n", - "v3io_framesd: str # V3IO framesd URL\n", - "time_format: str # The time format into which time related fields will be converted\n", - "aggregate_count_windows: List[str] # List of window sizes for predictions count\n", - "aggregate_count_period: str # Period of predictions count windows\n", - "aggregate_avg_windows: List[str] # List of window sizes for average latency\n", - "aggregate_avg_period: str # Period of average latency windows\n", - "```" - ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%% md\n" - } - } - }, - { - "cell_type": "markdown", - "source": [ - "## Export function yaml" - ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%% md\n" - } - } - }, - { - "cell_type": "code", - "execution_count": null, - "outputs": [], - "source": [ - "from mlrun import code_to_function\n", - "from mlrun.runtimes import RemoteRuntime\n", - "\n", - "\n", - "fn: RemoteRuntime = code_to_function(\n", - " name=\"model-monitoring-stream\",\n", - " kind=\"nuclio\",\n", - " image=\"mlrun/mlrun\",\n", - " filename=\"model_monitoring_stream.py\",\n", - " handler=\"handler\",\n", - ")\n", - "fn.export(\"model_monitoring_stream.yaml\")\n" - ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%%\n" - } - } - }, - { - "cell_type": "markdown", - "source": [ - "## Deploy Stream Processing" - ], - "metadata": { - "collapsed": false - } - }, - { - "cell_type": "code", - "execution_count": null, - "outputs": [], - "source": [ - "import os\n", - "\n", - "from mlrun import import_function\n", - "from mlrun.platforms import mount_v3io\n", - "from mlrun.runtimes import RemoteRuntime\n", - "import json\n", - "\n", - "# Set project name\n", - "project = \"\"\n", - "\n", - "fn: RemoteRuntime = import_function(\"hub://model_monitoring_stream\")\n", - "\n", - "fn.add_v3io_stream_trigger(\n", - " stream_path=f\"projects/{project}/model-endpoints/stream\",\n", - " name=\"monitoring_stream_trigger\",\n", - ")\n", - "\n", - "fn.set_env(\"MODEL_MONITORING_PARAMETERS\", json.dumps({\"project\": project, \"v3io_framesd\": os.environ.get(\"V3IO_FRAMESD\")}))\n", - "\n", - "fn.metadata.project = project\n", - "fn.apply(mount_v3io())\n", - "fn.deploy()" - ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%%\n" - } - } - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 2 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} \ No newline at end of file diff --git a/functions/development/model_monitoring_stream/0.8.0/src/model_monitoring_stream.py b/functions/development/model_monitoring_stream/0.8.0/src/model_monitoring_stream.py deleted file mode 100644 index ed21aeb5..00000000 --- a/functions/development/model_monitoring_stream/0.8.0/src/model_monitoring_stream.py +++ /dev/null @@ -1,754 +0,0 @@ -import json -import os -from collections import defaultdict -from datetime import datetime -from os import environ -from typing import Dict, List, Set, Optional, Any, Union - -import pandas as pd -import v3io -from mlrun.config import config -from mlrun.run import MLClientCtx -from mlrun.utils import logger -from mlrun.utils.model_monitoring import ( - parse_model_endpoint_store_prefix, - create_model_endpoint_id, -) -from mlrun.utils.v3io_clients import get_v3io_client, get_frames_client -from nuclio import Event -from storey import ( - FieldAggregator, - NoopDriver, - Table, - Map, - MapClass, - AggregateByKey, - build_flow, - Filter, - FlatMap, - TSDBTarget, - ParquetTarget, - SyncEmitSource, -) -from storey.dtypes import SlidingWindows -from storey.steps import SampleWindow -# Constants -from v3io.dataplane import RaiseForStatus - -ISO_8061_UTC = "%Y-%m-%d %H:%M:%S.%f%z" -FUNCTION_URI = "function_uri" -MODEL = "model" -VERSION = "version" -VERSIONED_MODEL = "versioned_model" -MODEL_CLASS = "model_class" -TIMESTAMP = "timestamp" -ENDPOINT_ID = "endpoint_id" -REQUEST_ID = "request_id" -LABELS = "labels" -UNPACKED_LABELS = "unpacked_labels" -LATENCY_AVG_5M = "latency_avg_5m" -LATENCY_AVG_1H = "latency_avg_1h" -PREDICTIONS_PER_SECOND = "predictions_per_second" -PREDICTIONS_COUNT_5M = "predictions_count_5m" -PREDICTIONS_COUNT_1H = "predictions_count_1h" -FIRST_REQUEST = "first_request" -LAST_REQUEST = "last_request" -ERROR_COUNT = "error_count" -ENTITIES = "entities" -FEATURE_NAMES = "feature_names" -LABEL_COLUMNS = "label_columns" -LATENCY = "latency" -RECORD_TYPE = "record_type" -FEATURES = "features" -PREDICTION = "prediction" -PREDICTIONS = "predictions" -NAMED_FEATURES = "named_features" -NAMED_PREDICTIONS = "named_predictions" -BASE_METRICS = "base_metrics" -CUSTOM_METRICS = "custom_metrics" -ENDPOINT_FEATURES = "endpoint_features" -METRICS = "metrics" -BATCH_TIMESTAMP = "batch_timestamp" -TIME_FORMAT: str = "%Y-%m-%d %H:%M:%S.%f" # ISO 8061 - - -# Stream processing code -class EventStreamProcessor: - def __init__( - self, - project: str, - sample_window: int = 10, - tsdb_batching_max_events: int = 10, - tsdb_batching_timeout_secs: int = 60 * 5, # Default 5 minutes - parquet_batching_max_events: int = 10_000, - parquet_batching_timeout_secs: int = 60 * 60, # Default 1 hour - aggregate_count_windows: Optional[List[str]] = None, - aggregate_count_period: str = "30s", - aggregate_avg_windows: Optional[List[str]] = None, - aggregate_avg_period: str = "30s", - v3io_access_key: Optional[str] = None, - v3io_framesd: Optional[str] = None, - v3io_api: Optional[str] = None, - ): - self.project = project - self.sample_window = sample_window - self.tsdb_batching_max_events = tsdb_batching_max_events - self.tsdb_batching_timeout_secs = tsdb_batching_timeout_secs - self.parquet_batching_max_events = parquet_batching_max_events - self.parquet_batching_timeout_secs = parquet_batching_timeout_secs - self.aggregate_count_windows = aggregate_count_windows or ["5m", "1h"] - self.aggregate_count_period = aggregate_count_period - self.aggregate_avg_windows = aggregate_avg_windows or ["5m", "1h"] - self.aggregate_avg_period = aggregate_avg_period - - self.v3io_framesd = v3io_framesd or config.v3io_framesd - self.v3io_api = v3io_api or config.v3io_api - - self.v3io_access_key = v3io_access_key or environ.get("V3IO_ACCESS_KEY") - self.model_monitoring_access_key = ( - os.environ.get("MODEL_MONITORING_ACCESS_KEY") or self.v3io_access_key - ) - - template = config.model_endpoint_monitoring.store_prefixes.default - - kv_path = template.format(project=project, kind="endpoints") - _, self.kv_container, self.kv_path = parse_model_endpoint_store_prefix(kv_path) - - tsdb_path = template.format(project=project, kind="events") - _, self.tsdb_container, self.tsdb_path = parse_model_endpoint_store_prefix( - tsdb_path - ) - self.tsdb_path = f"{self.tsdb_container}/{self.tsdb_path}" - - self.parquet_path = config.model_endpoint_monitoring.store_prefixes.user_space.format( - project=project, kind="parquet" - ) - - logger.info( - "V3IO Configuration", - v3io_access_key=self.v3io_access_key, - model_monitoring_access_key=self.model_monitoring_access_key, - default_store_prefix=config.model_endpoint_monitoring.store_prefixes.default, - user_space_store_prefix=config.model_endpoint_monitoring.store_prefixes.user_space, - v3io_api=self.v3io_api, - v3io_framesd=self.v3io_framesd, - kv_container=self.kv_container, - kv_path=self.kv_path, - tsdb_container=self.tsdb_container, - tsdb_path=self.tsdb_path, - parquet_path=self.parquet_path, - ) - - self._kv_keys = [ - FUNCTION_URI, - MODEL, - MODEL_CLASS, - TIMESTAMP, - ENDPOINT_ID, - LABELS, - UNPACKED_LABELS, - LATENCY_AVG_5M, - LATENCY_AVG_1H, - PREDICTIONS_PER_SECOND, - PREDICTIONS_COUNT_5M, - PREDICTIONS_COUNT_1H, - FIRST_REQUEST, - LAST_REQUEST, - ERROR_COUNT, - ] - - self._flow = build_flow( - [ - SyncEmitSource(), - ProcessEndpointEvent( - kv_container=self.kv_container, - kv_path=self.kv_path, - v3io_access_key=self.v3io_access_key, - ), - FilterNotNone(), - FlatMap(lambda x: x), - MapFeatureNames( - kv_container=self.kv_container, - kv_path=self.kv_path, - access_key=self.v3io_access_key, - ), - # Branch 1: Aggregate events, count averages and update TSDB and KV - [ - AggregateByKey( - aggregates=[ - FieldAggregator( - PREDICTIONS, - ENDPOINT_ID, - ["count"], - SlidingWindows( - self.aggregate_count_windows, - self.aggregate_count_period, - ), - ), - FieldAggregator( - LATENCY, - LATENCY, - ["avg"], - SlidingWindows( - self.aggregate_avg_windows, - self.aggregate_avg_period, - ), - ), - ], - table=Table("notable", NoopDriver()), - ), - SampleWindow( - self.sample_window - ), # Add required gap between event to apply sampling - Map(self.compute_predictions_per_second), - # Branch 1.1: Updated KV - [ - Map(self.process_before_kv), - WriteToKV(container=self.kv_container, table=self.kv_path), - InferSchema( - v3io_access_key=self.v3io_access_key, - v3io_framesd=self.v3io_framesd, - container=self.kv_container, - table=self.kv_path, - ), - ], - # Branch 1.2: Update TSDB - [ - # Map the event into taggable fields, add record type to each field - Map(self.process_before_events_tsdb), - [ - FilterKeys(BASE_METRICS), - UnpackValues(BASE_METRICS), - TSDBTarget( - path=self.tsdb_path, - rate="10/m", - time_col=TIMESTAMP, - container=self.tsdb_container, - access_key=self.v3io_access_key, - v3io_frames=self.v3io_framesd, - index_cols=[ENDPOINT_ID, RECORD_TYPE], - # Settings for _Batching - max_events=self.tsdb_batching_max_events, - timeout_secs=self.tsdb_batching_timeout_secs, - key=ENDPOINT_ID, - ), - ], - [ - FilterKeys(ENDPOINT_FEATURES), - UnpackValues(ENDPOINT_FEATURES), - TSDBTarget( - path=self.tsdb_path, - rate="10/m", - time_col=TIMESTAMP, - container=self.tsdb_container, - access_key=self.v3io_access_key, - v3io_frames=self.v3io_framesd, - index_cols=[ENDPOINT_ID, RECORD_TYPE], - # Settings for _Batching - max_events=self.tsdb_batching_max_events, - timeout_secs=self.tsdb_batching_timeout_secs, - key=ENDPOINT_ID, - ), - ], - [ - FilterKeys(CUSTOM_METRICS), - FilterNotNone(), - UnpackValues(CUSTOM_METRICS), - TSDBTarget( - path=self.tsdb_path, - rate="10/m", - time_col=TIMESTAMP, - container=self.tsdb_container, - access_key=self.v3io_access_key, - v3io_frames=self.v3io_framesd, - index_cols=[ENDPOINT_ID, RECORD_TYPE], - # Settings for _Batching - max_events=self.tsdb_batching_max_events, - timeout_secs=self.tsdb_batching_timeout_secs, - key=ENDPOINT_ID, - ), - ], - ], - ], - # Branch 2: Batch events, write to parquet - [ - Map(self.process_before_parquet), - ParquetTarget( - path=self.parquet_path, - partition_cols=["$key", "$year", "$month", "$day", "$hour"], - infer_columns_from_data=True, - # Settings for _Batching - max_events=self.parquet_batching_max_events, - timeout_secs=self.parquet_batching_timeout_secs, - # Settings for v3io storage - storage_options={ - "v3io_api": self.v3io_api, - "v3io_access_key": self.model_monitoring_access_key, - }, - ), - ], - ] - ).run() - - def consume(self, event: Dict): - events = [] - if "headers" in event and "values" in event: - for values in event["values"]: - events.append({k: v for k, v in zip(event["headers"], values)}) - else: - events.append(event) - - for enriched in map(enrich_even_details, events): - if enriched is not None: - self._flow.emit( - enriched, - key=enriched[ENDPOINT_ID], - event_time=datetime.strptime(enriched["when"], ISO_8061_UTC), - ) - else: - pass - - @staticmethod - def compute_predictions_per_second(event: dict): - event[PREDICTIONS_PER_SECOND] = float(event[PREDICTIONS_COUNT_5M]) / 600 - return event - - def process_before_kv(self, event: dict): - # Filter relevant keys - e = {k: event[k] for k in self._kv_keys} - # Unpack labels dictionary - e = {**e, **e.pop(UNPACKED_LABELS, {})} - # Write labels to kv as json string to be presentable later - e[LABELS] = json.dumps(e[LABELS]) - return e - - @staticmethod - def process_before_events_tsdb(event: Dict): - base_fields = [TIMESTAMP, ENDPOINT_ID] - - base_event = {k: event[k] for k in base_fields} - base_event[TIMESTAMP] = pd.to_datetime( - base_event[TIMESTAMP], format=TIME_FORMAT - ) - - base_metrics = { - RECORD_TYPE: BASE_METRICS, - PREDICTIONS_PER_SECOND: event[PREDICTIONS_PER_SECOND], - PREDICTIONS_COUNT_5M: event[PREDICTIONS_COUNT_5M], - PREDICTIONS_COUNT_1H: event[PREDICTIONS_COUNT_1H], - LATENCY_AVG_5M: event[LATENCY_AVG_5M], - LATENCY_AVG_1H: event[LATENCY_AVG_1H], - **base_event, - } - - endpoint_features = { - RECORD_TYPE: ENDPOINT_FEATURES, - **event[NAMED_PREDICTIONS], - **event[NAMED_FEATURES], - **base_event, - } - - processed = {BASE_METRICS: base_metrics, ENDPOINT_FEATURES: endpoint_features} - - if event[METRICS]: - processed[CUSTOM_METRICS] = { - RECORD_TYPE: CUSTOM_METRICS, - **event[METRICS], - **base_event, - } - - return processed - - @staticmethod - def process_before_parquet(event: dict): - def set_none_if_empty(_event: dict, keys: List[str]): - for key in keys: - if not _event.get(key): - _event[key] = None - - def drop_if_exists(_event: dict, keys: List[str]): - for key in keys: - _event.pop(key, None) - - def unpack_if_exists(_event: dict, keys: List[str]): - for key in keys: - value = _event.get(key) - if value is not None: - _event = {**value, **event} - - drop_if_exists(event, [UNPACKED_LABELS, FEATURES]) - unpack_if_exists(event, [ENTITIES]) - set_none_if_empty(event, [LABELS, METRICS, ENTITIES]) - return event - - -class ProcessEndpointEvent(MapClass): - def __init__(self, kv_container: str, kv_path: str, v3io_access_key: str, **kwargs): - super().__init__(**kwargs) - self.kv_container: str = kv_container - self.kv_path: str = kv_path - self.v3io_access_key: str = v3io_access_key - self.first_request: Dict[str, str] = dict() - self.last_request: Dict[str, str] = dict() - self.error_count: Dict[str, int] = defaultdict(int) - self.endpoints: Set[str] = set() - - def do(self, event: dict): - function_uri = event[FUNCTION_URI] - versioned_model = event[VERSIONED_MODEL] - endpoint_id = event[ENDPOINT_ID] - - # In case this process fails, resume state from existing record - self.resume_state(endpoint_id) - - # Handle errors coming from stream - found_errors = self.handle_errors(endpoint_id, event) - if found_errors: - return None - - # Validate event fields - model_class = event.get("model_class") or event.get("class") - timestamp = event.get("when") - request_id = event.get("request", {}).get("id") - latency = event.get("microsec") - features = event.get("request", {}).get("inputs") - predictions = event.get("resp", {}).get("outputs") - - if not self.is_valid(endpoint_id, is_not_none, timestamp, ["when"],): - return None - - if endpoint_id not in self.first_request: - self.first_request[endpoint_id] = timestamp - self.last_request[endpoint_id] = timestamp - - if not self.is_valid(endpoint_id, is_not_none, request_id, ["request", "id"],): - return None - if not self.is_valid(endpoint_id, is_not_none, latency, ["microsec"],): - return None - if not self.is_valid( - endpoint_id, is_not_none, features, ["request", "inputs"], - ): - return None - if not self.is_valid( - endpoint_id, is_not_none, predictions, ["resp", "outputs"], - ): - return None - - unpacked_labels = {f"_{k}": v for k, v in event.get(LABELS, {}).items()} - - # Separate each model invocation into sub events - events = [] - for i, (feature, prediction) in enumerate(zip(features, predictions)): - if not self.is_valid( - endpoint_id, - is_list_of_numerics, - feature, - ["request", "inputs", f"[{i}]"], - ): - return None - - if not isinstance(prediction, list): - prediction = [prediction] - - events.append( - { - FUNCTION_URI: function_uri, - MODEL: versioned_model, - MODEL_CLASS: model_class, - TIMESTAMP: timestamp, - ENDPOINT_ID: endpoint_id, - REQUEST_ID: request_id, - LATENCY: latency, - FEATURES: feature, - PREDICTION: prediction, - FIRST_REQUEST: self.first_request[endpoint_id], - LAST_REQUEST: self.last_request[endpoint_id], - ERROR_COUNT: self.error_count[endpoint_id], - LABELS: event.get(LABELS, {}), - METRICS: event.get(METRICS, {}), - ENTITIES: event.get("request", {}).get(ENTITIES, {}), - UNPACKED_LABELS: unpacked_labels, - } - ) - return events - - def resume_state(self, endpoint_id): - # Make sure process is resumable, if process fails for any reason, be able to pick things up close to where we - # left them - if endpoint_id not in self.endpoints: - logger.info("Trying to resume state", endpoint_id=endpoint_id) - endpoint_record = get_endpoint_record( - kv_container=self.kv_container, - kv_path=self.kv_path, - endpoint_id=endpoint_id, - access_key=self.v3io_access_key, - ) - if endpoint_record: - first_request = endpoint_record.get(FIRST_REQUEST) - if first_request: - self.first_request[endpoint_id] = first_request - error_count = endpoint_record.get(ERROR_COUNT) - if error_count: - self.error_count[endpoint_id] = error_count - self.endpoints.add(endpoint_id) - - def is_valid( - self, endpoint_id: str, validation_function, field: Any, dict_path: List[str] - ): - if validation_function(field, dict_path): - return True - self.error_count[endpoint_id] += 1 - return False - - def handle_errors(self, endpoint_id, event) -> bool: - if "error" in event: - self.error_count[endpoint_id] += 1 - return True - - return False - - -def enrich_even_details(event) -> Optional[dict]: - function_uri = event.get(FUNCTION_URI) - - if not is_not_none(function_uri, [FUNCTION_URI]): - return None - - model = event.get(MODEL) - if not is_not_none(model, [MODEL]): - return None - - version = event.get(VERSION) - versioned_model = f"{model}:{version}" if version else f"{model}:latest" - - endpoint_id = create_model_endpoint_id( - function_uri=function_uri, versioned_model=versioned_model, - ) - - endpoint_id = str(endpoint_id) - - event[VERSIONED_MODEL] = versioned_model - event[ENDPOINT_ID] = endpoint_id - - return event - - -def is_not_none(field: Any, dict_path: List[str]): - if field is not None: - return True - logger.error( - f"Expected event field is missing: {field} [Event -> {''.join(dict_path)}]" - ) - return False - - -def is_list_of_numerics( - field: List[Union[int, float, dict, list]], dict_path: List[str] -): - if all(isinstance(x, int) or isinstance(x, float) for x in field): - return True - logger.error( - f"Expected event field is missing: {field} [Event -> {''.join(dict_path)}]" - ) - return False - - -class FilterNotNone(Filter): - def __init__(self, **kwargs): - super().__init__(fn=lambda event: event is not None, **kwargs) - - -class FilterKeys(MapClass): - def __init__(self, *args, **kwargs): - super().__init__(**kwargs) - self.keys = list(args) - - def do(self, event): - new_event = {} - for key in self.keys: - if key in event: - new_event[key] = event[key] - - return new_event if new_event else None - - -class UnpackValues(MapClass): - def __init__(self, *args, **kwargs): - super().__init__(**kwargs) - self.keys_to_unpack = set(args) - - def do(self, event): - unpacked = {} - for key in event.keys(): - if key in self.keys_to_unpack: - unpacked = {**unpacked, **event[key]} - else: - unpacked[key] = event[key] - return unpacked - - -class MapFeatureNames(MapClass): - def __init__(self, kv_container: str, kv_path: str, access_key: str, **kwargs): - super().__init__(**kwargs) - self.kv_container = kv_container - self.kv_path = kv_path - self.access_key = access_key - self.feature_names = {} - self.label_columns = {} - - def do(self, event: Dict): - endpoint_id = event[ENDPOINT_ID] - - if endpoint_id not in self.feature_names: - endpoint_record = get_endpoint_record( - kv_container=self.kv_container, - kv_path=self.kv_path, - endpoint_id=endpoint_id, - access_key=self.access_key, - ) - feature_names = endpoint_record.get(FEATURE_NAMES) - feature_names = json.loads(feature_names) if feature_names else None - - label_columns = endpoint_record.get(LABEL_COLUMNS) - label_columns = json.loads(label_columns) if label_columns else None - - if not feature_names: - logger.warn( - f"Feature names are not initialized, they will be automatically generated", - endpoint_id=endpoint_id, - ) - feature_names = [f"f{i}" for i, _ in enumerate(event[FEATURES])] - get_v3io_client().kv.update( - container=self.kv_container, - table_path=self.kv_path, - access_key=self.access_key, - key=event[ENDPOINT_ID], - attributes={FEATURE_NAMES: json.dumps(feature_names)}, - raise_for_status=RaiseForStatus.always, - ) - - if not label_columns: - logger.warn( - f"label column names are not initialized, they will be automatically generated", - endpoint_id=endpoint_id, - ) - label_columns = [f"p{i}" for i, _ in enumerate(event[PREDICTION])] - get_v3io_client().kv.update( - container=self.kv_container, - table_path=self.kv_path, - access_key=self.access_key, - key=event[ENDPOINT_ID], - attributes={LABEL_COLUMNS: json.dumps(label_columns)}, - raise_for_status=RaiseForStatus.always, - ) - - self.label_columns[endpoint_id] = label_columns - self.feature_names[endpoint_id] = feature_names - - logger.info( - "Label columns", endpoint_id=endpoint_id, label_columns=label_columns - ) - logger.info( - "Feature names", endpoint_id=endpoint_id, feature_names=feature_names - ) - - feature_names = self.feature_names[endpoint_id] - features = event[FEATURES] - event[NAMED_FEATURES] = { - name: feature for name, feature in zip(feature_names, features) - } - - label_columns = self.label_columns[endpoint_id] - prediction = event[PREDICTION] - event[NAMED_PREDICTIONS] = { - name: prediction for name, prediction in zip(label_columns, prediction) - } - logger.info("Mapped event", event=event) - return event - - -class WriteToKV(MapClass): - def __init__(self, container: str, table: str, **kwargs): - super().__init__(**kwargs) - self.container = container - self.table = table - - def do(self, event: Dict): - get_v3io_client().kv.update( - container=self.container, - table_path=self.table, - key=event[ENDPOINT_ID], - attributes=event, - ) - return event - - -class InferSchema(MapClass): - def __init__( - self, - v3io_access_key: str, - v3io_framesd: str, - container: str, - table: str, - **kwargs, - ): - super().__init__(**kwargs) - self.container = container - self.v3io_access_key = v3io_access_key - self.v3io_framesd = v3io_framesd - self.table = table - self.keys = set() - - def do(self, event: Dict): - key_set = set(event.keys()) - if not key_set.issubset(self.keys): - self.keys.update(key_set) - get_frames_client( - token=self.v3io_access_key, - container=self.container, - address=self.v3io_framesd, - ).execute(backend="kv", table=self.table, command="infer_schema") - logger.info( - "Found new keys, inferred schema", table=self.table, event=event - ) - return event - - -def get_endpoint_record( - kv_container: str, kv_path: str, endpoint_id: str, access_key: str -) -> Optional[dict]: - logger.info( - f"Grabbing endpoint data", - container=kv_container, - table_path=kv_path, - key=endpoint_id, - ) - try: - endpoint_record = ( - get_v3io_client() - .kv.get( - container=kv_container, - table_path=kv_path, - key=endpoint_id, - access_key=access_key, - raise_for_status=v3io.dataplane.RaiseForStatus.always, - ) - .output.item - ) - return endpoint_record - except Exception: - return None - - -def init_context(context: MLClientCtx): - context.logger.info("Initializing EventStreamProcessor") - parameters = environ.get("MODEL_MONITORING_PARAMETERS") - parameters = json.loads(parameters) if parameters else {} - stream_processor = EventStreamProcessor(**parameters) - setattr(context, "stream_processor", stream_processor) - - -def handler(context: MLClientCtx, event: Event): - event_body = json.loads(event.body) - context.logger.debug(event_body) - context.stream_processor.consume(event_body) diff --git a/functions/development/model_monitoring_stream/0.8.0/src/requirements.txt b/functions/development/model_monitoring_stream/0.8.0/src/requirements.txt deleted file mode 100644 index 5e3645de..00000000 --- a/functions/development/model_monitoring_stream/0.8.0/src/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -storey \ No newline at end of file diff --git a/functions/development/model_monitoring_stream/0.8.0/static/documentation.html b/functions/development/model_monitoring_stream/0.8.0/static/documentation.html deleted file mode 100644 index 7971cb28..00000000 --- a/functions/development/model_monitoring_stream/0.8.0/static/documentation.html +++ /dev/null @@ -1,245 +0,0 @@ - - - - - - - -model_monitoring_stream package - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- - -
-
-
-
-
-
-

model_monitoring_stream package

-
-

Submodules

-
-
-

model_monitoring_stream.model_monitoring_stream module

-
-
-class model_monitoring_stream.model_monitoring_stream.EventStreamProcessor(project: str, sample_window: int = 10, tsdb_batching_max_events: int = 10, tsdb_batching_timeout_secs: int = 300, parquet_batching_max_events: int = 10000, parquet_batching_timeout_secs: int = 3600, aggregate_count_windows: Optional[List[str]] = None, aggregate_count_period: str = '30s', aggregate_avg_windows: Optional[List[str]] = None, aggregate_avg_period: str = '30s', v3io_access_key: Optional[str] = None, v3io_framesd: Optional[str] = None, v3io_api: Optional[str] = None)[source]
-

Bases: object

-
-
-static compute_predictions_per_second(event: dict)[source]
-
-
-
-consume(event: Dict)[source]
-
-
-
-static process_before_events_tsdb(event: Dict)[source]
-
-
-
-process_before_kv(event: dict)[source]
-
-
-
-static process_before_parquet(event: dict)[source]
-
-
-
-
-class model_monitoring_stream.model_monitoring_stream.FilterKeys(*args: Any, **kwargs: Any)[source]
-

Bases: storey.

-
-
-do(event)[source]
-
-
-
-
-class model_monitoring_stream.model_monitoring_stream.FilterNotNone(*args: Any, **kwargs: Any)[source]
-

Bases: storey.

-
-
-
-class model_monitoring_stream.model_monitoring_stream.InferSchema(*args: Any, **kwargs: Any)[source]
-

Bases: storey.

-
-
-do(event: Dict)[source]
-
-
-
-
-class model_monitoring_stream.model_monitoring_stream.MapFeatureNames(*args: Any, **kwargs: Any)[source]
-

Bases: storey.

-
-
-do(event: Dict)[source]
-
-
-
-
-class model_monitoring_stream.model_monitoring_stream.ProcessEndpointEvent(*args: Any, **kwargs: Any)[source]
-

Bases: storey.

-
-
-do(event: dict)[source]
-
-
-
-handle_errors(endpoint_id, event)bool[source]
-
-
-
-is_valid(endpoint_id: str, validation_function, field: Any, dict_path: List[str])[source]
-
-
-
-resume_state(endpoint_id)[source]
-
-
-
-
-class model_monitoring_stream.model_monitoring_stream.UnpackValues(*args: Any, **kwargs: Any)[source]
-

Bases: storey.

-
-
-do(event)[source]
-
-
-
-
-class model_monitoring_stream.model_monitoring_stream.WriteToKV(*args: Any, **kwargs: Any)[source]
-

Bases: storey.

-
-
-do(event: Dict)[source]
-
-
-
-
-model_monitoring_stream.model_monitoring_stream.enrich_even_details(event)Optional[dict][source]
-
-
-
-model_monitoring_stream.model_monitoring_stream.get_endpoint_record(kv_container: str, kv_path: str, endpoint_id: str, access_key: str)Optional[dict][source]
-
-
-
-model_monitoring_stream.model_monitoring_stream.handler(context: mlrun.execution.MLClientCtx, event: nuclio.request.Event)[source]
-
-
-
-model_monitoring_stream.model_monitoring_stream.init_context(context: mlrun.execution.MLClientCtx)[source]
-
-
-
-model_monitoring_stream.model_monitoring_stream.is_list_of_numerics(field: List[Union[int, float, dict, list]], dict_path: List[str])[source]
-
-
-
-model_monitoring_stream.model_monitoring_stream.is_not_none(field: Any, dict_path: List[str])[source]
-
-
-
-

Module contents

-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/model_monitoring_stream/0.8.0/static/example.html b/functions/development/model_monitoring_stream/0.8.0/static/example.html deleted file mode 100644 index 7da34cb9..00000000 --- a/functions/development/model_monitoring_stream/0.8.0/static/example.html +++ /dev/null @@ -1,240 +0,0 @@ - - - - - - - -Model Monitoring - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- - -
-
-
-
-
-
-

Model Monitoring

-
-

Initial set up (and pre-requisites)

-
    -
  1. Make sure you have the mlrun-api datasource available in your Grafana instance, otherwise add it by:

    -
      -
    1. Open your grafana instance

    2. -
    3. Navigate to Configuration -> Data Sources

    4. -
    5. Press Add data source and configure the following parameters

    6. -
    -
    Name: mlrun-api
    -URL: http://mlrun-api:8080/api/grafana-proxy/model-endpoints
    -Access: Server (default)
    -
    -## Add a custom header of:
    -X-V3io-Session-Key: <YOUR ACCESS KEY>
    -
    -
    -
      -
    1. Press Save & Test to make sure it works, a confirmation message should appear when this button is pressed

    2. -
    -
  2. -
  3. Import the available dashboards (./dashboards/*) to you Grafana instance

  4. -
  5. To allow the system to utilize drift measurement, make sure you supply the train set when logging the model on the -training step

    -
    # Log model
    -context.log_model(
    -    "model",
    -    body=dumps(model),
    -    artifact_path=context.artifact_subpath("models"),
    -    extra_data=eval_metrics,
    -    model_file="model.pkl",
    -    metrics=context.results,
    -    training_set=X_test,  # <- make sure this is passed into log_model
    -    labels={"class": "sklearn.linear_model.LogisticRegression"}
    -)
    -
    -
    -
  6. -
  7. When serving a model, make sure that the Nuclio function is deployed with tracking enabled by applying -fn.set_tracking()

  8. -
-
-
-

Configuration

-

The stream processing portion of the model monitoring, can be deployed under multiple configuration options. The -available configurations can be found under stream.Config. Once configured it should be supplied as environment -parameters to the Nuclio function by setting fn.set_envs

-
project: str                        # project name
-sample_window: int                  # The sampling window for the data that flows into the TSDB and the KV
-kv_path_template: str               # Path template for the kv table
-tsdb_path_template: str             # Path template for the tsdb table
-parquet_path_template: str          # v3io parquets path template, assumes v3io is mounted
-tsdb_batching_max_events: int       # The max amount of event to batch before writing the batch to tsdb
-tsdb_batching_timeout_secs: int     # The max amount of seconds a given batch can be gathered before being emitted
-parquet_batching_max_events: int    # The max amount of event to batch before writing the batch to parquet
-parquet_batching_timeout_secs: int  # The max amount of seconds, a given batch can be gathered before being written to parquet
-container: str                      # container name
-v3io_access_key: str                # V3IO Access key
-v3io_framesd: str                   # V3IO framesd URL
-time_format: str                    # The time format into which time related fields will be converted
-aggregate_count_windows: List[str]  # List of window sizes for predictions count
-aggregate_count_period: str         # Period of predictions count windows
-aggregate_avg_windows: List[str]    # List of window sizes for average latency
-aggregate_avg_period: str           # Period of average latency windows
-
-
-
-
-

Export function yaml

-
-
-
from mlrun import code_to_function
-from mlrun.runtimes import RemoteRuntime
-
-
-fn: RemoteRuntime = code_to_function(
-    name="model-monitoring-stream",
-    kind="nuclio",
-    image="mlrun/mlrun",
-    filename="model_monitoring_stream.py",
-    handler="handler",
-)
-fn.export("model_monitoring_stream.yaml")
-
-
-
-
-
-
-

Deploy Stream Processing

-
-
-
import os
-
-from mlrun import import_function
-from mlrun.platforms import mount_v3io
-from mlrun.runtimes import RemoteRuntime
-import json
-
-# Set project name
-project = ""
-
-fn: RemoteRuntime = import_function("hub://model_monitoring_stream")
-
-fn.add_v3io_stream_trigger(
-    stream_path=f"projects/{project}/model-endpoints/stream",
-    name="monitoring_stream_trigger",
-)
-
-fn.set_env("MODEL_MONITORING_PARAMETERS", json.dumps({"project": project, "v3io_framesd": os.environ.get("V3IO_FRAMESD")}))
-
-fn.metadata.project = project
-fn.apply(mount_v3io())
-fn.deploy()
-
-
-
-
-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/model_monitoring_stream/0.8.0/static/function.html b/functions/development/model_monitoring_stream/0.8.0/static/function.html deleted file mode 100644 index e5929222..00000000 --- a/functions/development/model_monitoring_stream/0.8.0/static/function.html +++ /dev/null @@ -1,289 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: remote
-metadata:
-  name: model-monitoring-stream
-  tag: ''
-  hash: 33f4d6de0858b3dfc9d150fc82fbed6feb05534c
-  project: default
-  categories:
-  - monitoring
-spec:
-  command: ''
-  args: []
-  image: livsmichael/mlrun-api:automation
-  entry_points:
-    consume:
-      name: consume
-      doc: ''
-      parameters:
-      - name: self
-        default: ''
-      - name: event
-        type: Dict
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 293
-    compute_predictions_per_second:
-      name: compute_predictions_per_second
-      doc: ''
-      parameters:
-      - name: event
-        type: dict
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 311
-    process_before_kv:
-      name: process_before_kv
-      doc: ''
-      parameters:
-      - name: self
-        default: ''
-      - name: event
-        type: dict
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 316
-    process_before_events_tsdb:
-      name: process_before_events_tsdb
-      doc: ''
-      parameters:
-      - name: event
-        type: Dict
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 325
-    process_before_parquet:
-      name: process_before_parquet
-      doc: ''
-      parameters:
-      - name: event
-        type: dict
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 362
-    set_none_if_empty:
-      name: set_none_if_empty
-      doc: ''
-      parameters:
-      - name: _event
-        type: dict
-        default: ''
-      - name: keys
-        type: List[str]
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 364
-    drop_if_exists:
-      name: drop_if_exists
-      doc: ''
-      parameters:
-      - name: _event
-        type: dict
-        default: ''
-      - name: keys
-        type: List[str]
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 369
-    unpack_if_exists:
-      name: unpack_if_exists
-      doc: ''
-      parameters:
-      - name: _event
-        type: dict
-        default: ''
-      - name: keys
-        type: List[str]
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 373
-    do:
-      name: do
-      doc: ''
-      parameters:
-      - name: self
-        default: ''
-      - name: event
-        type: Dict
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 702
-    resume_state:
-      name: resume_state
-      doc: ''
-      parameters:
-      - name: self
-        default: ''
-      - name: endpoint_id
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 475
-    is_valid:
-      name: is_valid
-      doc: ''
-      parameters:
-      - name: self
-        default: ''
-      - name: endpoint_id
-        type: str
-        default: ''
-      - name: validation_function
-        default: ''
-      - name: field
-        type: Any
-        default: ''
-      - name: dict_path
-        type: List[str]
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 495
-    handle_errors:
-      name: handle_errors
-      doc: ''
-      parameters:
-      - name: self
-        default: ''
-      - name: endpoint_id
-        default: ''
-      - name: event
-        default: ''
-      outputs:
-      - default: ''
-        type: bool
-      lineno: 503
-    enrich_even_details:
-      name: enrich_even_details
-      doc: ''
-      parameters:
-      - name: event
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 511
-    is_not_none:
-      name: is_not_none
-      doc: ''
-      parameters:
-      - name: field
-        type: Any
-        default: ''
-      - name: dict_path
-        type: List[str]
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 536
-    is_list_of_numerics:
-      name: is_list_of_numerics
-      doc: ''
-      parameters:
-      - name: field
-        type: List[Union[int, float, dict, list]]
-        default: ''
-      - name: dict_path
-        type: List[str]
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 545
-    get_endpoint_record:
-      name: get_endpoint_record
-      doc: ''
-      parameters:
-      - name: kv_container
-        type: str
-        default: ''
-      - name: kv_path
-        type: str
-        default: ''
-      - name: endpoint_id
-        type: str
-        default: ''
-      - name: access_key
-        type: str
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 717
-    init_context:
-      name: init_context
-      doc: ''
-      parameters:
-      - name: context
-        type: MLClientCtx
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 743
-    handler:
-      name: handler
-      doc: ''
-      parameters:
-      - name: context
-        type: MLClientCtx
-        default: ''
-      - name: event
-        type: Event
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 751
-  description: ''
-  min_replicas: 1
-  max_replicas: 4
-  env: []
-  base_spec:
-    apiVersion: nuclio.io/v1
-    kind: Function
-    metadata:
-      name: model-monitoring-stream
-      labels: {}
-      annotations:
-        nuclio.io/generated_by: function generated from /home/michaell/projects/functions/model_monitoring_stream/model_monitoring_stream.py
-    spec:
-      runtime: python:3.6
-      handler: model_monitoring_stream:handler
-      env: []
-      volumes: []
-      build:
-        commands: []
-        noBaseImagesPull: true
-        functionSourceCode: 
-  source: ''
-  build:
-    commands: []
-    code_origin: https://github.com/Michaelliv/functions.git#202b4c489e4c02c3025742ea237f1a042b7c6043:/home/michaell/projects/functions/model_monitoring_stream/model_monitoring_stream.py
-  default_handler: handler
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/model_monitoring_stream/0.8.0/static/item.html b/functions/development/model_monitoring_stream/0.8.0/static/item.html deleted file mode 100644 index 47908171..00000000 --- a/functions/development/model_monitoring_stream/0.8.0/static/item.html +++ /dev/null @@ -1,44 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- monitoring
-description: ''
-doc: ''
-example: model_monitoring_stream.ipynb
-generationDate: 2021-05-19:23-13
-icon: ''
-labels: {}
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 0.8.0
-name: model-monitoring-stream
-platformVersion: 3.2.0
-spec:
-  filename: model_monitoring_stream.py
-  handler: handler
-  image: livsmichael/mlrun-api:automation
-  kind: nuclio
-  requirements: []
-url: ''
-version: 0.8.0
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/model_monitoring_stream/0.8.0/static/source.html b/functions/development/model_monitoring_stream/0.8.0/static/source.html deleted file mode 100644 index e76e5230..00000000 --- a/functions/development/model_monitoring_stream/0.8.0/static/source.html +++ /dev/null @@ -1,776 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-import json
-import os
-from collections import defaultdict
-from datetime import datetime
-from os import environ
-from typing import Dict, List, Set, Optional, Any, Union
-
-import pandas as pd
-import v3io
-from mlrun.config import config
-from mlrun.run import MLClientCtx
-from mlrun.utils import logger
-from mlrun.utils.model_monitoring import (
-    parse_model_endpoint_store_prefix,
-    create_model_endpoint_id,
-)
-from mlrun.utils.v3io_clients import get_v3io_client, get_frames_client
-from nuclio import Event
-from storey import (
-    FieldAggregator,
-    NoopDriver,
-    Table,
-    Map,
-    MapClass,
-    AggregateByKey,
-    build_flow,
-    Filter,
-    FlatMap,
-    TSDBTarget,
-    ParquetTarget,
-    SyncEmitSource,
-)
-from storey.dtypes import SlidingWindows
-from storey.steps import SampleWindow
-# Constants
-from v3io.dataplane import RaiseForStatus
-
-ISO_8061_UTC = "%Y-%m-%d %H:%M:%S.%f%z"
-FUNCTION_URI = "function_uri"
-MODEL = "model"
-VERSION = "version"
-VERSIONED_MODEL = "versioned_model"
-MODEL_CLASS = "model_class"
-TIMESTAMP = "timestamp"
-ENDPOINT_ID = "endpoint_id"
-REQUEST_ID = "request_id"
-LABELS = "labels"
-UNPACKED_LABELS = "unpacked_labels"
-LATENCY_AVG_5M = "latency_avg_5m"
-LATENCY_AVG_1H = "latency_avg_1h"
-PREDICTIONS_PER_SECOND = "predictions_per_second"
-PREDICTIONS_COUNT_5M = "predictions_count_5m"
-PREDICTIONS_COUNT_1H = "predictions_count_1h"
-FIRST_REQUEST = "first_request"
-LAST_REQUEST = "last_request"
-ERROR_COUNT = "error_count"
-ENTITIES = "entities"
-FEATURE_NAMES = "feature_names"
-LABEL_COLUMNS = "label_columns"
-LATENCY = "latency"
-RECORD_TYPE = "record_type"
-FEATURES = "features"
-PREDICTION = "prediction"
-PREDICTIONS = "predictions"
-NAMED_FEATURES = "named_features"
-NAMED_PREDICTIONS = "named_predictions"
-BASE_METRICS = "base_metrics"
-CUSTOM_METRICS = "custom_metrics"
-ENDPOINT_FEATURES = "endpoint_features"
-METRICS = "metrics"
-BATCH_TIMESTAMP = "batch_timestamp"
-TIME_FORMAT: str = "%Y-%m-%d %H:%M:%S.%f"  # ISO 8061
-
-
-# Stream processing code
-class EventStreamProcessor:
-    def __init__(
-        self,
-        project: str,
-        sample_window: int = 10,
-        tsdb_batching_max_events: int = 10,
-        tsdb_batching_timeout_secs: int = 60 * 5,  # Default 5 minutes
-        parquet_batching_max_events: int = 10_000,
-        parquet_batching_timeout_secs: int = 60 * 60,  # Default 1 hour
-        aggregate_count_windows: Optional[List[str]] = None,
-        aggregate_count_period: str = "30s",
-        aggregate_avg_windows: Optional[List[str]] = None,
-        aggregate_avg_period: str = "30s",
-        v3io_access_key: Optional[str] = None,
-        v3io_framesd: Optional[str] = None,
-        v3io_api: Optional[str] = None,
-    ):
-        self.project = project
-        self.sample_window = sample_window
-        self.tsdb_batching_max_events = tsdb_batching_max_events
-        self.tsdb_batching_timeout_secs = tsdb_batching_timeout_secs
-        self.parquet_batching_max_events = parquet_batching_max_events
-        self.parquet_batching_timeout_secs = parquet_batching_timeout_secs
-        self.aggregate_count_windows = aggregate_count_windows or ["5m", "1h"]
-        self.aggregate_count_period = aggregate_count_period
-        self.aggregate_avg_windows = aggregate_avg_windows or ["5m", "1h"]
-        self.aggregate_avg_period = aggregate_avg_period
-
-        self.v3io_framesd = v3io_framesd or config.v3io_framesd
-        self.v3io_api = v3io_api or config.v3io_api
-
-        self.v3io_access_key = v3io_access_key or environ.get("V3IO_ACCESS_KEY")
-        self.model_monitoring_access_key = (
-            os.environ.get("MODEL_MONITORING_ACCESS_KEY") or self.v3io_access_key
-        )
-
-        template = config.model_endpoint_monitoring.store_prefixes.default
-
-        kv_path = template.format(project=project, kind="endpoints")
-        _, self.kv_container, self.kv_path = parse_model_endpoint_store_prefix(kv_path)
-
-        tsdb_path = template.format(project=project, kind="events")
-        _, self.tsdb_container, self.tsdb_path = parse_model_endpoint_store_prefix(
-            tsdb_path
-        )
-        self.tsdb_path = f"{self.tsdb_container}/{self.tsdb_path}"
-
-        self.parquet_path = config.model_endpoint_monitoring.store_prefixes.user_space.format(
-            project=project, kind="parquet"
-        )
-
-        logger.info(
-            "V3IO Configuration",
-            v3io_access_key=self.v3io_access_key,
-            model_monitoring_access_key=self.model_monitoring_access_key,
-            default_store_prefix=config.model_endpoint_monitoring.store_prefixes.default,
-            user_space_store_prefix=config.model_endpoint_monitoring.store_prefixes.user_space,
-            v3io_api=self.v3io_api,
-            v3io_framesd=self.v3io_framesd,
-            kv_container=self.kv_container,
-            kv_path=self.kv_path,
-            tsdb_container=self.tsdb_container,
-            tsdb_path=self.tsdb_path,
-            parquet_path=self.parquet_path,
-        )
-
-        self._kv_keys = [
-            FUNCTION_URI,
-            MODEL,
-            MODEL_CLASS,
-            TIMESTAMP,
-            ENDPOINT_ID,
-            LABELS,
-            UNPACKED_LABELS,
-            LATENCY_AVG_5M,
-            LATENCY_AVG_1H,
-            PREDICTIONS_PER_SECOND,
-            PREDICTIONS_COUNT_5M,
-            PREDICTIONS_COUNT_1H,
-            FIRST_REQUEST,
-            LAST_REQUEST,
-            ERROR_COUNT,
-        ]
-
-        self._flow = build_flow(
-            [
-                SyncEmitSource(),
-                ProcessEndpointEvent(
-                    kv_container=self.kv_container,
-                    kv_path=self.kv_path,
-                    v3io_access_key=self.v3io_access_key,
-                ),
-                FilterNotNone(),
-                FlatMap(lambda x: x),
-                MapFeatureNames(
-                    kv_container=self.kv_container,
-                    kv_path=self.kv_path,
-                    access_key=self.v3io_access_key,
-                ),
-                # Branch 1: Aggregate events, count averages and update TSDB and KV
-                [
-                    AggregateByKey(
-                        aggregates=[
-                            FieldAggregator(
-                                PREDICTIONS,
-                                ENDPOINT_ID,
-                                ["count"],
-                                SlidingWindows(
-                                    self.aggregate_count_windows,
-                                    self.aggregate_count_period,
-                                ),
-                            ),
-                            FieldAggregator(
-                                LATENCY,
-                                LATENCY,
-                                ["avg"],
-                                SlidingWindows(
-                                    self.aggregate_avg_windows,
-                                    self.aggregate_avg_period,
-                                ),
-                            ),
-                        ],
-                        table=Table("notable", NoopDriver()),
-                    ),
-                    SampleWindow(
-                        self.sample_window
-                    ),  # Add required gap between event to apply sampling
-                    Map(self.compute_predictions_per_second),
-                    # Branch 1.1: Updated KV
-                    [
-                        Map(self.process_before_kv),
-                        WriteToKV(container=self.kv_container, table=self.kv_path),
-                        InferSchema(
-                            v3io_access_key=self.v3io_access_key,
-                            v3io_framesd=self.v3io_framesd,
-                            container=self.kv_container,
-                            table=self.kv_path,
-                        ),
-                    ],
-                    # Branch 1.2: Update TSDB
-                    [
-                        # Map the event into taggable fields, add record type to each field
-                        Map(self.process_before_events_tsdb),
-                        [
-                            FilterKeys(BASE_METRICS),
-                            UnpackValues(BASE_METRICS),
-                            TSDBTarget(
-                                path=self.tsdb_path,
-                                rate="10/m",
-                                time_col=TIMESTAMP,
-                                container=self.tsdb_container,
-                                access_key=self.v3io_access_key,
-                                v3io_frames=self.v3io_framesd,
-                                index_cols=[ENDPOINT_ID, RECORD_TYPE],
-                                # Settings for _Batching
-                                max_events=self.tsdb_batching_max_events,
-                                timeout_secs=self.tsdb_batching_timeout_secs,
-                                key=ENDPOINT_ID,
-                            ),
-                        ],
-                        [
-                            FilterKeys(ENDPOINT_FEATURES),
-                            UnpackValues(ENDPOINT_FEATURES),
-                            TSDBTarget(
-                                path=self.tsdb_path,
-                                rate="10/m",
-                                time_col=TIMESTAMP,
-                                container=self.tsdb_container,
-                                access_key=self.v3io_access_key,
-                                v3io_frames=self.v3io_framesd,
-                                index_cols=[ENDPOINT_ID, RECORD_TYPE],
-                                # Settings for _Batching
-                                max_events=self.tsdb_batching_max_events,
-                                timeout_secs=self.tsdb_batching_timeout_secs,
-                                key=ENDPOINT_ID,
-                            ),
-                        ],
-                        [
-                            FilterKeys(CUSTOM_METRICS),
-                            FilterNotNone(),
-                            UnpackValues(CUSTOM_METRICS),
-                            TSDBTarget(
-                                path=self.tsdb_path,
-                                rate="10/m",
-                                time_col=TIMESTAMP,
-                                container=self.tsdb_container,
-                                access_key=self.v3io_access_key,
-                                v3io_frames=self.v3io_framesd,
-                                index_cols=[ENDPOINT_ID, RECORD_TYPE],
-                                # Settings for _Batching
-                                max_events=self.tsdb_batching_max_events,
-                                timeout_secs=self.tsdb_batching_timeout_secs,
-                                key=ENDPOINT_ID,
-                            ),
-                        ],
-                    ],
-                ],
-                # Branch 2: Batch events, write to parquet
-                [
-                    Map(self.process_before_parquet),
-                    ParquetTarget(
-                        path=self.parquet_path,
-                        partition_cols=["$key", "$year", "$month", "$day", "$hour"],
-                        infer_columns_from_data=True,
-                        # Settings for _Batching
-                        max_events=self.parquet_batching_max_events,
-                        timeout_secs=self.parquet_batching_timeout_secs,
-                        # Settings for v3io storage
-                        storage_options={
-                            "v3io_api": self.v3io_api,
-                            "v3io_access_key": self.model_monitoring_access_key,
-                        },
-                    ),
-                ],
-            ]
-        ).run()
-
-    def consume(self, event: Dict):
-        events = []
-        if "headers" in event and "values" in event:
-            for values in event["values"]:
-                events.append({k: v for k, v in zip(event["headers"], values)})
-        else:
-            events.append(event)
-
-        for enriched in map(enrich_even_details, events):
-            if enriched is not None:
-                self._flow.emit(
-                    enriched,
-                    key=enriched[ENDPOINT_ID],
-                    event_time=datetime.strptime(enriched["when"], ISO_8061_UTC),
-                )
-            else:
-                pass
-
-    @staticmethod
-    def compute_predictions_per_second(event: dict):
-        event[PREDICTIONS_PER_SECOND] = float(event[PREDICTIONS_COUNT_5M]) / 600
-        return event
-
-    def process_before_kv(self, event: dict):
-        # Filter relevant keys
-        e = {k: event[k] for k in self._kv_keys}
-        # Unpack labels dictionary
-        e = {**e, **e.pop(UNPACKED_LABELS, {})}
-        # Write labels to kv as json string to be presentable later
-        e[LABELS] = json.dumps(e[LABELS])
-        return e
-
-    @staticmethod
-    def process_before_events_tsdb(event: Dict):
-        base_fields = [TIMESTAMP, ENDPOINT_ID]
-
-        base_event = {k: event[k] for k in base_fields}
-        base_event[TIMESTAMP] = pd.to_datetime(
-            base_event[TIMESTAMP], format=TIME_FORMAT
-        )
-
-        base_metrics = {
-            RECORD_TYPE: BASE_METRICS,
-            PREDICTIONS_PER_SECOND: event[PREDICTIONS_PER_SECOND],
-            PREDICTIONS_COUNT_5M: event[PREDICTIONS_COUNT_5M],
-            PREDICTIONS_COUNT_1H: event[PREDICTIONS_COUNT_1H],
-            LATENCY_AVG_5M: event[LATENCY_AVG_5M],
-            LATENCY_AVG_1H: event[LATENCY_AVG_1H],
-            **base_event,
-        }
-
-        endpoint_features = {
-            RECORD_TYPE: ENDPOINT_FEATURES,
-            **event[NAMED_PREDICTIONS],
-            **event[NAMED_FEATURES],
-            **base_event,
-        }
-
-        processed = {BASE_METRICS: base_metrics, ENDPOINT_FEATURES: endpoint_features}
-
-        if event[METRICS]:
-            processed[CUSTOM_METRICS] = {
-                RECORD_TYPE: CUSTOM_METRICS,
-                **event[METRICS],
-                **base_event,
-            }
-
-        return processed
-
-    @staticmethod
-    def process_before_parquet(event: dict):
-        def set_none_if_empty(_event: dict, keys: List[str]):
-            for key in keys:
-                if not _event.get(key):
-                    _event[key] = None
-
-        def drop_if_exists(_event: dict, keys: List[str]):
-            for key in keys:
-                _event.pop(key, None)
-
-        def unpack_if_exists(_event: dict, keys: List[str]):
-            for key in keys:
-                value = _event.get(key)
-                if value is not None:
-                    _event = {**value, **event}
-
-        drop_if_exists(event, [UNPACKED_LABELS, FEATURES])
-        unpack_if_exists(event, [ENTITIES])
-        set_none_if_empty(event, [LABELS, METRICS, ENTITIES])
-        return event
-
-
-class ProcessEndpointEvent(MapClass):
-    def __init__(self, kv_container: str, kv_path: str, v3io_access_key: str, **kwargs):
-        super().__init__(**kwargs)
-        self.kv_container: str = kv_container
-        self.kv_path: str = kv_path
-        self.v3io_access_key: str = v3io_access_key
-        self.first_request: Dict[str, str] = dict()
-        self.last_request: Dict[str, str] = dict()
-        self.error_count: Dict[str, int] = defaultdict(int)
-        self.endpoints: Set[str] = set()
-
-    def do(self, event: dict):
-        function_uri = event[FUNCTION_URI]
-        versioned_model = event[VERSIONED_MODEL]
-        endpoint_id = event[ENDPOINT_ID]
-
-        # In case this process fails, resume state from existing record
-        self.resume_state(endpoint_id)
-
-        # Handle errors coming from stream
-        found_errors = self.handle_errors(endpoint_id, event)
-        if found_errors:
-            return None
-
-        # Validate event fields
-        model_class = event.get("model_class") or event.get("class")
-        timestamp = event.get("when")
-        request_id = event.get("request", {}).get("id")
-        latency = event.get("microsec")
-        features = event.get("request", {}).get("inputs")
-        predictions = event.get("resp", {}).get("outputs")
-
-        if not self.is_valid(endpoint_id, is_not_none, timestamp, ["when"],):
-            return None
-
-        if endpoint_id not in self.first_request:
-            self.first_request[endpoint_id] = timestamp
-        self.last_request[endpoint_id] = timestamp
-
-        if not self.is_valid(endpoint_id, is_not_none, request_id, ["request", "id"],):
-            return None
-        if not self.is_valid(endpoint_id, is_not_none, latency, ["microsec"],):
-            return None
-        if not self.is_valid(
-            endpoint_id, is_not_none, features, ["request", "inputs"],
-        ):
-            return None
-        if not self.is_valid(
-            endpoint_id, is_not_none, predictions, ["resp", "outputs"],
-        ):
-            return None
-
-        unpacked_labels = {f"_{k}": v for k, v in event.get(LABELS, {}).items()}
-
-        # Separate each model invocation into sub events
-        events = []
-        for i, (feature, prediction) in enumerate(zip(features, predictions)):
-            if not self.is_valid(
-                endpoint_id,
-                is_list_of_numerics,
-                feature,
-                ["request", "inputs", f"[{i}]"],
-            ):
-                return None
-
-            if not isinstance(prediction, list):
-                prediction = [prediction]
-
-            events.append(
-                {
-                    FUNCTION_URI: function_uri,
-                    MODEL: versioned_model,
-                    MODEL_CLASS: model_class,
-                    TIMESTAMP: timestamp,
-                    ENDPOINT_ID: endpoint_id,
-                    REQUEST_ID: request_id,
-                    LATENCY: latency,
-                    FEATURES: feature,
-                    PREDICTION: prediction,
-                    FIRST_REQUEST: self.first_request[endpoint_id],
-                    LAST_REQUEST: self.last_request[endpoint_id],
-                    ERROR_COUNT: self.error_count[endpoint_id],
-                    LABELS: event.get(LABELS, {}),
-                    METRICS: event.get(METRICS, {}),
-                    ENTITIES: event.get("request", {}).get(ENTITIES, {}),
-                    UNPACKED_LABELS: unpacked_labels,
-                }
-            )
-        return events
-
-    def resume_state(self, endpoint_id):
-        # Make sure process is resumable, if process fails for any reason, be able to pick things up close to where we
-        # left them
-        if endpoint_id not in self.endpoints:
-            logger.info("Trying to resume state", endpoint_id=endpoint_id)
-            endpoint_record = get_endpoint_record(
-                kv_container=self.kv_container,
-                kv_path=self.kv_path,
-                endpoint_id=endpoint_id,
-                access_key=self.v3io_access_key,
-            )
-            if endpoint_record:
-                first_request = endpoint_record.get(FIRST_REQUEST)
-                if first_request:
-                    self.first_request[endpoint_id] = first_request
-                error_count = endpoint_record.get(ERROR_COUNT)
-                if error_count:
-                    self.error_count[endpoint_id] = error_count
-            self.endpoints.add(endpoint_id)
-
-    def is_valid(
-        self, endpoint_id: str, validation_function, field: Any, dict_path: List[str]
-    ):
-        if validation_function(field, dict_path):
-            return True
-        self.error_count[endpoint_id] += 1
-        return False
-
-    def handle_errors(self, endpoint_id, event) -> bool:
-        if "error" in event:
-            self.error_count[endpoint_id] += 1
-            return True
-
-        return False
-
-
-def enrich_even_details(event) -> Optional[dict]:
-    function_uri = event.get(FUNCTION_URI)
-
-    if not is_not_none(function_uri, [FUNCTION_URI]):
-        return None
-
-    model = event.get(MODEL)
-    if not is_not_none(model, [MODEL]):
-        return None
-
-    version = event.get(VERSION)
-    versioned_model = f"{model}:{version}" if version else f"{model}:latest"
-
-    endpoint_id = create_model_endpoint_id(
-        function_uri=function_uri, versioned_model=versioned_model,
-    )
-
-    endpoint_id = str(endpoint_id)
-
-    event[VERSIONED_MODEL] = versioned_model
-    event[ENDPOINT_ID] = endpoint_id
-
-    return event
-
-
-def is_not_none(field: Any, dict_path: List[str]):
-    if field is not None:
-        return True
-    logger.error(
-        f"Expected event field is missing: {field} [Event -> {''.join(dict_path)}]"
-    )
-    return False
-
-
-def is_list_of_numerics(
-    field: List[Union[int, float, dict, list]], dict_path: List[str]
-):
-    if all(isinstance(x, int) or isinstance(x, float) for x in field):
-        return True
-    logger.error(
-        f"Expected event field is missing: {field} [Event -> {''.join(dict_path)}]"
-    )
-    return False
-
-
-class FilterNotNone(Filter):
-    def __init__(self, **kwargs):
-        super().__init__(fn=lambda event: event is not None, **kwargs)
-
-
-class FilterKeys(MapClass):
-    def __init__(self, *args, **kwargs):
-        super().__init__(**kwargs)
-        self.keys = list(args)
-
-    def do(self, event):
-        new_event = {}
-        for key in self.keys:
-            if key in event:
-                new_event[key] = event[key]
-
-        return new_event if new_event else None
-
-
-class UnpackValues(MapClass):
-    def __init__(self, *args, **kwargs):
-        super().__init__(**kwargs)
-        self.keys_to_unpack = set(args)
-
-    def do(self, event):
-        unpacked = {}
-        for key in event.keys():
-            if key in self.keys_to_unpack:
-                unpacked = {**unpacked, **event[key]}
-            else:
-                unpacked[key] = event[key]
-        return unpacked
-
-
-class MapFeatureNames(MapClass):
-    def __init__(self, kv_container: str, kv_path: str, access_key: str, **kwargs):
-        super().__init__(**kwargs)
-        self.kv_container = kv_container
-        self.kv_path = kv_path
-        self.access_key = access_key
-        self.feature_names = {}
-        self.label_columns = {}
-
-    def do(self, event: Dict):
-        endpoint_id = event[ENDPOINT_ID]
-
-        if endpoint_id not in self.feature_names:
-            endpoint_record = get_endpoint_record(
-                kv_container=self.kv_container,
-                kv_path=self.kv_path,
-                endpoint_id=endpoint_id,
-                access_key=self.access_key,
-            )
-            feature_names = endpoint_record.get(FEATURE_NAMES)
-            feature_names = json.loads(feature_names) if feature_names else None
-
-            label_columns = endpoint_record.get(LABEL_COLUMNS)
-            label_columns = json.loads(label_columns) if label_columns else None
-
-            if not feature_names:
-                logger.warn(
-                    f"Feature names are not initialized, they will be automatically generated",
-                    endpoint_id=endpoint_id,
-                )
-                feature_names = [f"f{i}" for i, _ in enumerate(event[FEATURES])]
-                get_v3io_client().kv.update(
-                    container=self.kv_container,
-                    table_path=self.kv_path,
-                    access_key=self.access_key,
-                    key=event[ENDPOINT_ID],
-                    attributes={FEATURE_NAMES: json.dumps(feature_names)},
-                    raise_for_status=RaiseForStatus.always,
-                )
-
-            if not label_columns:
-                logger.warn(
-                    f"label column names are not initialized, they will be automatically generated",
-                    endpoint_id=endpoint_id,
-                )
-                label_columns = [f"p{i}" for i, _ in enumerate(event[PREDICTION])]
-                get_v3io_client().kv.update(
-                    container=self.kv_container,
-                    table_path=self.kv_path,
-                    access_key=self.access_key,
-                    key=event[ENDPOINT_ID],
-                    attributes={LABEL_COLUMNS: json.dumps(label_columns)},
-                    raise_for_status=RaiseForStatus.always,
-                )
-
-            self.label_columns[endpoint_id] = label_columns
-            self.feature_names[endpoint_id] = feature_names
-
-            logger.info(
-                "Label columns", endpoint_id=endpoint_id, label_columns=label_columns
-            )
-            logger.info(
-                "Feature names", endpoint_id=endpoint_id, feature_names=feature_names
-            )
-
-        feature_names = self.feature_names[endpoint_id]
-        features = event[FEATURES]
-        event[NAMED_FEATURES] = {
-            name: feature for name, feature in zip(feature_names, features)
-        }
-
-        label_columns = self.label_columns[endpoint_id]
-        prediction = event[PREDICTION]
-        event[NAMED_PREDICTIONS] = {
-            name: prediction for name, prediction in zip(label_columns, prediction)
-        }
-        logger.info("Mapped event", event=event)
-        return event
-
-
-class WriteToKV(MapClass):
-    def __init__(self, container: str, table: str, **kwargs):
-        super().__init__(**kwargs)
-        self.container = container
-        self.table = table
-
-    def do(self, event: Dict):
-        get_v3io_client().kv.update(
-            container=self.container,
-            table_path=self.table,
-            key=event[ENDPOINT_ID],
-            attributes=event,
-        )
-        return event
-
-
-class InferSchema(MapClass):
-    def __init__(
-        self,
-        v3io_access_key: str,
-        v3io_framesd: str,
-        container: str,
-        table: str,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.container = container
-        self.v3io_access_key = v3io_access_key
-        self.v3io_framesd = v3io_framesd
-        self.table = table
-        self.keys = set()
-
-    def do(self, event: Dict):
-        key_set = set(event.keys())
-        if not key_set.issubset(self.keys):
-            self.keys.update(key_set)
-            get_frames_client(
-                token=self.v3io_access_key,
-                container=self.container,
-                address=self.v3io_framesd,
-            ).execute(backend="kv", table=self.table, command="infer_schema")
-            logger.info(
-                "Found new keys, inferred schema", table=self.table, event=event
-            )
-        return event
-
-
-def get_endpoint_record(
-    kv_container: str, kv_path: str, endpoint_id: str, access_key: str
-) -> Optional[dict]:
-    logger.info(
-        f"Grabbing endpoint data",
-        container=kv_container,
-        table_path=kv_path,
-        key=endpoint_id,
-    )
-    try:
-        endpoint_record = (
-            get_v3io_client()
-            .kv.get(
-                container=kv_container,
-                table_path=kv_path,
-                key=endpoint_id,
-                access_key=access_key,
-                raise_for_status=v3io.dataplane.RaiseForStatus.always,
-            )
-            .output.item
-        )
-        return endpoint_record
-    except Exception:
-        return None
-
-
-def init_context(context: MLClientCtx):
-    context.logger.info("Initializing EventStreamProcessor")
-    parameters = environ.get("MODEL_MONITORING_PARAMETERS")
-    parameters = json.loads(parameters) if parameters else {}
-    stream_processor = EventStreamProcessor(**parameters)
-    setattr(context, "stream_processor", stream_processor)
-
-
-def handler(context: MLClientCtx, event: Event):
-    event_body = json.loads(event.body)
-    context.logger.debug(event_body)
-    context.stream_processor.consume(event_body)
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/model_monitoring_stream/0.9.0/src/function.yaml b/functions/development/model_monitoring_stream/0.9.0/src/function.yaml deleted file mode 100644 index 07a21c40..00000000 --- a/functions/development/model_monitoring_stream/0.9.0/src/function.yaml +++ /dev/null @@ -1,267 +0,0 @@ -kind: remote -metadata: - name: model-monitoring-stream - tag: '' - hash: 33f4d6de0858b3dfc9d150fc82fbed6feb05534c - project: '' - categories: - - monitoring -spec: - command: '' - args: [] - image: livsmichael/mlrun-api:automation - entry_points: - consume: - name: consume - doc: '' - parameters: - - name: self - default: '' - - name: event - type: Dict - default: '' - outputs: - - default: '' - lineno: 293 - compute_predictions_per_second: - name: compute_predictions_per_second - doc: '' - parameters: - - name: event - type: dict - default: '' - outputs: - - default: '' - lineno: 311 - process_before_kv: - name: process_before_kv - doc: '' - parameters: - - name: self - default: '' - - name: event - type: dict - default: '' - outputs: - - default: '' - lineno: 316 - process_before_events_tsdb: - name: process_before_events_tsdb - doc: '' - parameters: - - name: event - type: Dict - default: '' - outputs: - - default: '' - lineno: 325 - process_before_parquet: - name: process_before_parquet - doc: '' - parameters: - - name: event - type: dict - default: '' - outputs: - - default: '' - lineno: 362 - set_none_if_empty: - name: set_none_if_empty - doc: '' - parameters: - - name: _event - type: dict - default: '' - - name: keys - type: List[str] - default: '' - outputs: - - default: '' - lineno: 364 - drop_if_exists: - name: drop_if_exists - doc: '' - parameters: - - name: _event - type: dict - default: '' - - name: keys - type: List[str] - default: '' - outputs: - - default: '' - lineno: 369 - unpack_if_exists: - name: unpack_if_exists - doc: '' - parameters: - - name: _event - type: dict - default: '' - - name: keys - type: List[str] - default: '' - outputs: - - default: '' - lineno: 373 - do: - name: do - doc: '' - parameters: - - name: self - default: '' - - name: event - type: Dict - default: '' - outputs: - - default: '' - lineno: 702 - resume_state: - name: resume_state - doc: '' - parameters: - - name: self - default: '' - - name: endpoint_id - default: '' - outputs: - - default: '' - lineno: 475 - is_valid: - name: is_valid - doc: '' - parameters: - - name: self - default: '' - - name: endpoint_id - type: str - default: '' - - name: validation_function - default: '' - - name: field - type: Any - default: '' - - name: dict_path - type: List[str] - default: '' - outputs: - - default: '' - lineno: 495 - handle_errors: - name: handle_errors - doc: '' - parameters: - - name: self - default: '' - - name: endpoint_id - default: '' - - name: event - default: '' - outputs: - - default: '' - type: bool - lineno: 503 - enrich_even_details: - name: enrich_even_details - doc: '' - parameters: - - name: event - default: '' - outputs: - - default: '' - lineno: 511 - is_not_none: - name: is_not_none - doc: '' - parameters: - - name: field - type: Any - default: '' - - name: dict_path - type: List[str] - default: '' - outputs: - - default: '' - lineno: 536 - is_list_of_numerics: - name: is_list_of_numerics - doc: '' - parameters: - - name: field - type: List[Union[int, float, dict, list]] - default: '' - - name: dict_path - type: List[str] - default: '' - outputs: - - default: '' - lineno: 545 - get_endpoint_record: - name: get_endpoint_record - doc: '' - parameters: - - name: kv_container - type: str - default: '' - - name: kv_path - type: str - default: '' - - name: endpoint_id - type: str - default: '' - - name: access_key - type: str - default: '' - outputs: - - default: '' - lineno: 717 - init_context: - name: init_context - doc: '' - parameters: - - name: context - type: MLClientCtx - default: '' - outputs: - - default: '' - lineno: 743 - handler: - name: handler - doc: '' - parameters: - - name: context - type: MLClientCtx - default: '' - - name: event - type: Event - default: '' - outputs: - - default: '' - lineno: 751 - description: '' - min_replicas: 1 - max_replicas: 4 - env: [] - base_spec: - apiVersion: nuclio.io/v1 - kind: Function - metadata: - name: model-monitoring-stream - labels: {} - annotations: - nuclio.io/generated_by: function generated from /home/michaell/projects/functions/model_monitoring_stream/model_monitoring_stream.py - spec: - runtime: python:3.6 - handler: model_monitoring_stream:handler - env: [] - volumes: [] - build: - commands: [] - noBaseImagesPull: true - functionSourceCode:  - source: '' - build: - commands: [] - code_origin: https://github.com/Michaelliv/functions.git#202b4c489e4c02c3025742ea237f1a042b7c6043:/home/michaell/projects/functions/model_monitoring_stream/model_monitoring_stream.py - default_handler: handler -verbose: false diff --git a/functions/development/model_monitoring_stream/0.9.0/src/item.yaml b/functions/development/model_monitoring_stream/0.9.0/src/item.yaml deleted file mode 100644 index 13565971..00000000 --- a/functions/development/model_monitoring_stream/0.9.0/src/item.yaml +++ /dev/null @@ -1,22 +0,0 @@ -apiVersion: v1 -categories: -- monitoring -description: '' -doc: '' -example: model_monitoring_stream.ipynb -generationDate: 2021-11-18:12-28 -icon: '' -labels: {} -maintainers: [] -marketplaceType: '' -mlrunVersion: 0.8.0 -name: model-monitoring-stream -platformVersion: 3.2.0 -spec: - filename: model_monitoring_stream.py - handler: handler - image: livsmichael/mlrun-api:automation - kind: nuclio - requirements: [] -url: '' -version: 0.9.0 diff --git a/functions/development/model_monitoring_stream/0.9.0/src/model_monitoring_stream.ipynb b/functions/development/model_monitoring_stream/0.9.0/src/model_monitoring_stream.ipynb deleted file mode 100644 index 93d8c92e..00000000 --- a/functions/development/model_monitoring_stream/0.9.0/src/model_monitoring_stream.ipynb +++ /dev/null @@ -1,178 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "source": [ - "# Model Monitoring\n", - "\n", - "## Initial set up (and pre-requisites)\n", - "1. Make sure you have the `mlrun-api` datasource available in your Grafana instance, otherwise add it by:\n", - " 1. Open your grafana instance\n", - " 2. Navigate to `Configuration -> Data Sources`\n", - " 3. Press `Add data source` and configure the following parameters\n", - " ```\n", - " Name: mlrun-api\n", - " URL: http://mlrun-api:8080/api/grafana-proxy/model-endpoints\n", - " Access: Server (default)\n", - "\n", - " ## Add a custom header of:\n", - " X-V3io-Session-Key: \n", - " ```\n", - " 4. Press `Save & Test` to make sure it works, a confirmation message should appear when this button is pressed\n", - "\n", - "2. Import the available dashboards `(./dashboards/*)` to you Grafana instance\n", - "3. To allow the system to utilize drift measurement, make sure you supply the train set when logging the model on the\n", - " training step\n", - "\n", - " ```python\n", - " # Log model\n", - " context.log_model(\n", - " \"model\",\n", - " body=dumps(model),\n", - " artifact_path=context.artifact_subpath(\"models\"),\n", - " extra_data=eval_metrics,\n", - " model_file=\"model.pkl\",\n", - " metrics=context.results,\n", - " training_set=X_test, # <- make sure this is passed into log_model\n", - " labels={\"class\": \"sklearn.linear_model.LogisticRegression\"}\n", - " )\n", - " ```\n", - "4. When serving a model, make sure that the Nuclio function is deployed with tracking enabled by applying\n", - " `fn.set_tracking()`\n", - "\n", - "## Configuration\n", - "The stream processing portion of the model monitoring, can be deployed under multiple configuration options. The\n", - "available configurations can be found under `stream.Config`. Once configured it should be supplied as environment\n", - "parameters to the Nuclio function by setting `fn.set_envs`\n", - "\n", - "```python\n", - "project: str # project name\n", - "sample_window: int # The sampling window for the data that flows into the TSDB and the KV\n", - "kv_path_template: str # Path template for the kv table\n", - "tsdb_path_template: str # Path template for the tsdb table\n", - "parquet_path_template: str # v3io parquets path template, assumes v3io is mounted\n", - "tsdb_batching_max_events: int # The max amount of event to batch before writing the batch to tsdb\n", - "tsdb_batching_timeout_secs: int # The max amount of seconds a given batch can be gathered before being emitted\n", - "parquet_batching_max_events: int # The max amount of event to batch before writing the batch to parquet\n", - "parquet_batching_timeout_secs: int # The max amount of seconds, a given batch can be gathered before being written to parquet\n", - "container: str # container name\n", - "v3io_access_key: str # V3IO Access key\n", - "v3io_framesd: str # V3IO framesd URL\n", - "time_format: str # The time format into which time related fields will be converted\n", - "aggregate_count_windows: List[str] # List of window sizes for predictions count\n", - "aggregate_count_period: str # Period of predictions count windows\n", - "aggregate_avg_windows: List[str] # List of window sizes for average latency\n", - "aggregate_avg_period: str # Period of average latency windows\n", - "```" - ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%% md\n" - } - } - }, - { - "cell_type": "markdown", - "source": [ - "## Export function yaml" - ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%% md\n" - } - } - }, - { - "cell_type": "code", - "execution_count": null, - "outputs": [], - "source": [ - "from mlrun import code_to_function\n", - "from mlrun.runtimes import RemoteRuntime\n", - "\n", - "\n", - "fn: RemoteRuntime = code_to_function(\n", - " name=\"model-monitoring-stream\",\n", - " kind=\"nuclio\",\n", - " image=\"mlrun/mlrun\",\n", - " filename=\"model_monitoring_stream.py\",\n", - " handler=\"handler\",\n", - ")\n", - "fn.export(\"model_monitoring_stream.yaml\")\n" - ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%%\n" - } - } - }, - { - "cell_type": "markdown", - "source": [ - "## Deploy Stream Processing" - ], - "metadata": { - "collapsed": false - } - }, - { - "cell_type": "code", - "execution_count": null, - "outputs": [], - "source": [ - "import os\n", - "\n", - "from mlrun import import_function\n", - "from mlrun.platforms import mount_v3io\n", - "from mlrun.runtimes import RemoteRuntime\n", - "import json\n", - "\n", - "# Set project name\n", - "project = \"\"\n", - "\n", - "fn: RemoteRuntime = import_function(\"hub://model_monitoring_stream\")\n", - "\n", - "fn.add_v3io_stream_trigger(\n", - " stream_path=f\"projects/{project}/model-endpoints/stream\",\n", - " name=\"monitoring_stream_trigger\",\n", - ")\n", - "\n", - "fn.set_env(\"MODEL_MONITORING_PARAMETERS\", json.dumps({\"project\": project, \"v3io_framesd\": os.environ.get(\"V3IO_FRAMESD\")}))\n", - "\n", - "fn.metadata.project = project\n", - "fn.apply(mount_v3io())\n", - "fn.deploy()" - ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%%\n" - } - } - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 2 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} \ No newline at end of file diff --git a/functions/development/model_monitoring_stream/0.9.0/src/model_monitoring_stream.py b/functions/development/model_monitoring_stream/0.9.0/src/model_monitoring_stream.py deleted file mode 100644 index ed21aeb5..00000000 --- a/functions/development/model_monitoring_stream/0.9.0/src/model_monitoring_stream.py +++ /dev/null @@ -1,754 +0,0 @@ -import json -import os -from collections import defaultdict -from datetime import datetime -from os import environ -from typing import Dict, List, Set, Optional, Any, Union - -import pandas as pd -import v3io -from mlrun.config import config -from mlrun.run import MLClientCtx -from mlrun.utils import logger -from mlrun.utils.model_monitoring import ( - parse_model_endpoint_store_prefix, - create_model_endpoint_id, -) -from mlrun.utils.v3io_clients import get_v3io_client, get_frames_client -from nuclio import Event -from storey import ( - FieldAggregator, - NoopDriver, - Table, - Map, - MapClass, - AggregateByKey, - build_flow, - Filter, - FlatMap, - TSDBTarget, - ParquetTarget, - SyncEmitSource, -) -from storey.dtypes import SlidingWindows -from storey.steps import SampleWindow -# Constants -from v3io.dataplane import RaiseForStatus - -ISO_8061_UTC = "%Y-%m-%d %H:%M:%S.%f%z" -FUNCTION_URI = "function_uri" -MODEL = "model" -VERSION = "version" -VERSIONED_MODEL = "versioned_model" -MODEL_CLASS = "model_class" -TIMESTAMP = "timestamp" -ENDPOINT_ID = "endpoint_id" -REQUEST_ID = "request_id" -LABELS = "labels" -UNPACKED_LABELS = "unpacked_labels" -LATENCY_AVG_5M = "latency_avg_5m" -LATENCY_AVG_1H = "latency_avg_1h" -PREDICTIONS_PER_SECOND = "predictions_per_second" -PREDICTIONS_COUNT_5M = "predictions_count_5m" -PREDICTIONS_COUNT_1H = "predictions_count_1h" -FIRST_REQUEST = "first_request" -LAST_REQUEST = "last_request" -ERROR_COUNT = "error_count" -ENTITIES = "entities" -FEATURE_NAMES = "feature_names" -LABEL_COLUMNS = "label_columns" -LATENCY = "latency" -RECORD_TYPE = "record_type" -FEATURES = "features" -PREDICTION = "prediction" -PREDICTIONS = "predictions" -NAMED_FEATURES = "named_features" -NAMED_PREDICTIONS = "named_predictions" -BASE_METRICS = "base_metrics" -CUSTOM_METRICS = "custom_metrics" -ENDPOINT_FEATURES = "endpoint_features" -METRICS = "metrics" -BATCH_TIMESTAMP = "batch_timestamp" -TIME_FORMAT: str = "%Y-%m-%d %H:%M:%S.%f" # ISO 8061 - - -# Stream processing code -class EventStreamProcessor: - def __init__( - self, - project: str, - sample_window: int = 10, - tsdb_batching_max_events: int = 10, - tsdb_batching_timeout_secs: int = 60 * 5, # Default 5 minutes - parquet_batching_max_events: int = 10_000, - parquet_batching_timeout_secs: int = 60 * 60, # Default 1 hour - aggregate_count_windows: Optional[List[str]] = None, - aggregate_count_period: str = "30s", - aggregate_avg_windows: Optional[List[str]] = None, - aggregate_avg_period: str = "30s", - v3io_access_key: Optional[str] = None, - v3io_framesd: Optional[str] = None, - v3io_api: Optional[str] = None, - ): - self.project = project - self.sample_window = sample_window - self.tsdb_batching_max_events = tsdb_batching_max_events - self.tsdb_batching_timeout_secs = tsdb_batching_timeout_secs - self.parquet_batching_max_events = parquet_batching_max_events - self.parquet_batching_timeout_secs = parquet_batching_timeout_secs - self.aggregate_count_windows = aggregate_count_windows or ["5m", "1h"] - self.aggregate_count_period = aggregate_count_period - self.aggregate_avg_windows = aggregate_avg_windows or ["5m", "1h"] - self.aggregate_avg_period = aggregate_avg_period - - self.v3io_framesd = v3io_framesd or config.v3io_framesd - self.v3io_api = v3io_api or config.v3io_api - - self.v3io_access_key = v3io_access_key or environ.get("V3IO_ACCESS_KEY") - self.model_monitoring_access_key = ( - os.environ.get("MODEL_MONITORING_ACCESS_KEY") or self.v3io_access_key - ) - - template = config.model_endpoint_monitoring.store_prefixes.default - - kv_path = template.format(project=project, kind="endpoints") - _, self.kv_container, self.kv_path = parse_model_endpoint_store_prefix(kv_path) - - tsdb_path = template.format(project=project, kind="events") - _, self.tsdb_container, self.tsdb_path = parse_model_endpoint_store_prefix( - tsdb_path - ) - self.tsdb_path = f"{self.tsdb_container}/{self.tsdb_path}" - - self.parquet_path = config.model_endpoint_monitoring.store_prefixes.user_space.format( - project=project, kind="parquet" - ) - - logger.info( - "V3IO Configuration", - v3io_access_key=self.v3io_access_key, - model_monitoring_access_key=self.model_monitoring_access_key, - default_store_prefix=config.model_endpoint_monitoring.store_prefixes.default, - user_space_store_prefix=config.model_endpoint_monitoring.store_prefixes.user_space, - v3io_api=self.v3io_api, - v3io_framesd=self.v3io_framesd, - kv_container=self.kv_container, - kv_path=self.kv_path, - tsdb_container=self.tsdb_container, - tsdb_path=self.tsdb_path, - parquet_path=self.parquet_path, - ) - - self._kv_keys = [ - FUNCTION_URI, - MODEL, - MODEL_CLASS, - TIMESTAMP, - ENDPOINT_ID, - LABELS, - UNPACKED_LABELS, - LATENCY_AVG_5M, - LATENCY_AVG_1H, - PREDICTIONS_PER_SECOND, - PREDICTIONS_COUNT_5M, - PREDICTIONS_COUNT_1H, - FIRST_REQUEST, - LAST_REQUEST, - ERROR_COUNT, - ] - - self._flow = build_flow( - [ - SyncEmitSource(), - ProcessEndpointEvent( - kv_container=self.kv_container, - kv_path=self.kv_path, - v3io_access_key=self.v3io_access_key, - ), - FilterNotNone(), - FlatMap(lambda x: x), - MapFeatureNames( - kv_container=self.kv_container, - kv_path=self.kv_path, - access_key=self.v3io_access_key, - ), - # Branch 1: Aggregate events, count averages and update TSDB and KV - [ - AggregateByKey( - aggregates=[ - FieldAggregator( - PREDICTIONS, - ENDPOINT_ID, - ["count"], - SlidingWindows( - self.aggregate_count_windows, - self.aggregate_count_period, - ), - ), - FieldAggregator( - LATENCY, - LATENCY, - ["avg"], - SlidingWindows( - self.aggregate_avg_windows, - self.aggregate_avg_period, - ), - ), - ], - table=Table("notable", NoopDriver()), - ), - SampleWindow( - self.sample_window - ), # Add required gap between event to apply sampling - Map(self.compute_predictions_per_second), - # Branch 1.1: Updated KV - [ - Map(self.process_before_kv), - WriteToKV(container=self.kv_container, table=self.kv_path), - InferSchema( - v3io_access_key=self.v3io_access_key, - v3io_framesd=self.v3io_framesd, - container=self.kv_container, - table=self.kv_path, - ), - ], - # Branch 1.2: Update TSDB - [ - # Map the event into taggable fields, add record type to each field - Map(self.process_before_events_tsdb), - [ - FilterKeys(BASE_METRICS), - UnpackValues(BASE_METRICS), - TSDBTarget( - path=self.tsdb_path, - rate="10/m", - time_col=TIMESTAMP, - container=self.tsdb_container, - access_key=self.v3io_access_key, - v3io_frames=self.v3io_framesd, - index_cols=[ENDPOINT_ID, RECORD_TYPE], - # Settings for _Batching - max_events=self.tsdb_batching_max_events, - timeout_secs=self.tsdb_batching_timeout_secs, - key=ENDPOINT_ID, - ), - ], - [ - FilterKeys(ENDPOINT_FEATURES), - UnpackValues(ENDPOINT_FEATURES), - TSDBTarget( - path=self.tsdb_path, - rate="10/m", - time_col=TIMESTAMP, - container=self.tsdb_container, - access_key=self.v3io_access_key, - v3io_frames=self.v3io_framesd, - index_cols=[ENDPOINT_ID, RECORD_TYPE], - # Settings for _Batching - max_events=self.tsdb_batching_max_events, - timeout_secs=self.tsdb_batching_timeout_secs, - key=ENDPOINT_ID, - ), - ], - [ - FilterKeys(CUSTOM_METRICS), - FilterNotNone(), - UnpackValues(CUSTOM_METRICS), - TSDBTarget( - path=self.tsdb_path, - rate="10/m", - time_col=TIMESTAMP, - container=self.tsdb_container, - access_key=self.v3io_access_key, - v3io_frames=self.v3io_framesd, - index_cols=[ENDPOINT_ID, RECORD_TYPE], - # Settings for _Batching - max_events=self.tsdb_batching_max_events, - timeout_secs=self.tsdb_batching_timeout_secs, - key=ENDPOINT_ID, - ), - ], - ], - ], - # Branch 2: Batch events, write to parquet - [ - Map(self.process_before_parquet), - ParquetTarget( - path=self.parquet_path, - partition_cols=["$key", "$year", "$month", "$day", "$hour"], - infer_columns_from_data=True, - # Settings for _Batching - max_events=self.parquet_batching_max_events, - timeout_secs=self.parquet_batching_timeout_secs, - # Settings for v3io storage - storage_options={ - "v3io_api": self.v3io_api, - "v3io_access_key": self.model_monitoring_access_key, - }, - ), - ], - ] - ).run() - - def consume(self, event: Dict): - events = [] - if "headers" in event and "values" in event: - for values in event["values"]: - events.append({k: v for k, v in zip(event["headers"], values)}) - else: - events.append(event) - - for enriched in map(enrich_even_details, events): - if enriched is not None: - self._flow.emit( - enriched, - key=enriched[ENDPOINT_ID], - event_time=datetime.strptime(enriched["when"], ISO_8061_UTC), - ) - else: - pass - - @staticmethod - def compute_predictions_per_second(event: dict): - event[PREDICTIONS_PER_SECOND] = float(event[PREDICTIONS_COUNT_5M]) / 600 - return event - - def process_before_kv(self, event: dict): - # Filter relevant keys - e = {k: event[k] for k in self._kv_keys} - # Unpack labels dictionary - e = {**e, **e.pop(UNPACKED_LABELS, {})} - # Write labels to kv as json string to be presentable later - e[LABELS] = json.dumps(e[LABELS]) - return e - - @staticmethod - def process_before_events_tsdb(event: Dict): - base_fields = [TIMESTAMP, ENDPOINT_ID] - - base_event = {k: event[k] for k in base_fields} - base_event[TIMESTAMP] = pd.to_datetime( - base_event[TIMESTAMP], format=TIME_FORMAT - ) - - base_metrics = { - RECORD_TYPE: BASE_METRICS, - PREDICTIONS_PER_SECOND: event[PREDICTIONS_PER_SECOND], - PREDICTIONS_COUNT_5M: event[PREDICTIONS_COUNT_5M], - PREDICTIONS_COUNT_1H: event[PREDICTIONS_COUNT_1H], - LATENCY_AVG_5M: event[LATENCY_AVG_5M], - LATENCY_AVG_1H: event[LATENCY_AVG_1H], - **base_event, - } - - endpoint_features = { - RECORD_TYPE: ENDPOINT_FEATURES, - **event[NAMED_PREDICTIONS], - **event[NAMED_FEATURES], - **base_event, - } - - processed = {BASE_METRICS: base_metrics, ENDPOINT_FEATURES: endpoint_features} - - if event[METRICS]: - processed[CUSTOM_METRICS] = { - RECORD_TYPE: CUSTOM_METRICS, - **event[METRICS], - **base_event, - } - - return processed - - @staticmethod - def process_before_parquet(event: dict): - def set_none_if_empty(_event: dict, keys: List[str]): - for key in keys: - if not _event.get(key): - _event[key] = None - - def drop_if_exists(_event: dict, keys: List[str]): - for key in keys: - _event.pop(key, None) - - def unpack_if_exists(_event: dict, keys: List[str]): - for key in keys: - value = _event.get(key) - if value is not None: - _event = {**value, **event} - - drop_if_exists(event, [UNPACKED_LABELS, FEATURES]) - unpack_if_exists(event, [ENTITIES]) - set_none_if_empty(event, [LABELS, METRICS, ENTITIES]) - return event - - -class ProcessEndpointEvent(MapClass): - def __init__(self, kv_container: str, kv_path: str, v3io_access_key: str, **kwargs): - super().__init__(**kwargs) - self.kv_container: str = kv_container - self.kv_path: str = kv_path - self.v3io_access_key: str = v3io_access_key - self.first_request: Dict[str, str] = dict() - self.last_request: Dict[str, str] = dict() - self.error_count: Dict[str, int] = defaultdict(int) - self.endpoints: Set[str] = set() - - def do(self, event: dict): - function_uri = event[FUNCTION_URI] - versioned_model = event[VERSIONED_MODEL] - endpoint_id = event[ENDPOINT_ID] - - # In case this process fails, resume state from existing record - self.resume_state(endpoint_id) - - # Handle errors coming from stream - found_errors = self.handle_errors(endpoint_id, event) - if found_errors: - return None - - # Validate event fields - model_class = event.get("model_class") or event.get("class") - timestamp = event.get("when") - request_id = event.get("request", {}).get("id") - latency = event.get("microsec") - features = event.get("request", {}).get("inputs") - predictions = event.get("resp", {}).get("outputs") - - if not self.is_valid(endpoint_id, is_not_none, timestamp, ["when"],): - return None - - if endpoint_id not in self.first_request: - self.first_request[endpoint_id] = timestamp - self.last_request[endpoint_id] = timestamp - - if not self.is_valid(endpoint_id, is_not_none, request_id, ["request", "id"],): - return None - if not self.is_valid(endpoint_id, is_not_none, latency, ["microsec"],): - return None - if not self.is_valid( - endpoint_id, is_not_none, features, ["request", "inputs"], - ): - return None - if not self.is_valid( - endpoint_id, is_not_none, predictions, ["resp", "outputs"], - ): - return None - - unpacked_labels = {f"_{k}": v for k, v in event.get(LABELS, {}).items()} - - # Separate each model invocation into sub events - events = [] - for i, (feature, prediction) in enumerate(zip(features, predictions)): - if not self.is_valid( - endpoint_id, - is_list_of_numerics, - feature, - ["request", "inputs", f"[{i}]"], - ): - return None - - if not isinstance(prediction, list): - prediction = [prediction] - - events.append( - { - FUNCTION_URI: function_uri, - MODEL: versioned_model, - MODEL_CLASS: model_class, - TIMESTAMP: timestamp, - ENDPOINT_ID: endpoint_id, - REQUEST_ID: request_id, - LATENCY: latency, - FEATURES: feature, - PREDICTION: prediction, - FIRST_REQUEST: self.first_request[endpoint_id], - LAST_REQUEST: self.last_request[endpoint_id], - ERROR_COUNT: self.error_count[endpoint_id], - LABELS: event.get(LABELS, {}), - METRICS: event.get(METRICS, {}), - ENTITIES: event.get("request", {}).get(ENTITIES, {}), - UNPACKED_LABELS: unpacked_labels, - } - ) - return events - - def resume_state(self, endpoint_id): - # Make sure process is resumable, if process fails for any reason, be able to pick things up close to where we - # left them - if endpoint_id not in self.endpoints: - logger.info("Trying to resume state", endpoint_id=endpoint_id) - endpoint_record = get_endpoint_record( - kv_container=self.kv_container, - kv_path=self.kv_path, - endpoint_id=endpoint_id, - access_key=self.v3io_access_key, - ) - if endpoint_record: - first_request = endpoint_record.get(FIRST_REQUEST) - if first_request: - self.first_request[endpoint_id] = first_request - error_count = endpoint_record.get(ERROR_COUNT) - if error_count: - self.error_count[endpoint_id] = error_count - self.endpoints.add(endpoint_id) - - def is_valid( - self, endpoint_id: str, validation_function, field: Any, dict_path: List[str] - ): - if validation_function(field, dict_path): - return True - self.error_count[endpoint_id] += 1 - return False - - def handle_errors(self, endpoint_id, event) -> bool: - if "error" in event: - self.error_count[endpoint_id] += 1 - return True - - return False - - -def enrich_even_details(event) -> Optional[dict]: - function_uri = event.get(FUNCTION_URI) - - if not is_not_none(function_uri, [FUNCTION_URI]): - return None - - model = event.get(MODEL) - if not is_not_none(model, [MODEL]): - return None - - version = event.get(VERSION) - versioned_model = f"{model}:{version}" if version else f"{model}:latest" - - endpoint_id = create_model_endpoint_id( - function_uri=function_uri, versioned_model=versioned_model, - ) - - endpoint_id = str(endpoint_id) - - event[VERSIONED_MODEL] = versioned_model - event[ENDPOINT_ID] = endpoint_id - - return event - - -def is_not_none(field: Any, dict_path: List[str]): - if field is not None: - return True - logger.error( - f"Expected event field is missing: {field} [Event -> {''.join(dict_path)}]" - ) - return False - - -def is_list_of_numerics( - field: List[Union[int, float, dict, list]], dict_path: List[str] -): - if all(isinstance(x, int) or isinstance(x, float) for x in field): - return True - logger.error( - f"Expected event field is missing: {field} [Event -> {''.join(dict_path)}]" - ) - return False - - -class FilterNotNone(Filter): - def __init__(self, **kwargs): - super().__init__(fn=lambda event: event is not None, **kwargs) - - -class FilterKeys(MapClass): - def __init__(self, *args, **kwargs): - super().__init__(**kwargs) - self.keys = list(args) - - def do(self, event): - new_event = {} - for key in self.keys: - if key in event: - new_event[key] = event[key] - - return new_event if new_event else None - - -class UnpackValues(MapClass): - def __init__(self, *args, **kwargs): - super().__init__(**kwargs) - self.keys_to_unpack = set(args) - - def do(self, event): - unpacked = {} - for key in event.keys(): - if key in self.keys_to_unpack: - unpacked = {**unpacked, **event[key]} - else: - unpacked[key] = event[key] - return unpacked - - -class MapFeatureNames(MapClass): - def __init__(self, kv_container: str, kv_path: str, access_key: str, **kwargs): - super().__init__(**kwargs) - self.kv_container = kv_container - self.kv_path = kv_path - self.access_key = access_key - self.feature_names = {} - self.label_columns = {} - - def do(self, event: Dict): - endpoint_id = event[ENDPOINT_ID] - - if endpoint_id not in self.feature_names: - endpoint_record = get_endpoint_record( - kv_container=self.kv_container, - kv_path=self.kv_path, - endpoint_id=endpoint_id, - access_key=self.access_key, - ) - feature_names = endpoint_record.get(FEATURE_NAMES) - feature_names = json.loads(feature_names) if feature_names else None - - label_columns = endpoint_record.get(LABEL_COLUMNS) - label_columns = json.loads(label_columns) if label_columns else None - - if not feature_names: - logger.warn( - f"Feature names are not initialized, they will be automatically generated", - endpoint_id=endpoint_id, - ) - feature_names = [f"f{i}" for i, _ in enumerate(event[FEATURES])] - get_v3io_client().kv.update( - container=self.kv_container, - table_path=self.kv_path, - access_key=self.access_key, - key=event[ENDPOINT_ID], - attributes={FEATURE_NAMES: json.dumps(feature_names)}, - raise_for_status=RaiseForStatus.always, - ) - - if not label_columns: - logger.warn( - f"label column names are not initialized, they will be automatically generated", - endpoint_id=endpoint_id, - ) - label_columns = [f"p{i}" for i, _ in enumerate(event[PREDICTION])] - get_v3io_client().kv.update( - container=self.kv_container, - table_path=self.kv_path, - access_key=self.access_key, - key=event[ENDPOINT_ID], - attributes={LABEL_COLUMNS: json.dumps(label_columns)}, - raise_for_status=RaiseForStatus.always, - ) - - self.label_columns[endpoint_id] = label_columns - self.feature_names[endpoint_id] = feature_names - - logger.info( - "Label columns", endpoint_id=endpoint_id, label_columns=label_columns - ) - logger.info( - "Feature names", endpoint_id=endpoint_id, feature_names=feature_names - ) - - feature_names = self.feature_names[endpoint_id] - features = event[FEATURES] - event[NAMED_FEATURES] = { - name: feature for name, feature in zip(feature_names, features) - } - - label_columns = self.label_columns[endpoint_id] - prediction = event[PREDICTION] - event[NAMED_PREDICTIONS] = { - name: prediction for name, prediction in zip(label_columns, prediction) - } - logger.info("Mapped event", event=event) - return event - - -class WriteToKV(MapClass): - def __init__(self, container: str, table: str, **kwargs): - super().__init__(**kwargs) - self.container = container - self.table = table - - def do(self, event: Dict): - get_v3io_client().kv.update( - container=self.container, - table_path=self.table, - key=event[ENDPOINT_ID], - attributes=event, - ) - return event - - -class InferSchema(MapClass): - def __init__( - self, - v3io_access_key: str, - v3io_framesd: str, - container: str, - table: str, - **kwargs, - ): - super().__init__(**kwargs) - self.container = container - self.v3io_access_key = v3io_access_key - self.v3io_framesd = v3io_framesd - self.table = table - self.keys = set() - - def do(self, event: Dict): - key_set = set(event.keys()) - if not key_set.issubset(self.keys): - self.keys.update(key_set) - get_frames_client( - token=self.v3io_access_key, - container=self.container, - address=self.v3io_framesd, - ).execute(backend="kv", table=self.table, command="infer_schema") - logger.info( - "Found new keys, inferred schema", table=self.table, event=event - ) - return event - - -def get_endpoint_record( - kv_container: str, kv_path: str, endpoint_id: str, access_key: str -) -> Optional[dict]: - logger.info( - f"Grabbing endpoint data", - container=kv_container, - table_path=kv_path, - key=endpoint_id, - ) - try: - endpoint_record = ( - get_v3io_client() - .kv.get( - container=kv_container, - table_path=kv_path, - key=endpoint_id, - access_key=access_key, - raise_for_status=v3io.dataplane.RaiseForStatus.always, - ) - .output.item - ) - return endpoint_record - except Exception: - return None - - -def init_context(context: MLClientCtx): - context.logger.info("Initializing EventStreamProcessor") - parameters = environ.get("MODEL_MONITORING_PARAMETERS") - parameters = json.loads(parameters) if parameters else {} - stream_processor = EventStreamProcessor(**parameters) - setattr(context, "stream_processor", stream_processor) - - -def handler(context: MLClientCtx, event: Event): - event_body = json.loads(event.body) - context.logger.debug(event_body) - context.stream_processor.consume(event_body) diff --git a/functions/development/model_monitoring_stream/0.9.0/src/requirements.txt b/functions/development/model_monitoring_stream/0.9.0/src/requirements.txt deleted file mode 100644 index 5e3645de..00000000 --- a/functions/development/model_monitoring_stream/0.9.0/src/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -storey \ No newline at end of file diff --git a/functions/development/model_monitoring_stream/0.9.0/static/documentation.html b/functions/development/model_monitoring_stream/0.9.0/static/documentation.html deleted file mode 100644 index 0e7674fe..00000000 --- a/functions/development/model_monitoring_stream/0.9.0/static/documentation.html +++ /dev/null @@ -1,125 +0,0 @@ - - - - - - - -model_monitoring_stream package - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- - -
-
-
-
-
-
-

model_monitoring_stream package

-
-

Submodules

-
-
-

model_monitoring_stream.model_monitoring_stream module

-
-
-

Module contents

-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/model_monitoring_stream/0.9.0/static/example.html b/functions/development/model_monitoring_stream/0.9.0/static/example.html deleted file mode 100644 index 7da34cb9..00000000 --- a/functions/development/model_monitoring_stream/0.9.0/static/example.html +++ /dev/null @@ -1,240 +0,0 @@ - - - - - - - -Model Monitoring - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- - -
-
-
-
-
-
-

Model Monitoring

-
-

Initial set up (and pre-requisites)

-
    -
  1. Make sure you have the mlrun-api datasource available in your Grafana instance, otherwise add it by:

    -
      -
    1. Open your grafana instance

    2. -
    3. Navigate to Configuration -> Data Sources

    4. -
    5. Press Add data source and configure the following parameters

    6. -
    -
    Name: mlrun-api
    -URL: http://mlrun-api:8080/api/grafana-proxy/model-endpoints
    -Access: Server (default)
    -
    -## Add a custom header of:
    -X-V3io-Session-Key: <YOUR ACCESS KEY>
    -
    -
    -
      -
    1. Press Save & Test to make sure it works, a confirmation message should appear when this button is pressed

    2. -
    -
  2. -
  3. Import the available dashboards (./dashboards/*) to you Grafana instance

  4. -
  5. To allow the system to utilize drift measurement, make sure you supply the train set when logging the model on the -training step

    -
    # Log model
    -context.log_model(
    -    "model",
    -    body=dumps(model),
    -    artifact_path=context.artifact_subpath("models"),
    -    extra_data=eval_metrics,
    -    model_file="model.pkl",
    -    metrics=context.results,
    -    training_set=X_test,  # <- make sure this is passed into log_model
    -    labels={"class": "sklearn.linear_model.LogisticRegression"}
    -)
    -
    -
    -
  6. -
  7. When serving a model, make sure that the Nuclio function is deployed with tracking enabled by applying -fn.set_tracking()

  8. -
-
-
-

Configuration

-

The stream processing portion of the model monitoring, can be deployed under multiple configuration options. The -available configurations can be found under stream.Config. Once configured it should be supplied as environment -parameters to the Nuclio function by setting fn.set_envs

-
project: str                        # project name
-sample_window: int                  # The sampling window for the data that flows into the TSDB and the KV
-kv_path_template: str               # Path template for the kv table
-tsdb_path_template: str             # Path template for the tsdb table
-parquet_path_template: str          # v3io parquets path template, assumes v3io is mounted
-tsdb_batching_max_events: int       # The max amount of event to batch before writing the batch to tsdb
-tsdb_batching_timeout_secs: int     # The max amount of seconds a given batch can be gathered before being emitted
-parquet_batching_max_events: int    # The max amount of event to batch before writing the batch to parquet
-parquet_batching_timeout_secs: int  # The max amount of seconds, a given batch can be gathered before being written to parquet
-container: str                      # container name
-v3io_access_key: str                # V3IO Access key
-v3io_framesd: str                   # V3IO framesd URL
-time_format: str                    # The time format into which time related fields will be converted
-aggregate_count_windows: List[str]  # List of window sizes for predictions count
-aggregate_count_period: str         # Period of predictions count windows
-aggregate_avg_windows: List[str]    # List of window sizes for average latency
-aggregate_avg_period: str           # Period of average latency windows
-
-
-
-
-

Export function yaml

-
-
-
from mlrun import code_to_function
-from mlrun.runtimes import RemoteRuntime
-
-
-fn: RemoteRuntime = code_to_function(
-    name="model-monitoring-stream",
-    kind="nuclio",
-    image="mlrun/mlrun",
-    filename="model_monitoring_stream.py",
-    handler="handler",
-)
-fn.export("model_monitoring_stream.yaml")
-
-
-
-
-
-
-

Deploy Stream Processing

-
-
-
import os
-
-from mlrun import import_function
-from mlrun.platforms import mount_v3io
-from mlrun.runtimes import RemoteRuntime
-import json
-
-# Set project name
-project = ""
-
-fn: RemoteRuntime = import_function("hub://model_monitoring_stream")
-
-fn.add_v3io_stream_trigger(
-    stream_path=f"projects/{project}/model-endpoints/stream",
-    name="monitoring_stream_trigger",
-)
-
-fn.set_env("MODEL_MONITORING_PARAMETERS", json.dumps({"project": project, "v3io_framesd": os.environ.get("V3IO_FRAMESD")}))
-
-fn.metadata.project = project
-fn.apply(mount_v3io())
-fn.deploy()
-
-
-
-
-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/model_monitoring_stream/0.9.0/static/function.html b/functions/development/model_monitoring_stream/0.9.0/static/function.html deleted file mode 100644 index 81290204..00000000 --- a/functions/development/model_monitoring_stream/0.9.0/static/function.html +++ /dev/null @@ -1,289 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: remote
-metadata:
-  name: model-monitoring-stream
-  tag: ''
-  hash: 33f4d6de0858b3dfc9d150fc82fbed6feb05534c
-  project: ''
-  categories:
-  - monitoring
-spec:
-  command: ''
-  args: []
-  image: livsmichael/mlrun-api:automation
-  entry_points:
-    consume:
-      name: consume
-      doc: ''
-      parameters:
-      - name: self
-        default: ''
-      - name: event
-        type: Dict
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 293
-    compute_predictions_per_second:
-      name: compute_predictions_per_second
-      doc: ''
-      parameters:
-      - name: event
-        type: dict
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 311
-    process_before_kv:
-      name: process_before_kv
-      doc: ''
-      parameters:
-      - name: self
-        default: ''
-      - name: event
-        type: dict
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 316
-    process_before_events_tsdb:
-      name: process_before_events_tsdb
-      doc: ''
-      parameters:
-      - name: event
-        type: Dict
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 325
-    process_before_parquet:
-      name: process_before_parquet
-      doc: ''
-      parameters:
-      - name: event
-        type: dict
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 362
-    set_none_if_empty:
-      name: set_none_if_empty
-      doc: ''
-      parameters:
-      - name: _event
-        type: dict
-        default: ''
-      - name: keys
-        type: List[str]
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 364
-    drop_if_exists:
-      name: drop_if_exists
-      doc: ''
-      parameters:
-      - name: _event
-        type: dict
-        default: ''
-      - name: keys
-        type: List[str]
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 369
-    unpack_if_exists:
-      name: unpack_if_exists
-      doc: ''
-      parameters:
-      - name: _event
-        type: dict
-        default: ''
-      - name: keys
-        type: List[str]
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 373
-    do:
-      name: do
-      doc: ''
-      parameters:
-      - name: self
-        default: ''
-      - name: event
-        type: Dict
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 702
-    resume_state:
-      name: resume_state
-      doc: ''
-      parameters:
-      - name: self
-        default: ''
-      - name: endpoint_id
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 475
-    is_valid:
-      name: is_valid
-      doc: ''
-      parameters:
-      - name: self
-        default: ''
-      - name: endpoint_id
-        type: str
-        default: ''
-      - name: validation_function
-        default: ''
-      - name: field
-        type: Any
-        default: ''
-      - name: dict_path
-        type: List[str]
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 495
-    handle_errors:
-      name: handle_errors
-      doc: ''
-      parameters:
-      - name: self
-        default: ''
-      - name: endpoint_id
-        default: ''
-      - name: event
-        default: ''
-      outputs:
-      - default: ''
-        type: bool
-      lineno: 503
-    enrich_even_details:
-      name: enrich_even_details
-      doc: ''
-      parameters:
-      - name: event
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 511
-    is_not_none:
-      name: is_not_none
-      doc: ''
-      parameters:
-      - name: field
-        type: Any
-        default: ''
-      - name: dict_path
-        type: List[str]
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 536
-    is_list_of_numerics:
-      name: is_list_of_numerics
-      doc: ''
-      parameters:
-      - name: field
-        type: List[Union[int, float, dict, list]]
-        default: ''
-      - name: dict_path
-        type: List[str]
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 545
-    get_endpoint_record:
-      name: get_endpoint_record
-      doc: ''
-      parameters:
-      - name: kv_container
-        type: str
-        default: ''
-      - name: kv_path
-        type: str
-        default: ''
-      - name: endpoint_id
-        type: str
-        default: ''
-      - name: access_key
-        type: str
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 717
-    init_context:
-      name: init_context
-      doc: ''
-      parameters:
-      - name: context
-        type: MLClientCtx
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 743
-    handler:
-      name: handler
-      doc: ''
-      parameters:
-      - name: context
-        type: MLClientCtx
-        default: ''
-      - name: event
-        type: Event
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 751
-  description: ''
-  min_replicas: 1
-  max_replicas: 4
-  env: []
-  base_spec:
-    apiVersion: nuclio.io/v1
-    kind: Function
-    metadata:
-      name: model-monitoring-stream
-      labels: {}
-      annotations:
-        nuclio.io/generated_by: function generated from /home/michaell/projects/functions/model_monitoring_stream/model_monitoring_stream.py
-    spec:
-      runtime: python:3.6
-      handler: model_monitoring_stream:handler
-      env: []
-      volumes: []
-      build:
-        commands: []
-        noBaseImagesPull: true
-        functionSourceCode: 
-  source: ''
-  build:
-    commands: []
-    code_origin: https://github.com/Michaelliv/functions.git#202b4c489e4c02c3025742ea237f1a042b7c6043:/home/michaell/projects/functions/model_monitoring_stream/model_monitoring_stream.py
-  default_handler: handler
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/model_monitoring_stream/0.9.0/static/item.html b/functions/development/model_monitoring_stream/0.9.0/static/item.html deleted file mode 100644 index 010a791c..00000000 --- a/functions/development/model_monitoring_stream/0.9.0/static/item.html +++ /dev/null @@ -1,44 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- monitoring
-description: ''
-doc: ''
-example: model_monitoring_stream.ipynb
-generationDate: 2021-11-18:12-28
-icon: ''
-labels: {}
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 0.8.0
-name: model-monitoring-stream
-platformVersion: 3.2.0
-spec:
-  filename: model_monitoring_stream.py
-  handler: handler
-  image: livsmichael/mlrun-api:automation
-  kind: nuclio
-  requirements: []
-url: ''
-version: 0.9.0
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/model_monitoring_stream/0.9.0/static/source.html b/functions/development/model_monitoring_stream/0.9.0/static/source.html deleted file mode 100644 index e76e5230..00000000 --- a/functions/development/model_monitoring_stream/0.9.0/static/source.html +++ /dev/null @@ -1,776 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-import json
-import os
-from collections import defaultdict
-from datetime import datetime
-from os import environ
-from typing import Dict, List, Set, Optional, Any, Union
-
-import pandas as pd
-import v3io
-from mlrun.config import config
-from mlrun.run import MLClientCtx
-from mlrun.utils import logger
-from mlrun.utils.model_monitoring import (
-    parse_model_endpoint_store_prefix,
-    create_model_endpoint_id,
-)
-from mlrun.utils.v3io_clients import get_v3io_client, get_frames_client
-from nuclio import Event
-from storey import (
-    FieldAggregator,
-    NoopDriver,
-    Table,
-    Map,
-    MapClass,
-    AggregateByKey,
-    build_flow,
-    Filter,
-    FlatMap,
-    TSDBTarget,
-    ParquetTarget,
-    SyncEmitSource,
-)
-from storey.dtypes import SlidingWindows
-from storey.steps import SampleWindow
-# Constants
-from v3io.dataplane import RaiseForStatus
-
-ISO_8061_UTC = "%Y-%m-%d %H:%M:%S.%f%z"
-FUNCTION_URI = "function_uri"
-MODEL = "model"
-VERSION = "version"
-VERSIONED_MODEL = "versioned_model"
-MODEL_CLASS = "model_class"
-TIMESTAMP = "timestamp"
-ENDPOINT_ID = "endpoint_id"
-REQUEST_ID = "request_id"
-LABELS = "labels"
-UNPACKED_LABELS = "unpacked_labels"
-LATENCY_AVG_5M = "latency_avg_5m"
-LATENCY_AVG_1H = "latency_avg_1h"
-PREDICTIONS_PER_SECOND = "predictions_per_second"
-PREDICTIONS_COUNT_5M = "predictions_count_5m"
-PREDICTIONS_COUNT_1H = "predictions_count_1h"
-FIRST_REQUEST = "first_request"
-LAST_REQUEST = "last_request"
-ERROR_COUNT = "error_count"
-ENTITIES = "entities"
-FEATURE_NAMES = "feature_names"
-LABEL_COLUMNS = "label_columns"
-LATENCY = "latency"
-RECORD_TYPE = "record_type"
-FEATURES = "features"
-PREDICTION = "prediction"
-PREDICTIONS = "predictions"
-NAMED_FEATURES = "named_features"
-NAMED_PREDICTIONS = "named_predictions"
-BASE_METRICS = "base_metrics"
-CUSTOM_METRICS = "custom_metrics"
-ENDPOINT_FEATURES = "endpoint_features"
-METRICS = "metrics"
-BATCH_TIMESTAMP = "batch_timestamp"
-TIME_FORMAT: str = "%Y-%m-%d %H:%M:%S.%f"  # ISO 8061
-
-
-# Stream processing code
-class EventStreamProcessor:
-    def __init__(
-        self,
-        project: str,
-        sample_window: int = 10,
-        tsdb_batching_max_events: int = 10,
-        tsdb_batching_timeout_secs: int = 60 * 5,  # Default 5 minutes
-        parquet_batching_max_events: int = 10_000,
-        parquet_batching_timeout_secs: int = 60 * 60,  # Default 1 hour
-        aggregate_count_windows: Optional[List[str]] = None,
-        aggregate_count_period: str = "30s",
-        aggregate_avg_windows: Optional[List[str]] = None,
-        aggregate_avg_period: str = "30s",
-        v3io_access_key: Optional[str] = None,
-        v3io_framesd: Optional[str] = None,
-        v3io_api: Optional[str] = None,
-    ):
-        self.project = project
-        self.sample_window = sample_window
-        self.tsdb_batching_max_events = tsdb_batching_max_events
-        self.tsdb_batching_timeout_secs = tsdb_batching_timeout_secs
-        self.parquet_batching_max_events = parquet_batching_max_events
-        self.parquet_batching_timeout_secs = parquet_batching_timeout_secs
-        self.aggregate_count_windows = aggregate_count_windows or ["5m", "1h"]
-        self.aggregate_count_period = aggregate_count_period
-        self.aggregate_avg_windows = aggregate_avg_windows or ["5m", "1h"]
-        self.aggregate_avg_period = aggregate_avg_period
-
-        self.v3io_framesd = v3io_framesd or config.v3io_framesd
-        self.v3io_api = v3io_api or config.v3io_api
-
-        self.v3io_access_key = v3io_access_key or environ.get("V3IO_ACCESS_KEY")
-        self.model_monitoring_access_key = (
-            os.environ.get("MODEL_MONITORING_ACCESS_KEY") or self.v3io_access_key
-        )
-
-        template = config.model_endpoint_monitoring.store_prefixes.default
-
-        kv_path = template.format(project=project, kind="endpoints")
-        _, self.kv_container, self.kv_path = parse_model_endpoint_store_prefix(kv_path)
-
-        tsdb_path = template.format(project=project, kind="events")
-        _, self.tsdb_container, self.tsdb_path = parse_model_endpoint_store_prefix(
-            tsdb_path
-        )
-        self.tsdb_path = f"{self.tsdb_container}/{self.tsdb_path}"
-
-        self.parquet_path = config.model_endpoint_monitoring.store_prefixes.user_space.format(
-            project=project, kind="parquet"
-        )
-
-        logger.info(
-            "V3IO Configuration",
-            v3io_access_key=self.v3io_access_key,
-            model_monitoring_access_key=self.model_monitoring_access_key,
-            default_store_prefix=config.model_endpoint_monitoring.store_prefixes.default,
-            user_space_store_prefix=config.model_endpoint_monitoring.store_prefixes.user_space,
-            v3io_api=self.v3io_api,
-            v3io_framesd=self.v3io_framesd,
-            kv_container=self.kv_container,
-            kv_path=self.kv_path,
-            tsdb_container=self.tsdb_container,
-            tsdb_path=self.tsdb_path,
-            parquet_path=self.parquet_path,
-        )
-
-        self._kv_keys = [
-            FUNCTION_URI,
-            MODEL,
-            MODEL_CLASS,
-            TIMESTAMP,
-            ENDPOINT_ID,
-            LABELS,
-            UNPACKED_LABELS,
-            LATENCY_AVG_5M,
-            LATENCY_AVG_1H,
-            PREDICTIONS_PER_SECOND,
-            PREDICTIONS_COUNT_5M,
-            PREDICTIONS_COUNT_1H,
-            FIRST_REQUEST,
-            LAST_REQUEST,
-            ERROR_COUNT,
-        ]
-
-        self._flow = build_flow(
-            [
-                SyncEmitSource(),
-                ProcessEndpointEvent(
-                    kv_container=self.kv_container,
-                    kv_path=self.kv_path,
-                    v3io_access_key=self.v3io_access_key,
-                ),
-                FilterNotNone(),
-                FlatMap(lambda x: x),
-                MapFeatureNames(
-                    kv_container=self.kv_container,
-                    kv_path=self.kv_path,
-                    access_key=self.v3io_access_key,
-                ),
-                # Branch 1: Aggregate events, count averages and update TSDB and KV
-                [
-                    AggregateByKey(
-                        aggregates=[
-                            FieldAggregator(
-                                PREDICTIONS,
-                                ENDPOINT_ID,
-                                ["count"],
-                                SlidingWindows(
-                                    self.aggregate_count_windows,
-                                    self.aggregate_count_period,
-                                ),
-                            ),
-                            FieldAggregator(
-                                LATENCY,
-                                LATENCY,
-                                ["avg"],
-                                SlidingWindows(
-                                    self.aggregate_avg_windows,
-                                    self.aggregate_avg_period,
-                                ),
-                            ),
-                        ],
-                        table=Table("notable", NoopDriver()),
-                    ),
-                    SampleWindow(
-                        self.sample_window
-                    ),  # Add required gap between event to apply sampling
-                    Map(self.compute_predictions_per_second),
-                    # Branch 1.1: Updated KV
-                    [
-                        Map(self.process_before_kv),
-                        WriteToKV(container=self.kv_container, table=self.kv_path),
-                        InferSchema(
-                            v3io_access_key=self.v3io_access_key,
-                            v3io_framesd=self.v3io_framesd,
-                            container=self.kv_container,
-                            table=self.kv_path,
-                        ),
-                    ],
-                    # Branch 1.2: Update TSDB
-                    [
-                        # Map the event into taggable fields, add record type to each field
-                        Map(self.process_before_events_tsdb),
-                        [
-                            FilterKeys(BASE_METRICS),
-                            UnpackValues(BASE_METRICS),
-                            TSDBTarget(
-                                path=self.tsdb_path,
-                                rate="10/m",
-                                time_col=TIMESTAMP,
-                                container=self.tsdb_container,
-                                access_key=self.v3io_access_key,
-                                v3io_frames=self.v3io_framesd,
-                                index_cols=[ENDPOINT_ID, RECORD_TYPE],
-                                # Settings for _Batching
-                                max_events=self.tsdb_batching_max_events,
-                                timeout_secs=self.tsdb_batching_timeout_secs,
-                                key=ENDPOINT_ID,
-                            ),
-                        ],
-                        [
-                            FilterKeys(ENDPOINT_FEATURES),
-                            UnpackValues(ENDPOINT_FEATURES),
-                            TSDBTarget(
-                                path=self.tsdb_path,
-                                rate="10/m",
-                                time_col=TIMESTAMP,
-                                container=self.tsdb_container,
-                                access_key=self.v3io_access_key,
-                                v3io_frames=self.v3io_framesd,
-                                index_cols=[ENDPOINT_ID, RECORD_TYPE],
-                                # Settings for _Batching
-                                max_events=self.tsdb_batching_max_events,
-                                timeout_secs=self.tsdb_batching_timeout_secs,
-                                key=ENDPOINT_ID,
-                            ),
-                        ],
-                        [
-                            FilterKeys(CUSTOM_METRICS),
-                            FilterNotNone(),
-                            UnpackValues(CUSTOM_METRICS),
-                            TSDBTarget(
-                                path=self.tsdb_path,
-                                rate="10/m",
-                                time_col=TIMESTAMP,
-                                container=self.tsdb_container,
-                                access_key=self.v3io_access_key,
-                                v3io_frames=self.v3io_framesd,
-                                index_cols=[ENDPOINT_ID, RECORD_TYPE],
-                                # Settings for _Batching
-                                max_events=self.tsdb_batching_max_events,
-                                timeout_secs=self.tsdb_batching_timeout_secs,
-                                key=ENDPOINT_ID,
-                            ),
-                        ],
-                    ],
-                ],
-                # Branch 2: Batch events, write to parquet
-                [
-                    Map(self.process_before_parquet),
-                    ParquetTarget(
-                        path=self.parquet_path,
-                        partition_cols=["$key", "$year", "$month", "$day", "$hour"],
-                        infer_columns_from_data=True,
-                        # Settings for _Batching
-                        max_events=self.parquet_batching_max_events,
-                        timeout_secs=self.parquet_batching_timeout_secs,
-                        # Settings for v3io storage
-                        storage_options={
-                            "v3io_api": self.v3io_api,
-                            "v3io_access_key": self.model_monitoring_access_key,
-                        },
-                    ),
-                ],
-            ]
-        ).run()
-
-    def consume(self, event: Dict):
-        events = []
-        if "headers" in event and "values" in event:
-            for values in event["values"]:
-                events.append({k: v for k, v in zip(event["headers"], values)})
-        else:
-            events.append(event)
-
-        for enriched in map(enrich_even_details, events):
-            if enriched is not None:
-                self._flow.emit(
-                    enriched,
-                    key=enriched[ENDPOINT_ID],
-                    event_time=datetime.strptime(enriched["when"], ISO_8061_UTC),
-                )
-            else:
-                pass
-
-    @staticmethod
-    def compute_predictions_per_second(event: dict):
-        event[PREDICTIONS_PER_SECOND] = float(event[PREDICTIONS_COUNT_5M]) / 600
-        return event
-
-    def process_before_kv(self, event: dict):
-        # Filter relevant keys
-        e = {k: event[k] for k in self._kv_keys}
-        # Unpack labels dictionary
-        e = {**e, **e.pop(UNPACKED_LABELS, {})}
-        # Write labels to kv as json string to be presentable later
-        e[LABELS] = json.dumps(e[LABELS])
-        return e
-
-    @staticmethod
-    def process_before_events_tsdb(event: Dict):
-        base_fields = [TIMESTAMP, ENDPOINT_ID]
-
-        base_event = {k: event[k] for k in base_fields}
-        base_event[TIMESTAMP] = pd.to_datetime(
-            base_event[TIMESTAMP], format=TIME_FORMAT
-        )
-
-        base_metrics = {
-            RECORD_TYPE: BASE_METRICS,
-            PREDICTIONS_PER_SECOND: event[PREDICTIONS_PER_SECOND],
-            PREDICTIONS_COUNT_5M: event[PREDICTIONS_COUNT_5M],
-            PREDICTIONS_COUNT_1H: event[PREDICTIONS_COUNT_1H],
-            LATENCY_AVG_5M: event[LATENCY_AVG_5M],
-            LATENCY_AVG_1H: event[LATENCY_AVG_1H],
-            **base_event,
-        }
-
-        endpoint_features = {
-            RECORD_TYPE: ENDPOINT_FEATURES,
-            **event[NAMED_PREDICTIONS],
-            **event[NAMED_FEATURES],
-            **base_event,
-        }
-
-        processed = {BASE_METRICS: base_metrics, ENDPOINT_FEATURES: endpoint_features}
-
-        if event[METRICS]:
-            processed[CUSTOM_METRICS] = {
-                RECORD_TYPE: CUSTOM_METRICS,
-                **event[METRICS],
-                **base_event,
-            }
-
-        return processed
-
-    @staticmethod
-    def process_before_parquet(event: dict):
-        def set_none_if_empty(_event: dict, keys: List[str]):
-            for key in keys:
-                if not _event.get(key):
-                    _event[key] = None
-
-        def drop_if_exists(_event: dict, keys: List[str]):
-            for key in keys:
-                _event.pop(key, None)
-
-        def unpack_if_exists(_event: dict, keys: List[str]):
-            for key in keys:
-                value = _event.get(key)
-                if value is not None:
-                    _event = {**value, **event}
-
-        drop_if_exists(event, [UNPACKED_LABELS, FEATURES])
-        unpack_if_exists(event, [ENTITIES])
-        set_none_if_empty(event, [LABELS, METRICS, ENTITIES])
-        return event
-
-
-class ProcessEndpointEvent(MapClass):
-    def __init__(self, kv_container: str, kv_path: str, v3io_access_key: str, **kwargs):
-        super().__init__(**kwargs)
-        self.kv_container: str = kv_container
-        self.kv_path: str = kv_path
-        self.v3io_access_key: str = v3io_access_key
-        self.first_request: Dict[str, str] = dict()
-        self.last_request: Dict[str, str] = dict()
-        self.error_count: Dict[str, int] = defaultdict(int)
-        self.endpoints: Set[str] = set()
-
-    def do(self, event: dict):
-        function_uri = event[FUNCTION_URI]
-        versioned_model = event[VERSIONED_MODEL]
-        endpoint_id = event[ENDPOINT_ID]
-
-        # In case this process fails, resume state from existing record
-        self.resume_state(endpoint_id)
-
-        # Handle errors coming from stream
-        found_errors = self.handle_errors(endpoint_id, event)
-        if found_errors:
-            return None
-
-        # Validate event fields
-        model_class = event.get("model_class") or event.get("class")
-        timestamp = event.get("when")
-        request_id = event.get("request", {}).get("id")
-        latency = event.get("microsec")
-        features = event.get("request", {}).get("inputs")
-        predictions = event.get("resp", {}).get("outputs")
-
-        if not self.is_valid(endpoint_id, is_not_none, timestamp, ["when"],):
-            return None
-
-        if endpoint_id not in self.first_request:
-            self.first_request[endpoint_id] = timestamp
-        self.last_request[endpoint_id] = timestamp
-
-        if not self.is_valid(endpoint_id, is_not_none, request_id, ["request", "id"],):
-            return None
-        if not self.is_valid(endpoint_id, is_not_none, latency, ["microsec"],):
-            return None
-        if not self.is_valid(
-            endpoint_id, is_not_none, features, ["request", "inputs"],
-        ):
-            return None
-        if not self.is_valid(
-            endpoint_id, is_not_none, predictions, ["resp", "outputs"],
-        ):
-            return None
-
-        unpacked_labels = {f"_{k}": v for k, v in event.get(LABELS, {}).items()}
-
-        # Separate each model invocation into sub events
-        events = []
-        for i, (feature, prediction) in enumerate(zip(features, predictions)):
-            if not self.is_valid(
-                endpoint_id,
-                is_list_of_numerics,
-                feature,
-                ["request", "inputs", f"[{i}]"],
-            ):
-                return None
-
-            if not isinstance(prediction, list):
-                prediction = [prediction]
-
-            events.append(
-                {
-                    FUNCTION_URI: function_uri,
-                    MODEL: versioned_model,
-                    MODEL_CLASS: model_class,
-                    TIMESTAMP: timestamp,
-                    ENDPOINT_ID: endpoint_id,
-                    REQUEST_ID: request_id,
-                    LATENCY: latency,
-                    FEATURES: feature,
-                    PREDICTION: prediction,
-                    FIRST_REQUEST: self.first_request[endpoint_id],
-                    LAST_REQUEST: self.last_request[endpoint_id],
-                    ERROR_COUNT: self.error_count[endpoint_id],
-                    LABELS: event.get(LABELS, {}),
-                    METRICS: event.get(METRICS, {}),
-                    ENTITIES: event.get("request", {}).get(ENTITIES, {}),
-                    UNPACKED_LABELS: unpacked_labels,
-                }
-            )
-        return events
-
-    def resume_state(self, endpoint_id):
-        # Make sure process is resumable, if process fails for any reason, be able to pick things up close to where we
-        # left them
-        if endpoint_id not in self.endpoints:
-            logger.info("Trying to resume state", endpoint_id=endpoint_id)
-            endpoint_record = get_endpoint_record(
-                kv_container=self.kv_container,
-                kv_path=self.kv_path,
-                endpoint_id=endpoint_id,
-                access_key=self.v3io_access_key,
-            )
-            if endpoint_record:
-                first_request = endpoint_record.get(FIRST_REQUEST)
-                if first_request:
-                    self.first_request[endpoint_id] = first_request
-                error_count = endpoint_record.get(ERROR_COUNT)
-                if error_count:
-                    self.error_count[endpoint_id] = error_count
-            self.endpoints.add(endpoint_id)
-
-    def is_valid(
-        self, endpoint_id: str, validation_function, field: Any, dict_path: List[str]
-    ):
-        if validation_function(field, dict_path):
-            return True
-        self.error_count[endpoint_id] += 1
-        return False
-
-    def handle_errors(self, endpoint_id, event) -> bool:
-        if "error" in event:
-            self.error_count[endpoint_id] += 1
-            return True
-
-        return False
-
-
-def enrich_even_details(event) -> Optional[dict]:
-    function_uri = event.get(FUNCTION_URI)
-
-    if not is_not_none(function_uri, [FUNCTION_URI]):
-        return None
-
-    model = event.get(MODEL)
-    if not is_not_none(model, [MODEL]):
-        return None
-
-    version = event.get(VERSION)
-    versioned_model = f"{model}:{version}" if version else f"{model}:latest"
-
-    endpoint_id = create_model_endpoint_id(
-        function_uri=function_uri, versioned_model=versioned_model,
-    )
-
-    endpoint_id = str(endpoint_id)
-
-    event[VERSIONED_MODEL] = versioned_model
-    event[ENDPOINT_ID] = endpoint_id
-
-    return event
-
-
-def is_not_none(field: Any, dict_path: List[str]):
-    if field is not None:
-        return True
-    logger.error(
-        f"Expected event field is missing: {field} [Event -> {''.join(dict_path)}]"
-    )
-    return False
-
-
-def is_list_of_numerics(
-    field: List[Union[int, float, dict, list]], dict_path: List[str]
-):
-    if all(isinstance(x, int) or isinstance(x, float) for x in field):
-        return True
-    logger.error(
-        f"Expected event field is missing: {field} [Event -> {''.join(dict_path)}]"
-    )
-    return False
-
-
-class FilterNotNone(Filter):
-    def __init__(self, **kwargs):
-        super().__init__(fn=lambda event: event is not None, **kwargs)
-
-
-class FilterKeys(MapClass):
-    def __init__(self, *args, **kwargs):
-        super().__init__(**kwargs)
-        self.keys = list(args)
-
-    def do(self, event):
-        new_event = {}
-        for key in self.keys:
-            if key in event:
-                new_event[key] = event[key]
-
-        return new_event if new_event else None
-
-
-class UnpackValues(MapClass):
-    def __init__(self, *args, **kwargs):
-        super().__init__(**kwargs)
-        self.keys_to_unpack = set(args)
-
-    def do(self, event):
-        unpacked = {}
-        for key in event.keys():
-            if key in self.keys_to_unpack:
-                unpacked = {**unpacked, **event[key]}
-            else:
-                unpacked[key] = event[key]
-        return unpacked
-
-
-class MapFeatureNames(MapClass):
-    def __init__(self, kv_container: str, kv_path: str, access_key: str, **kwargs):
-        super().__init__(**kwargs)
-        self.kv_container = kv_container
-        self.kv_path = kv_path
-        self.access_key = access_key
-        self.feature_names = {}
-        self.label_columns = {}
-
-    def do(self, event: Dict):
-        endpoint_id = event[ENDPOINT_ID]
-
-        if endpoint_id not in self.feature_names:
-            endpoint_record = get_endpoint_record(
-                kv_container=self.kv_container,
-                kv_path=self.kv_path,
-                endpoint_id=endpoint_id,
-                access_key=self.access_key,
-            )
-            feature_names = endpoint_record.get(FEATURE_NAMES)
-            feature_names = json.loads(feature_names) if feature_names else None
-
-            label_columns = endpoint_record.get(LABEL_COLUMNS)
-            label_columns = json.loads(label_columns) if label_columns else None
-
-            if not feature_names:
-                logger.warn(
-                    f"Feature names are not initialized, they will be automatically generated",
-                    endpoint_id=endpoint_id,
-                )
-                feature_names = [f"f{i}" for i, _ in enumerate(event[FEATURES])]
-                get_v3io_client().kv.update(
-                    container=self.kv_container,
-                    table_path=self.kv_path,
-                    access_key=self.access_key,
-                    key=event[ENDPOINT_ID],
-                    attributes={FEATURE_NAMES: json.dumps(feature_names)},
-                    raise_for_status=RaiseForStatus.always,
-                )
-
-            if not label_columns:
-                logger.warn(
-                    f"label column names are not initialized, they will be automatically generated",
-                    endpoint_id=endpoint_id,
-                )
-                label_columns = [f"p{i}" for i, _ in enumerate(event[PREDICTION])]
-                get_v3io_client().kv.update(
-                    container=self.kv_container,
-                    table_path=self.kv_path,
-                    access_key=self.access_key,
-                    key=event[ENDPOINT_ID],
-                    attributes={LABEL_COLUMNS: json.dumps(label_columns)},
-                    raise_for_status=RaiseForStatus.always,
-                )
-
-            self.label_columns[endpoint_id] = label_columns
-            self.feature_names[endpoint_id] = feature_names
-
-            logger.info(
-                "Label columns", endpoint_id=endpoint_id, label_columns=label_columns
-            )
-            logger.info(
-                "Feature names", endpoint_id=endpoint_id, feature_names=feature_names
-            )
-
-        feature_names = self.feature_names[endpoint_id]
-        features = event[FEATURES]
-        event[NAMED_FEATURES] = {
-            name: feature for name, feature in zip(feature_names, features)
-        }
-
-        label_columns = self.label_columns[endpoint_id]
-        prediction = event[PREDICTION]
-        event[NAMED_PREDICTIONS] = {
-            name: prediction for name, prediction in zip(label_columns, prediction)
-        }
-        logger.info("Mapped event", event=event)
-        return event
-
-
-class WriteToKV(MapClass):
-    def __init__(self, container: str, table: str, **kwargs):
-        super().__init__(**kwargs)
-        self.container = container
-        self.table = table
-
-    def do(self, event: Dict):
-        get_v3io_client().kv.update(
-            container=self.container,
-            table_path=self.table,
-            key=event[ENDPOINT_ID],
-            attributes=event,
-        )
-        return event
-
-
-class InferSchema(MapClass):
-    def __init__(
-        self,
-        v3io_access_key: str,
-        v3io_framesd: str,
-        container: str,
-        table: str,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.container = container
-        self.v3io_access_key = v3io_access_key
-        self.v3io_framesd = v3io_framesd
-        self.table = table
-        self.keys = set()
-
-    def do(self, event: Dict):
-        key_set = set(event.keys())
-        if not key_set.issubset(self.keys):
-            self.keys.update(key_set)
-            get_frames_client(
-                token=self.v3io_access_key,
-                container=self.container,
-                address=self.v3io_framesd,
-            ).execute(backend="kv", table=self.table, command="infer_schema")
-            logger.info(
-                "Found new keys, inferred schema", table=self.table, event=event
-            )
-        return event
-
-
-def get_endpoint_record(
-    kv_container: str, kv_path: str, endpoint_id: str, access_key: str
-) -> Optional[dict]:
-    logger.info(
-        f"Grabbing endpoint data",
-        container=kv_container,
-        table_path=kv_path,
-        key=endpoint_id,
-    )
-    try:
-        endpoint_record = (
-            get_v3io_client()
-            .kv.get(
-                container=kv_container,
-                table_path=kv_path,
-                key=endpoint_id,
-                access_key=access_key,
-                raise_for_status=v3io.dataplane.RaiseForStatus.always,
-            )
-            .output.item
-        )
-        return endpoint_record
-    except Exception:
-        return None
-
-
-def init_context(context: MLClientCtx):
-    context.logger.info("Initializing EventStreamProcessor")
-    parameters = environ.get("MODEL_MONITORING_PARAMETERS")
-    parameters = json.loads(parameters) if parameters else {}
-    stream_processor = EventStreamProcessor(**parameters)
-    setattr(context, "stream_processor", stream_processor)
-
-
-def handler(context: MLClientCtx, event: Event):
-    event_body = json.loads(event.body)
-    context.logger.debug(event_body)
-    context.stream_processor.consume(event_body)
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/model_monitoring_stream/0.9.1/src/function.yaml b/functions/development/model_monitoring_stream/0.9.1/src/function.yaml deleted file mode 100644 index 07a21c40..00000000 --- a/functions/development/model_monitoring_stream/0.9.1/src/function.yaml +++ /dev/null @@ -1,267 +0,0 @@ -kind: remote -metadata: - name: model-monitoring-stream - tag: '' - hash: 33f4d6de0858b3dfc9d150fc82fbed6feb05534c - project: '' - categories: - - monitoring -spec: - command: '' - args: [] - image: livsmichael/mlrun-api:automation - entry_points: - consume: - name: consume - doc: '' - parameters: - - name: self - default: '' - - name: event - type: Dict - default: '' - outputs: - - default: '' - lineno: 293 - compute_predictions_per_second: - name: compute_predictions_per_second - doc: '' - parameters: - - name: event - type: dict - default: '' - outputs: - - default: '' - lineno: 311 - process_before_kv: - name: process_before_kv - doc: '' - parameters: - - name: self - default: '' - - name: event - type: dict - default: '' - outputs: - - default: '' - lineno: 316 - process_before_events_tsdb: - name: process_before_events_tsdb - doc: '' - parameters: - - name: event - type: Dict - default: '' - outputs: - - default: '' - lineno: 325 - process_before_parquet: - name: process_before_parquet - doc: '' - parameters: - - name: event - type: dict - default: '' - outputs: - - default: '' - lineno: 362 - set_none_if_empty: - name: set_none_if_empty - doc: '' - parameters: - - name: _event - type: dict - default: '' - - name: keys - type: List[str] - default: '' - outputs: - - default: '' - lineno: 364 - drop_if_exists: - name: drop_if_exists - doc: '' - parameters: - - name: _event - type: dict - default: '' - - name: keys - type: List[str] - default: '' - outputs: - - default: '' - lineno: 369 - unpack_if_exists: - name: unpack_if_exists - doc: '' - parameters: - - name: _event - type: dict - default: '' - - name: keys - type: List[str] - default: '' - outputs: - - default: '' - lineno: 373 - do: - name: do - doc: '' - parameters: - - name: self - default: '' - - name: event - type: Dict - default: '' - outputs: - - default: '' - lineno: 702 - resume_state: - name: resume_state - doc: '' - parameters: - - name: self - default: '' - - name: endpoint_id - default: '' - outputs: - - default: '' - lineno: 475 - is_valid: - name: is_valid - doc: '' - parameters: - - name: self - default: '' - - name: endpoint_id - type: str - default: '' - - name: validation_function - default: '' - - name: field - type: Any - default: '' - - name: dict_path - type: List[str] - default: '' - outputs: - - default: '' - lineno: 495 - handle_errors: - name: handle_errors - doc: '' - parameters: - - name: self - default: '' - - name: endpoint_id - default: '' - - name: event - default: '' - outputs: - - default: '' - type: bool - lineno: 503 - enrich_even_details: - name: enrich_even_details - doc: '' - parameters: - - name: event - default: '' - outputs: - - default: '' - lineno: 511 - is_not_none: - name: is_not_none - doc: '' - parameters: - - name: field - type: Any - default: '' - - name: dict_path - type: List[str] - default: '' - outputs: - - default: '' - lineno: 536 - is_list_of_numerics: - name: is_list_of_numerics - doc: '' - parameters: - - name: field - type: List[Union[int, float, dict, list]] - default: '' - - name: dict_path - type: List[str] - default: '' - outputs: - - default: '' - lineno: 545 - get_endpoint_record: - name: get_endpoint_record - doc: '' - parameters: - - name: kv_container - type: str - default: '' - - name: kv_path - type: str - default: '' - - name: endpoint_id - type: str - default: '' - - name: access_key - type: str - default: '' - outputs: - - default: '' - lineno: 717 - init_context: - name: init_context - doc: '' - parameters: - - name: context - type: MLClientCtx - default: '' - outputs: - - default: '' - lineno: 743 - handler: - name: handler - doc: '' - parameters: - - name: context - type: MLClientCtx - default: '' - - name: event - type: Event - default: '' - outputs: - - default: '' - lineno: 751 - description: '' - min_replicas: 1 - max_replicas: 4 - env: [] - base_spec: - apiVersion: nuclio.io/v1 - kind: Function - metadata: - name: model-monitoring-stream - labels: {} - annotations: - nuclio.io/generated_by: function generated from /home/michaell/projects/functions/model_monitoring_stream/model_monitoring_stream.py - spec: - runtime: python:3.6 - handler: model_monitoring_stream:handler - env: [] - volumes: [] - build: - commands: [] - noBaseImagesPull: true - functionSourceCode:  - source: '' - build: - commands: [] - code_origin: https://github.com/Michaelliv/functions.git#202b4c489e4c02c3025742ea237f1a042b7c6043:/home/michaell/projects/functions/model_monitoring_stream/model_monitoring_stream.py - default_handler: handler -verbose: false diff --git a/functions/development/model_monitoring_stream/0.9.1/src/item.yaml b/functions/development/model_monitoring_stream/0.9.1/src/item.yaml deleted file mode 100644 index e34ecb54..00000000 --- a/functions/development/model_monitoring_stream/0.9.1/src/item.yaml +++ /dev/null @@ -1,22 +0,0 @@ -apiVersion: v1 -categories: -- monitoring -description: '' -doc: '' -example: model_monitoring_stream.ipynb -generationDate: 2021-11-18:12-28 -icon: '' -labels: {} -maintainers: [] -marketplaceType: '' -mlrunVersion: 0.8.0 -name: model-monitoring-stream -platformVersion: 3.2.0 -spec: - filename: model_monitoring_stream.py - handler: handler - image: livsmichael/mlrun-api:automation - kind: nuclio - requirements: [] -url: '' -version: 0.9.1 diff --git a/functions/development/model_monitoring_stream/0.9.1/src/model_monitoring_stream.ipynb b/functions/development/model_monitoring_stream/0.9.1/src/model_monitoring_stream.ipynb deleted file mode 100644 index 93d8c92e..00000000 --- a/functions/development/model_monitoring_stream/0.9.1/src/model_monitoring_stream.ipynb +++ /dev/null @@ -1,178 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "source": [ - "# Model Monitoring\n", - "\n", - "## Initial set up (and pre-requisites)\n", - "1. Make sure you have the `mlrun-api` datasource available in your Grafana instance, otherwise add it by:\n", - " 1. Open your grafana instance\n", - " 2. Navigate to `Configuration -> Data Sources`\n", - " 3. Press `Add data source` and configure the following parameters\n", - " ```\n", - " Name: mlrun-api\n", - " URL: http://mlrun-api:8080/api/grafana-proxy/model-endpoints\n", - " Access: Server (default)\n", - "\n", - " ## Add a custom header of:\n", - " X-V3io-Session-Key: \n", - " ```\n", - " 4. Press `Save & Test` to make sure it works, a confirmation message should appear when this button is pressed\n", - "\n", - "2. Import the available dashboards `(./dashboards/*)` to you Grafana instance\n", - "3. To allow the system to utilize drift measurement, make sure you supply the train set when logging the model on the\n", - " training step\n", - "\n", - " ```python\n", - " # Log model\n", - " context.log_model(\n", - " \"model\",\n", - " body=dumps(model),\n", - " artifact_path=context.artifact_subpath(\"models\"),\n", - " extra_data=eval_metrics,\n", - " model_file=\"model.pkl\",\n", - " metrics=context.results,\n", - " training_set=X_test, # <- make sure this is passed into log_model\n", - " labels={\"class\": \"sklearn.linear_model.LogisticRegression\"}\n", - " )\n", - " ```\n", - "4. When serving a model, make sure that the Nuclio function is deployed with tracking enabled by applying\n", - " `fn.set_tracking()`\n", - "\n", - "## Configuration\n", - "The stream processing portion of the model monitoring, can be deployed under multiple configuration options. The\n", - "available configurations can be found under `stream.Config`. Once configured it should be supplied as environment\n", - "parameters to the Nuclio function by setting `fn.set_envs`\n", - "\n", - "```python\n", - "project: str # project name\n", - "sample_window: int # The sampling window for the data that flows into the TSDB and the KV\n", - "kv_path_template: str # Path template for the kv table\n", - "tsdb_path_template: str # Path template for the tsdb table\n", - "parquet_path_template: str # v3io parquets path template, assumes v3io is mounted\n", - "tsdb_batching_max_events: int # The max amount of event to batch before writing the batch to tsdb\n", - "tsdb_batching_timeout_secs: int # The max amount of seconds a given batch can be gathered before being emitted\n", - "parquet_batching_max_events: int # The max amount of event to batch before writing the batch to parquet\n", - "parquet_batching_timeout_secs: int # The max amount of seconds, a given batch can be gathered before being written to parquet\n", - "container: str # container name\n", - "v3io_access_key: str # V3IO Access key\n", - "v3io_framesd: str # V3IO framesd URL\n", - "time_format: str # The time format into which time related fields will be converted\n", - "aggregate_count_windows: List[str] # List of window sizes for predictions count\n", - "aggregate_count_period: str # Period of predictions count windows\n", - "aggregate_avg_windows: List[str] # List of window sizes for average latency\n", - "aggregate_avg_period: str # Period of average latency windows\n", - "```" - ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%% md\n" - } - } - }, - { - "cell_type": "markdown", - "source": [ - "## Export function yaml" - ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%% md\n" - } - } - }, - { - "cell_type": "code", - "execution_count": null, - "outputs": [], - "source": [ - "from mlrun import code_to_function\n", - "from mlrun.runtimes import RemoteRuntime\n", - "\n", - "\n", - "fn: RemoteRuntime = code_to_function(\n", - " name=\"model-monitoring-stream\",\n", - " kind=\"nuclio\",\n", - " image=\"mlrun/mlrun\",\n", - " filename=\"model_monitoring_stream.py\",\n", - " handler=\"handler\",\n", - ")\n", - "fn.export(\"model_monitoring_stream.yaml\")\n" - ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%%\n" - } - } - }, - { - "cell_type": "markdown", - "source": [ - "## Deploy Stream Processing" - ], - "metadata": { - "collapsed": false - } - }, - { - "cell_type": "code", - "execution_count": null, - "outputs": [], - "source": [ - "import os\n", - "\n", - "from mlrun import import_function\n", - "from mlrun.platforms import mount_v3io\n", - "from mlrun.runtimes import RemoteRuntime\n", - "import json\n", - "\n", - "# Set project name\n", - "project = \"\"\n", - "\n", - "fn: RemoteRuntime = import_function(\"hub://model_monitoring_stream\")\n", - "\n", - "fn.add_v3io_stream_trigger(\n", - " stream_path=f\"projects/{project}/model-endpoints/stream\",\n", - " name=\"monitoring_stream_trigger\",\n", - ")\n", - "\n", - "fn.set_env(\"MODEL_MONITORING_PARAMETERS\", json.dumps({\"project\": project, \"v3io_framesd\": os.environ.get(\"V3IO_FRAMESD\")}))\n", - "\n", - "fn.metadata.project = project\n", - "fn.apply(mount_v3io())\n", - "fn.deploy()" - ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%%\n" - } - } - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 2 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} \ No newline at end of file diff --git a/functions/development/model_monitoring_stream/0.9.1/src/model_monitoring_stream.py b/functions/development/model_monitoring_stream/0.9.1/src/model_monitoring_stream.py deleted file mode 100644 index ed21aeb5..00000000 --- a/functions/development/model_monitoring_stream/0.9.1/src/model_monitoring_stream.py +++ /dev/null @@ -1,754 +0,0 @@ -import json -import os -from collections import defaultdict -from datetime import datetime -from os import environ -from typing import Dict, List, Set, Optional, Any, Union - -import pandas as pd -import v3io -from mlrun.config import config -from mlrun.run import MLClientCtx -from mlrun.utils import logger -from mlrun.utils.model_monitoring import ( - parse_model_endpoint_store_prefix, - create_model_endpoint_id, -) -from mlrun.utils.v3io_clients import get_v3io_client, get_frames_client -from nuclio import Event -from storey import ( - FieldAggregator, - NoopDriver, - Table, - Map, - MapClass, - AggregateByKey, - build_flow, - Filter, - FlatMap, - TSDBTarget, - ParquetTarget, - SyncEmitSource, -) -from storey.dtypes import SlidingWindows -from storey.steps import SampleWindow -# Constants -from v3io.dataplane import RaiseForStatus - -ISO_8061_UTC = "%Y-%m-%d %H:%M:%S.%f%z" -FUNCTION_URI = "function_uri" -MODEL = "model" -VERSION = "version" -VERSIONED_MODEL = "versioned_model" -MODEL_CLASS = "model_class" -TIMESTAMP = "timestamp" -ENDPOINT_ID = "endpoint_id" -REQUEST_ID = "request_id" -LABELS = "labels" -UNPACKED_LABELS = "unpacked_labels" -LATENCY_AVG_5M = "latency_avg_5m" -LATENCY_AVG_1H = "latency_avg_1h" -PREDICTIONS_PER_SECOND = "predictions_per_second" -PREDICTIONS_COUNT_5M = "predictions_count_5m" -PREDICTIONS_COUNT_1H = "predictions_count_1h" -FIRST_REQUEST = "first_request" -LAST_REQUEST = "last_request" -ERROR_COUNT = "error_count" -ENTITIES = "entities" -FEATURE_NAMES = "feature_names" -LABEL_COLUMNS = "label_columns" -LATENCY = "latency" -RECORD_TYPE = "record_type" -FEATURES = "features" -PREDICTION = "prediction" -PREDICTIONS = "predictions" -NAMED_FEATURES = "named_features" -NAMED_PREDICTIONS = "named_predictions" -BASE_METRICS = "base_metrics" -CUSTOM_METRICS = "custom_metrics" -ENDPOINT_FEATURES = "endpoint_features" -METRICS = "metrics" -BATCH_TIMESTAMP = "batch_timestamp" -TIME_FORMAT: str = "%Y-%m-%d %H:%M:%S.%f" # ISO 8061 - - -# Stream processing code -class EventStreamProcessor: - def __init__( - self, - project: str, - sample_window: int = 10, - tsdb_batching_max_events: int = 10, - tsdb_batching_timeout_secs: int = 60 * 5, # Default 5 minutes - parquet_batching_max_events: int = 10_000, - parquet_batching_timeout_secs: int = 60 * 60, # Default 1 hour - aggregate_count_windows: Optional[List[str]] = None, - aggregate_count_period: str = "30s", - aggregate_avg_windows: Optional[List[str]] = None, - aggregate_avg_period: str = "30s", - v3io_access_key: Optional[str] = None, - v3io_framesd: Optional[str] = None, - v3io_api: Optional[str] = None, - ): - self.project = project - self.sample_window = sample_window - self.tsdb_batching_max_events = tsdb_batching_max_events - self.tsdb_batching_timeout_secs = tsdb_batching_timeout_secs - self.parquet_batching_max_events = parquet_batching_max_events - self.parquet_batching_timeout_secs = parquet_batching_timeout_secs - self.aggregate_count_windows = aggregate_count_windows or ["5m", "1h"] - self.aggregate_count_period = aggregate_count_period - self.aggregate_avg_windows = aggregate_avg_windows or ["5m", "1h"] - self.aggregate_avg_period = aggregate_avg_period - - self.v3io_framesd = v3io_framesd or config.v3io_framesd - self.v3io_api = v3io_api or config.v3io_api - - self.v3io_access_key = v3io_access_key or environ.get("V3IO_ACCESS_KEY") - self.model_monitoring_access_key = ( - os.environ.get("MODEL_MONITORING_ACCESS_KEY") or self.v3io_access_key - ) - - template = config.model_endpoint_monitoring.store_prefixes.default - - kv_path = template.format(project=project, kind="endpoints") - _, self.kv_container, self.kv_path = parse_model_endpoint_store_prefix(kv_path) - - tsdb_path = template.format(project=project, kind="events") - _, self.tsdb_container, self.tsdb_path = parse_model_endpoint_store_prefix( - tsdb_path - ) - self.tsdb_path = f"{self.tsdb_container}/{self.tsdb_path}" - - self.parquet_path = config.model_endpoint_monitoring.store_prefixes.user_space.format( - project=project, kind="parquet" - ) - - logger.info( - "V3IO Configuration", - v3io_access_key=self.v3io_access_key, - model_monitoring_access_key=self.model_monitoring_access_key, - default_store_prefix=config.model_endpoint_monitoring.store_prefixes.default, - user_space_store_prefix=config.model_endpoint_monitoring.store_prefixes.user_space, - v3io_api=self.v3io_api, - v3io_framesd=self.v3io_framesd, - kv_container=self.kv_container, - kv_path=self.kv_path, - tsdb_container=self.tsdb_container, - tsdb_path=self.tsdb_path, - parquet_path=self.parquet_path, - ) - - self._kv_keys = [ - FUNCTION_URI, - MODEL, - MODEL_CLASS, - TIMESTAMP, - ENDPOINT_ID, - LABELS, - UNPACKED_LABELS, - LATENCY_AVG_5M, - LATENCY_AVG_1H, - PREDICTIONS_PER_SECOND, - PREDICTIONS_COUNT_5M, - PREDICTIONS_COUNT_1H, - FIRST_REQUEST, - LAST_REQUEST, - ERROR_COUNT, - ] - - self._flow = build_flow( - [ - SyncEmitSource(), - ProcessEndpointEvent( - kv_container=self.kv_container, - kv_path=self.kv_path, - v3io_access_key=self.v3io_access_key, - ), - FilterNotNone(), - FlatMap(lambda x: x), - MapFeatureNames( - kv_container=self.kv_container, - kv_path=self.kv_path, - access_key=self.v3io_access_key, - ), - # Branch 1: Aggregate events, count averages and update TSDB and KV - [ - AggregateByKey( - aggregates=[ - FieldAggregator( - PREDICTIONS, - ENDPOINT_ID, - ["count"], - SlidingWindows( - self.aggregate_count_windows, - self.aggregate_count_period, - ), - ), - FieldAggregator( - LATENCY, - LATENCY, - ["avg"], - SlidingWindows( - self.aggregate_avg_windows, - self.aggregate_avg_period, - ), - ), - ], - table=Table("notable", NoopDriver()), - ), - SampleWindow( - self.sample_window - ), # Add required gap between event to apply sampling - Map(self.compute_predictions_per_second), - # Branch 1.1: Updated KV - [ - Map(self.process_before_kv), - WriteToKV(container=self.kv_container, table=self.kv_path), - InferSchema( - v3io_access_key=self.v3io_access_key, - v3io_framesd=self.v3io_framesd, - container=self.kv_container, - table=self.kv_path, - ), - ], - # Branch 1.2: Update TSDB - [ - # Map the event into taggable fields, add record type to each field - Map(self.process_before_events_tsdb), - [ - FilterKeys(BASE_METRICS), - UnpackValues(BASE_METRICS), - TSDBTarget( - path=self.tsdb_path, - rate="10/m", - time_col=TIMESTAMP, - container=self.tsdb_container, - access_key=self.v3io_access_key, - v3io_frames=self.v3io_framesd, - index_cols=[ENDPOINT_ID, RECORD_TYPE], - # Settings for _Batching - max_events=self.tsdb_batching_max_events, - timeout_secs=self.tsdb_batching_timeout_secs, - key=ENDPOINT_ID, - ), - ], - [ - FilterKeys(ENDPOINT_FEATURES), - UnpackValues(ENDPOINT_FEATURES), - TSDBTarget( - path=self.tsdb_path, - rate="10/m", - time_col=TIMESTAMP, - container=self.tsdb_container, - access_key=self.v3io_access_key, - v3io_frames=self.v3io_framesd, - index_cols=[ENDPOINT_ID, RECORD_TYPE], - # Settings for _Batching - max_events=self.tsdb_batching_max_events, - timeout_secs=self.tsdb_batching_timeout_secs, - key=ENDPOINT_ID, - ), - ], - [ - FilterKeys(CUSTOM_METRICS), - FilterNotNone(), - UnpackValues(CUSTOM_METRICS), - TSDBTarget( - path=self.tsdb_path, - rate="10/m", - time_col=TIMESTAMP, - container=self.tsdb_container, - access_key=self.v3io_access_key, - v3io_frames=self.v3io_framesd, - index_cols=[ENDPOINT_ID, RECORD_TYPE], - # Settings for _Batching - max_events=self.tsdb_batching_max_events, - timeout_secs=self.tsdb_batching_timeout_secs, - key=ENDPOINT_ID, - ), - ], - ], - ], - # Branch 2: Batch events, write to parquet - [ - Map(self.process_before_parquet), - ParquetTarget( - path=self.parquet_path, - partition_cols=["$key", "$year", "$month", "$day", "$hour"], - infer_columns_from_data=True, - # Settings for _Batching - max_events=self.parquet_batching_max_events, - timeout_secs=self.parquet_batching_timeout_secs, - # Settings for v3io storage - storage_options={ - "v3io_api": self.v3io_api, - "v3io_access_key": self.model_monitoring_access_key, - }, - ), - ], - ] - ).run() - - def consume(self, event: Dict): - events = [] - if "headers" in event and "values" in event: - for values in event["values"]: - events.append({k: v for k, v in zip(event["headers"], values)}) - else: - events.append(event) - - for enriched in map(enrich_even_details, events): - if enriched is not None: - self._flow.emit( - enriched, - key=enriched[ENDPOINT_ID], - event_time=datetime.strptime(enriched["when"], ISO_8061_UTC), - ) - else: - pass - - @staticmethod - def compute_predictions_per_second(event: dict): - event[PREDICTIONS_PER_SECOND] = float(event[PREDICTIONS_COUNT_5M]) / 600 - return event - - def process_before_kv(self, event: dict): - # Filter relevant keys - e = {k: event[k] for k in self._kv_keys} - # Unpack labels dictionary - e = {**e, **e.pop(UNPACKED_LABELS, {})} - # Write labels to kv as json string to be presentable later - e[LABELS] = json.dumps(e[LABELS]) - return e - - @staticmethod - def process_before_events_tsdb(event: Dict): - base_fields = [TIMESTAMP, ENDPOINT_ID] - - base_event = {k: event[k] for k in base_fields} - base_event[TIMESTAMP] = pd.to_datetime( - base_event[TIMESTAMP], format=TIME_FORMAT - ) - - base_metrics = { - RECORD_TYPE: BASE_METRICS, - PREDICTIONS_PER_SECOND: event[PREDICTIONS_PER_SECOND], - PREDICTIONS_COUNT_5M: event[PREDICTIONS_COUNT_5M], - PREDICTIONS_COUNT_1H: event[PREDICTIONS_COUNT_1H], - LATENCY_AVG_5M: event[LATENCY_AVG_5M], - LATENCY_AVG_1H: event[LATENCY_AVG_1H], - **base_event, - } - - endpoint_features = { - RECORD_TYPE: ENDPOINT_FEATURES, - **event[NAMED_PREDICTIONS], - **event[NAMED_FEATURES], - **base_event, - } - - processed = {BASE_METRICS: base_metrics, ENDPOINT_FEATURES: endpoint_features} - - if event[METRICS]: - processed[CUSTOM_METRICS] = { - RECORD_TYPE: CUSTOM_METRICS, - **event[METRICS], - **base_event, - } - - return processed - - @staticmethod - def process_before_parquet(event: dict): - def set_none_if_empty(_event: dict, keys: List[str]): - for key in keys: - if not _event.get(key): - _event[key] = None - - def drop_if_exists(_event: dict, keys: List[str]): - for key in keys: - _event.pop(key, None) - - def unpack_if_exists(_event: dict, keys: List[str]): - for key in keys: - value = _event.get(key) - if value is not None: - _event = {**value, **event} - - drop_if_exists(event, [UNPACKED_LABELS, FEATURES]) - unpack_if_exists(event, [ENTITIES]) - set_none_if_empty(event, [LABELS, METRICS, ENTITIES]) - return event - - -class ProcessEndpointEvent(MapClass): - def __init__(self, kv_container: str, kv_path: str, v3io_access_key: str, **kwargs): - super().__init__(**kwargs) - self.kv_container: str = kv_container - self.kv_path: str = kv_path - self.v3io_access_key: str = v3io_access_key - self.first_request: Dict[str, str] = dict() - self.last_request: Dict[str, str] = dict() - self.error_count: Dict[str, int] = defaultdict(int) - self.endpoints: Set[str] = set() - - def do(self, event: dict): - function_uri = event[FUNCTION_URI] - versioned_model = event[VERSIONED_MODEL] - endpoint_id = event[ENDPOINT_ID] - - # In case this process fails, resume state from existing record - self.resume_state(endpoint_id) - - # Handle errors coming from stream - found_errors = self.handle_errors(endpoint_id, event) - if found_errors: - return None - - # Validate event fields - model_class = event.get("model_class") or event.get("class") - timestamp = event.get("when") - request_id = event.get("request", {}).get("id") - latency = event.get("microsec") - features = event.get("request", {}).get("inputs") - predictions = event.get("resp", {}).get("outputs") - - if not self.is_valid(endpoint_id, is_not_none, timestamp, ["when"],): - return None - - if endpoint_id not in self.first_request: - self.first_request[endpoint_id] = timestamp - self.last_request[endpoint_id] = timestamp - - if not self.is_valid(endpoint_id, is_not_none, request_id, ["request", "id"],): - return None - if not self.is_valid(endpoint_id, is_not_none, latency, ["microsec"],): - return None - if not self.is_valid( - endpoint_id, is_not_none, features, ["request", "inputs"], - ): - return None - if not self.is_valid( - endpoint_id, is_not_none, predictions, ["resp", "outputs"], - ): - return None - - unpacked_labels = {f"_{k}": v for k, v in event.get(LABELS, {}).items()} - - # Separate each model invocation into sub events - events = [] - for i, (feature, prediction) in enumerate(zip(features, predictions)): - if not self.is_valid( - endpoint_id, - is_list_of_numerics, - feature, - ["request", "inputs", f"[{i}]"], - ): - return None - - if not isinstance(prediction, list): - prediction = [prediction] - - events.append( - { - FUNCTION_URI: function_uri, - MODEL: versioned_model, - MODEL_CLASS: model_class, - TIMESTAMP: timestamp, - ENDPOINT_ID: endpoint_id, - REQUEST_ID: request_id, - LATENCY: latency, - FEATURES: feature, - PREDICTION: prediction, - FIRST_REQUEST: self.first_request[endpoint_id], - LAST_REQUEST: self.last_request[endpoint_id], - ERROR_COUNT: self.error_count[endpoint_id], - LABELS: event.get(LABELS, {}), - METRICS: event.get(METRICS, {}), - ENTITIES: event.get("request", {}).get(ENTITIES, {}), - UNPACKED_LABELS: unpacked_labels, - } - ) - return events - - def resume_state(self, endpoint_id): - # Make sure process is resumable, if process fails for any reason, be able to pick things up close to where we - # left them - if endpoint_id not in self.endpoints: - logger.info("Trying to resume state", endpoint_id=endpoint_id) - endpoint_record = get_endpoint_record( - kv_container=self.kv_container, - kv_path=self.kv_path, - endpoint_id=endpoint_id, - access_key=self.v3io_access_key, - ) - if endpoint_record: - first_request = endpoint_record.get(FIRST_REQUEST) - if first_request: - self.first_request[endpoint_id] = first_request - error_count = endpoint_record.get(ERROR_COUNT) - if error_count: - self.error_count[endpoint_id] = error_count - self.endpoints.add(endpoint_id) - - def is_valid( - self, endpoint_id: str, validation_function, field: Any, dict_path: List[str] - ): - if validation_function(field, dict_path): - return True - self.error_count[endpoint_id] += 1 - return False - - def handle_errors(self, endpoint_id, event) -> bool: - if "error" in event: - self.error_count[endpoint_id] += 1 - return True - - return False - - -def enrich_even_details(event) -> Optional[dict]: - function_uri = event.get(FUNCTION_URI) - - if not is_not_none(function_uri, [FUNCTION_URI]): - return None - - model = event.get(MODEL) - if not is_not_none(model, [MODEL]): - return None - - version = event.get(VERSION) - versioned_model = f"{model}:{version}" if version else f"{model}:latest" - - endpoint_id = create_model_endpoint_id( - function_uri=function_uri, versioned_model=versioned_model, - ) - - endpoint_id = str(endpoint_id) - - event[VERSIONED_MODEL] = versioned_model - event[ENDPOINT_ID] = endpoint_id - - return event - - -def is_not_none(field: Any, dict_path: List[str]): - if field is not None: - return True - logger.error( - f"Expected event field is missing: {field} [Event -> {''.join(dict_path)}]" - ) - return False - - -def is_list_of_numerics( - field: List[Union[int, float, dict, list]], dict_path: List[str] -): - if all(isinstance(x, int) or isinstance(x, float) for x in field): - return True - logger.error( - f"Expected event field is missing: {field} [Event -> {''.join(dict_path)}]" - ) - return False - - -class FilterNotNone(Filter): - def __init__(self, **kwargs): - super().__init__(fn=lambda event: event is not None, **kwargs) - - -class FilterKeys(MapClass): - def __init__(self, *args, **kwargs): - super().__init__(**kwargs) - self.keys = list(args) - - def do(self, event): - new_event = {} - for key in self.keys: - if key in event: - new_event[key] = event[key] - - return new_event if new_event else None - - -class UnpackValues(MapClass): - def __init__(self, *args, **kwargs): - super().__init__(**kwargs) - self.keys_to_unpack = set(args) - - def do(self, event): - unpacked = {} - for key in event.keys(): - if key in self.keys_to_unpack: - unpacked = {**unpacked, **event[key]} - else: - unpacked[key] = event[key] - return unpacked - - -class MapFeatureNames(MapClass): - def __init__(self, kv_container: str, kv_path: str, access_key: str, **kwargs): - super().__init__(**kwargs) - self.kv_container = kv_container - self.kv_path = kv_path - self.access_key = access_key - self.feature_names = {} - self.label_columns = {} - - def do(self, event: Dict): - endpoint_id = event[ENDPOINT_ID] - - if endpoint_id not in self.feature_names: - endpoint_record = get_endpoint_record( - kv_container=self.kv_container, - kv_path=self.kv_path, - endpoint_id=endpoint_id, - access_key=self.access_key, - ) - feature_names = endpoint_record.get(FEATURE_NAMES) - feature_names = json.loads(feature_names) if feature_names else None - - label_columns = endpoint_record.get(LABEL_COLUMNS) - label_columns = json.loads(label_columns) if label_columns else None - - if not feature_names: - logger.warn( - f"Feature names are not initialized, they will be automatically generated", - endpoint_id=endpoint_id, - ) - feature_names = [f"f{i}" for i, _ in enumerate(event[FEATURES])] - get_v3io_client().kv.update( - container=self.kv_container, - table_path=self.kv_path, - access_key=self.access_key, - key=event[ENDPOINT_ID], - attributes={FEATURE_NAMES: json.dumps(feature_names)}, - raise_for_status=RaiseForStatus.always, - ) - - if not label_columns: - logger.warn( - f"label column names are not initialized, they will be automatically generated", - endpoint_id=endpoint_id, - ) - label_columns = [f"p{i}" for i, _ in enumerate(event[PREDICTION])] - get_v3io_client().kv.update( - container=self.kv_container, - table_path=self.kv_path, - access_key=self.access_key, - key=event[ENDPOINT_ID], - attributes={LABEL_COLUMNS: json.dumps(label_columns)}, - raise_for_status=RaiseForStatus.always, - ) - - self.label_columns[endpoint_id] = label_columns - self.feature_names[endpoint_id] = feature_names - - logger.info( - "Label columns", endpoint_id=endpoint_id, label_columns=label_columns - ) - logger.info( - "Feature names", endpoint_id=endpoint_id, feature_names=feature_names - ) - - feature_names = self.feature_names[endpoint_id] - features = event[FEATURES] - event[NAMED_FEATURES] = { - name: feature for name, feature in zip(feature_names, features) - } - - label_columns = self.label_columns[endpoint_id] - prediction = event[PREDICTION] - event[NAMED_PREDICTIONS] = { - name: prediction for name, prediction in zip(label_columns, prediction) - } - logger.info("Mapped event", event=event) - return event - - -class WriteToKV(MapClass): - def __init__(self, container: str, table: str, **kwargs): - super().__init__(**kwargs) - self.container = container - self.table = table - - def do(self, event: Dict): - get_v3io_client().kv.update( - container=self.container, - table_path=self.table, - key=event[ENDPOINT_ID], - attributes=event, - ) - return event - - -class InferSchema(MapClass): - def __init__( - self, - v3io_access_key: str, - v3io_framesd: str, - container: str, - table: str, - **kwargs, - ): - super().__init__(**kwargs) - self.container = container - self.v3io_access_key = v3io_access_key - self.v3io_framesd = v3io_framesd - self.table = table - self.keys = set() - - def do(self, event: Dict): - key_set = set(event.keys()) - if not key_set.issubset(self.keys): - self.keys.update(key_set) - get_frames_client( - token=self.v3io_access_key, - container=self.container, - address=self.v3io_framesd, - ).execute(backend="kv", table=self.table, command="infer_schema") - logger.info( - "Found new keys, inferred schema", table=self.table, event=event - ) - return event - - -def get_endpoint_record( - kv_container: str, kv_path: str, endpoint_id: str, access_key: str -) -> Optional[dict]: - logger.info( - f"Grabbing endpoint data", - container=kv_container, - table_path=kv_path, - key=endpoint_id, - ) - try: - endpoint_record = ( - get_v3io_client() - .kv.get( - container=kv_container, - table_path=kv_path, - key=endpoint_id, - access_key=access_key, - raise_for_status=v3io.dataplane.RaiseForStatus.always, - ) - .output.item - ) - return endpoint_record - except Exception: - return None - - -def init_context(context: MLClientCtx): - context.logger.info("Initializing EventStreamProcessor") - parameters = environ.get("MODEL_MONITORING_PARAMETERS") - parameters = json.loads(parameters) if parameters else {} - stream_processor = EventStreamProcessor(**parameters) - setattr(context, "stream_processor", stream_processor) - - -def handler(context: MLClientCtx, event: Event): - event_body = json.loads(event.body) - context.logger.debug(event_body) - context.stream_processor.consume(event_body) diff --git a/functions/development/model_monitoring_stream/0.9.1/src/requirements.txt b/functions/development/model_monitoring_stream/0.9.1/src/requirements.txt deleted file mode 100644 index ef238930..00000000 --- a/functions/development/model_monitoring_stream/0.9.1/src/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -storey -nuclio -v3io \ No newline at end of file diff --git a/functions/development/model_monitoring_stream/0.9.1/static/documentation.html b/functions/development/model_monitoring_stream/0.9.1/static/documentation.html deleted file mode 100644 index 8a9a22d7..00000000 --- a/functions/development/model_monitoring_stream/0.9.1/static/documentation.html +++ /dev/null @@ -1,248 +0,0 @@ - - - - - - - -model_monitoring_stream package - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- - -
-
-
-
-
-
-

model_monitoring_stream package

-
-

Submodules

-
-
-

model_monitoring_stream.model_monitoring_stream module

-
-
-class model_monitoring_stream.model_monitoring_stream.EventStreamProcessor(project: str, sample_window: int = 10, tsdb_batching_max_events: int = 10, tsdb_batching_timeout_secs: int = 300, parquet_batching_max_events: int = 10000, parquet_batching_timeout_secs: int = 3600, aggregate_count_windows: Optional[List[str]] = None, aggregate_count_period: str = '30s', aggregate_avg_windows: Optional[List[str]] = None, aggregate_avg_period: str = '30s', v3io_access_key: Optional[str] = None, v3io_framesd: Optional[str] = None, v3io_api: Optional[str] = None)[source]
-

Bases: object

-
-
-static compute_predictions_per_second(event: dict)[source]
-
-
-
-consume(event: Dict)[source]
-
-
-
-static process_before_events_tsdb(event: Dict)[source]
-
-
-
-process_before_kv(event: dict)[source]
-
-
-
-static process_before_parquet(event: dict)[source]
-
-
-
-
-class model_monitoring_stream.model_monitoring_stream.FilterKeys(*args: Any, **kwargs: Any)[source]
-

Bases: storey.

-
-
-do(event)[source]
-
-
-
-
-class model_monitoring_stream.model_monitoring_stream.FilterNotNone(*args: Any, **kwargs: Any)[source]
-

Bases: storey.

-
-
-
-class model_monitoring_stream.model_monitoring_stream.InferSchema(*args: Any, **kwargs: Any)[source]
-

Bases: storey.

-
-
-do(event: Dict)[source]
-
-
-
-
-class model_monitoring_stream.model_monitoring_stream.MapFeatureNames(*args: Any, **kwargs: Any)[source]
-

Bases: storey.

-
-
-do(event: Dict)[source]
-
-
-
-
-class model_monitoring_stream.model_monitoring_stream.ProcessEndpointEvent(*args: Any, **kwargs: Any)[source]
-

Bases: storey.

-
-
-do(event: dict)[source]
-
-
-
-handle_errors(endpoint_id, event)bool[source]
-
-
-
-is_valid(endpoint_id: str, validation_function, field: Any, dict_path: List[str])[source]
-
-
-
-resume_state(endpoint_id)[source]
-
-
-
-
-class model_monitoring_stream.model_monitoring_stream.UnpackValues(*args: Any, **kwargs: Any)[source]
-

Bases: storey.

-
-
-do(event)[source]
-
-
-
-
-class model_monitoring_stream.model_monitoring_stream.WriteToKV(*args: Any, **kwargs: Any)[source]
-

Bases: storey.

-
-
-do(event: Dict)[source]
-
-
-
-
-model_monitoring_stream.model_monitoring_stream.enrich_even_details(event)Optional[dict][source]
-
-
-
-model_monitoring_stream.model_monitoring_stream.get_endpoint_record(kv_container: str, kv_path: str, endpoint_id: str, access_key: str)Optional[dict][source]
-
-
-
-model_monitoring_stream.model_monitoring_stream.handler(context: mlrun.execution.MLClientCtx, event: nuclio.request.Event)[source]
-
-
-
-model_monitoring_stream.model_monitoring_stream.init_context(context: mlrun.execution.MLClientCtx)[source]
-
-
-
-model_monitoring_stream.model_monitoring_stream.is_list_of_numerics(field: List[Union[int, float, dict, list]], dict_path: List[str])[source]
-
-
-
-model_monitoring_stream.model_monitoring_stream.is_not_none(field: Any, dict_path: List[str])[source]
-
-
-
-

Module contents

-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/model_monitoring_stream/0.9.1/static/example.html b/functions/development/model_monitoring_stream/0.9.1/static/example.html deleted file mode 100644 index 5b21dbfe..00000000 --- a/functions/development/model_monitoring_stream/0.9.1/static/example.html +++ /dev/null @@ -1,243 +0,0 @@ - - - - - - - -Model Monitoring - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- - -
-
-
-
-
-
-

Model Monitoring

-
-

Initial set up (and pre-requisites)

-
    -
  1. Make sure you have the mlrun-api datasource available in your Grafana instance, otherwise add it by:

    -
      -
    1. Open your grafana instance

    2. -
    3. Navigate to Configuration -> Data Sources

    4. -
    5. Press Add data source and configure the following parameters

    6. -
    -
    Name: mlrun-api
    -URL: http://mlrun-api:8080/api/grafana-proxy/model-endpoints
    -Access: Server (default)
    -
    -## Add a custom header of:
    -X-V3io-Session-Key: <YOUR ACCESS KEY>
    -
    -
    -
      -
    1. Press Save & Test to make sure it works, a confirmation message should appear when this button is pressed

    2. -
    -
  2. -
  3. Import the available dashboards (./dashboards/*) to you Grafana instance

  4. -
  5. To allow the system to utilize drift measurement, make sure you supply the train set when logging the model on the -training step

    -
    # Log model
    -context.log_model(
    -    "model",
    -    body=dumps(model),
    -    artifact_path=context.artifact_subpath("models"),
    -    extra_data=eval_metrics,
    -    model_file="model.pkl",
    -    metrics=context.results,
    -    training_set=X_test,  # <- make sure this is passed into log_model
    -    labels={"class": "sklearn.linear_model.LogisticRegression"}
    -)
    -
    -
    -
  6. -
  7. When serving a model, make sure that the Nuclio function is deployed with tracking enabled by applying -fn.set_tracking()

  8. -
-
-
-

Configuration

-

The stream processing portion of the model monitoring, can be deployed under multiple configuration options. The -available configurations can be found under stream.Config. Once configured it should be supplied as environment -parameters to the Nuclio function by setting fn.set_envs

-
project: str                        # project name
-sample_window: int                  # The sampling window for the data that flows into the TSDB and the KV
-kv_path_template: str               # Path template for the kv table
-tsdb_path_template: str             # Path template for the tsdb table
-parquet_path_template: str          # v3io parquets path template, assumes v3io is mounted
-tsdb_batching_max_events: int       # The max amount of event to batch before writing the batch to tsdb
-tsdb_batching_timeout_secs: int     # The max amount of seconds a given batch can be gathered before being emitted
-parquet_batching_max_events: int    # The max amount of event to batch before writing the batch to parquet
-parquet_batching_timeout_secs: int  # The max amount of seconds, a given batch can be gathered before being written to parquet
-container: str                      # container name
-v3io_access_key: str                # V3IO Access key
-v3io_framesd: str                   # V3IO framesd URL
-time_format: str                    # The time format into which time related fields will be converted
-aggregate_count_windows: List[str]  # List of window sizes for predictions count
-aggregate_count_period: str         # Period of predictions count windows
-aggregate_avg_windows: List[str]    # List of window sizes for average latency
-aggregate_avg_period: str           # Period of average latency windows
-
-
-
-
-

Export function yaml

-
-
-
from mlrun import code_to_function
-from mlrun.runtimes import RemoteRuntime
-
-
-fn: RemoteRuntime = code_to_function(
-    name="model-monitoring-stream",
-    kind="nuclio",
-    image="mlrun/mlrun",
-    filename="model_monitoring_stream.py",
-    handler="handler",
-)
-fn.export("model_monitoring_stream.yaml")
-
-
-
-
-
-
-

Deploy Stream Processing

-
-
-
import os
-
-from mlrun import import_function
-from mlrun.platforms import mount_v3io
-from mlrun.runtimes import RemoteRuntime
-import json
-
-# Set project name
-project = ""
-
-fn: RemoteRuntime = import_function("hub://model_monitoring_stream")
-
-fn.add_v3io_stream_trigger(
-    stream_path=f"projects/{project}/model-endpoints/stream",
-    name="monitoring_stream_trigger",
-)
-
-fn.set_env("MODEL_MONITORING_PARAMETERS", json.dumps({"project": project, "v3io_framesd": os.environ.get("V3IO_FRAMESD")}))
-
-fn.metadata.project = project
-fn.apply(mount_v3io())
-fn.deploy()
-
-
-
-
-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/model_monitoring_stream/0.9.1/static/function.html b/functions/development/model_monitoring_stream/0.9.1/static/function.html deleted file mode 100644 index 81290204..00000000 --- a/functions/development/model_monitoring_stream/0.9.1/static/function.html +++ /dev/null @@ -1,289 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: remote
-metadata:
-  name: model-monitoring-stream
-  tag: ''
-  hash: 33f4d6de0858b3dfc9d150fc82fbed6feb05534c
-  project: ''
-  categories:
-  - monitoring
-spec:
-  command: ''
-  args: []
-  image: livsmichael/mlrun-api:automation
-  entry_points:
-    consume:
-      name: consume
-      doc: ''
-      parameters:
-      - name: self
-        default: ''
-      - name: event
-        type: Dict
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 293
-    compute_predictions_per_second:
-      name: compute_predictions_per_second
-      doc: ''
-      parameters:
-      - name: event
-        type: dict
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 311
-    process_before_kv:
-      name: process_before_kv
-      doc: ''
-      parameters:
-      - name: self
-        default: ''
-      - name: event
-        type: dict
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 316
-    process_before_events_tsdb:
-      name: process_before_events_tsdb
-      doc: ''
-      parameters:
-      - name: event
-        type: Dict
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 325
-    process_before_parquet:
-      name: process_before_parquet
-      doc: ''
-      parameters:
-      - name: event
-        type: dict
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 362
-    set_none_if_empty:
-      name: set_none_if_empty
-      doc: ''
-      parameters:
-      - name: _event
-        type: dict
-        default: ''
-      - name: keys
-        type: List[str]
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 364
-    drop_if_exists:
-      name: drop_if_exists
-      doc: ''
-      parameters:
-      - name: _event
-        type: dict
-        default: ''
-      - name: keys
-        type: List[str]
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 369
-    unpack_if_exists:
-      name: unpack_if_exists
-      doc: ''
-      parameters:
-      - name: _event
-        type: dict
-        default: ''
-      - name: keys
-        type: List[str]
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 373
-    do:
-      name: do
-      doc: ''
-      parameters:
-      - name: self
-        default: ''
-      - name: event
-        type: Dict
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 702
-    resume_state:
-      name: resume_state
-      doc: ''
-      parameters:
-      - name: self
-        default: ''
-      - name: endpoint_id
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 475
-    is_valid:
-      name: is_valid
-      doc: ''
-      parameters:
-      - name: self
-        default: ''
-      - name: endpoint_id
-        type: str
-        default: ''
-      - name: validation_function
-        default: ''
-      - name: field
-        type: Any
-        default: ''
-      - name: dict_path
-        type: List[str]
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 495
-    handle_errors:
-      name: handle_errors
-      doc: ''
-      parameters:
-      - name: self
-        default: ''
-      - name: endpoint_id
-        default: ''
-      - name: event
-        default: ''
-      outputs:
-      - default: ''
-        type: bool
-      lineno: 503
-    enrich_even_details:
-      name: enrich_even_details
-      doc: ''
-      parameters:
-      - name: event
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 511
-    is_not_none:
-      name: is_not_none
-      doc: ''
-      parameters:
-      - name: field
-        type: Any
-        default: ''
-      - name: dict_path
-        type: List[str]
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 536
-    is_list_of_numerics:
-      name: is_list_of_numerics
-      doc: ''
-      parameters:
-      - name: field
-        type: List[Union[int, float, dict, list]]
-        default: ''
-      - name: dict_path
-        type: List[str]
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 545
-    get_endpoint_record:
-      name: get_endpoint_record
-      doc: ''
-      parameters:
-      - name: kv_container
-        type: str
-        default: ''
-      - name: kv_path
-        type: str
-        default: ''
-      - name: endpoint_id
-        type: str
-        default: ''
-      - name: access_key
-        type: str
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 717
-    init_context:
-      name: init_context
-      doc: ''
-      parameters:
-      - name: context
-        type: MLClientCtx
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 743
-    handler:
-      name: handler
-      doc: ''
-      parameters:
-      - name: context
-        type: MLClientCtx
-        default: ''
-      - name: event
-        type: Event
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 751
-  description: ''
-  min_replicas: 1
-  max_replicas: 4
-  env: []
-  base_spec:
-    apiVersion: nuclio.io/v1
-    kind: Function
-    metadata:
-      name: model-monitoring-stream
-      labels: {}
-      annotations:
-        nuclio.io/generated_by: function generated from /home/michaell/projects/functions/model_monitoring_stream/model_monitoring_stream.py
-    spec:
-      runtime: python:3.6
-      handler: model_monitoring_stream:handler
-      env: []
-      volumes: []
-      build:
-        commands: []
-        noBaseImagesPull: true
-        functionSourceCode: 
-  source: ''
-  build:
-    commands: []
-    code_origin: https://github.com/Michaelliv/functions.git#202b4c489e4c02c3025742ea237f1a042b7c6043:/home/michaell/projects/functions/model_monitoring_stream/model_monitoring_stream.py
-  default_handler: handler
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/model_monitoring_stream/0.9.1/static/item.html b/functions/development/model_monitoring_stream/0.9.1/static/item.html deleted file mode 100644 index 879aebde..00000000 --- a/functions/development/model_monitoring_stream/0.9.1/static/item.html +++ /dev/null @@ -1,44 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- monitoring
-description: ''
-doc: ''
-example: model_monitoring_stream.ipynb
-generationDate: 2021-11-18:12-28
-icon: ''
-labels: {}
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 0.8.0
-name: model-monitoring-stream
-platformVersion: 3.2.0
-spec:
-  filename: model_monitoring_stream.py
-  handler: handler
-  image: livsmichael/mlrun-api:automation
-  kind: nuclio
-  requirements: []
-url: ''
-version: 0.9.1
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/model_monitoring_stream/0.9.1/static/source.html b/functions/development/model_monitoring_stream/0.9.1/static/source.html deleted file mode 100644 index e76e5230..00000000 --- a/functions/development/model_monitoring_stream/0.9.1/static/source.html +++ /dev/null @@ -1,776 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-import json
-import os
-from collections import defaultdict
-from datetime import datetime
-from os import environ
-from typing import Dict, List, Set, Optional, Any, Union
-
-import pandas as pd
-import v3io
-from mlrun.config import config
-from mlrun.run import MLClientCtx
-from mlrun.utils import logger
-from mlrun.utils.model_monitoring import (
-    parse_model_endpoint_store_prefix,
-    create_model_endpoint_id,
-)
-from mlrun.utils.v3io_clients import get_v3io_client, get_frames_client
-from nuclio import Event
-from storey import (
-    FieldAggregator,
-    NoopDriver,
-    Table,
-    Map,
-    MapClass,
-    AggregateByKey,
-    build_flow,
-    Filter,
-    FlatMap,
-    TSDBTarget,
-    ParquetTarget,
-    SyncEmitSource,
-)
-from storey.dtypes import SlidingWindows
-from storey.steps import SampleWindow
-# Constants
-from v3io.dataplane import RaiseForStatus
-
-ISO_8061_UTC = "%Y-%m-%d %H:%M:%S.%f%z"
-FUNCTION_URI = "function_uri"
-MODEL = "model"
-VERSION = "version"
-VERSIONED_MODEL = "versioned_model"
-MODEL_CLASS = "model_class"
-TIMESTAMP = "timestamp"
-ENDPOINT_ID = "endpoint_id"
-REQUEST_ID = "request_id"
-LABELS = "labels"
-UNPACKED_LABELS = "unpacked_labels"
-LATENCY_AVG_5M = "latency_avg_5m"
-LATENCY_AVG_1H = "latency_avg_1h"
-PREDICTIONS_PER_SECOND = "predictions_per_second"
-PREDICTIONS_COUNT_5M = "predictions_count_5m"
-PREDICTIONS_COUNT_1H = "predictions_count_1h"
-FIRST_REQUEST = "first_request"
-LAST_REQUEST = "last_request"
-ERROR_COUNT = "error_count"
-ENTITIES = "entities"
-FEATURE_NAMES = "feature_names"
-LABEL_COLUMNS = "label_columns"
-LATENCY = "latency"
-RECORD_TYPE = "record_type"
-FEATURES = "features"
-PREDICTION = "prediction"
-PREDICTIONS = "predictions"
-NAMED_FEATURES = "named_features"
-NAMED_PREDICTIONS = "named_predictions"
-BASE_METRICS = "base_metrics"
-CUSTOM_METRICS = "custom_metrics"
-ENDPOINT_FEATURES = "endpoint_features"
-METRICS = "metrics"
-BATCH_TIMESTAMP = "batch_timestamp"
-TIME_FORMAT: str = "%Y-%m-%d %H:%M:%S.%f"  # ISO 8061
-
-
-# Stream processing code
-class EventStreamProcessor:
-    def __init__(
-        self,
-        project: str,
-        sample_window: int = 10,
-        tsdb_batching_max_events: int = 10,
-        tsdb_batching_timeout_secs: int = 60 * 5,  # Default 5 minutes
-        parquet_batching_max_events: int = 10_000,
-        parquet_batching_timeout_secs: int = 60 * 60,  # Default 1 hour
-        aggregate_count_windows: Optional[List[str]] = None,
-        aggregate_count_period: str = "30s",
-        aggregate_avg_windows: Optional[List[str]] = None,
-        aggregate_avg_period: str = "30s",
-        v3io_access_key: Optional[str] = None,
-        v3io_framesd: Optional[str] = None,
-        v3io_api: Optional[str] = None,
-    ):
-        self.project = project
-        self.sample_window = sample_window
-        self.tsdb_batching_max_events = tsdb_batching_max_events
-        self.tsdb_batching_timeout_secs = tsdb_batching_timeout_secs
-        self.parquet_batching_max_events = parquet_batching_max_events
-        self.parquet_batching_timeout_secs = parquet_batching_timeout_secs
-        self.aggregate_count_windows = aggregate_count_windows or ["5m", "1h"]
-        self.aggregate_count_period = aggregate_count_period
-        self.aggregate_avg_windows = aggregate_avg_windows or ["5m", "1h"]
-        self.aggregate_avg_period = aggregate_avg_period
-
-        self.v3io_framesd = v3io_framesd or config.v3io_framesd
-        self.v3io_api = v3io_api or config.v3io_api
-
-        self.v3io_access_key = v3io_access_key or environ.get("V3IO_ACCESS_KEY")
-        self.model_monitoring_access_key = (
-            os.environ.get("MODEL_MONITORING_ACCESS_KEY") or self.v3io_access_key
-        )
-
-        template = config.model_endpoint_monitoring.store_prefixes.default
-
-        kv_path = template.format(project=project, kind="endpoints")
-        _, self.kv_container, self.kv_path = parse_model_endpoint_store_prefix(kv_path)
-
-        tsdb_path = template.format(project=project, kind="events")
-        _, self.tsdb_container, self.tsdb_path = parse_model_endpoint_store_prefix(
-            tsdb_path
-        )
-        self.tsdb_path = f"{self.tsdb_container}/{self.tsdb_path}"
-
-        self.parquet_path = config.model_endpoint_monitoring.store_prefixes.user_space.format(
-            project=project, kind="parquet"
-        )
-
-        logger.info(
-            "V3IO Configuration",
-            v3io_access_key=self.v3io_access_key,
-            model_monitoring_access_key=self.model_monitoring_access_key,
-            default_store_prefix=config.model_endpoint_monitoring.store_prefixes.default,
-            user_space_store_prefix=config.model_endpoint_monitoring.store_prefixes.user_space,
-            v3io_api=self.v3io_api,
-            v3io_framesd=self.v3io_framesd,
-            kv_container=self.kv_container,
-            kv_path=self.kv_path,
-            tsdb_container=self.tsdb_container,
-            tsdb_path=self.tsdb_path,
-            parquet_path=self.parquet_path,
-        )
-
-        self._kv_keys = [
-            FUNCTION_URI,
-            MODEL,
-            MODEL_CLASS,
-            TIMESTAMP,
-            ENDPOINT_ID,
-            LABELS,
-            UNPACKED_LABELS,
-            LATENCY_AVG_5M,
-            LATENCY_AVG_1H,
-            PREDICTIONS_PER_SECOND,
-            PREDICTIONS_COUNT_5M,
-            PREDICTIONS_COUNT_1H,
-            FIRST_REQUEST,
-            LAST_REQUEST,
-            ERROR_COUNT,
-        ]
-
-        self._flow = build_flow(
-            [
-                SyncEmitSource(),
-                ProcessEndpointEvent(
-                    kv_container=self.kv_container,
-                    kv_path=self.kv_path,
-                    v3io_access_key=self.v3io_access_key,
-                ),
-                FilterNotNone(),
-                FlatMap(lambda x: x),
-                MapFeatureNames(
-                    kv_container=self.kv_container,
-                    kv_path=self.kv_path,
-                    access_key=self.v3io_access_key,
-                ),
-                # Branch 1: Aggregate events, count averages and update TSDB and KV
-                [
-                    AggregateByKey(
-                        aggregates=[
-                            FieldAggregator(
-                                PREDICTIONS,
-                                ENDPOINT_ID,
-                                ["count"],
-                                SlidingWindows(
-                                    self.aggregate_count_windows,
-                                    self.aggregate_count_period,
-                                ),
-                            ),
-                            FieldAggregator(
-                                LATENCY,
-                                LATENCY,
-                                ["avg"],
-                                SlidingWindows(
-                                    self.aggregate_avg_windows,
-                                    self.aggregate_avg_period,
-                                ),
-                            ),
-                        ],
-                        table=Table("notable", NoopDriver()),
-                    ),
-                    SampleWindow(
-                        self.sample_window
-                    ),  # Add required gap between event to apply sampling
-                    Map(self.compute_predictions_per_second),
-                    # Branch 1.1: Updated KV
-                    [
-                        Map(self.process_before_kv),
-                        WriteToKV(container=self.kv_container, table=self.kv_path),
-                        InferSchema(
-                            v3io_access_key=self.v3io_access_key,
-                            v3io_framesd=self.v3io_framesd,
-                            container=self.kv_container,
-                            table=self.kv_path,
-                        ),
-                    ],
-                    # Branch 1.2: Update TSDB
-                    [
-                        # Map the event into taggable fields, add record type to each field
-                        Map(self.process_before_events_tsdb),
-                        [
-                            FilterKeys(BASE_METRICS),
-                            UnpackValues(BASE_METRICS),
-                            TSDBTarget(
-                                path=self.tsdb_path,
-                                rate="10/m",
-                                time_col=TIMESTAMP,
-                                container=self.tsdb_container,
-                                access_key=self.v3io_access_key,
-                                v3io_frames=self.v3io_framesd,
-                                index_cols=[ENDPOINT_ID, RECORD_TYPE],
-                                # Settings for _Batching
-                                max_events=self.tsdb_batching_max_events,
-                                timeout_secs=self.tsdb_batching_timeout_secs,
-                                key=ENDPOINT_ID,
-                            ),
-                        ],
-                        [
-                            FilterKeys(ENDPOINT_FEATURES),
-                            UnpackValues(ENDPOINT_FEATURES),
-                            TSDBTarget(
-                                path=self.tsdb_path,
-                                rate="10/m",
-                                time_col=TIMESTAMP,
-                                container=self.tsdb_container,
-                                access_key=self.v3io_access_key,
-                                v3io_frames=self.v3io_framesd,
-                                index_cols=[ENDPOINT_ID, RECORD_TYPE],
-                                # Settings for _Batching
-                                max_events=self.tsdb_batching_max_events,
-                                timeout_secs=self.tsdb_batching_timeout_secs,
-                                key=ENDPOINT_ID,
-                            ),
-                        ],
-                        [
-                            FilterKeys(CUSTOM_METRICS),
-                            FilterNotNone(),
-                            UnpackValues(CUSTOM_METRICS),
-                            TSDBTarget(
-                                path=self.tsdb_path,
-                                rate="10/m",
-                                time_col=TIMESTAMP,
-                                container=self.tsdb_container,
-                                access_key=self.v3io_access_key,
-                                v3io_frames=self.v3io_framesd,
-                                index_cols=[ENDPOINT_ID, RECORD_TYPE],
-                                # Settings for _Batching
-                                max_events=self.tsdb_batching_max_events,
-                                timeout_secs=self.tsdb_batching_timeout_secs,
-                                key=ENDPOINT_ID,
-                            ),
-                        ],
-                    ],
-                ],
-                # Branch 2: Batch events, write to parquet
-                [
-                    Map(self.process_before_parquet),
-                    ParquetTarget(
-                        path=self.parquet_path,
-                        partition_cols=["$key", "$year", "$month", "$day", "$hour"],
-                        infer_columns_from_data=True,
-                        # Settings for _Batching
-                        max_events=self.parquet_batching_max_events,
-                        timeout_secs=self.parquet_batching_timeout_secs,
-                        # Settings for v3io storage
-                        storage_options={
-                            "v3io_api": self.v3io_api,
-                            "v3io_access_key": self.model_monitoring_access_key,
-                        },
-                    ),
-                ],
-            ]
-        ).run()
-
-    def consume(self, event: Dict):
-        events = []
-        if "headers" in event and "values" in event:
-            for values in event["values"]:
-                events.append({k: v for k, v in zip(event["headers"], values)})
-        else:
-            events.append(event)
-
-        for enriched in map(enrich_even_details, events):
-            if enriched is not None:
-                self._flow.emit(
-                    enriched,
-                    key=enriched[ENDPOINT_ID],
-                    event_time=datetime.strptime(enriched["when"], ISO_8061_UTC),
-                )
-            else:
-                pass
-
-    @staticmethod
-    def compute_predictions_per_second(event: dict):
-        event[PREDICTIONS_PER_SECOND] = float(event[PREDICTIONS_COUNT_5M]) / 600
-        return event
-
-    def process_before_kv(self, event: dict):
-        # Filter relevant keys
-        e = {k: event[k] for k in self._kv_keys}
-        # Unpack labels dictionary
-        e = {**e, **e.pop(UNPACKED_LABELS, {})}
-        # Write labels to kv as json string to be presentable later
-        e[LABELS] = json.dumps(e[LABELS])
-        return e
-
-    @staticmethod
-    def process_before_events_tsdb(event: Dict):
-        base_fields = [TIMESTAMP, ENDPOINT_ID]
-
-        base_event = {k: event[k] for k in base_fields}
-        base_event[TIMESTAMP] = pd.to_datetime(
-            base_event[TIMESTAMP], format=TIME_FORMAT
-        )
-
-        base_metrics = {
-            RECORD_TYPE: BASE_METRICS,
-            PREDICTIONS_PER_SECOND: event[PREDICTIONS_PER_SECOND],
-            PREDICTIONS_COUNT_5M: event[PREDICTIONS_COUNT_5M],
-            PREDICTIONS_COUNT_1H: event[PREDICTIONS_COUNT_1H],
-            LATENCY_AVG_5M: event[LATENCY_AVG_5M],
-            LATENCY_AVG_1H: event[LATENCY_AVG_1H],
-            **base_event,
-        }
-
-        endpoint_features = {
-            RECORD_TYPE: ENDPOINT_FEATURES,
-            **event[NAMED_PREDICTIONS],
-            **event[NAMED_FEATURES],
-            **base_event,
-        }
-
-        processed = {BASE_METRICS: base_metrics, ENDPOINT_FEATURES: endpoint_features}
-
-        if event[METRICS]:
-            processed[CUSTOM_METRICS] = {
-                RECORD_TYPE: CUSTOM_METRICS,
-                **event[METRICS],
-                **base_event,
-            }
-
-        return processed
-
-    @staticmethod
-    def process_before_parquet(event: dict):
-        def set_none_if_empty(_event: dict, keys: List[str]):
-            for key in keys:
-                if not _event.get(key):
-                    _event[key] = None
-
-        def drop_if_exists(_event: dict, keys: List[str]):
-            for key in keys:
-                _event.pop(key, None)
-
-        def unpack_if_exists(_event: dict, keys: List[str]):
-            for key in keys:
-                value = _event.get(key)
-                if value is not None:
-                    _event = {**value, **event}
-
-        drop_if_exists(event, [UNPACKED_LABELS, FEATURES])
-        unpack_if_exists(event, [ENTITIES])
-        set_none_if_empty(event, [LABELS, METRICS, ENTITIES])
-        return event
-
-
-class ProcessEndpointEvent(MapClass):
-    def __init__(self, kv_container: str, kv_path: str, v3io_access_key: str, **kwargs):
-        super().__init__(**kwargs)
-        self.kv_container: str = kv_container
-        self.kv_path: str = kv_path
-        self.v3io_access_key: str = v3io_access_key
-        self.first_request: Dict[str, str] = dict()
-        self.last_request: Dict[str, str] = dict()
-        self.error_count: Dict[str, int] = defaultdict(int)
-        self.endpoints: Set[str] = set()
-
-    def do(self, event: dict):
-        function_uri = event[FUNCTION_URI]
-        versioned_model = event[VERSIONED_MODEL]
-        endpoint_id = event[ENDPOINT_ID]
-
-        # In case this process fails, resume state from existing record
-        self.resume_state(endpoint_id)
-
-        # Handle errors coming from stream
-        found_errors = self.handle_errors(endpoint_id, event)
-        if found_errors:
-            return None
-
-        # Validate event fields
-        model_class = event.get("model_class") or event.get("class")
-        timestamp = event.get("when")
-        request_id = event.get("request", {}).get("id")
-        latency = event.get("microsec")
-        features = event.get("request", {}).get("inputs")
-        predictions = event.get("resp", {}).get("outputs")
-
-        if not self.is_valid(endpoint_id, is_not_none, timestamp, ["when"],):
-            return None
-
-        if endpoint_id not in self.first_request:
-            self.first_request[endpoint_id] = timestamp
-        self.last_request[endpoint_id] = timestamp
-
-        if not self.is_valid(endpoint_id, is_not_none, request_id, ["request", "id"],):
-            return None
-        if not self.is_valid(endpoint_id, is_not_none, latency, ["microsec"],):
-            return None
-        if not self.is_valid(
-            endpoint_id, is_not_none, features, ["request", "inputs"],
-        ):
-            return None
-        if not self.is_valid(
-            endpoint_id, is_not_none, predictions, ["resp", "outputs"],
-        ):
-            return None
-
-        unpacked_labels = {f"_{k}": v for k, v in event.get(LABELS, {}).items()}
-
-        # Separate each model invocation into sub events
-        events = []
-        for i, (feature, prediction) in enumerate(zip(features, predictions)):
-            if not self.is_valid(
-                endpoint_id,
-                is_list_of_numerics,
-                feature,
-                ["request", "inputs", f"[{i}]"],
-            ):
-                return None
-
-            if not isinstance(prediction, list):
-                prediction = [prediction]
-
-            events.append(
-                {
-                    FUNCTION_URI: function_uri,
-                    MODEL: versioned_model,
-                    MODEL_CLASS: model_class,
-                    TIMESTAMP: timestamp,
-                    ENDPOINT_ID: endpoint_id,
-                    REQUEST_ID: request_id,
-                    LATENCY: latency,
-                    FEATURES: feature,
-                    PREDICTION: prediction,
-                    FIRST_REQUEST: self.first_request[endpoint_id],
-                    LAST_REQUEST: self.last_request[endpoint_id],
-                    ERROR_COUNT: self.error_count[endpoint_id],
-                    LABELS: event.get(LABELS, {}),
-                    METRICS: event.get(METRICS, {}),
-                    ENTITIES: event.get("request", {}).get(ENTITIES, {}),
-                    UNPACKED_LABELS: unpacked_labels,
-                }
-            )
-        return events
-
-    def resume_state(self, endpoint_id):
-        # Make sure process is resumable, if process fails for any reason, be able to pick things up close to where we
-        # left them
-        if endpoint_id not in self.endpoints:
-            logger.info("Trying to resume state", endpoint_id=endpoint_id)
-            endpoint_record = get_endpoint_record(
-                kv_container=self.kv_container,
-                kv_path=self.kv_path,
-                endpoint_id=endpoint_id,
-                access_key=self.v3io_access_key,
-            )
-            if endpoint_record:
-                first_request = endpoint_record.get(FIRST_REQUEST)
-                if first_request:
-                    self.first_request[endpoint_id] = first_request
-                error_count = endpoint_record.get(ERROR_COUNT)
-                if error_count:
-                    self.error_count[endpoint_id] = error_count
-            self.endpoints.add(endpoint_id)
-
-    def is_valid(
-        self, endpoint_id: str, validation_function, field: Any, dict_path: List[str]
-    ):
-        if validation_function(field, dict_path):
-            return True
-        self.error_count[endpoint_id] += 1
-        return False
-
-    def handle_errors(self, endpoint_id, event) -> bool:
-        if "error" in event:
-            self.error_count[endpoint_id] += 1
-            return True
-
-        return False
-
-
-def enrich_even_details(event) -> Optional[dict]:
-    function_uri = event.get(FUNCTION_URI)
-
-    if not is_not_none(function_uri, [FUNCTION_URI]):
-        return None
-
-    model = event.get(MODEL)
-    if not is_not_none(model, [MODEL]):
-        return None
-
-    version = event.get(VERSION)
-    versioned_model = f"{model}:{version}" if version else f"{model}:latest"
-
-    endpoint_id = create_model_endpoint_id(
-        function_uri=function_uri, versioned_model=versioned_model,
-    )
-
-    endpoint_id = str(endpoint_id)
-
-    event[VERSIONED_MODEL] = versioned_model
-    event[ENDPOINT_ID] = endpoint_id
-
-    return event
-
-
-def is_not_none(field: Any, dict_path: List[str]):
-    if field is not None:
-        return True
-    logger.error(
-        f"Expected event field is missing: {field} [Event -> {''.join(dict_path)}]"
-    )
-    return False
-
-
-def is_list_of_numerics(
-    field: List[Union[int, float, dict, list]], dict_path: List[str]
-):
-    if all(isinstance(x, int) or isinstance(x, float) for x in field):
-        return True
-    logger.error(
-        f"Expected event field is missing: {field} [Event -> {''.join(dict_path)}]"
-    )
-    return False
-
-
-class FilterNotNone(Filter):
-    def __init__(self, **kwargs):
-        super().__init__(fn=lambda event: event is not None, **kwargs)
-
-
-class FilterKeys(MapClass):
-    def __init__(self, *args, **kwargs):
-        super().__init__(**kwargs)
-        self.keys = list(args)
-
-    def do(self, event):
-        new_event = {}
-        for key in self.keys:
-            if key in event:
-                new_event[key] = event[key]
-
-        return new_event if new_event else None
-
-
-class UnpackValues(MapClass):
-    def __init__(self, *args, **kwargs):
-        super().__init__(**kwargs)
-        self.keys_to_unpack = set(args)
-
-    def do(self, event):
-        unpacked = {}
-        for key in event.keys():
-            if key in self.keys_to_unpack:
-                unpacked = {**unpacked, **event[key]}
-            else:
-                unpacked[key] = event[key]
-        return unpacked
-
-
-class MapFeatureNames(MapClass):
-    def __init__(self, kv_container: str, kv_path: str, access_key: str, **kwargs):
-        super().__init__(**kwargs)
-        self.kv_container = kv_container
-        self.kv_path = kv_path
-        self.access_key = access_key
-        self.feature_names = {}
-        self.label_columns = {}
-
-    def do(self, event: Dict):
-        endpoint_id = event[ENDPOINT_ID]
-
-        if endpoint_id not in self.feature_names:
-            endpoint_record = get_endpoint_record(
-                kv_container=self.kv_container,
-                kv_path=self.kv_path,
-                endpoint_id=endpoint_id,
-                access_key=self.access_key,
-            )
-            feature_names = endpoint_record.get(FEATURE_NAMES)
-            feature_names = json.loads(feature_names) if feature_names else None
-
-            label_columns = endpoint_record.get(LABEL_COLUMNS)
-            label_columns = json.loads(label_columns) if label_columns else None
-
-            if not feature_names:
-                logger.warn(
-                    f"Feature names are not initialized, they will be automatically generated",
-                    endpoint_id=endpoint_id,
-                )
-                feature_names = [f"f{i}" for i, _ in enumerate(event[FEATURES])]
-                get_v3io_client().kv.update(
-                    container=self.kv_container,
-                    table_path=self.kv_path,
-                    access_key=self.access_key,
-                    key=event[ENDPOINT_ID],
-                    attributes={FEATURE_NAMES: json.dumps(feature_names)},
-                    raise_for_status=RaiseForStatus.always,
-                )
-
-            if not label_columns:
-                logger.warn(
-                    f"label column names are not initialized, they will be automatically generated",
-                    endpoint_id=endpoint_id,
-                )
-                label_columns = [f"p{i}" for i, _ in enumerate(event[PREDICTION])]
-                get_v3io_client().kv.update(
-                    container=self.kv_container,
-                    table_path=self.kv_path,
-                    access_key=self.access_key,
-                    key=event[ENDPOINT_ID],
-                    attributes={LABEL_COLUMNS: json.dumps(label_columns)},
-                    raise_for_status=RaiseForStatus.always,
-                )
-
-            self.label_columns[endpoint_id] = label_columns
-            self.feature_names[endpoint_id] = feature_names
-
-            logger.info(
-                "Label columns", endpoint_id=endpoint_id, label_columns=label_columns
-            )
-            logger.info(
-                "Feature names", endpoint_id=endpoint_id, feature_names=feature_names
-            )
-
-        feature_names = self.feature_names[endpoint_id]
-        features = event[FEATURES]
-        event[NAMED_FEATURES] = {
-            name: feature for name, feature in zip(feature_names, features)
-        }
-
-        label_columns = self.label_columns[endpoint_id]
-        prediction = event[PREDICTION]
-        event[NAMED_PREDICTIONS] = {
-            name: prediction for name, prediction in zip(label_columns, prediction)
-        }
-        logger.info("Mapped event", event=event)
-        return event
-
-
-class WriteToKV(MapClass):
-    def __init__(self, container: str, table: str, **kwargs):
-        super().__init__(**kwargs)
-        self.container = container
-        self.table = table
-
-    def do(self, event: Dict):
-        get_v3io_client().kv.update(
-            container=self.container,
-            table_path=self.table,
-            key=event[ENDPOINT_ID],
-            attributes=event,
-        )
-        return event
-
-
-class InferSchema(MapClass):
-    def __init__(
-        self,
-        v3io_access_key: str,
-        v3io_framesd: str,
-        container: str,
-        table: str,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.container = container
-        self.v3io_access_key = v3io_access_key
-        self.v3io_framesd = v3io_framesd
-        self.table = table
-        self.keys = set()
-
-    def do(self, event: Dict):
-        key_set = set(event.keys())
-        if not key_set.issubset(self.keys):
-            self.keys.update(key_set)
-            get_frames_client(
-                token=self.v3io_access_key,
-                container=self.container,
-                address=self.v3io_framesd,
-            ).execute(backend="kv", table=self.table, command="infer_schema")
-            logger.info(
-                "Found new keys, inferred schema", table=self.table, event=event
-            )
-        return event
-
-
-def get_endpoint_record(
-    kv_container: str, kv_path: str, endpoint_id: str, access_key: str
-) -> Optional[dict]:
-    logger.info(
-        f"Grabbing endpoint data",
-        container=kv_container,
-        table_path=kv_path,
-        key=endpoint_id,
-    )
-    try:
-        endpoint_record = (
-            get_v3io_client()
-            .kv.get(
-                container=kv_container,
-                table_path=kv_path,
-                key=endpoint_id,
-                access_key=access_key,
-                raise_for_status=v3io.dataplane.RaiseForStatus.always,
-            )
-            .output.item
-        )
-        return endpoint_record
-    except Exception:
-        return None
-
-
-def init_context(context: MLClientCtx):
-    context.logger.info("Initializing EventStreamProcessor")
-    parameters = environ.get("MODEL_MONITORING_PARAMETERS")
-    parameters = json.loads(parameters) if parameters else {}
-    stream_processor = EventStreamProcessor(**parameters)
-    setattr(context, "stream_processor", stream_processor)
-
-
-def handler(context: MLClientCtx, event: Event):
-    event_body = json.loads(event.body)
-    context.logger.debug(event_body)
-    context.stream_processor.consume(event_body)
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/model_monitoring_stream/1.1.0/src/function.yaml b/functions/development/model_monitoring_stream/1.1.0/src/function.yaml deleted file mode 100644 index 07a21c40..00000000 --- a/functions/development/model_monitoring_stream/1.1.0/src/function.yaml +++ /dev/null @@ -1,267 +0,0 @@ -kind: remote -metadata: - name: model-monitoring-stream - tag: '' - hash: 33f4d6de0858b3dfc9d150fc82fbed6feb05534c - project: '' - categories: - - monitoring -spec: - command: '' - args: [] - image: livsmichael/mlrun-api:automation - entry_points: - consume: - name: consume - doc: '' - parameters: - - name: self - default: '' - - name: event - type: Dict - default: '' - outputs: - - default: '' - lineno: 293 - compute_predictions_per_second: - name: compute_predictions_per_second - doc: '' - parameters: - - name: event - type: dict - default: '' - outputs: - - default: '' - lineno: 311 - process_before_kv: - name: process_before_kv - doc: '' - parameters: - - name: self - default: '' - - name: event - type: dict - default: '' - outputs: - - default: '' - lineno: 316 - process_before_events_tsdb: - name: process_before_events_tsdb - doc: '' - parameters: - - name: event - type: Dict - default: '' - outputs: - - default: '' - lineno: 325 - process_before_parquet: - name: process_before_parquet - doc: '' - parameters: - - name: event - type: dict - default: '' - outputs: - - default: '' - lineno: 362 - set_none_if_empty: - name: set_none_if_empty - doc: '' - parameters: - - name: _event - type: dict - default: '' - - name: keys - type: List[str] - default: '' - outputs: - - default: '' - lineno: 364 - drop_if_exists: - name: drop_if_exists - doc: '' - parameters: - - name: _event - type: dict - default: '' - - name: keys - type: List[str] - default: '' - outputs: - - default: '' - lineno: 369 - unpack_if_exists: - name: unpack_if_exists - doc: '' - parameters: - - name: _event - type: dict - default: '' - - name: keys - type: List[str] - default: '' - outputs: - - default: '' - lineno: 373 - do: - name: do - doc: '' - parameters: - - name: self - default: '' - - name: event - type: Dict - default: '' - outputs: - - default: '' - lineno: 702 - resume_state: - name: resume_state - doc: '' - parameters: - - name: self - default: '' - - name: endpoint_id - default: '' - outputs: - - default: '' - lineno: 475 - is_valid: - name: is_valid - doc: '' - parameters: - - name: self - default: '' - - name: endpoint_id - type: str - default: '' - - name: validation_function - default: '' - - name: field - type: Any - default: '' - - name: dict_path - type: List[str] - default: '' - outputs: - - default: '' - lineno: 495 - handle_errors: - name: handle_errors - doc: '' - parameters: - - name: self - default: '' - - name: endpoint_id - default: '' - - name: event - default: '' - outputs: - - default: '' - type: bool - lineno: 503 - enrich_even_details: - name: enrich_even_details - doc: '' - parameters: - - name: event - default: '' - outputs: - - default: '' - lineno: 511 - is_not_none: - name: is_not_none - doc: '' - parameters: - - name: field - type: Any - default: '' - - name: dict_path - type: List[str] - default: '' - outputs: - - default: '' - lineno: 536 - is_list_of_numerics: - name: is_list_of_numerics - doc: '' - parameters: - - name: field - type: List[Union[int, float, dict, list]] - default: '' - - name: dict_path - type: List[str] - default: '' - outputs: - - default: '' - lineno: 545 - get_endpoint_record: - name: get_endpoint_record - doc: '' - parameters: - - name: kv_container - type: str - default: '' - - name: kv_path - type: str - default: '' - - name: endpoint_id - type: str - default: '' - - name: access_key - type: str - default: '' - outputs: - - default: '' - lineno: 717 - init_context: - name: init_context - doc: '' - parameters: - - name: context - type: MLClientCtx - default: '' - outputs: - - default: '' - lineno: 743 - handler: - name: handler - doc: '' - parameters: - - name: context - type: MLClientCtx - default: '' - - name: event - type: Event - default: '' - outputs: - - default: '' - lineno: 751 - description: '' - min_replicas: 1 - max_replicas: 4 - env: [] - base_spec: - apiVersion: nuclio.io/v1 - kind: Function - metadata: - name: model-monitoring-stream - labels: {} - annotations: - nuclio.io/generated_by: function generated from /home/michaell/projects/functions/model_monitoring_stream/model_monitoring_stream.py - spec: - runtime: python:3.6 - handler: model_monitoring_stream:handler - env: [] - volumes: [] - build: - commands: [] - noBaseImagesPull: true - functionSourceCode:  - source: '' - build: - commands: [] - code_origin: https://github.com/Michaelliv/functions.git#202b4c489e4c02c3025742ea237f1a042b7c6043:/home/michaell/projects/functions/model_monitoring_stream/model_monitoring_stream.py - default_handler: handler -verbose: false diff --git a/functions/development/model_monitoring_stream/1.1.0/src/item.yaml b/functions/development/model_monitoring_stream/1.1.0/src/item.yaml deleted file mode 100644 index 219fa528..00000000 --- a/functions/development/model_monitoring_stream/1.1.0/src/item.yaml +++ /dev/null @@ -1,23 +0,0 @@ -apiVersion: v1 -categories: -- monitoring -description: '' -doc: '' -example: model_monitoring_stream.ipynb -generationDate: 2022-08-28:17-25 -hidden: false -icon: '' -labels: {} -maintainers: [] -marketplaceType: '' -mlrunVersion: 1.1.0 -name: model-monitoring-stream -platformVersion: 3.5.0 -spec: - filename: model_monitoring_stream.py - handler: handler - image: livsmichael/mlrun-api:automation - kind: nuclio - requirements: [] -url: '' -version: 1.1.0 diff --git a/functions/development/model_monitoring_stream/1.1.0/src/model_monitoring_stream.ipynb b/functions/development/model_monitoring_stream/1.1.0/src/model_monitoring_stream.ipynb deleted file mode 100644 index 93d8c92e..00000000 --- a/functions/development/model_monitoring_stream/1.1.0/src/model_monitoring_stream.ipynb +++ /dev/null @@ -1,178 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "source": [ - "# Model Monitoring\n", - "\n", - "## Initial set up (and pre-requisites)\n", - "1. Make sure you have the `mlrun-api` datasource available in your Grafana instance, otherwise add it by:\n", - " 1. Open your grafana instance\n", - " 2. Navigate to `Configuration -> Data Sources`\n", - " 3. Press `Add data source` and configure the following parameters\n", - " ```\n", - " Name: mlrun-api\n", - " URL: http://mlrun-api:8080/api/grafana-proxy/model-endpoints\n", - " Access: Server (default)\n", - "\n", - " ## Add a custom header of:\n", - " X-V3io-Session-Key: \n", - " ```\n", - " 4. Press `Save & Test` to make sure it works, a confirmation message should appear when this button is pressed\n", - "\n", - "2. Import the available dashboards `(./dashboards/*)` to you Grafana instance\n", - "3. To allow the system to utilize drift measurement, make sure you supply the train set when logging the model on the\n", - " training step\n", - "\n", - " ```python\n", - " # Log model\n", - " context.log_model(\n", - " \"model\",\n", - " body=dumps(model),\n", - " artifact_path=context.artifact_subpath(\"models\"),\n", - " extra_data=eval_metrics,\n", - " model_file=\"model.pkl\",\n", - " metrics=context.results,\n", - " training_set=X_test, # <- make sure this is passed into log_model\n", - " labels={\"class\": \"sklearn.linear_model.LogisticRegression\"}\n", - " )\n", - " ```\n", - "4. When serving a model, make sure that the Nuclio function is deployed with tracking enabled by applying\n", - " `fn.set_tracking()`\n", - "\n", - "## Configuration\n", - "The stream processing portion of the model monitoring, can be deployed under multiple configuration options. The\n", - "available configurations can be found under `stream.Config`. Once configured it should be supplied as environment\n", - "parameters to the Nuclio function by setting `fn.set_envs`\n", - "\n", - "```python\n", - "project: str # project name\n", - "sample_window: int # The sampling window for the data that flows into the TSDB and the KV\n", - "kv_path_template: str # Path template for the kv table\n", - "tsdb_path_template: str # Path template for the tsdb table\n", - "parquet_path_template: str # v3io parquets path template, assumes v3io is mounted\n", - "tsdb_batching_max_events: int # The max amount of event to batch before writing the batch to tsdb\n", - "tsdb_batching_timeout_secs: int # The max amount of seconds a given batch can be gathered before being emitted\n", - "parquet_batching_max_events: int # The max amount of event to batch before writing the batch to parquet\n", - "parquet_batching_timeout_secs: int # The max amount of seconds, a given batch can be gathered before being written to parquet\n", - "container: str # container name\n", - "v3io_access_key: str # V3IO Access key\n", - "v3io_framesd: str # V3IO framesd URL\n", - "time_format: str # The time format into which time related fields will be converted\n", - "aggregate_count_windows: List[str] # List of window sizes for predictions count\n", - "aggregate_count_period: str # Period of predictions count windows\n", - "aggregate_avg_windows: List[str] # List of window sizes for average latency\n", - "aggregate_avg_period: str # Period of average latency windows\n", - "```" - ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%% md\n" - } - } - }, - { - "cell_type": "markdown", - "source": [ - "## Export function yaml" - ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%% md\n" - } - } - }, - { - "cell_type": "code", - "execution_count": null, - "outputs": [], - "source": [ - "from mlrun import code_to_function\n", - "from mlrun.runtimes import RemoteRuntime\n", - "\n", - "\n", - "fn: RemoteRuntime = code_to_function(\n", - " name=\"model-monitoring-stream\",\n", - " kind=\"nuclio\",\n", - " image=\"mlrun/mlrun\",\n", - " filename=\"model_monitoring_stream.py\",\n", - " handler=\"handler\",\n", - ")\n", - "fn.export(\"model_monitoring_stream.yaml\")\n" - ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%%\n" - } - } - }, - { - "cell_type": "markdown", - "source": [ - "## Deploy Stream Processing" - ], - "metadata": { - "collapsed": false - } - }, - { - "cell_type": "code", - "execution_count": null, - "outputs": [], - "source": [ - "import os\n", - "\n", - "from mlrun import import_function\n", - "from mlrun.platforms import mount_v3io\n", - "from mlrun.runtimes import RemoteRuntime\n", - "import json\n", - "\n", - "# Set project name\n", - "project = \"\"\n", - "\n", - "fn: RemoteRuntime = import_function(\"hub://model_monitoring_stream\")\n", - "\n", - "fn.add_v3io_stream_trigger(\n", - " stream_path=f\"projects/{project}/model-endpoints/stream\",\n", - " name=\"monitoring_stream_trigger\",\n", - ")\n", - "\n", - "fn.set_env(\"MODEL_MONITORING_PARAMETERS\", json.dumps({\"project\": project, \"v3io_framesd\": os.environ.get(\"V3IO_FRAMESD\")}))\n", - "\n", - "fn.metadata.project = project\n", - "fn.apply(mount_v3io())\n", - "fn.deploy()" - ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%%\n" - } - } - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 2 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} \ No newline at end of file diff --git a/functions/development/model_monitoring_stream/1.1.0/src/model_monitoring_stream.py b/functions/development/model_monitoring_stream/1.1.0/src/model_monitoring_stream.py deleted file mode 100644 index 90c8b92c..00000000 --- a/functions/development/model_monitoring_stream/1.1.0/src/model_monitoring_stream.py +++ /dev/null @@ -1,768 +0,0 @@ -# Copyright 2019 Iguazio -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -import json -import os -from collections import defaultdict -from datetime import datetime -from os import environ -from typing import Dict, List, Set, Optional, Any, Union - -import pandas as pd -import v3io -from mlrun.config import config -from mlrun.run import MLClientCtx -from mlrun.utils import logger -from mlrun.utils.model_monitoring import ( - parse_model_endpoint_store_prefix, - create_model_endpoint_id, -) -from mlrun.utils.v3io_clients import get_v3io_client, get_frames_client -from nuclio import Event -from storey import ( - FieldAggregator, - NoopDriver, - Table, - Map, - MapClass, - AggregateByKey, - build_flow, - Filter, - FlatMap, - TSDBTarget, - ParquetTarget, - SyncEmitSource, -) -from storey.dtypes import SlidingWindows -from storey.steps import SampleWindow -# Constants -from v3io.dataplane import RaiseForStatus - -ISO_8061_UTC = "%Y-%m-%d %H:%M:%S.%f%z" -FUNCTION_URI = "function_uri" -MODEL = "model" -VERSION = "version" -VERSIONED_MODEL = "versioned_model" -MODEL_CLASS = "model_class" -TIMESTAMP = "timestamp" -ENDPOINT_ID = "endpoint_id" -REQUEST_ID = "request_id" -LABELS = "labels" -UNPACKED_LABELS = "unpacked_labels" -LATENCY_AVG_5M = "latency_avg_5m" -LATENCY_AVG_1H = "latency_avg_1h" -PREDICTIONS_PER_SECOND = "predictions_per_second" -PREDICTIONS_COUNT_5M = "predictions_count_5m" -PREDICTIONS_COUNT_1H = "predictions_count_1h" -FIRST_REQUEST = "first_request" -LAST_REQUEST = "last_request" -ERROR_COUNT = "error_count" -ENTITIES = "entities" -FEATURE_NAMES = "feature_names" -LABEL_COLUMNS = "label_columns" -LATENCY = "latency" -RECORD_TYPE = "record_type" -FEATURES = "features" -PREDICTION = "prediction" -PREDICTIONS = "predictions" -NAMED_FEATURES = "named_features" -NAMED_PREDICTIONS = "named_predictions" -BASE_METRICS = "base_metrics" -CUSTOM_METRICS = "custom_metrics" -ENDPOINT_FEATURES = "endpoint_features" -METRICS = "metrics" -BATCH_TIMESTAMP = "batch_timestamp" -TIME_FORMAT: str = "%Y-%m-%d %H:%M:%S.%f" # ISO 8061 - - -# Stream processing code -class EventStreamProcessor: - def __init__( - self, - project: str, - sample_window: int = 10, - tsdb_batching_max_events: int = 10, - tsdb_batching_timeout_secs: int = 60 * 5, # Default 5 minutes - parquet_batching_max_events: int = 10_000, - parquet_batching_timeout_secs: int = 60 * 60, # Default 1 hour - aggregate_count_windows: Optional[List[str]] = None, - aggregate_count_period: str = "30s", - aggregate_avg_windows: Optional[List[str]] = None, - aggregate_avg_period: str = "30s", - v3io_access_key: Optional[str] = None, - v3io_framesd: Optional[str] = None, - v3io_api: Optional[str] = None, - ): - self.project = project - self.sample_window = sample_window - self.tsdb_batching_max_events = tsdb_batching_max_events - self.tsdb_batching_timeout_secs = tsdb_batching_timeout_secs - self.parquet_batching_max_events = parquet_batching_max_events - self.parquet_batching_timeout_secs = parquet_batching_timeout_secs - self.aggregate_count_windows = aggregate_count_windows or ["5m", "1h"] - self.aggregate_count_period = aggregate_count_period - self.aggregate_avg_windows = aggregate_avg_windows or ["5m", "1h"] - self.aggregate_avg_period = aggregate_avg_period - - self.v3io_framesd = v3io_framesd or config.v3io_framesd - self.v3io_api = v3io_api or config.v3io_api - - self.v3io_access_key = v3io_access_key or environ.get("V3IO_ACCESS_KEY") - self.model_monitoring_access_key = ( - os.environ.get("MODEL_MONITORING_ACCESS_KEY") or self.v3io_access_key - ) - - template = config.model_endpoint_monitoring.store_prefixes.default - - kv_path = template.format(project=project, kind="endpoints") - _, self.kv_container, self.kv_path = parse_model_endpoint_store_prefix(kv_path) - - tsdb_path = template.format(project=project, kind="events") - _, self.tsdb_container, self.tsdb_path = parse_model_endpoint_store_prefix( - tsdb_path - ) - self.tsdb_path = f"{self.tsdb_container}/{self.tsdb_path}" - - self.parquet_path = config.model_endpoint_monitoring.store_prefixes.user_space.format( - project=project, kind="parquet" - ) - - logger.info( - "V3IO Configuration", - v3io_access_key=self.v3io_access_key, - model_monitoring_access_key=self.model_monitoring_access_key, - default_store_prefix=config.model_endpoint_monitoring.store_prefixes.default, - user_space_store_prefix=config.model_endpoint_monitoring.store_prefixes.user_space, - v3io_api=self.v3io_api, - v3io_framesd=self.v3io_framesd, - kv_container=self.kv_container, - kv_path=self.kv_path, - tsdb_container=self.tsdb_container, - tsdb_path=self.tsdb_path, - parquet_path=self.parquet_path, - ) - - self._kv_keys = [ - FUNCTION_URI, - MODEL, - MODEL_CLASS, - TIMESTAMP, - ENDPOINT_ID, - LABELS, - UNPACKED_LABELS, - LATENCY_AVG_5M, - LATENCY_AVG_1H, - PREDICTIONS_PER_SECOND, - PREDICTIONS_COUNT_5M, - PREDICTIONS_COUNT_1H, - FIRST_REQUEST, - LAST_REQUEST, - ERROR_COUNT, - ] - - self._flow = build_flow( - [ - SyncEmitSource(), - ProcessEndpointEvent( - kv_container=self.kv_container, - kv_path=self.kv_path, - v3io_access_key=self.v3io_access_key, - ), - FilterNotNone(), - FlatMap(lambda x: x), - MapFeatureNames( - kv_container=self.kv_container, - kv_path=self.kv_path, - access_key=self.v3io_access_key, - ), - # Branch 1: Aggregate events, count averages and update TSDB and KV - [ - AggregateByKey( - aggregates=[ - FieldAggregator( - PREDICTIONS, - ENDPOINT_ID, - ["count"], - SlidingWindows( - self.aggregate_count_windows, - self.aggregate_count_period, - ), - ), - FieldAggregator( - LATENCY, - LATENCY, - ["avg"], - SlidingWindows( - self.aggregate_avg_windows, - self.aggregate_avg_period, - ), - ), - ], - table=Table("notable", NoopDriver()), - ), - SampleWindow( - self.sample_window - ), # Add required gap between event to apply sampling - Map(self.compute_predictions_per_second), - # Branch 1.1: Updated KV - [ - Map(self.process_before_kv), - WriteToKV(container=self.kv_container, table=self.kv_path), - InferSchema( - v3io_access_key=self.v3io_access_key, - v3io_framesd=self.v3io_framesd, - container=self.kv_container, - table=self.kv_path, - ), - ], - # Branch 1.2: Update TSDB - [ - # Map the event into taggable fields, add record type to each field - Map(self.process_before_events_tsdb), - [ - FilterKeys(BASE_METRICS), - UnpackValues(BASE_METRICS), - TSDBTarget( - path=self.tsdb_path, - rate="10/m", - time_col=TIMESTAMP, - container=self.tsdb_container, - access_key=self.v3io_access_key, - v3io_frames=self.v3io_framesd, - index_cols=[ENDPOINT_ID, RECORD_TYPE], - # Settings for _Batching - max_events=self.tsdb_batching_max_events, - timeout_secs=self.tsdb_batching_timeout_secs, - key=ENDPOINT_ID, - ), - ], - [ - FilterKeys(ENDPOINT_FEATURES), - UnpackValues(ENDPOINT_FEATURES), - TSDBTarget( - path=self.tsdb_path, - rate="10/m", - time_col=TIMESTAMP, - container=self.tsdb_container, - access_key=self.v3io_access_key, - v3io_frames=self.v3io_framesd, - index_cols=[ENDPOINT_ID, RECORD_TYPE], - # Settings for _Batching - max_events=self.tsdb_batching_max_events, - timeout_secs=self.tsdb_batching_timeout_secs, - key=ENDPOINT_ID, - ), - ], - [ - FilterKeys(CUSTOM_METRICS), - FilterNotNone(), - UnpackValues(CUSTOM_METRICS), - TSDBTarget( - path=self.tsdb_path, - rate="10/m", - time_col=TIMESTAMP, - container=self.tsdb_container, - access_key=self.v3io_access_key, - v3io_frames=self.v3io_framesd, - index_cols=[ENDPOINT_ID, RECORD_TYPE], - # Settings for _Batching - max_events=self.tsdb_batching_max_events, - timeout_secs=self.tsdb_batching_timeout_secs, - key=ENDPOINT_ID, - ), - ], - ], - ], - # Branch 2: Batch events, write to parquet - [ - Map(self.process_before_parquet), - ParquetTarget( - path=self.parquet_path, - partition_cols=["$key", "$year", "$month", "$day", "$hour"], - infer_columns_from_data=True, - # Settings for _Batching - max_events=self.parquet_batching_max_events, - timeout_secs=self.parquet_batching_timeout_secs, - # Settings for v3io storage - storage_options={ - "v3io_api": self.v3io_api, - "v3io_access_key": self.model_monitoring_access_key, - }, - ), - ], - ] - ).run() - - def consume(self, event: Dict): - events = [] - if "headers" in event and "values" in event: - for values in event["values"]: - events.append({k: v for k, v in zip(event["headers"], values)}) - else: - events.append(event) - - for enriched in map(enrich_even_details, events): - if enriched is not None: - self._flow.emit( - enriched, - key=enriched[ENDPOINT_ID], - event_time=datetime.strptime(enriched["when"], ISO_8061_UTC), - ) - else: - pass - - @staticmethod - def compute_predictions_per_second(event: dict): - event[PREDICTIONS_PER_SECOND] = float(event[PREDICTIONS_COUNT_5M]) / 600 - return event - - def process_before_kv(self, event: dict): - # Filter relevant keys - e = {k: event[k] for k in self._kv_keys} - # Unpack labels dictionary - e = {**e, **e.pop(UNPACKED_LABELS, {})} - # Write labels to kv as json string to be presentable later - e[LABELS] = json.dumps(e[LABELS]) - return e - - @staticmethod - def process_before_events_tsdb(event: Dict): - base_fields = [TIMESTAMP, ENDPOINT_ID] - - base_event = {k: event[k] for k in base_fields} - base_event[TIMESTAMP] = pd.to_datetime( - base_event[TIMESTAMP], format=TIME_FORMAT - ) - - base_metrics = { - RECORD_TYPE: BASE_METRICS, - PREDICTIONS_PER_SECOND: event[PREDICTIONS_PER_SECOND], - PREDICTIONS_COUNT_5M: event[PREDICTIONS_COUNT_5M], - PREDICTIONS_COUNT_1H: event[PREDICTIONS_COUNT_1H], - LATENCY_AVG_5M: event[LATENCY_AVG_5M], - LATENCY_AVG_1H: event[LATENCY_AVG_1H], - **base_event, - } - - endpoint_features = { - RECORD_TYPE: ENDPOINT_FEATURES, - **event[NAMED_PREDICTIONS], - **event[NAMED_FEATURES], - **base_event, - } - - processed = {BASE_METRICS: base_metrics, ENDPOINT_FEATURES: endpoint_features} - - if event[METRICS]: - processed[CUSTOM_METRICS] = { - RECORD_TYPE: CUSTOM_METRICS, - **event[METRICS], - **base_event, - } - - return processed - - @staticmethod - def process_before_parquet(event: dict): - def set_none_if_empty(_event: dict, keys: List[str]): - for key in keys: - if not _event.get(key): - _event[key] = None - - def drop_if_exists(_event: dict, keys: List[str]): - for key in keys: - _event.pop(key, None) - - def unpack_if_exists(_event: dict, keys: List[str]): - for key in keys: - value = _event.get(key) - if value is not None: - _event = {**value, **event} - - drop_if_exists(event, [UNPACKED_LABELS, FEATURES]) - unpack_if_exists(event, [ENTITIES]) - set_none_if_empty(event, [LABELS, METRICS, ENTITIES]) - return event - - -class ProcessEndpointEvent(MapClass): - def __init__(self, kv_container: str, kv_path: str, v3io_access_key: str, **kwargs): - super().__init__(**kwargs) - self.kv_container: str = kv_container - self.kv_path: str = kv_path - self.v3io_access_key: str = v3io_access_key - self.first_request: Dict[str, str] = dict() - self.last_request: Dict[str, str] = dict() - self.error_count: Dict[str, int] = defaultdict(int) - self.endpoints: Set[str] = set() - - def do(self, event: dict): - function_uri = event[FUNCTION_URI] - versioned_model = event[VERSIONED_MODEL] - endpoint_id = event[ENDPOINT_ID] - - # In case this process fails, resume state from existing record - self.resume_state(endpoint_id) - - # Handle errors coming from stream - found_errors = self.handle_errors(endpoint_id, event) - if found_errors: - return None - - # Validate event fields - model_class = event.get("model_class") or event.get("class") - timestamp = event.get("when") - request_id = event.get("request", {}).get("id") - latency = event.get("microsec") - features = event.get("request", {}).get("inputs") - predictions = event.get("resp", {}).get("outputs") - - if not self.is_valid(endpoint_id, is_not_none, timestamp, ["when"],): - return None - - if endpoint_id not in self.first_request: - self.first_request[endpoint_id] = timestamp - self.last_request[endpoint_id] = timestamp - - if not self.is_valid(endpoint_id, is_not_none, request_id, ["request", "id"],): - return None - if not self.is_valid(endpoint_id, is_not_none, latency, ["microsec"],): - return None - if not self.is_valid( - endpoint_id, is_not_none, features, ["request", "inputs"], - ): - return None - if not self.is_valid( - endpoint_id, is_not_none, predictions, ["resp", "outputs"], - ): - return None - - unpacked_labels = {f"_{k}": v for k, v in event.get(LABELS, {}).items()} - - # Separate each model invocation into sub events - events = [] - for i, (feature, prediction) in enumerate(zip(features, predictions)): - if not self.is_valid( - endpoint_id, - is_list_of_numerics, - feature, - ["request", "inputs", f"[{i}]"], - ): - return None - - if not isinstance(prediction, list): - prediction = [prediction] - - events.append( - { - FUNCTION_URI: function_uri, - MODEL: versioned_model, - MODEL_CLASS: model_class, - TIMESTAMP: timestamp, - ENDPOINT_ID: endpoint_id, - REQUEST_ID: request_id, - LATENCY: latency, - FEATURES: feature, - PREDICTION: prediction, - FIRST_REQUEST: self.first_request[endpoint_id], - LAST_REQUEST: self.last_request[endpoint_id], - ERROR_COUNT: self.error_count[endpoint_id], - LABELS: event.get(LABELS, {}), - METRICS: event.get(METRICS, {}), - ENTITIES: event.get("request", {}).get(ENTITIES, {}), - UNPACKED_LABELS: unpacked_labels, - } - ) - return events - - def resume_state(self, endpoint_id): - # Make sure process is resumable, if process fails for any reason, be able to pick things up close to where we - # left them - if endpoint_id not in self.endpoints: - logger.info("Trying to resume state", endpoint_id=endpoint_id) - endpoint_record = get_endpoint_record( - kv_container=self.kv_container, - kv_path=self.kv_path, - endpoint_id=endpoint_id, - access_key=self.v3io_access_key, - ) - if endpoint_record: - first_request = endpoint_record.get(FIRST_REQUEST) - if first_request: - self.first_request[endpoint_id] = first_request - error_count = endpoint_record.get(ERROR_COUNT) - if error_count: - self.error_count[endpoint_id] = error_count - self.endpoints.add(endpoint_id) - - def is_valid( - self, endpoint_id: str, validation_function, field: Any, dict_path: List[str] - ): - if validation_function(field, dict_path): - return True - self.error_count[endpoint_id] += 1 - return False - - def handle_errors(self, endpoint_id, event) -> bool: - if "error" in event: - self.error_count[endpoint_id] += 1 - return True - - return False - - -def enrich_even_details(event) -> Optional[dict]: - function_uri = event.get(FUNCTION_URI) - - if not is_not_none(function_uri, [FUNCTION_URI]): - return None - - model = event.get(MODEL) - if not is_not_none(model, [MODEL]): - return None - - version = event.get(VERSION) - versioned_model = f"{model}:{version}" if version else f"{model}:latest" - - endpoint_id = create_model_endpoint_id( - function_uri=function_uri, versioned_model=versioned_model, - ) - - endpoint_id = str(endpoint_id) - - event[VERSIONED_MODEL] = versioned_model - event[ENDPOINT_ID] = endpoint_id - - return event - - -def is_not_none(field: Any, dict_path: List[str]): - if field is not None: - return True - logger.error( - f"Expected event field is missing: {field} [Event -> {''.join(dict_path)}]" - ) - return False - - -def is_list_of_numerics( - field: List[Union[int, float, dict, list]], dict_path: List[str] -): - if all(isinstance(x, int) or isinstance(x, float) for x in field): - return True - logger.error( - f"Expected event field is missing: {field} [Event -> {''.join(dict_path)}]" - ) - return False - - -class FilterNotNone(Filter): - def __init__(self, **kwargs): - super().__init__(fn=lambda event: event is not None, **kwargs) - - -class FilterKeys(MapClass): - def __init__(self, *args, **kwargs): - super().__init__(**kwargs) - self.keys = list(args) - - def do(self, event): - new_event = {} - for key in self.keys: - if key in event: - new_event[key] = event[key] - - return new_event if new_event else None - - -class UnpackValues(MapClass): - def __init__(self, *args, **kwargs): - super().__init__(**kwargs) - self.keys_to_unpack = set(args) - - def do(self, event): - unpacked = {} - for key in event.keys(): - if key in self.keys_to_unpack: - unpacked = {**unpacked, **event[key]} - else: - unpacked[key] = event[key] - return unpacked - - -class MapFeatureNames(MapClass): - def __init__(self, kv_container: str, kv_path: str, access_key: str, **kwargs): - super().__init__(**kwargs) - self.kv_container = kv_container - self.kv_path = kv_path - self.access_key = access_key - self.feature_names = {} - self.label_columns = {} - - def do(self, event: Dict): - endpoint_id = event[ENDPOINT_ID] - - if endpoint_id not in self.feature_names: - endpoint_record = get_endpoint_record( - kv_container=self.kv_container, - kv_path=self.kv_path, - endpoint_id=endpoint_id, - access_key=self.access_key, - ) - feature_names = endpoint_record.get(FEATURE_NAMES) - feature_names = json.loads(feature_names) if feature_names else None - - label_columns = endpoint_record.get(LABEL_COLUMNS) - label_columns = json.loads(label_columns) if label_columns else None - - if not feature_names: - logger.warn( - f"Feature names are not initialized, they will be automatically generated", - endpoint_id=endpoint_id, - ) - feature_names = [f"f{i}" for i, _ in enumerate(event[FEATURES])] - get_v3io_client().kv.update( - container=self.kv_container, - table_path=self.kv_path, - access_key=self.access_key, - key=event[ENDPOINT_ID], - attributes={FEATURE_NAMES: json.dumps(feature_names)}, - raise_for_status=RaiseForStatus.always, - ) - - if not label_columns: - logger.warn( - f"label column names are not initialized, they will be automatically generated", - endpoint_id=endpoint_id, - ) - label_columns = [f"p{i}" for i, _ in enumerate(event[PREDICTION])] - get_v3io_client().kv.update( - container=self.kv_container, - table_path=self.kv_path, - access_key=self.access_key, - key=event[ENDPOINT_ID], - attributes={LABEL_COLUMNS: json.dumps(label_columns)}, - raise_for_status=RaiseForStatus.always, - ) - - self.label_columns[endpoint_id] = label_columns - self.feature_names[endpoint_id] = feature_names - - logger.info( - "Label columns", endpoint_id=endpoint_id, label_columns=label_columns - ) - logger.info( - "Feature names", endpoint_id=endpoint_id, feature_names=feature_names - ) - - feature_names = self.feature_names[endpoint_id] - features = event[FEATURES] - event[NAMED_FEATURES] = { - name: feature for name, feature in zip(feature_names, features) - } - - label_columns = self.label_columns[endpoint_id] - prediction = event[PREDICTION] - event[NAMED_PREDICTIONS] = { - name: prediction for name, prediction in zip(label_columns, prediction) - } - logger.info("Mapped event", event=event) - return event - - -class WriteToKV(MapClass): - def __init__(self, container: str, table: str, **kwargs): - super().__init__(**kwargs) - self.container = container - self.table = table - - def do(self, event: Dict): - get_v3io_client().kv.update( - container=self.container, - table_path=self.table, - key=event[ENDPOINT_ID], - attributes=event, - ) - return event - - -class InferSchema(MapClass): - def __init__( - self, - v3io_access_key: str, - v3io_framesd: str, - container: str, - table: str, - **kwargs, - ): - super().__init__(**kwargs) - self.container = container - self.v3io_access_key = v3io_access_key - self.v3io_framesd = v3io_framesd - self.table = table - self.keys = set() - - def do(self, event: Dict): - key_set = set(event.keys()) - if not key_set.issubset(self.keys): - self.keys.update(key_set) - get_frames_client( - token=self.v3io_access_key, - container=self.container, - address=self.v3io_framesd, - ).execute(backend="kv", table=self.table, command="infer_schema") - logger.info( - "Found new keys, inferred schema", table=self.table, event=event - ) - return event - - -def get_endpoint_record( - kv_container: str, kv_path: str, endpoint_id: str, access_key: str -) -> Optional[dict]: - logger.info( - f"Grabbing endpoint data", - container=kv_container, - table_path=kv_path, - key=endpoint_id, - ) - try: - endpoint_record = ( - get_v3io_client() - .kv.get( - container=kv_container, - table_path=kv_path, - key=endpoint_id, - access_key=access_key, - raise_for_status=v3io.dataplane.RaiseForStatus.always, - ) - .output.item - ) - return endpoint_record - except Exception: - return None - - -def init_context(context: MLClientCtx): - context.logger.info("Initializing EventStreamProcessor") - parameters = environ.get("MODEL_MONITORING_PARAMETERS") - parameters = json.loads(parameters) if parameters else {} - stream_processor = EventStreamProcessor(**parameters) - setattr(context, "stream_processor", stream_processor) - - -def handler(context: MLClientCtx, event: Event): - event_body = json.loads(event.body) - context.logger.debug(event_body) - context.stream_processor.consume(event_body) diff --git a/functions/development/model_monitoring_stream/1.1.0/src/requirements.txt b/functions/development/model_monitoring_stream/1.1.0/src/requirements.txt deleted file mode 100644 index ef238930..00000000 --- a/functions/development/model_monitoring_stream/1.1.0/src/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -storey -nuclio -v3io \ No newline at end of file diff --git a/functions/development/model_monitoring_stream/1.1.0/static/documentation.html b/functions/development/model_monitoring_stream/1.1.0/static/documentation.html deleted file mode 100644 index 875606b7..00000000 --- a/functions/development/model_monitoring_stream/1.1.0/static/documentation.html +++ /dev/null @@ -1,342 +0,0 @@ - - - - - - - -model_monitoring_stream package - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - - - -
-
- - -
-
-
- -
-

model_monitoring_stream package

- -
- -
-
-
-
-
-

model_monitoring_stream package#

-
-

Submodules#

-
-
-

model_monitoring_stream.model_monitoring_stream module#

-
-
-class model_monitoring_stream.model_monitoring_stream.EventStreamProcessor(project: str, sample_window: int = 10, tsdb_batching_max_events: int = 10, tsdb_batching_timeout_secs: int = 300, parquet_batching_max_events: int = 10000, parquet_batching_timeout_secs: int = 3600, aggregate_count_windows: Optional[List[str]] = None, aggregate_count_period: str = '30s', aggregate_avg_windows: Optional[List[str]] = None, aggregate_avg_period: str = '30s', v3io_access_key: Optional[str] = None, v3io_framesd: Optional[str] = None, v3io_api: Optional[str] = None)[source]#
-

Bases: object

-
-
-static compute_predictions_per_second(event: dict)[source]#
-
-
-
-consume(event: Dict)[source]#
-
-
-
-static process_before_events_tsdb(event: Dict)[source]#
-
-
-
-process_before_kv(event: dict)[source]#
-
-
-
-static process_before_parquet(event: dict)[source]#
-
-
-
-
-class model_monitoring_stream.model_monitoring_stream.FilterKeys(*args: Any, **kwargs: Any)[source]#
-

Bases: storey.

-
-
-do(event)[source]#
-
-
-
-
-class model_monitoring_stream.model_monitoring_stream.FilterNotNone(*args: Any, **kwargs: Any)[source]#
-

Bases: storey.

-
-
-
-class model_monitoring_stream.model_monitoring_stream.InferSchema(*args: Any, **kwargs: Any)[source]#
-

Bases: storey.

-
-
-do(event: Dict)[source]#
-
-
-
-
-class model_monitoring_stream.model_monitoring_stream.MapFeatureNames(*args: Any, **kwargs: Any)[source]#
-

Bases: storey.

-
-
-do(event: Dict)[source]#
-
-
-
-
-class model_monitoring_stream.model_monitoring_stream.ProcessEndpointEvent(*args: Any, **kwargs: Any)[source]#
-

Bases: storey.

-
-
-do(event: dict)[source]#
-
-
-
-handle_errors(endpoint_id, event)bool[source]#
-
-
-
-is_valid(endpoint_id: str, validation_function, field: Any, dict_path: List[str])[source]#
-
-
-
-resume_state(endpoint_id)[source]#
-
-
-
-
-class model_monitoring_stream.model_monitoring_stream.UnpackValues(*args: Any, **kwargs: Any)[source]#
-

Bases: storey.

-
-
-do(event)[source]#
-
-
-
-
-class model_monitoring_stream.model_monitoring_stream.WriteToKV(*args: Any, **kwargs: Any)[source]#
-

Bases: storey.

-
-
-do(event: Dict)[source]#
-
-
-
-
-model_monitoring_stream.model_monitoring_stream.enrich_even_details(event)Optional[dict][source]#
-
-
-
-model_monitoring_stream.model_monitoring_stream.get_endpoint_record(kv_container: str, kv_path: str, endpoint_id: str, access_key: str)Optional[dict][source]#
-
-
-
-model_monitoring_stream.model_monitoring_stream.handler(context: mlrun.execution.MLClientCtx, event: nuclio.request.Event)[source]#
-
-
-
-model_monitoring_stream.model_monitoring_stream.init_context(context: mlrun.execution.MLClientCtx)[source]#
-
-
-
-model_monitoring_stream.model_monitoring_stream.is_list_of_numerics(field: List[Union[int, float, dict, list]], dict_path: List[str])[source]#
-
-
-
-model_monitoring_stream.model_monitoring_stream.is_not_none(field: Any, dict_path: List[str])[source]#
-
-
-
-

Module contents#

-
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/model_monitoring_stream/1.1.0/static/example.html b/functions/development/model_monitoring_stream/1.1.0/static/example.html deleted file mode 100644 index fe5c2685..00000000 --- a/functions/development/model_monitoring_stream/1.1.0/static/example.html +++ /dev/null @@ -1,352 +0,0 @@ - - - - - - - -Model Monitoring - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - - - -
-
- - -
-
-
- - -
-
-
-

Model Monitoring#

-
-

Initial set up (and pre-requisites)#

-
    -
  1. Make sure you have the mlrun-api datasource available in your Grafana instance, otherwise add it by:

    -
      -
    1. Open your grafana instance

    2. -
    3. Navigate to Configuration -> Data Sources

    4. -
    5. Press Add data source and configure the following parameters

    6. -
    -
    Name: mlrun-api
    -URL: http://mlrun-api:8080/api/grafana-proxy/model-endpoints
    -Access: Server (default)
    -
    -## Add a custom header of:
    -X-V3io-Session-Key: <YOUR ACCESS KEY>
    -
    -
    -
      -
    1. Press Save & Test to make sure it works, a confirmation message should appear when this button is pressed

    2. -
    -
  2. -
  3. Import the available dashboards (./dashboards/*) to you Grafana instance

  4. -
  5. To allow the system to utilize drift measurement, make sure you supply the train set when logging the model on the -training step

    -
    # Log model
    -context.log_model(
    -    "model",
    -    body=dumps(model),
    -    artifact_path=context.artifact_subpath("models"),
    -    extra_data=eval_metrics,
    -    model_file="model.pkl",
    -    metrics=context.results,
    -    training_set=X_test,  # <- make sure this is passed into log_model
    -    labels={"class": "sklearn.linear_model.LogisticRegression"}
    -)
    -
    -
    -
  6. -
  7. When serving a model, make sure that the Nuclio function is deployed with tracking enabled by applying -fn.set_tracking()

  8. -
-
-
-

Configuration#

-

The stream processing portion of the model monitoring, can be deployed under multiple configuration options. The -available configurations can be found under stream.Config. Once configured it should be supplied as environment -parameters to the Nuclio function by setting fn.set_envs

-
project: str                        # project name
-sample_window: int                  # The sampling window for the data that flows into the TSDB and the KV
-kv_path_template: str               # Path template for the kv table
-tsdb_path_template: str             # Path template for the tsdb table
-parquet_path_template: str          # v3io parquets path template, assumes v3io is mounted
-tsdb_batching_max_events: int       # The max amount of event to batch before writing the batch to tsdb
-tsdb_batching_timeout_secs: int     # The max amount of seconds a given batch can be gathered before being emitted
-parquet_batching_max_events: int    # The max amount of event to batch before writing the batch to parquet
-parquet_batching_timeout_secs: int  # The max amount of seconds, a given batch can be gathered before being written to parquet
-container: str                      # container name
-v3io_access_key: str                # V3IO Access key
-v3io_framesd: str                   # V3IO framesd URL
-time_format: str                    # The time format into which time related fields will be converted
-aggregate_count_windows: List[str]  # List of window sizes for predictions count
-aggregate_count_period: str         # Period of predictions count windows
-aggregate_avg_windows: List[str]    # List of window sizes for average latency
-aggregate_avg_period: str           # Period of average latency windows
-
-
-
-
-

Export function yaml#

-
-
-
from mlrun import code_to_function
-from mlrun.runtimes import RemoteRuntime
-
-
-fn: RemoteRuntime = code_to_function(
-    name="model-monitoring-stream",
-    kind="nuclio",
-    image="mlrun/mlrun",
-    filename="model_monitoring_stream.py",
-    handler="handler",
-)
-fn.export("model_monitoring_stream.yaml")
-
-
-
-
-
-
-

Deploy Stream Processing#

-
-
-
import os
-
-from mlrun import import_function
-from mlrun.platforms import mount_v3io
-from mlrun.runtimes import RemoteRuntime
-import json
-
-# Set project name
-project = ""
-
-fn: RemoteRuntime = import_function("hub://model_monitoring_stream")
-
-fn.add_v3io_stream_trigger(
-    stream_path=f"projects/{project}/model-endpoints/stream",
-    name="monitoring_stream_trigger",
-)
-
-fn.set_env("MODEL_MONITORING_PARAMETERS", json.dumps({"project": project, "v3io_framesd": os.environ.get("V3IO_FRAMESD")}))
-
-fn.metadata.project = project
-fn.apply(mount_v3io())
-fn.deploy()
-
-
-
-
-
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/model_monitoring_stream/1.1.0/static/function.html b/functions/development/model_monitoring_stream/1.1.0/static/function.html deleted file mode 100644 index 81290204..00000000 --- a/functions/development/model_monitoring_stream/1.1.0/static/function.html +++ /dev/null @@ -1,289 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: remote
-metadata:
-  name: model-monitoring-stream
-  tag: ''
-  hash: 33f4d6de0858b3dfc9d150fc82fbed6feb05534c
-  project: ''
-  categories:
-  - monitoring
-spec:
-  command: ''
-  args: []
-  image: livsmichael/mlrun-api:automation
-  entry_points:
-    consume:
-      name: consume
-      doc: ''
-      parameters:
-      - name: self
-        default: ''
-      - name: event
-        type: Dict
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 293
-    compute_predictions_per_second:
-      name: compute_predictions_per_second
-      doc: ''
-      parameters:
-      - name: event
-        type: dict
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 311
-    process_before_kv:
-      name: process_before_kv
-      doc: ''
-      parameters:
-      - name: self
-        default: ''
-      - name: event
-        type: dict
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 316
-    process_before_events_tsdb:
-      name: process_before_events_tsdb
-      doc: ''
-      parameters:
-      - name: event
-        type: Dict
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 325
-    process_before_parquet:
-      name: process_before_parquet
-      doc: ''
-      parameters:
-      - name: event
-        type: dict
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 362
-    set_none_if_empty:
-      name: set_none_if_empty
-      doc: ''
-      parameters:
-      - name: _event
-        type: dict
-        default: ''
-      - name: keys
-        type: List[str]
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 364
-    drop_if_exists:
-      name: drop_if_exists
-      doc: ''
-      parameters:
-      - name: _event
-        type: dict
-        default: ''
-      - name: keys
-        type: List[str]
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 369
-    unpack_if_exists:
-      name: unpack_if_exists
-      doc: ''
-      parameters:
-      - name: _event
-        type: dict
-        default: ''
-      - name: keys
-        type: List[str]
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 373
-    do:
-      name: do
-      doc: ''
-      parameters:
-      - name: self
-        default: ''
-      - name: event
-        type: Dict
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 702
-    resume_state:
-      name: resume_state
-      doc: ''
-      parameters:
-      - name: self
-        default: ''
-      - name: endpoint_id
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 475
-    is_valid:
-      name: is_valid
-      doc: ''
-      parameters:
-      - name: self
-        default: ''
-      - name: endpoint_id
-        type: str
-        default: ''
-      - name: validation_function
-        default: ''
-      - name: field
-        type: Any
-        default: ''
-      - name: dict_path
-        type: List[str]
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 495
-    handle_errors:
-      name: handle_errors
-      doc: ''
-      parameters:
-      - name: self
-        default: ''
-      - name: endpoint_id
-        default: ''
-      - name: event
-        default: ''
-      outputs:
-      - default: ''
-        type: bool
-      lineno: 503
-    enrich_even_details:
-      name: enrich_even_details
-      doc: ''
-      parameters:
-      - name: event
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 511
-    is_not_none:
-      name: is_not_none
-      doc: ''
-      parameters:
-      - name: field
-        type: Any
-        default: ''
-      - name: dict_path
-        type: List[str]
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 536
-    is_list_of_numerics:
-      name: is_list_of_numerics
-      doc: ''
-      parameters:
-      - name: field
-        type: List[Union[int, float, dict, list]]
-        default: ''
-      - name: dict_path
-        type: List[str]
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 545
-    get_endpoint_record:
-      name: get_endpoint_record
-      doc: ''
-      parameters:
-      - name: kv_container
-        type: str
-        default: ''
-      - name: kv_path
-        type: str
-        default: ''
-      - name: endpoint_id
-        type: str
-        default: ''
-      - name: access_key
-        type: str
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 717
-    init_context:
-      name: init_context
-      doc: ''
-      parameters:
-      - name: context
-        type: MLClientCtx
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 743
-    handler:
-      name: handler
-      doc: ''
-      parameters:
-      - name: context
-        type: MLClientCtx
-        default: ''
-      - name: event
-        type: Event
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 751
-  description: ''
-  min_replicas: 1
-  max_replicas: 4
-  env: []
-  base_spec:
-    apiVersion: nuclio.io/v1
-    kind: Function
-    metadata:
-      name: model-monitoring-stream
-      labels: {}
-      annotations:
-        nuclio.io/generated_by: function generated from /home/michaell/projects/functions/model_monitoring_stream/model_monitoring_stream.py
-    spec:
-      runtime: python:3.6
-      handler: model_monitoring_stream:handler
-      env: []
-      volumes: []
-      build:
-        commands: []
-        noBaseImagesPull: true
-        functionSourceCode: 
-  source: ''
-  build:
-    commands: []
-    code_origin: https://github.com/Michaelliv/functions.git#202b4c489e4c02c3025742ea237f1a042b7c6043:/home/michaell/projects/functions/model_monitoring_stream/model_monitoring_stream.py
-  default_handler: handler
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/model_monitoring_stream/1.1.0/static/item.html b/functions/development/model_monitoring_stream/1.1.0/static/item.html deleted file mode 100644 index 454c1df8..00000000 --- a/functions/development/model_monitoring_stream/1.1.0/static/item.html +++ /dev/null @@ -1,45 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- monitoring
-description: ''
-doc: ''
-example: model_monitoring_stream.ipynb
-generationDate: 2022-08-28:17-25
-hidden: false
-icon: ''
-labels: {}
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 1.1.0
-name: model-monitoring-stream
-platformVersion: 3.5.0
-spec:
-  filename: model_monitoring_stream.py
-  handler: handler
-  image: livsmichael/mlrun-api:automation
-  kind: nuclio
-  requirements: []
-url: ''
-version: 1.1.0
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/model_monitoring_stream/1.1.0/static/model_monitoring_stream.html b/functions/development/model_monitoring_stream/1.1.0/static/model_monitoring_stream.html deleted file mode 100644 index 6f57cfc6..00000000 --- a/functions/development/model_monitoring_stream/1.1.0/static/model_monitoring_stream.html +++ /dev/null @@ -1,908 +0,0 @@ - - - - - - - -model_monitoring_stream.model_monitoring_stream - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - -
-
- -
-
-
-
-
- -
-

- -
-
-
-
-
-
-
-

Source code for model_monitoring_stream.model_monitoring_stream

-# Copyright 2019 Iguazio
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-import json
-import os
-from collections import defaultdict
-from datetime import datetime
-from os import environ
-from typing import Dict, List, Set, Optional, Any, Union
-
-import pandas as pd
-import v3io
-from mlrun.config import config
-from mlrun.run import MLClientCtx
-from mlrun.utils import logger
-from mlrun.utils.model_monitoring import (
-    parse_model_endpoint_store_prefix,
-    create_model_endpoint_id,
-)
-from mlrun.utils.v3io_clients import get_v3io_client, get_frames_client
-from nuclio import Event
-from storey import (
-    FieldAggregator,
-    NoopDriver,
-    Table,
-    Map,
-    MapClass,
-    AggregateByKey,
-    build_flow,
-    Filter,
-    FlatMap,
-    TSDBTarget,
-    ParquetTarget,
-    SyncEmitSource,
-)
-from storey.dtypes import SlidingWindows
-from storey.steps import SampleWindow
-# Constants
-from v3io.dataplane import RaiseForStatus
-
-ISO_8061_UTC = "%Y-%m-%d %H:%M:%S.%f%z"
-FUNCTION_URI = "function_uri"
-MODEL = "model"
-VERSION = "version"
-VERSIONED_MODEL = "versioned_model"
-MODEL_CLASS = "model_class"
-TIMESTAMP = "timestamp"
-ENDPOINT_ID = "endpoint_id"
-REQUEST_ID = "request_id"
-LABELS = "labels"
-UNPACKED_LABELS = "unpacked_labels"
-LATENCY_AVG_5M = "latency_avg_5m"
-LATENCY_AVG_1H = "latency_avg_1h"
-PREDICTIONS_PER_SECOND = "predictions_per_second"
-PREDICTIONS_COUNT_5M = "predictions_count_5m"
-PREDICTIONS_COUNT_1H = "predictions_count_1h"
-FIRST_REQUEST = "first_request"
-LAST_REQUEST = "last_request"
-ERROR_COUNT = "error_count"
-ENTITIES = "entities"
-FEATURE_NAMES = "feature_names"
-LABEL_COLUMNS = "label_columns"
-LATENCY = "latency"
-RECORD_TYPE = "record_type"
-FEATURES = "features"
-PREDICTION = "prediction"
-PREDICTIONS = "predictions"
-NAMED_FEATURES = "named_features"
-NAMED_PREDICTIONS = "named_predictions"
-BASE_METRICS = "base_metrics"
-CUSTOM_METRICS = "custom_metrics"
-ENDPOINT_FEATURES = "endpoint_features"
-METRICS = "metrics"
-BATCH_TIMESTAMP = "batch_timestamp"
-TIME_FORMAT: str = "%Y-%m-%d %H:%M:%S.%f"  # ISO 8061
-
-
-# Stream processing code
-
[docs]class EventStreamProcessor: - def __init__( - self, - project: str, - sample_window: int = 10, - tsdb_batching_max_events: int = 10, - tsdb_batching_timeout_secs: int = 60 * 5, # Default 5 minutes - parquet_batching_max_events: int = 10_000, - parquet_batching_timeout_secs: int = 60 * 60, # Default 1 hour - aggregate_count_windows: Optional[List[str]] = None, - aggregate_count_period: str = "30s", - aggregate_avg_windows: Optional[List[str]] = None, - aggregate_avg_period: str = "30s", - v3io_access_key: Optional[str] = None, - v3io_framesd: Optional[str] = None, - v3io_api: Optional[str] = None, - ): - self.project = project - self.sample_window = sample_window - self.tsdb_batching_max_events = tsdb_batching_max_events - self.tsdb_batching_timeout_secs = tsdb_batching_timeout_secs - self.parquet_batching_max_events = parquet_batching_max_events - self.parquet_batching_timeout_secs = parquet_batching_timeout_secs - self.aggregate_count_windows = aggregate_count_windows or ["5m", "1h"] - self.aggregate_count_period = aggregate_count_period - self.aggregate_avg_windows = aggregate_avg_windows or ["5m", "1h"] - self.aggregate_avg_period = aggregate_avg_period - - self.v3io_framesd = v3io_framesd or config.v3io_framesd - self.v3io_api = v3io_api or config.v3io_api - - self.v3io_access_key = v3io_access_key or environ.get("V3IO_ACCESS_KEY") - self.model_monitoring_access_key = ( - os.environ.get("MODEL_MONITORING_ACCESS_KEY") or self.v3io_access_key - ) - - template = config.model_endpoint_monitoring.store_prefixes.default - - kv_path = template.format(project=project, kind="endpoints") - _, self.kv_container, self.kv_path = parse_model_endpoint_store_prefix(kv_path) - - tsdb_path = template.format(project=project, kind="events") - _, self.tsdb_container, self.tsdb_path = parse_model_endpoint_store_prefix( - tsdb_path - ) - self.tsdb_path = f"{self.tsdb_container}/{self.tsdb_path}" - - self.parquet_path = config.model_endpoint_monitoring.store_prefixes.user_space.format( - project=project, kind="parquet" - ) - - logger.info( - "V3IO Configuration", - v3io_access_key=self.v3io_access_key, - model_monitoring_access_key=self.model_monitoring_access_key, - default_store_prefix=config.model_endpoint_monitoring.store_prefixes.default, - user_space_store_prefix=config.model_endpoint_monitoring.store_prefixes.user_space, - v3io_api=self.v3io_api, - v3io_framesd=self.v3io_framesd, - kv_container=self.kv_container, - kv_path=self.kv_path, - tsdb_container=self.tsdb_container, - tsdb_path=self.tsdb_path, - parquet_path=self.parquet_path, - ) - - self._kv_keys = [ - FUNCTION_URI, - MODEL, - MODEL_CLASS, - TIMESTAMP, - ENDPOINT_ID, - LABELS, - UNPACKED_LABELS, - LATENCY_AVG_5M, - LATENCY_AVG_1H, - PREDICTIONS_PER_SECOND, - PREDICTIONS_COUNT_5M, - PREDICTIONS_COUNT_1H, - FIRST_REQUEST, - LAST_REQUEST, - ERROR_COUNT, - ] - - self._flow = build_flow( - [ - SyncEmitSource(), - ProcessEndpointEvent( - kv_container=self.kv_container, - kv_path=self.kv_path, - v3io_access_key=self.v3io_access_key, - ), - FilterNotNone(), - FlatMap(lambda x: x), - MapFeatureNames( - kv_container=self.kv_container, - kv_path=self.kv_path, - access_key=self.v3io_access_key, - ), - # Branch 1: Aggregate events, count averages and update TSDB and KV - [ - AggregateByKey( - aggregates=[ - FieldAggregator( - PREDICTIONS, - ENDPOINT_ID, - ["count"], - SlidingWindows( - self.aggregate_count_windows, - self.aggregate_count_period, - ), - ), - FieldAggregator( - LATENCY, - LATENCY, - ["avg"], - SlidingWindows( - self.aggregate_avg_windows, - self.aggregate_avg_period, - ), - ), - ], - table=Table("notable", NoopDriver()), - ), - SampleWindow( - self.sample_window - ), # Add required gap between event to apply sampling - Map(self.compute_predictions_per_second), - # Branch 1.1: Updated KV - [ - Map(self.process_before_kv), - WriteToKV(container=self.kv_container, table=self.kv_path), - InferSchema( - v3io_access_key=self.v3io_access_key, - v3io_framesd=self.v3io_framesd, - container=self.kv_container, - table=self.kv_path, - ), - ], - # Branch 1.2: Update TSDB - [ - # Map the event into taggable fields, add record type to each field - Map(self.process_before_events_tsdb), - [ - FilterKeys(BASE_METRICS), - UnpackValues(BASE_METRICS), - TSDBTarget( - path=self.tsdb_path, - rate="10/m", - time_col=TIMESTAMP, - container=self.tsdb_container, - access_key=self.v3io_access_key, - v3io_frames=self.v3io_framesd, - index_cols=[ENDPOINT_ID, RECORD_TYPE], - # Settings for _Batching - max_events=self.tsdb_batching_max_events, - timeout_secs=self.tsdb_batching_timeout_secs, - key=ENDPOINT_ID, - ), - ], - [ - FilterKeys(ENDPOINT_FEATURES), - UnpackValues(ENDPOINT_FEATURES), - TSDBTarget( - path=self.tsdb_path, - rate="10/m", - time_col=TIMESTAMP, - container=self.tsdb_container, - access_key=self.v3io_access_key, - v3io_frames=self.v3io_framesd, - index_cols=[ENDPOINT_ID, RECORD_TYPE], - # Settings for _Batching - max_events=self.tsdb_batching_max_events, - timeout_secs=self.tsdb_batching_timeout_secs, - key=ENDPOINT_ID, - ), - ], - [ - FilterKeys(CUSTOM_METRICS), - FilterNotNone(), - UnpackValues(CUSTOM_METRICS), - TSDBTarget( - path=self.tsdb_path, - rate="10/m", - time_col=TIMESTAMP, - container=self.tsdb_container, - access_key=self.v3io_access_key, - v3io_frames=self.v3io_framesd, - index_cols=[ENDPOINT_ID, RECORD_TYPE], - # Settings for _Batching - max_events=self.tsdb_batching_max_events, - timeout_secs=self.tsdb_batching_timeout_secs, - key=ENDPOINT_ID, - ), - ], - ], - ], - # Branch 2: Batch events, write to parquet - [ - Map(self.process_before_parquet), - ParquetTarget( - path=self.parquet_path, - partition_cols=["$key", "$year", "$month", "$day", "$hour"], - infer_columns_from_data=True, - # Settings for _Batching - max_events=self.parquet_batching_max_events, - timeout_secs=self.parquet_batching_timeout_secs, - # Settings for v3io storage - storage_options={ - "v3io_api": self.v3io_api, - "v3io_access_key": self.model_monitoring_access_key, - }, - ), - ], - ] - ).run() - -
[docs] def consume(self, event: Dict): - events = [] - if "headers" in event and "values" in event: - for values in event["values"]: - events.append({k: v for k, v in zip(event["headers"], values)}) - else: - events.append(event) - - for enriched in map(enrich_even_details, events): - if enriched is not None: - self._flow.emit( - enriched, - key=enriched[ENDPOINT_ID], - event_time=datetime.strptime(enriched["when"], ISO_8061_UTC), - ) - else: - pass
- -
[docs] @staticmethod - def compute_predictions_per_second(event: dict): - event[PREDICTIONS_PER_SECOND] = float(event[PREDICTIONS_COUNT_5M]) / 600 - return event
- -
[docs] def process_before_kv(self, event: dict): - # Filter relevant keys - e = {k: event[k] for k in self._kv_keys} - # Unpack labels dictionary - e = {**e, **e.pop(UNPACKED_LABELS, {})} - # Write labels to kv as json string to be presentable later - e[LABELS] = json.dumps(e[LABELS]) - return e
- -
[docs] @staticmethod - def process_before_events_tsdb(event: Dict): - base_fields = [TIMESTAMP, ENDPOINT_ID] - - base_event = {k: event[k] for k in base_fields} - base_event[TIMESTAMP] = pd.to_datetime( - base_event[TIMESTAMP], format=TIME_FORMAT - ) - - base_metrics = { - RECORD_TYPE: BASE_METRICS, - PREDICTIONS_PER_SECOND: event[PREDICTIONS_PER_SECOND], - PREDICTIONS_COUNT_5M: event[PREDICTIONS_COUNT_5M], - PREDICTIONS_COUNT_1H: event[PREDICTIONS_COUNT_1H], - LATENCY_AVG_5M: event[LATENCY_AVG_5M], - LATENCY_AVG_1H: event[LATENCY_AVG_1H], - **base_event, - } - - endpoint_features = { - RECORD_TYPE: ENDPOINT_FEATURES, - **event[NAMED_PREDICTIONS], - **event[NAMED_FEATURES], - **base_event, - } - - processed = {BASE_METRICS: base_metrics, ENDPOINT_FEATURES: endpoint_features} - - if event[METRICS]: - processed[CUSTOM_METRICS] = { - RECORD_TYPE: CUSTOM_METRICS, - **event[METRICS], - **base_event, - } - - return processed
- -
[docs] @staticmethod - def process_before_parquet(event: dict): - def set_none_if_empty(_event: dict, keys: List[str]): - for key in keys: - if not _event.get(key): - _event[key] = None - - def drop_if_exists(_event: dict, keys: List[str]): - for key in keys: - _event.pop(key, None) - - def unpack_if_exists(_event: dict, keys: List[str]): - for key in keys: - value = _event.get(key) - if value is not None: - _event = {**value, **event} - - drop_if_exists(event, [UNPACKED_LABELS, FEATURES]) - unpack_if_exists(event, [ENTITIES]) - set_none_if_empty(event, [LABELS, METRICS, ENTITIES]) - return event
- - -
[docs]class ProcessEndpointEvent(MapClass): - def __init__(self, kv_container: str, kv_path: str, v3io_access_key: str, **kwargs): - super().__init__(**kwargs) - self.kv_container: str = kv_container - self.kv_path: str = kv_path - self.v3io_access_key: str = v3io_access_key - self.first_request: Dict[str, str] = dict() - self.last_request: Dict[str, str] = dict() - self.error_count: Dict[str, int] = defaultdict(int) - self.endpoints: Set[str] = set() - -
[docs] def do(self, event: dict): - function_uri = event[FUNCTION_URI] - versioned_model = event[VERSIONED_MODEL] - endpoint_id = event[ENDPOINT_ID] - - # In case this process fails, resume state from existing record - self.resume_state(endpoint_id) - - # Handle errors coming from stream - found_errors = self.handle_errors(endpoint_id, event) - if found_errors: - return None - - # Validate event fields - model_class = event.get("model_class") or event.get("class") - timestamp = event.get("when") - request_id = event.get("request", {}).get("id") - latency = event.get("microsec") - features = event.get("request", {}).get("inputs") - predictions = event.get("resp", {}).get("outputs") - - if not self.is_valid(endpoint_id, is_not_none, timestamp, ["when"],): - return None - - if endpoint_id not in self.first_request: - self.first_request[endpoint_id] = timestamp - self.last_request[endpoint_id] = timestamp - - if not self.is_valid(endpoint_id, is_not_none, request_id, ["request", "id"],): - return None - if not self.is_valid(endpoint_id, is_not_none, latency, ["microsec"],): - return None - if not self.is_valid( - endpoint_id, is_not_none, features, ["request", "inputs"], - ): - return None - if not self.is_valid( - endpoint_id, is_not_none, predictions, ["resp", "outputs"], - ): - return None - - unpacked_labels = {f"_{k}": v for k, v in event.get(LABELS, {}).items()} - - # Separate each model invocation into sub events - events = [] - for i, (feature, prediction) in enumerate(zip(features, predictions)): - if not self.is_valid( - endpoint_id, - is_list_of_numerics, - feature, - ["request", "inputs", f"[{i}]"], - ): - return None - - if not isinstance(prediction, list): - prediction = [prediction] - - events.append( - { - FUNCTION_URI: function_uri, - MODEL: versioned_model, - MODEL_CLASS: model_class, - TIMESTAMP: timestamp, - ENDPOINT_ID: endpoint_id, - REQUEST_ID: request_id, - LATENCY: latency, - FEATURES: feature, - PREDICTION: prediction, - FIRST_REQUEST: self.first_request[endpoint_id], - LAST_REQUEST: self.last_request[endpoint_id], - ERROR_COUNT: self.error_count[endpoint_id], - LABELS: event.get(LABELS, {}), - METRICS: event.get(METRICS, {}), - ENTITIES: event.get("request", {}).get(ENTITIES, {}), - UNPACKED_LABELS: unpacked_labels, - } - ) - return events
- -
[docs] def resume_state(self, endpoint_id): - # Make sure process is resumable, if process fails for any reason, be able to pick things up close to where we - # left them - if endpoint_id not in self.endpoints: - logger.info("Trying to resume state", endpoint_id=endpoint_id) - endpoint_record = get_endpoint_record( - kv_container=self.kv_container, - kv_path=self.kv_path, - endpoint_id=endpoint_id, - access_key=self.v3io_access_key, - ) - if endpoint_record: - first_request = endpoint_record.get(FIRST_REQUEST) - if first_request: - self.first_request[endpoint_id] = first_request - error_count = endpoint_record.get(ERROR_COUNT) - if error_count: - self.error_count[endpoint_id] = error_count - self.endpoints.add(endpoint_id)
- -
[docs] def is_valid( - self, endpoint_id: str, validation_function, field: Any, dict_path: List[str] - ): - if validation_function(field, dict_path): - return True - self.error_count[endpoint_id] += 1 - return False
- -
[docs] def handle_errors(self, endpoint_id, event) -> bool: - if "error" in event: - self.error_count[endpoint_id] += 1 - return True - - return False
- - -
[docs]def enrich_even_details(event) -> Optional[dict]: - function_uri = event.get(FUNCTION_URI) - - if not is_not_none(function_uri, [FUNCTION_URI]): - return None - - model = event.get(MODEL) - if not is_not_none(model, [MODEL]): - return None - - version = event.get(VERSION) - versioned_model = f"{model}:{version}" if version else f"{model}:latest" - - endpoint_id = create_model_endpoint_id( - function_uri=function_uri, versioned_model=versioned_model, - ) - - endpoint_id = str(endpoint_id) - - event[VERSIONED_MODEL] = versioned_model - event[ENDPOINT_ID] = endpoint_id - - return event
- - -
[docs]def is_not_none(field: Any, dict_path: List[str]): - if field is not None: - return True - logger.error( - f"Expected event field is missing: {field} [Event -> {''.join(dict_path)}]" - ) - return False
- - -
[docs]def is_list_of_numerics( - field: List[Union[int, float, dict, list]], dict_path: List[str] -): - if all(isinstance(x, int) or isinstance(x, float) for x in field): - return True - logger.error( - f"Expected event field is missing: {field} [Event -> {''.join(dict_path)}]" - ) - return False
- - -
[docs]class FilterNotNone(Filter): - def __init__(self, **kwargs): - super().__init__(fn=lambda event: event is not None, **kwargs)
- - -
[docs]class FilterKeys(MapClass): - def __init__(self, *args, **kwargs): - super().__init__(**kwargs) - self.keys = list(args) - -
[docs] def do(self, event): - new_event = {} - for key in self.keys: - if key in event: - new_event[key] = event[key] - - return new_event if new_event else None
- - -
[docs]class UnpackValues(MapClass): - def __init__(self, *args, **kwargs): - super().__init__(**kwargs) - self.keys_to_unpack = set(args) - -
[docs] def do(self, event): - unpacked = {} - for key in event.keys(): - if key in self.keys_to_unpack: - unpacked = {**unpacked, **event[key]} - else: - unpacked[key] = event[key] - return unpacked
- - -
[docs]class MapFeatureNames(MapClass): - def __init__(self, kv_container: str, kv_path: str, access_key: str, **kwargs): - super().__init__(**kwargs) - self.kv_container = kv_container - self.kv_path = kv_path - self.access_key = access_key - self.feature_names = {} - self.label_columns = {} - -
[docs] def do(self, event: Dict): - endpoint_id = event[ENDPOINT_ID] - - if endpoint_id not in self.feature_names: - endpoint_record = get_endpoint_record( - kv_container=self.kv_container, - kv_path=self.kv_path, - endpoint_id=endpoint_id, - access_key=self.access_key, - ) - feature_names = endpoint_record.get(FEATURE_NAMES) - feature_names = json.loads(feature_names) if feature_names else None - - label_columns = endpoint_record.get(LABEL_COLUMNS) - label_columns = json.loads(label_columns) if label_columns else None - - if not feature_names: - logger.warn( - f"Feature names are not initialized, they will be automatically generated", - endpoint_id=endpoint_id, - ) - feature_names = [f"f{i}" for i, _ in enumerate(event[FEATURES])] - get_v3io_client().kv.update( - container=self.kv_container, - table_path=self.kv_path, - access_key=self.access_key, - key=event[ENDPOINT_ID], - attributes={FEATURE_NAMES: json.dumps(feature_names)}, - raise_for_status=RaiseForStatus.always, - ) - - if not label_columns: - logger.warn( - f"label column names are not initialized, they will be automatically generated", - endpoint_id=endpoint_id, - ) - label_columns = [f"p{i}" for i, _ in enumerate(event[PREDICTION])] - get_v3io_client().kv.update( - container=self.kv_container, - table_path=self.kv_path, - access_key=self.access_key, - key=event[ENDPOINT_ID], - attributes={LABEL_COLUMNS: json.dumps(label_columns)}, - raise_for_status=RaiseForStatus.always, - ) - - self.label_columns[endpoint_id] = label_columns - self.feature_names[endpoint_id] = feature_names - - logger.info( - "Label columns", endpoint_id=endpoint_id, label_columns=label_columns - ) - logger.info( - "Feature names", endpoint_id=endpoint_id, feature_names=feature_names - ) - - feature_names = self.feature_names[endpoint_id] - features = event[FEATURES] - event[NAMED_FEATURES] = { - name: feature for name, feature in zip(feature_names, features) - } - - label_columns = self.label_columns[endpoint_id] - prediction = event[PREDICTION] - event[NAMED_PREDICTIONS] = { - name: prediction for name, prediction in zip(label_columns, prediction) - } - logger.info("Mapped event", event=event) - return event
- - -
[docs]class WriteToKV(MapClass): - def __init__(self, container: str, table: str, **kwargs): - super().__init__(**kwargs) - self.container = container - self.table = table - -
[docs] def do(self, event: Dict): - get_v3io_client().kv.update( - container=self.container, - table_path=self.table, - key=event[ENDPOINT_ID], - attributes=event, - ) - return event
- - -
[docs]class InferSchema(MapClass): - def __init__( - self, - v3io_access_key: str, - v3io_framesd: str, - container: str, - table: str, - **kwargs, - ): - super().__init__(**kwargs) - self.container = container - self.v3io_access_key = v3io_access_key - self.v3io_framesd = v3io_framesd - self.table = table - self.keys = set() - -
[docs] def do(self, event: Dict): - key_set = set(event.keys()) - if not key_set.issubset(self.keys): - self.keys.update(key_set) - get_frames_client( - token=self.v3io_access_key, - container=self.container, - address=self.v3io_framesd, - ).execute(backend="kv", table=self.table, command="infer_schema") - logger.info( - "Found new keys, inferred schema", table=self.table, event=event - ) - return event
- - -
[docs]def get_endpoint_record( - kv_container: str, kv_path: str, endpoint_id: str, access_key: str -) -> Optional[dict]: - logger.info( - f"Grabbing endpoint data", - container=kv_container, - table_path=kv_path, - key=endpoint_id, - ) - try: - endpoint_record = ( - get_v3io_client() - .kv.get( - container=kv_container, - table_path=kv_path, - key=endpoint_id, - access_key=access_key, - raise_for_status=v3io.dataplane.RaiseForStatus.always, - ) - .output.item - ) - return endpoint_record - except Exception: - return None
- - -
[docs]def init_context(context: MLClientCtx): - context.logger.info("Initializing EventStreamProcessor") - parameters = environ.get("MODEL_MONITORING_PARAMETERS") - parameters = json.loads(parameters) if parameters else {} - stream_processor = EventStreamProcessor(**parameters) - setattr(context, "stream_processor", stream_processor)
- - -
[docs]def handler(context: MLClientCtx, event: Event): - event_body = json.loads(event.body) - context.logger.debug(event_body) - context.stream_processor.consume(event_body)
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/model_monitoring_stream/1.1.0/static/source.html b/functions/development/model_monitoring_stream/1.1.0/static/source.html deleted file mode 100644 index 01b89503..00000000 --- a/functions/development/model_monitoring_stream/1.1.0/static/source.html +++ /dev/null @@ -1,790 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-# Copyright 2019 Iguazio
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-import json
-import os
-from collections import defaultdict
-from datetime import datetime
-from os import environ
-from typing import Dict, List, Set, Optional, Any, Union
-
-import pandas as pd
-import v3io
-from mlrun.config import config
-from mlrun.run import MLClientCtx
-from mlrun.utils import logger
-from mlrun.utils.model_monitoring import (
-    parse_model_endpoint_store_prefix,
-    create_model_endpoint_id,
-)
-from mlrun.utils.v3io_clients import get_v3io_client, get_frames_client
-from nuclio import Event
-from storey import (
-    FieldAggregator,
-    NoopDriver,
-    Table,
-    Map,
-    MapClass,
-    AggregateByKey,
-    build_flow,
-    Filter,
-    FlatMap,
-    TSDBTarget,
-    ParquetTarget,
-    SyncEmitSource,
-)
-from storey.dtypes import SlidingWindows
-from storey.steps import SampleWindow
-# Constants
-from v3io.dataplane import RaiseForStatus
-
-ISO_8061_UTC = "%Y-%m-%d %H:%M:%S.%f%z"
-FUNCTION_URI = "function_uri"
-MODEL = "model"
-VERSION = "version"
-VERSIONED_MODEL = "versioned_model"
-MODEL_CLASS = "model_class"
-TIMESTAMP = "timestamp"
-ENDPOINT_ID = "endpoint_id"
-REQUEST_ID = "request_id"
-LABELS = "labels"
-UNPACKED_LABELS = "unpacked_labels"
-LATENCY_AVG_5M = "latency_avg_5m"
-LATENCY_AVG_1H = "latency_avg_1h"
-PREDICTIONS_PER_SECOND = "predictions_per_second"
-PREDICTIONS_COUNT_5M = "predictions_count_5m"
-PREDICTIONS_COUNT_1H = "predictions_count_1h"
-FIRST_REQUEST = "first_request"
-LAST_REQUEST = "last_request"
-ERROR_COUNT = "error_count"
-ENTITIES = "entities"
-FEATURE_NAMES = "feature_names"
-LABEL_COLUMNS = "label_columns"
-LATENCY = "latency"
-RECORD_TYPE = "record_type"
-FEATURES = "features"
-PREDICTION = "prediction"
-PREDICTIONS = "predictions"
-NAMED_FEATURES = "named_features"
-NAMED_PREDICTIONS = "named_predictions"
-BASE_METRICS = "base_metrics"
-CUSTOM_METRICS = "custom_metrics"
-ENDPOINT_FEATURES = "endpoint_features"
-METRICS = "metrics"
-BATCH_TIMESTAMP = "batch_timestamp"
-TIME_FORMAT: str = "%Y-%m-%d %H:%M:%S.%f"  # ISO 8061
-
-
-# Stream processing code
-class EventStreamProcessor:
-    def __init__(
-        self,
-        project: str,
-        sample_window: int = 10,
-        tsdb_batching_max_events: int = 10,
-        tsdb_batching_timeout_secs: int = 60 * 5,  # Default 5 minutes
-        parquet_batching_max_events: int = 10_000,
-        parquet_batching_timeout_secs: int = 60 * 60,  # Default 1 hour
-        aggregate_count_windows: Optional[List[str]] = None,
-        aggregate_count_period: str = "30s",
-        aggregate_avg_windows: Optional[List[str]] = None,
-        aggregate_avg_period: str = "30s",
-        v3io_access_key: Optional[str] = None,
-        v3io_framesd: Optional[str] = None,
-        v3io_api: Optional[str] = None,
-    ):
-        self.project = project
-        self.sample_window = sample_window
-        self.tsdb_batching_max_events = tsdb_batching_max_events
-        self.tsdb_batching_timeout_secs = tsdb_batching_timeout_secs
-        self.parquet_batching_max_events = parquet_batching_max_events
-        self.parquet_batching_timeout_secs = parquet_batching_timeout_secs
-        self.aggregate_count_windows = aggregate_count_windows or ["5m", "1h"]
-        self.aggregate_count_period = aggregate_count_period
-        self.aggregate_avg_windows = aggregate_avg_windows or ["5m", "1h"]
-        self.aggregate_avg_period = aggregate_avg_period
-
-        self.v3io_framesd = v3io_framesd or config.v3io_framesd
-        self.v3io_api = v3io_api or config.v3io_api
-
-        self.v3io_access_key = v3io_access_key or environ.get("V3IO_ACCESS_KEY")
-        self.model_monitoring_access_key = (
-            os.environ.get("MODEL_MONITORING_ACCESS_KEY") or self.v3io_access_key
-        )
-
-        template = config.model_endpoint_monitoring.store_prefixes.default
-
-        kv_path = template.format(project=project, kind="endpoints")
-        _, self.kv_container, self.kv_path = parse_model_endpoint_store_prefix(kv_path)
-
-        tsdb_path = template.format(project=project, kind="events")
-        _, self.tsdb_container, self.tsdb_path = parse_model_endpoint_store_prefix(
-            tsdb_path
-        )
-        self.tsdb_path = f"{self.tsdb_container}/{self.tsdb_path}"
-
-        self.parquet_path = config.model_endpoint_monitoring.store_prefixes.user_space.format(
-            project=project, kind="parquet"
-        )
-
-        logger.info(
-            "V3IO Configuration",
-            v3io_access_key=self.v3io_access_key,
-            model_monitoring_access_key=self.model_monitoring_access_key,
-            default_store_prefix=config.model_endpoint_monitoring.store_prefixes.default,
-            user_space_store_prefix=config.model_endpoint_monitoring.store_prefixes.user_space,
-            v3io_api=self.v3io_api,
-            v3io_framesd=self.v3io_framesd,
-            kv_container=self.kv_container,
-            kv_path=self.kv_path,
-            tsdb_container=self.tsdb_container,
-            tsdb_path=self.tsdb_path,
-            parquet_path=self.parquet_path,
-        )
-
-        self._kv_keys = [
-            FUNCTION_URI,
-            MODEL,
-            MODEL_CLASS,
-            TIMESTAMP,
-            ENDPOINT_ID,
-            LABELS,
-            UNPACKED_LABELS,
-            LATENCY_AVG_5M,
-            LATENCY_AVG_1H,
-            PREDICTIONS_PER_SECOND,
-            PREDICTIONS_COUNT_5M,
-            PREDICTIONS_COUNT_1H,
-            FIRST_REQUEST,
-            LAST_REQUEST,
-            ERROR_COUNT,
-        ]
-
-        self._flow = build_flow(
-            [
-                SyncEmitSource(),
-                ProcessEndpointEvent(
-                    kv_container=self.kv_container,
-                    kv_path=self.kv_path,
-                    v3io_access_key=self.v3io_access_key,
-                ),
-                FilterNotNone(),
-                FlatMap(lambda x: x),
-                MapFeatureNames(
-                    kv_container=self.kv_container,
-                    kv_path=self.kv_path,
-                    access_key=self.v3io_access_key,
-                ),
-                # Branch 1: Aggregate events, count averages and update TSDB and KV
-                [
-                    AggregateByKey(
-                        aggregates=[
-                            FieldAggregator(
-                                PREDICTIONS,
-                                ENDPOINT_ID,
-                                ["count"],
-                                SlidingWindows(
-                                    self.aggregate_count_windows,
-                                    self.aggregate_count_period,
-                                ),
-                            ),
-                            FieldAggregator(
-                                LATENCY,
-                                LATENCY,
-                                ["avg"],
-                                SlidingWindows(
-                                    self.aggregate_avg_windows,
-                                    self.aggregate_avg_period,
-                                ),
-                            ),
-                        ],
-                        table=Table("notable", NoopDriver()),
-                    ),
-                    SampleWindow(
-                        self.sample_window
-                    ),  # Add required gap between event to apply sampling
-                    Map(self.compute_predictions_per_second),
-                    # Branch 1.1: Updated KV
-                    [
-                        Map(self.process_before_kv),
-                        WriteToKV(container=self.kv_container, table=self.kv_path),
-                        InferSchema(
-                            v3io_access_key=self.v3io_access_key,
-                            v3io_framesd=self.v3io_framesd,
-                            container=self.kv_container,
-                            table=self.kv_path,
-                        ),
-                    ],
-                    # Branch 1.2: Update TSDB
-                    [
-                        # Map the event into taggable fields, add record type to each field
-                        Map(self.process_before_events_tsdb),
-                        [
-                            FilterKeys(BASE_METRICS),
-                            UnpackValues(BASE_METRICS),
-                            TSDBTarget(
-                                path=self.tsdb_path,
-                                rate="10/m",
-                                time_col=TIMESTAMP,
-                                container=self.tsdb_container,
-                                access_key=self.v3io_access_key,
-                                v3io_frames=self.v3io_framesd,
-                                index_cols=[ENDPOINT_ID, RECORD_TYPE],
-                                # Settings for _Batching
-                                max_events=self.tsdb_batching_max_events,
-                                timeout_secs=self.tsdb_batching_timeout_secs,
-                                key=ENDPOINT_ID,
-                            ),
-                        ],
-                        [
-                            FilterKeys(ENDPOINT_FEATURES),
-                            UnpackValues(ENDPOINT_FEATURES),
-                            TSDBTarget(
-                                path=self.tsdb_path,
-                                rate="10/m",
-                                time_col=TIMESTAMP,
-                                container=self.tsdb_container,
-                                access_key=self.v3io_access_key,
-                                v3io_frames=self.v3io_framesd,
-                                index_cols=[ENDPOINT_ID, RECORD_TYPE],
-                                # Settings for _Batching
-                                max_events=self.tsdb_batching_max_events,
-                                timeout_secs=self.tsdb_batching_timeout_secs,
-                                key=ENDPOINT_ID,
-                            ),
-                        ],
-                        [
-                            FilterKeys(CUSTOM_METRICS),
-                            FilterNotNone(),
-                            UnpackValues(CUSTOM_METRICS),
-                            TSDBTarget(
-                                path=self.tsdb_path,
-                                rate="10/m",
-                                time_col=TIMESTAMP,
-                                container=self.tsdb_container,
-                                access_key=self.v3io_access_key,
-                                v3io_frames=self.v3io_framesd,
-                                index_cols=[ENDPOINT_ID, RECORD_TYPE],
-                                # Settings for _Batching
-                                max_events=self.tsdb_batching_max_events,
-                                timeout_secs=self.tsdb_batching_timeout_secs,
-                                key=ENDPOINT_ID,
-                            ),
-                        ],
-                    ],
-                ],
-                # Branch 2: Batch events, write to parquet
-                [
-                    Map(self.process_before_parquet),
-                    ParquetTarget(
-                        path=self.parquet_path,
-                        partition_cols=["$key", "$year", "$month", "$day", "$hour"],
-                        infer_columns_from_data=True,
-                        # Settings for _Batching
-                        max_events=self.parquet_batching_max_events,
-                        timeout_secs=self.parquet_batching_timeout_secs,
-                        # Settings for v3io storage
-                        storage_options={
-                            "v3io_api": self.v3io_api,
-                            "v3io_access_key": self.model_monitoring_access_key,
-                        },
-                    ),
-                ],
-            ]
-        ).run()
-
-    def consume(self, event: Dict):
-        events = []
-        if "headers" in event and "values" in event:
-            for values in event["values"]:
-                events.append({k: v for k, v in zip(event["headers"], values)})
-        else:
-            events.append(event)
-
-        for enriched in map(enrich_even_details, events):
-            if enriched is not None:
-                self._flow.emit(
-                    enriched,
-                    key=enriched[ENDPOINT_ID],
-                    event_time=datetime.strptime(enriched["when"], ISO_8061_UTC),
-                )
-            else:
-                pass
-
-    @staticmethod
-    def compute_predictions_per_second(event: dict):
-        event[PREDICTIONS_PER_SECOND] = float(event[PREDICTIONS_COUNT_5M]) / 600
-        return event
-
-    def process_before_kv(self, event: dict):
-        # Filter relevant keys
-        e = {k: event[k] for k in self._kv_keys}
-        # Unpack labels dictionary
-        e = {**e, **e.pop(UNPACKED_LABELS, {})}
-        # Write labels to kv as json string to be presentable later
-        e[LABELS] = json.dumps(e[LABELS])
-        return e
-
-    @staticmethod
-    def process_before_events_tsdb(event: Dict):
-        base_fields = [TIMESTAMP, ENDPOINT_ID]
-
-        base_event = {k: event[k] for k in base_fields}
-        base_event[TIMESTAMP] = pd.to_datetime(
-            base_event[TIMESTAMP], format=TIME_FORMAT
-        )
-
-        base_metrics = {
-            RECORD_TYPE: BASE_METRICS,
-            PREDICTIONS_PER_SECOND: event[PREDICTIONS_PER_SECOND],
-            PREDICTIONS_COUNT_5M: event[PREDICTIONS_COUNT_5M],
-            PREDICTIONS_COUNT_1H: event[PREDICTIONS_COUNT_1H],
-            LATENCY_AVG_5M: event[LATENCY_AVG_5M],
-            LATENCY_AVG_1H: event[LATENCY_AVG_1H],
-            **base_event,
-        }
-
-        endpoint_features = {
-            RECORD_TYPE: ENDPOINT_FEATURES,
-            **event[NAMED_PREDICTIONS],
-            **event[NAMED_FEATURES],
-            **base_event,
-        }
-
-        processed = {BASE_METRICS: base_metrics, ENDPOINT_FEATURES: endpoint_features}
-
-        if event[METRICS]:
-            processed[CUSTOM_METRICS] = {
-                RECORD_TYPE: CUSTOM_METRICS,
-                **event[METRICS],
-                **base_event,
-            }
-
-        return processed
-
-    @staticmethod
-    def process_before_parquet(event: dict):
-        def set_none_if_empty(_event: dict, keys: List[str]):
-            for key in keys:
-                if not _event.get(key):
-                    _event[key] = None
-
-        def drop_if_exists(_event: dict, keys: List[str]):
-            for key in keys:
-                _event.pop(key, None)
-
-        def unpack_if_exists(_event: dict, keys: List[str]):
-            for key in keys:
-                value = _event.get(key)
-                if value is not None:
-                    _event = {**value, **event}
-
-        drop_if_exists(event, [UNPACKED_LABELS, FEATURES])
-        unpack_if_exists(event, [ENTITIES])
-        set_none_if_empty(event, [LABELS, METRICS, ENTITIES])
-        return event
-
-
-class ProcessEndpointEvent(MapClass):
-    def __init__(self, kv_container: str, kv_path: str, v3io_access_key: str, **kwargs):
-        super().__init__(**kwargs)
-        self.kv_container: str = kv_container
-        self.kv_path: str = kv_path
-        self.v3io_access_key: str = v3io_access_key
-        self.first_request: Dict[str, str] = dict()
-        self.last_request: Dict[str, str] = dict()
-        self.error_count: Dict[str, int] = defaultdict(int)
-        self.endpoints: Set[str] = set()
-
-    def do(self, event: dict):
-        function_uri = event[FUNCTION_URI]
-        versioned_model = event[VERSIONED_MODEL]
-        endpoint_id = event[ENDPOINT_ID]
-
-        # In case this process fails, resume state from existing record
-        self.resume_state(endpoint_id)
-
-        # Handle errors coming from stream
-        found_errors = self.handle_errors(endpoint_id, event)
-        if found_errors:
-            return None
-
-        # Validate event fields
-        model_class = event.get("model_class") or event.get("class")
-        timestamp = event.get("when")
-        request_id = event.get("request", {}).get("id")
-        latency = event.get("microsec")
-        features = event.get("request", {}).get("inputs")
-        predictions = event.get("resp", {}).get("outputs")
-
-        if not self.is_valid(endpoint_id, is_not_none, timestamp, ["when"],):
-            return None
-
-        if endpoint_id not in self.first_request:
-            self.first_request[endpoint_id] = timestamp
-        self.last_request[endpoint_id] = timestamp
-
-        if not self.is_valid(endpoint_id, is_not_none, request_id, ["request", "id"],):
-            return None
-        if not self.is_valid(endpoint_id, is_not_none, latency, ["microsec"],):
-            return None
-        if not self.is_valid(
-            endpoint_id, is_not_none, features, ["request", "inputs"],
-        ):
-            return None
-        if not self.is_valid(
-            endpoint_id, is_not_none, predictions, ["resp", "outputs"],
-        ):
-            return None
-
-        unpacked_labels = {f"_{k}": v for k, v in event.get(LABELS, {}).items()}
-
-        # Separate each model invocation into sub events
-        events = []
-        for i, (feature, prediction) in enumerate(zip(features, predictions)):
-            if not self.is_valid(
-                endpoint_id,
-                is_list_of_numerics,
-                feature,
-                ["request", "inputs", f"[{i}]"],
-            ):
-                return None
-
-            if not isinstance(prediction, list):
-                prediction = [prediction]
-
-            events.append(
-                {
-                    FUNCTION_URI: function_uri,
-                    MODEL: versioned_model,
-                    MODEL_CLASS: model_class,
-                    TIMESTAMP: timestamp,
-                    ENDPOINT_ID: endpoint_id,
-                    REQUEST_ID: request_id,
-                    LATENCY: latency,
-                    FEATURES: feature,
-                    PREDICTION: prediction,
-                    FIRST_REQUEST: self.first_request[endpoint_id],
-                    LAST_REQUEST: self.last_request[endpoint_id],
-                    ERROR_COUNT: self.error_count[endpoint_id],
-                    LABELS: event.get(LABELS, {}),
-                    METRICS: event.get(METRICS, {}),
-                    ENTITIES: event.get("request", {}).get(ENTITIES, {}),
-                    UNPACKED_LABELS: unpacked_labels,
-                }
-            )
-        return events
-
-    def resume_state(self, endpoint_id):
-        # Make sure process is resumable, if process fails for any reason, be able to pick things up close to where we
-        # left them
-        if endpoint_id not in self.endpoints:
-            logger.info("Trying to resume state", endpoint_id=endpoint_id)
-            endpoint_record = get_endpoint_record(
-                kv_container=self.kv_container,
-                kv_path=self.kv_path,
-                endpoint_id=endpoint_id,
-                access_key=self.v3io_access_key,
-            )
-            if endpoint_record:
-                first_request = endpoint_record.get(FIRST_REQUEST)
-                if first_request:
-                    self.first_request[endpoint_id] = first_request
-                error_count = endpoint_record.get(ERROR_COUNT)
-                if error_count:
-                    self.error_count[endpoint_id] = error_count
-            self.endpoints.add(endpoint_id)
-
-    def is_valid(
-        self, endpoint_id: str, validation_function, field: Any, dict_path: List[str]
-    ):
-        if validation_function(field, dict_path):
-            return True
-        self.error_count[endpoint_id] += 1
-        return False
-
-    def handle_errors(self, endpoint_id, event) -> bool:
-        if "error" in event:
-            self.error_count[endpoint_id] += 1
-            return True
-
-        return False
-
-
-def enrich_even_details(event) -> Optional[dict]:
-    function_uri = event.get(FUNCTION_URI)
-
-    if not is_not_none(function_uri, [FUNCTION_URI]):
-        return None
-
-    model = event.get(MODEL)
-    if not is_not_none(model, [MODEL]):
-        return None
-
-    version = event.get(VERSION)
-    versioned_model = f"{model}:{version}" if version else f"{model}:latest"
-
-    endpoint_id = create_model_endpoint_id(
-        function_uri=function_uri, versioned_model=versioned_model,
-    )
-
-    endpoint_id = str(endpoint_id)
-
-    event[VERSIONED_MODEL] = versioned_model
-    event[ENDPOINT_ID] = endpoint_id
-
-    return event
-
-
-def is_not_none(field: Any, dict_path: List[str]):
-    if field is not None:
-        return True
-    logger.error(
-        f"Expected event field is missing: {field} [Event -> {''.join(dict_path)}]"
-    )
-    return False
-
-
-def is_list_of_numerics(
-    field: List[Union[int, float, dict, list]], dict_path: List[str]
-):
-    if all(isinstance(x, int) or isinstance(x, float) for x in field):
-        return True
-    logger.error(
-        f"Expected event field is missing: {field} [Event -> {''.join(dict_path)}]"
-    )
-    return False
-
-
-class FilterNotNone(Filter):
-    def __init__(self, **kwargs):
-        super().__init__(fn=lambda event: event is not None, **kwargs)
-
-
-class FilterKeys(MapClass):
-    def __init__(self, *args, **kwargs):
-        super().__init__(**kwargs)
-        self.keys = list(args)
-
-    def do(self, event):
-        new_event = {}
-        for key in self.keys:
-            if key in event:
-                new_event[key] = event[key]
-
-        return new_event if new_event else None
-
-
-class UnpackValues(MapClass):
-    def __init__(self, *args, **kwargs):
-        super().__init__(**kwargs)
-        self.keys_to_unpack = set(args)
-
-    def do(self, event):
-        unpacked = {}
-        for key in event.keys():
-            if key in self.keys_to_unpack:
-                unpacked = {**unpacked, **event[key]}
-            else:
-                unpacked[key] = event[key]
-        return unpacked
-
-
-class MapFeatureNames(MapClass):
-    def __init__(self, kv_container: str, kv_path: str, access_key: str, **kwargs):
-        super().__init__(**kwargs)
-        self.kv_container = kv_container
-        self.kv_path = kv_path
-        self.access_key = access_key
-        self.feature_names = {}
-        self.label_columns = {}
-
-    def do(self, event: Dict):
-        endpoint_id = event[ENDPOINT_ID]
-
-        if endpoint_id not in self.feature_names:
-            endpoint_record = get_endpoint_record(
-                kv_container=self.kv_container,
-                kv_path=self.kv_path,
-                endpoint_id=endpoint_id,
-                access_key=self.access_key,
-            )
-            feature_names = endpoint_record.get(FEATURE_NAMES)
-            feature_names = json.loads(feature_names) if feature_names else None
-
-            label_columns = endpoint_record.get(LABEL_COLUMNS)
-            label_columns = json.loads(label_columns) if label_columns else None
-
-            if not feature_names:
-                logger.warn(
-                    f"Feature names are not initialized, they will be automatically generated",
-                    endpoint_id=endpoint_id,
-                )
-                feature_names = [f"f{i}" for i, _ in enumerate(event[FEATURES])]
-                get_v3io_client().kv.update(
-                    container=self.kv_container,
-                    table_path=self.kv_path,
-                    access_key=self.access_key,
-                    key=event[ENDPOINT_ID],
-                    attributes={FEATURE_NAMES: json.dumps(feature_names)},
-                    raise_for_status=RaiseForStatus.always,
-                )
-
-            if not label_columns:
-                logger.warn(
-                    f"label column names are not initialized, they will be automatically generated",
-                    endpoint_id=endpoint_id,
-                )
-                label_columns = [f"p{i}" for i, _ in enumerate(event[PREDICTION])]
-                get_v3io_client().kv.update(
-                    container=self.kv_container,
-                    table_path=self.kv_path,
-                    access_key=self.access_key,
-                    key=event[ENDPOINT_ID],
-                    attributes={LABEL_COLUMNS: json.dumps(label_columns)},
-                    raise_for_status=RaiseForStatus.always,
-                )
-
-            self.label_columns[endpoint_id] = label_columns
-            self.feature_names[endpoint_id] = feature_names
-
-            logger.info(
-                "Label columns", endpoint_id=endpoint_id, label_columns=label_columns
-            )
-            logger.info(
-                "Feature names", endpoint_id=endpoint_id, feature_names=feature_names
-            )
-
-        feature_names = self.feature_names[endpoint_id]
-        features = event[FEATURES]
-        event[NAMED_FEATURES] = {
-            name: feature for name, feature in zip(feature_names, features)
-        }
-
-        label_columns = self.label_columns[endpoint_id]
-        prediction = event[PREDICTION]
-        event[NAMED_PREDICTIONS] = {
-            name: prediction for name, prediction in zip(label_columns, prediction)
-        }
-        logger.info("Mapped event", event=event)
-        return event
-
-
-class WriteToKV(MapClass):
-    def __init__(self, container: str, table: str, **kwargs):
-        super().__init__(**kwargs)
-        self.container = container
-        self.table = table
-
-    def do(self, event: Dict):
-        get_v3io_client().kv.update(
-            container=self.container,
-            table_path=self.table,
-            key=event[ENDPOINT_ID],
-            attributes=event,
-        )
-        return event
-
-
-class InferSchema(MapClass):
-    def __init__(
-        self,
-        v3io_access_key: str,
-        v3io_framesd: str,
-        container: str,
-        table: str,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.container = container
-        self.v3io_access_key = v3io_access_key
-        self.v3io_framesd = v3io_framesd
-        self.table = table
-        self.keys = set()
-
-    def do(self, event: Dict):
-        key_set = set(event.keys())
-        if not key_set.issubset(self.keys):
-            self.keys.update(key_set)
-            get_frames_client(
-                token=self.v3io_access_key,
-                container=self.container,
-                address=self.v3io_framesd,
-            ).execute(backend="kv", table=self.table, command="infer_schema")
-            logger.info(
-                "Found new keys, inferred schema", table=self.table, event=event
-            )
-        return event
-
-
-def get_endpoint_record(
-    kv_container: str, kv_path: str, endpoint_id: str, access_key: str
-) -> Optional[dict]:
-    logger.info(
-        f"Grabbing endpoint data",
-        container=kv_container,
-        table_path=kv_path,
-        key=endpoint_id,
-    )
-    try:
-        endpoint_record = (
-            get_v3io_client()
-            .kv.get(
-                container=kv_container,
-                table_path=kv_path,
-                key=endpoint_id,
-                access_key=access_key,
-                raise_for_status=v3io.dataplane.RaiseForStatus.always,
-            )
-            .output.item
-        )
-        return endpoint_record
-    except Exception:
-        return None
-
-
-def init_context(context: MLClientCtx):
-    context.logger.info("Initializing EventStreamProcessor")
-    parameters = environ.get("MODEL_MONITORING_PARAMETERS")
-    parameters = json.loads(parameters) if parameters else {}
-    stream_processor = EventStreamProcessor(**parameters)
-    setattr(context, "stream_processor", stream_processor)
-
-
-def handler(context: MLClientCtx, event: Event):
-    event_body = json.loads(event.body)
-    context.logger.debug(event_body)
-    context.stream_processor.consume(event_body)
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/model_monitoring_stream/latest/src/function.yaml b/functions/development/model_monitoring_stream/latest/src/function.yaml deleted file mode 100644 index 07a21c40..00000000 --- a/functions/development/model_monitoring_stream/latest/src/function.yaml +++ /dev/null @@ -1,267 +0,0 @@ -kind: remote -metadata: - name: model-monitoring-stream - tag: '' - hash: 33f4d6de0858b3dfc9d150fc82fbed6feb05534c - project: '' - categories: - - monitoring -spec: - command: '' - args: [] - image: livsmichael/mlrun-api:automation - entry_points: - consume: - name: consume - doc: '' - parameters: - - name: self - default: '' - - name: event - type: Dict - default: '' - outputs: - - default: '' - lineno: 293 - compute_predictions_per_second: - name: compute_predictions_per_second - doc: '' - parameters: - - name: event - type: dict - default: '' - outputs: - - default: '' - lineno: 311 - process_before_kv: - name: process_before_kv - doc: '' - parameters: - - name: self - default: '' - - name: event - type: dict - default: '' - outputs: - - default: '' - lineno: 316 - process_before_events_tsdb: - name: process_before_events_tsdb - doc: '' - parameters: - - name: event - type: Dict - default: '' - outputs: - - default: '' - lineno: 325 - process_before_parquet: - name: process_before_parquet - doc: '' - parameters: - - name: event - type: dict - default: '' - outputs: - - default: '' - lineno: 362 - set_none_if_empty: - name: set_none_if_empty - doc: '' - parameters: - - name: _event - type: dict - default: '' - - name: keys - type: List[str] - default: '' - outputs: - - default: '' - lineno: 364 - drop_if_exists: - name: drop_if_exists - doc: '' - parameters: - - name: _event - type: dict - default: '' - - name: keys - type: List[str] - default: '' - outputs: - - default: '' - lineno: 369 - unpack_if_exists: - name: unpack_if_exists - doc: '' - parameters: - - name: _event - type: dict - default: '' - - name: keys - type: List[str] - default: '' - outputs: - - default: '' - lineno: 373 - do: - name: do - doc: '' - parameters: - - name: self - default: '' - - name: event - type: Dict - default: '' - outputs: - - default: '' - lineno: 702 - resume_state: - name: resume_state - doc: '' - parameters: - - name: self - default: '' - - name: endpoint_id - default: '' - outputs: - - default: '' - lineno: 475 - is_valid: - name: is_valid - doc: '' - parameters: - - name: self - default: '' - - name: endpoint_id - type: str - default: '' - - name: validation_function - default: '' - - name: field - type: Any - default: '' - - name: dict_path - type: List[str] - default: '' - outputs: - - default: '' - lineno: 495 - handle_errors: - name: handle_errors - doc: '' - parameters: - - name: self - default: '' - - name: endpoint_id - default: '' - - name: event - default: '' - outputs: - - default: '' - type: bool - lineno: 503 - enrich_even_details: - name: enrich_even_details - doc: '' - parameters: - - name: event - default: '' - outputs: - - default: '' - lineno: 511 - is_not_none: - name: is_not_none - doc: '' - parameters: - - name: field - type: Any - default: '' - - name: dict_path - type: List[str] - default: '' - outputs: - - default: '' - lineno: 536 - is_list_of_numerics: - name: is_list_of_numerics - doc: '' - parameters: - - name: field - type: List[Union[int, float, dict, list]] - default: '' - - name: dict_path - type: List[str] - default: '' - outputs: - - default: '' - lineno: 545 - get_endpoint_record: - name: get_endpoint_record - doc: '' - parameters: - - name: kv_container - type: str - default: '' - - name: kv_path - type: str - default: '' - - name: endpoint_id - type: str - default: '' - - name: access_key - type: str - default: '' - outputs: - - default: '' - lineno: 717 - init_context: - name: init_context - doc: '' - parameters: - - name: context - type: MLClientCtx - default: '' - outputs: - - default: '' - lineno: 743 - handler: - name: handler - doc: '' - parameters: - - name: context - type: MLClientCtx - default: '' - - name: event - type: Event - default: '' - outputs: - - default: '' - lineno: 751 - description: '' - min_replicas: 1 - max_replicas: 4 - env: [] - base_spec: - apiVersion: nuclio.io/v1 - kind: Function - metadata: - name: model-monitoring-stream - labels: {} - annotations: - nuclio.io/generated_by: function generated from /home/michaell/projects/functions/model_monitoring_stream/model_monitoring_stream.py - spec: - runtime: python:3.6 - handler: model_monitoring_stream:handler - env: [] - volumes: [] - build: - commands: [] - noBaseImagesPull: true - functionSourceCode:  - source: '' - build: - commands: [] - code_origin: https://github.com/Michaelliv/functions.git#202b4c489e4c02c3025742ea237f1a042b7c6043:/home/michaell/projects/functions/model_monitoring_stream/model_monitoring_stream.py - default_handler: handler -verbose: false diff --git a/functions/development/model_monitoring_stream/latest/src/item.yaml b/functions/development/model_monitoring_stream/latest/src/item.yaml deleted file mode 100644 index 219fa528..00000000 --- a/functions/development/model_monitoring_stream/latest/src/item.yaml +++ /dev/null @@ -1,23 +0,0 @@ -apiVersion: v1 -categories: -- monitoring -description: '' -doc: '' -example: model_monitoring_stream.ipynb -generationDate: 2022-08-28:17-25 -hidden: false -icon: '' -labels: {} -maintainers: [] -marketplaceType: '' -mlrunVersion: 1.1.0 -name: model-monitoring-stream -platformVersion: 3.5.0 -spec: - filename: model_monitoring_stream.py - handler: handler - image: livsmichael/mlrun-api:automation - kind: nuclio - requirements: [] -url: '' -version: 1.1.0 diff --git a/functions/development/model_monitoring_stream/latest/src/model_monitoring_stream.ipynb b/functions/development/model_monitoring_stream/latest/src/model_monitoring_stream.ipynb deleted file mode 100644 index 93d8c92e..00000000 --- a/functions/development/model_monitoring_stream/latest/src/model_monitoring_stream.ipynb +++ /dev/null @@ -1,178 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "source": [ - "# Model Monitoring\n", - "\n", - "## Initial set up (and pre-requisites)\n", - "1. Make sure you have the `mlrun-api` datasource available in your Grafana instance, otherwise add it by:\n", - " 1. Open your grafana instance\n", - " 2. Navigate to `Configuration -> Data Sources`\n", - " 3. Press `Add data source` and configure the following parameters\n", - " ```\n", - " Name: mlrun-api\n", - " URL: http://mlrun-api:8080/api/grafana-proxy/model-endpoints\n", - " Access: Server (default)\n", - "\n", - " ## Add a custom header of:\n", - " X-V3io-Session-Key: \n", - " ```\n", - " 4. Press `Save & Test` to make sure it works, a confirmation message should appear when this button is pressed\n", - "\n", - "2. Import the available dashboards `(./dashboards/*)` to you Grafana instance\n", - "3. To allow the system to utilize drift measurement, make sure you supply the train set when logging the model on the\n", - " training step\n", - "\n", - " ```python\n", - " # Log model\n", - " context.log_model(\n", - " \"model\",\n", - " body=dumps(model),\n", - " artifact_path=context.artifact_subpath(\"models\"),\n", - " extra_data=eval_metrics,\n", - " model_file=\"model.pkl\",\n", - " metrics=context.results,\n", - " training_set=X_test, # <- make sure this is passed into log_model\n", - " labels={\"class\": \"sklearn.linear_model.LogisticRegression\"}\n", - " )\n", - " ```\n", - "4. When serving a model, make sure that the Nuclio function is deployed with tracking enabled by applying\n", - " `fn.set_tracking()`\n", - "\n", - "## Configuration\n", - "The stream processing portion of the model monitoring, can be deployed under multiple configuration options. The\n", - "available configurations can be found under `stream.Config`. Once configured it should be supplied as environment\n", - "parameters to the Nuclio function by setting `fn.set_envs`\n", - "\n", - "```python\n", - "project: str # project name\n", - "sample_window: int # The sampling window for the data that flows into the TSDB and the KV\n", - "kv_path_template: str # Path template for the kv table\n", - "tsdb_path_template: str # Path template for the tsdb table\n", - "parquet_path_template: str # v3io parquets path template, assumes v3io is mounted\n", - "tsdb_batching_max_events: int # The max amount of event to batch before writing the batch to tsdb\n", - "tsdb_batching_timeout_secs: int # The max amount of seconds a given batch can be gathered before being emitted\n", - "parquet_batching_max_events: int # The max amount of event to batch before writing the batch to parquet\n", - "parquet_batching_timeout_secs: int # The max amount of seconds, a given batch can be gathered before being written to parquet\n", - "container: str # container name\n", - "v3io_access_key: str # V3IO Access key\n", - "v3io_framesd: str # V3IO framesd URL\n", - "time_format: str # The time format into which time related fields will be converted\n", - "aggregate_count_windows: List[str] # List of window sizes for predictions count\n", - "aggregate_count_period: str # Period of predictions count windows\n", - "aggregate_avg_windows: List[str] # List of window sizes for average latency\n", - "aggregate_avg_period: str # Period of average latency windows\n", - "```" - ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%% md\n" - } - } - }, - { - "cell_type": "markdown", - "source": [ - "## Export function yaml" - ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%% md\n" - } - } - }, - { - "cell_type": "code", - "execution_count": null, - "outputs": [], - "source": [ - "from mlrun import code_to_function\n", - "from mlrun.runtimes import RemoteRuntime\n", - "\n", - "\n", - "fn: RemoteRuntime = code_to_function(\n", - " name=\"model-monitoring-stream\",\n", - " kind=\"nuclio\",\n", - " image=\"mlrun/mlrun\",\n", - " filename=\"model_monitoring_stream.py\",\n", - " handler=\"handler\",\n", - ")\n", - "fn.export(\"model_monitoring_stream.yaml\")\n" - ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%%\n" - } - } - }, - { - "cell_type": "markdown", - "source": [ - "## Deploy Stream Processing" - ], - "metadata": { - "collapsed": false - } - }, - { - "cell_type": "code", - "execution_count": null, - "outputs": [], - "source": [ - "import os\n", - "\n", - "from mlrun import import_function\n", - "from mlrun.platforms import mount_v3io\n", - "from mlrun.runtimes import RemoteRuntime\n", - "import json\n", - "\n", - "# Set project name\n", - "project = \"\"\n", - "\n", - "fn: RemoteRuntime = import_function(\"hub://model_monitoring_stream\")\n", - "\n", - "fn.add_v3io_stream_trigger(\n", - " stream_path=f\"projects/{project}/model-endpoints/stream\",\n", - " name=\"monitoring_stream_trigger\",\n", - ")\n", - "\n", - "fn.set_env(\"MODEL_MONITORING_PARAMETERS\", json.dumps({\"project\": project, \"v3io_framesd\": os.environ.get(\"V3IO_FRAMESD\")}))\n", - "\n", - "fn.metadata.project = project\n", - "fn.apply(mount_v3io())\n", - "fn.deploy()" - ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%%\n" - } - } - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 2 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} \ No newline at end of file diff --git a/functions/development/model_monitoring_stream/latest/src/model_monitoring_stream.py b/functions/development/model_monitoring_stream/latest/src/model_monitoring_stream.py deleted file mode 100644 index 90c8b92c..00000000 --- a/functions/development/model_monitoring_stream/latest/src/model_monitoring_stream.py +++ /dev/null @@ -1,768 +0,0 @@ -# Copyright 2019 Iguazio -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -import json -import os -from collections import defaultdict -from datetime import datetime -from os import environ -from typing import Dict, List, Set, Optional, Any, Union - -import pandas as pd -import v3io -from mlrun.config import config -from mlrun.run import MLClientCtx -from mlrun.utils import logger -from mlrun.utils.model_monitoring import ( - parse_model_endpoint_store_prefix, - create_model_endpoint_id, -) -from mlrun.utils.v3io_clients import get_v3io_client, get_frames_client -from nuclio import Event -from storey import ( - FieldAggregator, - NoopDriver, - Table, - Map, - MapClass, - AggregateByKey, - build_flow, - Filter, - FlatMap, - TSDBTarget, - ParquetTarget, - SyncEmitSource, -) -from storey.dtypes import SlidingWindows -from storey.steps import SampleWindow -# Constants -from v3io.dataplane import RaiseForStatus - -ISO_8061_UTC = "%Y-%m-%d %H:%M:%S.%f%z" -FUNCTION_URI = "function_uri" -MODEL = "model" -VERSION = "version" -VERSIONED_MODEL = "versioned_model" -MODEL_CLASS = "model_class" -TIMESTAMP = "timestamp" -ENDPOINT_ID = "endpoint_id" -REQUEST_ID = "request_id" -LABELS = "labels" -UNPACKED_LABELS = "unpacked_labels" -LATENCY_AVG_5M = "latency_avg_5m" -LATENCY_AVG_1H = "latency_avg_1h" -PREDICTIONS_PER_SECOND = "predictions_per_second" -PREDICTIONS_COUNT_5M = "predictions_count_5m" -PREDICTIONS_COUNT_1H = "predictions_count_1h" -FIRST_REQUEST = "first_request" -LAST_REQUEST = "last_request" -ERROR_COUNT = "error_count" -ENTITIES = "entities" -FEATURE_NAMES = "feature_names" -LABEL_COLUMNS = "label_columns" -LATENCY = "latency" -RECORD_TYPE = "record_type" -FEATURES = "features" -PREDICTION = "prediction" -PREDICTIONS = "predictions" -NAMED_FEATURES = "named_features" -NAMED_PREDICTIONS = "named_predictions" -BASE_METRICS = "base_metrics" -CUSTOM_METRICS = "custom_metrics" -ENDPOINT_FEATURES = "endpoint_features" -METRICS = "metrics" -BATCH_TIMESTAMP = "batch_timestamp" -TIME_FORMAT: str = "%Y-%m-%d %H:%M:%S.%f" # ISO 8061 - - -# Stream processing code -class EventStreamProcessor: - def __init__( - self, - project: str, - sample_window: int = 10, - tsdb_batching_max_events: int = 10, - tsdb_batching_timeout_secs: int = 60 * 5, # Default 5 minutes - parquet_batching_max_events: int = 10_000, - parquet_batching_timeout_secs: int = 60 * 60, # Default 1 hour - aggregate_count_windows: Optional[List[str]] = None, - aggregate_count_period: str = "30s", - aggregate_avg_windows: Optional[List[str]] = None, - aggregate_avg_period: str = "30s", - v3io_access_key: Optional[str] = None, - v3io_framesd: Optional[str] = None, - v3io_api: Optional[str] = None, - ): - self.project = project - self.sample_window = sample_window - self.tsdb_batching_max_events = tsdb_batching_max_events - self.tsdb_batching_timeout_secs = tsdb_batching_timeout_secs - self.parquet_batching_max_events = parquet_batching_max_events - self.parquet_batching_timeout_secs = parquet_batching_timeout_secs - self.aggregate_count_windows = aggregate_count_windows or ["5m", "1h"] - self.aggregate_count_period = aggregate_count_period - self.aggregate_avg_windows = aggregate_avg_windows or ["5m", "1h"] - self.aggregate_avg_period = aggregate_avg_period - - self.v3io_framesd = v3io_framesd or config.v3io_framesd - self.v3io_api = v3io_api or config.v3io_api - - self.v3io_access_key = v3io_access_key or environ.get("V3IO_ACCESS_KEY") - self.model_monitoring_access_key = ( - os.environ.get("MODEL_MONITORING_ACCESS_KEY") or self.v3io_access_key - ) - - template = config.model_endpoint_monitoring.store_prefixes.default - - kv_path = template.format(project=project, kind="endpoints") - _, self.kv_container, self.kv_path = parse_model_endpoint_store_prefix(kv_path) - - tsdb_path = template.format(project=project, kind="events") - _, self.tsdb_container, self.tsdb_path = parse_model_endpoint_store_prefix( - tsdb_path - ) - self.tsdb_path = f"{self.tsdb_container}/{self.tsdb_path}" - - self.parquet_path = config.model_endpoint_monitoring.store_prefixes.user_space.format( - project=project, kind="parquet" - ) - - logger.info( - "V3IO Configuration", - v3io_access_key=self.v3io_access_key, - model_monitoring_access_key=self.model_monitoring_access_key, - default_store_prefix=config.model_endpoint_monitoring.store_prefixes.default, - user_space_store_prefix=config.model_endpoint_monitoring.store_prefixes.user_space, - v3io_api=self.v3io_api, - v3io_framesd=self.v3io_framesd, - kv_container=self.kv_container, - kv_path=self.kv_path, - tsdb_container=self.tsdb_container, - tsdb_path=self.tsdb_path, - parquet_path=self.parquet_path, - ) - - self._kv_keys = [ - FUNCTION_URI, - MODEL, - MODEL_CLASS, - TIMESTAMP, - ENDPOINT_ID, - LABELS, - UNPACKED_LABELS, - LATENCY_AVG_5M, - LATENCY_AVG_1H, - PREDICTIONS_PER_SECOND, - PREDICTIONS_COUNT_5M, - PREDICTIONS_COUNT_1H, - FIRST_REQUEST, - LAST_REQUEST, - ERROR_COUNT, - ] - - self._flow = build_flow( - [ - SyncEmitSource(), - ProcessEndpointEvent( - kv_container=self.kv_container, - kv_path=self.kv_path, - v3io_access_key=self.v3io_access_key, - ), - FilterNotNone(), - FlatMap(lambda x: x), - MapFeatureNames( - kv_container=self.kv_container, - kv_path=self.kv_path, - access_key=self.v3io_access_key, - ), - # Branch 1: Aggregate events, count averages and update TSDB and KV - [ - AggregateByKey( - aggregates=[ - FieldAggregator( - PREDICTIONS, - ENDPOINT_ID, - ["count"], - SlidingWindows( - self.aggregate_count_windows, - self.aggregate_count_period, - ), - ), - FieldAggregator( - LATENCY, - LATENCY, - ["avg"], - SlidingWindows( - self.aggregate_avg_windows, - self.aggregate_avg_period, - ), - ), - ], - table=Table("notable", NoopDriver()), - ), - SampleWindow( - self.sample_window - ), # Add required gap between event to apply sampling - Map(self.compute_predictions_per_second), - # Branch 1.1: Updated KV - [ - Map(self.process_before_kv), - WriteToKV(container=self.kv_container, table=self.kv_path), - InferSchema( - v3io_access_key=self.v3io_access_key, - v3io_framesd=self.v3io_framesd, - container=self.kv_container, - table=self.kv_path, - ), - ], - # Branch 1.2: Update TSDB - [ - # Map the event into taggable fields, add record type to each field - Map(self.process_before_events_tsdb), - [ - FilterKeys(BASE_METRICS), - UnpackValues(BASE_METRICS), - TSDBTarget( - path=self.tsdb_path, - rate="10/m", - time_col=TIMESTAMP, - container=self.tsdb_container, - access_key=self.v3io_access_key, - v3io_frames=self.v3io_framesd, - index_cols=[ENDPOINT_ID, RECORD_TYPE], - # Settings for _Batching - max_events=self.tsdb_batching_max_events, - timeout_secs=self.tsdb_batching_timeout_secs, - key=ENDPOINT_ID, - ), - ], - [ - FilterKeys(ENDPOINT_FEATURES), - UnpackValues(ENDPOINT_FEATURES), - TSDBTarget( - path=self.tsdb_path, - rate="10/m", - time_col=TIMESTAMP, - container=self.tsdb_container, - access_key=self.v3io_access_key, - v3io_frames=self.v3io_framesd, - index_cols=[ENDPOINT_ID, RECORD_TYPE], - # Settings for _Batching - max_events=self.tsdb_batching_max_events, - timeout_secs=self.tsdb_batching_timeout_secs, - key=ENDPOINT_ID, - ), - ], - [ - FilterKeys(CUSTOM_METRICS), - FilterNotNone(), - UnpackValues(CUSTOM_METRICS), - TSDBTarget( - path=self.tsdb_path, - rate="10/m", - time_col=TIMESTAMP, - container=self.tsdb_container, - access_key=self.v3io_access_key, - v3io_frames=self.v3io_framesd, - index_cols=[ENDPOINT_ID, RECORD_TYPE], - # Settings for _Batching - max_events=self.tsdb_batching_max_events, - timeout_secs=self.tsdb_batching_timeout_secs, - key=ENDPOINT_ID, - ), - ], - ], - ], - # Branch 2: Batch events, write to parquet - [ - Map(self.process_before_parquet), - ParquetTarget( - path=self.parquet_path, - partition_cols=["$key", "$year", "$month", "$day", "$hour"], - infer_columns_from_data=True, - # Settings for _Batching - max_events=self.parquet_batching_max_events, - timeout_secs=self.parquet_batching_timeout_secs, - # Settings for v3io storage - storage_options={ - "v3io_api": self.v3io_api, - "v3io_access_key": self.model_monitoring_access_key, - }, - ), - ], - ] - ).run() - - def consume(self, event: Dict): - events = [] - if "headers" in event and "values" in event: - for values in event["values"]: - events.append({k: v for k, v in zip(event["headers"], values)}) - else: - events.append(event) - - for enriched in map(enrich_even_details, events): - if enriched is not None: - self._flow.emit( - enriched, - key=enriched[ENDPOINT_ID], - event_time=datetime.strptime(enriched["when"], ISO_8061_UTC), - ) - else: - pass - - @staticmethod - def compute_predictions_per_second(event: dict): - event[PREDICTIONS_PER_SECOND] = float(event[PREDICTIONS_COUNT_5M]) / 600 - return event - - def process_before_kv(self, event: dict): - # Filter relevant keys - e = {k: event[k] for k in self._kv_keys} - # Unpack labels dictionary - e = {**e, **e.pop(UNPACKED_LABELS, {})} - # Write labels to kv as json string to be presentable later - e[LABELS] = json.dumps(e[LABELS]) - return e - - @staticmethod - def process_before_events_tsdb(event: Dict): - base_fields = [TIMESTAMP, ENDPOINT_ID] - - base_event = {k: event[k] for k in base_fields} - base_event[TIMESTAMP] = pd.to_datetime( - base_event[TIMESTAMP], format=TIME_FORMAT - ) - - base_metrics = { - RECORD_TYPE: BASE_METRICS, - PREDICTIONS_PER_SECOND: event[PREDICTIONS_PER_SECOND], - PREDICTIONS_COUNT_5M: event[PREDICTIONS_COUNT_5M], - PREDICTIONS_COUNT_1H: event[PREDICTIONS_COUNT_1H], - LATENCY_AVG_5M: event[LATENCY_AVG_5M], - LATENCY_AVG_1H: event[LATENCY_AVG_1H], - **base_event, - } - - endpoint_features = { - RECORD_TYPE: ENDPOINT_FEATURES, - **event[NAMED_PREDICTIONS], - **event[NAMED_FEATURES], - **base_event, - } - - processed = {BASE_METRICS: base_metrics, ENDPOINT_FEATURES: endpoint_features} - - if event[METRICS]: - processed[CUSTOM_METRICS] = { - RECORD_TYPE: CUSTOM_METRICS, - **event[METRICS], - **base_event, - } - - return processed - - @staticmethod - def process_before_parquet(event: dict): - def set_none_if_empty(_event: dict, keys: List[str]): - for key in keys: - if not _event.get(key): - _event[key] = None - - def drop_if_exists(_event: dict, keys: List[str]): - for key in keys: - _event.pop(key, None) - - def unpack_if_exists(_event: dict, keys: List[str]): - for key in keys: - value = _event.get(key) - if value is not None: - _event = {**value, **event} - - drop_if_exists(event, [UNPACKED_LABELS, FEATURES]) - unpack_if_exists(event, [ENTITIES]) - set_none_if_empty(event, [LABELS, METRICS, ENTITIES]) - return event - - -class ProcessEndpointEvent(MapClass): - def __init__(self, kv_container: str, kv_path: str, v3io_access_key: str, **kwargs): - super().__init__(**kwargs) - self.kv_container: str = kv_container - self.kv_path: str = kv_path - self.v3io_access_key: str = v3io_access_key - self.first_request: Dict[str, str] = dict() - self.last_request: Dict[str, str] = dict() - self.error_count: Dict[str, int] = defaultdict(int) - self.endpoints: Set[str] = set() - - def do(self, event: dict): - function_uri = event[FUNCTION_URI] - versioned_model = event[VERSIONED_MODEL] - endpoint_id = event[ENDPOINT_ID] - - # In case this process fails, resume state from existing record - self.resume_state(endpoint_id) - - # Handle errors coming from stream - found_errors = self.handle_errors(endpoint_id, event) - if found_errors: - return None - - # Validate event fields - model_class = event.get("model_class") or event.get("class") - timestamp = event.get("when") - request_id = event.get("request", {}).get("id") - latency = event.get("microsec") - features = event.get("request", {}).get("inputs") - predictions = event.get("resp", {}).get("outputs") - - if not self.is_valid(endpoint_id, is_not_none, timestamp, ["when"],): - return None - - if endpoint_id not in self.first_request: - self.first_request[endpoint_id] = timestamp - self.last_request[endpoint_id] = timestamp - - if not self.is_valid(endpoint_id, is_not_none, request_id, ["request", "id"],): - return None - if not self.is_valid(endpoint_id, is_not_none, latency, ["microsec"],): - return None - if not self.is_valid( - endpoint_id, is_not_none, features, ["request", "inputs"], - ): - return None - if not self.is_valid( - endpoint_id, is_not_none, predictions, ["resp", "outputs"], - ): - return None - - unpacked_labels = {f"_{k}": v for k, v in event.get(LABELS, {}).items()} - - # Separate each model invocation into sub events - events = [] - for i, (feature, prediction) in enumerate(zip(features, predictions)): - if not self.is_valid( - endpoint_id, - is_list_of_numerics, - feature, - ["request", "inputs", f"[{i}]"], - ): - return None - - if not isinstance(prediction, list): - prediction = [prediction] - - events.append( - { - FUNCTION_URI: function_uri, - MODEL: versioned_model, - MODEL_CLASS: model_class, - TIMESTAMP: timestamp, - ENDPOINT_ID: endpoint_id, - REQUEST_ID: request_id, - LATENCY: latency, - FEATURES: feature, - PREDICTION: prediction, - FIRST_REQUEST: self.first_request[endpoint_id], - LAST_REQUEST: self.last_request[endpoint_id], - ERROR_COUNT: self.error_count[endpoint_id], - LABELS: event.get(LABELS, {}), - METRICS: event.get(METRICS, {}), - ENTITIES: event.get("request", {}).get(ENTITIES, {}), - UNPACKED_LABELS: unpacked_labels, - } - ) - return events - - def resume_state(self, endpoint_id): - # Make sure process is resumable, if process fails for any reason, be able to pick things up close to where we - # left them - if endpoint_id not in self.endpoints: - logger.info("Trying to resume state", endpoint_id=endpoint_id) - endpoint_record = get_endpoint_record( - kv_container=self.kv_container, - kv_path=self.kv_path, - endpoint_id=endpoint_id, - access_key=self.v3io_access_key, - ) - if endpoint_record: - first_request = endpoint_record.get(FIRST_REQUEST) - if first_request: - self.first_request[endpoint_id] = first_request - error_count = endpoint_record.get(ERROR_COUNT) - if error_count: - self.error_count[endpoint_id] = error_count - self.endpoints.add(endpoint_id) - - def is_valid( - self, endpoint_id: str, validation_function, field: Any, dict_path: List[str] - ): - if validation_function(field, dict_path): - return True - self.error_count[endpoint_id] += 1 - return False - - def handle_errors(self, endpoint_id, event) -> bool: - if "error" in event: - self.error_count[endpoint_id] += 1 - return True - - return False - - -def enrich_even_details(event) -> Optional[dict]: - function_uri = event.get(FUNCTION_URI) - - if not is_not_none(function_uri, [FUNCTION_URI]): - return None - - model = event.get(MODEL) - if not is_not_none(model, [MODEL]): - return None - - version = event.get(VERSION) - versioned_model = f"{model}:{version}" if version else f"{model}:latest" - - endpoint_id = create_model_endpoint_id( - function_uri=function_uri, versioned_model=versioned_model, - ) - - endpoint_id = str(endpoint_id) - - event[VERSIONED_MODEL] = versioned_model - event[ENDPOINT_ID] = endpoint_id - - return event - - -def is_not_none(field: Any, dict_path: List[str]): - if field is not None: - return True - logger.error( - f"Expected event field is missing: {field} [Event -> {''.join(dict_path)}]" - ) - return False - - -def is_list_of_numerics( - field: List[Union[int, float, dict, list]], dict_path: List[str] -): - if all(isinstance(x, int) or isinstance(x, float) for x in field): - return True - logger.error( - f"Expected event field is missing: {field} [Event -> {''.join(dict_path)}]" - ) - return False - - -class FilterNotNone(Filter): - def __init__(self, **kwargs): - super().__init__(fn=lambda event: event is not None, **kwargs) - - -class FilterKeys(MapClass): - def __init__(self, *args, **kwargs): - super().__init__(**kwargs) - self.keys = list(args) - - def do(self, event): - new_event = {} - for key in self.keys: - if key in event: - new_event[key] = event[key] - - return new_event if new_event else None - - -class UnpackValues(MapClass): - def __init__(self, *args, **kwargs): - super().__init__(**kwargs) - self.keys_to_unpack = set(args) - - def do(self, event): - unpacked = {} - for key in event.keys(): - if key in self.keys_to_unpack: - unpacked = {**unpacked, **event[key]} - else: - unpacked[key] = event[key] - return unpacked - - -class MapFeatureNames(MapClass): - def __init__(self, kv_container: str, kv_path: str, access_key: str, **kwargs): - super().__init__(**kwargs) - self.kv_container = kv_container - self.kv_path = kv_path - self.access_key = access_key - self.feature_names = {} - self.label_columns = {} - - def do(self, event: Dict): - endpoint_id = event[ENDPOINT_ID] - - if endpoint_id not in self.feature_names: - endpoint_record = get_endpoint_record( - kv_container=self.kv_container, - kv_path=self.kv_path, - endpoint_id=endpoint_id, - access_key=self.access_key, - ) - feature_names = endpoint_record.get(FEATURE_NAMES) - feature_names = json.loads(feature_names) if feature_names else None - - label_columns = endpoint_record.get(LABEL_COLUMNS) - label_columns = json.loads(label_columns) if label_columns else None - - if not feature_names: - logger.warn( - f"Feature names are not initialized, they will be automatically generated", - endpoint_id=endpoint_id, - ) - feature_names = [f"f{i}" for i, _ in enumerate(event[FEATURES])] - get_v3io_client().kv.update( - container=self.kv_container, - table_path=self.kv_path, - access_key=self.access_key, - key=event[ENDPOINT_ID], - attributes={FEATURE_NAMES: json.dumps(feature_names)}, - raise_for_status=RaiseForStatus.always, - ) - - if not label_columns: - logger.warn( - f"label column names are not initialized, they will be automatically generated", - endpoint_id=endpoint_id, - ) - label_columns = [f"p{i}" for i, _ in enumerate(event[PREDICTION])] - get_v3io_client().kv.update( - container=self.kv_container, - table_path=self.kv_path, - access_key=self.access_key, - key=event[ENDPOINT_ID], - attributes={LABEL_COLUMNS: json.dumps(label_columns)}, - raise_for_status=RaiseForStatus.always, - ) - - self.label_columns[endpoint_id] = label_columns - self.feature_names[endpoint_id] = feature_names - - logger.info( - "Label columns", endpoint_id=endpoint_id, label_columns=label_columns - ) - logger.info( - "Feature names", endpoint_id=endpoint_id, feature_names=feature_names - ) - - feature_names = self.feature_names[endpoint_id] - features = event[FEATURES] - event[NAMED_FEATURES] = { - name: feature for name, feature in zip(feature_names, features) - } - - label_columns = self.label_columns[endpoint_id] - prediction = event[PREDICTION] - event[NAMED_PREDICTIONS] = { - name: prediction for name, prediction in zip(label_columns, prediction) - } - logger.info("Mapped event", event=event) - return event - - -class WriteToKV(MapClass): - def __init__(self, container: str, table: str, **kwargs): - super().__init__(**kwargs) - self.container = container - self.table = table - - def do(self, event: Dict): - get_v3io_client().kv.update( - container=self.container, - table_path=self.table, - key=event[ENDPOINT_ID], - attributes=event, - ) - return event - - -class InferSchema(MapClass): - def __init__( - self, - v3io_access_key: str, - v3io_framesd: str, - container: str, - table: str, - **kwargs, - ): - super().__init__(**kwargs) - self.container = container - self.v3io_access_key = v3io_access_key - self.v3io_framesd = v3io_framesd - self.table = table - self.keys = set() - - def do(self, event: Dict): - key_set = set(event.keys()) - if not key_set.issubset(self.keys): - self.keys.update(key_set) - get_frames_client( - token=self.v3io_access_key, - container=self.container, - address=self.v3io_framesd, - ).execute(backend="kv", table=self.table, command="infer_schema") - logger.info( - "Found new keys, inferred schema", table=self.table, event=event - ) - return event - - -def get_endpoint_record( - kv_container: str, kv_path: str, endpoint_id: str, access_key: str -) -> Optional[dict]: - logger.info( - f"Grabbing endpoint data", - container=kv_container, - table_path=kv_path, - key=endpoint_id, - ) - try: - endpoint_record = ( - get_v3io_client() - .kv.get( - container=kv_container, - table_path=kv_path, - key=endpoint_id, - access_key=access_key, - raise_for_status=v3io.dataplane.RaiseForStatus.always, - ) - .output.item - ) - return endpoint_record - except Exception: - return None - - -def init_context(context: MLClientCtx): - context.logger.info("Initializing EventStreamProcessor") - parameters = environ.get("MODEL_MONITORING_PARAMETERS") - parameters = json.loads(parameters) if parameters else {} - stream_processor = EventStreamProcessor(**parameters) - setattr(context, "stream_processor", stream_processor) - - -def handler(context: MLClientCtx, event: Event): - event_body = json.loads(event.body) - context.logger.debug(event_body) - context.stream_processor.consume(event_body) diff --git a/functions/development/model_monitoring_stream/latest/src/requirements.txt b/functions/development/model_monitoring_stream/latest/src/requirements.txt deleted file mode 100644 index ef238930..00000000 --- a/functions/development/model_monitoring_stream/latest/src/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -storey -nuclio -v3io \ No newline at end of file diff --git a/functions/development/model_monitoring_stream/latest/static/documentation.html b/functions/development/model_monitoring_stream/latest/static/documentation.html deleted file mode 100644 index 875606b7..00000000 --- a/functions/development/model_monitoring_stream/latest/static/documentation.html +++ /dev/null @@ -1,342 +0,0 @@ - - - - - - - -model_monitoring_stream package - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - - - -
-
- - -
-
-
- -
-

model_monitoring_stream package

- -
- -
-
-
-
-
-

model_monitoring_stream package#

-
-

Submodules#

-
-
-

model_monitoring_stream.model_monitoring_stream module#

-
-
-class model_monitoring_stream.model_monitoring_stream.EventStreamProcessor(project: str, sample_window: int = 10, tsdb_batching_max_events: int = 10, tsdb_batching_timeout_secs: int = 300, parquet_batching_max_events: int = 10000, parquet_batching_timeout_secs: int = 3600, aggregate_count_windows: Optional[List[str]] = None, aggregate_count_period: str = '30s', aggregate_avg_windows: Optional[List[str]] = None, aggregate_avg_period: str = '30s', v3io_access_key: Optional[str] = None, v3io_framesd: Optional[str] = None, v3io_api: Optional[str] = None)[source]#
-

Bases: object

-
-
-static compute_predictions_per_second(event: dict)[source]#
-
-
-
-consume(event: Dict)[source]#
-
-
-
-static process_before_events_tsdb(event: Dict)[source]#
-
-
-
-process_before_kv(event: dict)[source]#
-
-
-
-static process_before_parquet(event: dict)[source]#
-
-
-
-
-class model_monitoring_stream.model_monitoring_stream.FilterKeys(*args: Any, **kwargs: Any)[source]#
-

Bases: storey.

-
-
-do(event)[source]#
-
-
-
-
-class model_monitoring_stream.model_monitoring_stream.FilterNotNone(*args: Any, **kwargs: Any)[source]#
-

Bases: storey.

-
-
-
-class model_monitoring_stream.model_monitoring_stream.InferSchema(*args: Any, **kwargs: Any)[source]#
-

Bases: storey.

-
-
-do(event: Dict)[source]#
-
-
-
-
-class model_monitoring_stream.model_monitoring_stream.MapFeatureNames(*args: Any, **kwargs: Any)[source]#
-

Bases: storey.

-
-
-do(event: Dict)[source]#
-
-
-
-
-class model_monitoring_stream.model_monitoring_stream.ProcessEndpointEvent(*args: Any, **kwargs: Any)[source]#
-

Bases: storey.

-
-
-do(event: dict)[source]#
-
-
-
-handle_errors(endpoint_id, event)bool[source]#
-
-
-
-is_valid(endpoint_id: str, validation_function, field: Any, dict_path: List[str])[source]#
-
-
-
-resume_state(endpoint_id)[source]#
-
-
-
-
-class model_monitoring_stream.model_monitoring_stream.UnpackValues(*args: Any, **kwargs: Any)[source]#
-

Bases: storey.

-
-
-do(event)[source]#
-
-
-
-
-class model_monitoring_stream.model_monitoring_stream.WriteToKV(*args: Any, **kwargs: Any)[source]#
-

Bases: storey.

-
-
-do(event: Dict)[source]#
-
-
-
-
-model_monitoring_stream.model_monitoring_stream.enrich_even_details(event)Optional[dict][source]#
-
-
-
-model_monitoring_stream.model_monitoring_stream.get_endpoint_record(kv_container: str, kv_path: str, endpoint_id: str, access_key: str)Optional[dict][source]#
-
-
-
-model_monitoring_stream.model_monitoring_stream.handler(context: mlrun.execution.MLClientCtx, event: nuclio.request.Event)[source]#
-
-
-
-model_monitoring_stream.model_monitoring_stream.init_context(context: mlrun.execution.MLClientCtx)[source]#
-
-
-
-model_monitoring_stream.model_monitoring_stream.is_list_of_numerics(field: List[Union[int, float, dict, list]], dict_path: List[str])[source]#
-
-
-
-model_monitoring_stream.model_monitoring_stream.is_not_none(field: Any, dict_path: List[str])[source]#
-
-
-
-

Module contents#

-
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/model_monitoring_stream/latest/static/example.html b/functions/development/model_monitoring_stream/latest/static/example.html deleted file mode 100644 index fe5c2685..00000000 --- a/functions/development/model_monitoring_stream/latest/static/example.html +++ /dev/null @@ -1,352 +0,0 @@ - - - - - - - -Model Monitoring - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - - - -
-
- - -
-
-
- - -
-
-
-

Model Monitoring#

-
-

Initial set up (and pre-requisites)#

-
    -
  1. Make sure you have the mlrun-api datasource available in your Grafana instance, otherwise add it by:

    -
      -
    1. Open your grafana instance

    2. -
    3. Navigate to Configuration -> Data Sources

    4. -
    5. Press Add data source and configure the following parameters

    6. -
    -
    Name: mlrun-api
    -URL: http://mlrun-api:8080/api/grafana-proxy/model-endpoints
    -Access: Server (default)
    -
    -## Add a custom header of:
    -X-V3io-Session-Key: <YOUR ACCESS KEY>
    -
    -
    -
      -
    1. Press Save & Test to make sure it works, a confirmation message should appear when this button is pressed

    2. -
    -
  2. -
  3. Import the available dashboards (./dashboards/*) to you Grafana instance

  4. -
  5. To allow the system to utilize drift measurement, make sure you supply the train set when logging the model on the -training step

    -
    # Log model
    -context.log_model(
    -    "model",
    -    body=dumps(model),
    -    artifact_path=context.artifact_subpath("models"),
    -    extra_data=eval_metrics,
    -    model_file="model.pkl",
    -    metrics=context.results,
    -    training_set=X_test,  # <- make sure this is passed into log_model
    -    labels={"class": "sklearn.linear_model.LogisticRegression"}
    -)
    -
    -
    -
  6. -
  7. When serving a model, make sure that the Nuclio function is deployed with tracking enabled by applying -fn.set_tracking()

  8. -
-
-
-

Configuration#

-

The stream processing portion of the model monitoring, can be deployed under multiple configuration options. The -available configurations can be found under stream.Config. Once configured it should be supplied as environment -parameters to the Nuclio function by setting fn.set_envs

-
project: str                        # project name
-sample_window: int                  # The sampling window for the data that flows into the TSDB and the KV
-kv_path_template: str               # Path template for the kv table
-tsdb_path_template: str             # Path template for the tsdb table
-parquet_path_template: str          # v3io parquets path template, assumes v3io is mounted
-tsdb_batching_max_events: int       # The max amount of event to batch before writing the batch to tsdb
-tsdb_batching_timeout_secs: int     # The max amount of seconds a given batch can be gathered before being emitted
-parquet_batching_max_events: int    # The max amount of event to batch before writing the batch to parquet
-parquet_batching_timeout_secs: int  # The max amount of seconds, a given batch can be gathered before being written to parquet
-container: str                      # container name
-v3io_access_key: str                # V3IO Access key
-v3io_framesd: str                   # V3IO framesd URL
-time_format: str                    # The time format into which time related fields will be converted
-aggregate_count_windows: List[str]  # List of window sizes for predictions count
-aggregate_count_period: str         # Period of predictions count windows
-aggregate_avg_windows: List[str]    # List of window sizes for average latency
-aggregate_avg_period: str           # Period of average latency windows
-
-
-
-
-

Export function yaml#

-
-
-
from mlrun import code_to_function
-from mlrun.runtimes import RemoteRuntime
-
-
-fn: RemoteRuntime = code_to_function(
-    name="model-monitoring-stream",
-    kind="nuclio",
-    image="mlrun/mlrun",
-    filename="model_monitoring_stream.py",
-    handler="handler",
-)
-fn.export("model_monitoring_stream.yaml")
-
-
-
-
-
-
-

Deploy Stream Processing#

-
-
-
import os
-
-from mlrun import import_function
-from mlrun.platforms import mount_v3io
-from mlrun.runtimes import RemoteRuntime
-import json
-
-# Set project name
-project = ""
-
-fn: RemoteRuntime = import_function("hub://model_monitoring_stream")
-
-fn.add_v3io_stream_trigger(
-    stream_path=f"projects/{project}/model-endpoints/stream",
-    name="monitoring_stream_trigger",
-)
-
-fn.set_env("MODEL_MONITORING_PARAMETERS", json.dumps({"project": project, "v3io_framesd": os.environ.get("V3IO_FRAMESD")}))
-
-fn.metadata.project = project
-fn.apply(mount_v3io())
-fn.deploy()
-
-
-
-
-
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/model_monitoring_stream/latest/static/function.html b/functions/development/model_monitoring_stream/latest/static/function.html deleted file mode 100644 index 81290204..00000000 --- a/functions/development/model_monitoring_stream/latest/static/function.html +++ /dev/null @@ -1,289 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: remote
-metadata:
-  name: model-monitoring-stream
-  tag: ''
-  hash: 33f4d6de0858b3dfc9d150fc82fbed6feb05534c
-  project: ''
-  categories:
-  - monitoring
-spec:
-  command: ''
-  args: []
-  image: livsmichael/mlrun-api:automation
-  entry_points:
-    consume:
-      name: consume
-      doc: ''
-      parameters:
-      - name: self
-        default: ''
-      - name: event
-        type: Dict
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 293
-    compute_predictions_per_second:
-      name: compute_predictions_per_second
-      doc: ''
-      parameters:
-      - name: event
-        type: dict
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 311
-    process_before_kv:
-      name: process_before_kv
-      doc: ''
-      parameters:
-      - name: self
-        default: ''
-      - name: event
-        type: dict
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 316
-    process_before_events_tsdb:
-      name: process_before_events_tsdb
-      doc: ''
-      parameters:
-      - name: event
-        type: Dict
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 325
-    process_before_parquet:
-      name: process_before_parquet
-      doc: ''
-      parameters:
-      - name: event
-        type: dict
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 362
-    set_none_if_empty:
-      name: set_none_if_empty
-      doc: ''
-      parameters:
-      - name: _event
-        type: dict
-        default: ''
-      - name: keys
-        type: List[str]
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 364
-    drop_if_exists:
-      name: drop_if_exists
-      doc: ''
-      parameters:
-      - name: _event
-        type: dict
-        default: ''
-      - name: keys
-        type: List[str]
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 369
-    unpack_if_exists:
-      name: unpack_if_exists
-      doc: ''
-      parameters:
-      - name: _event
-        type: dict
-        default: ''
-      - name: keys
-        type: List[str]
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 373
-    do:
-      name: do
-      doc: ''
-      parameters:
-      - name: self
-        default: ''
-      - name: event
-        type: Dict
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 702
-    resume_state:
-      name: resume_state
-      doc: ''
-      parameters:
-      - name: self
-        default: ''
-      - name: endpoint_id
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 475
-    is_valid:
-      name: is_valid
-      doc: ''
-      parameters:
-      - name: self
-        default: ''
-      - name: endpoint_id
-        type: str
-        default: ''
-      - name: validation_function
-        default: ''
-      - name: field
-        type: Any
-        default: ''
-      - name: dict_path
-        type: List[str]
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 495
-    handle_errors:
-      name: handle_errors
-      doc: ''
-      parameters:
-      - name: self
-        default: ''
-      - name: endpoint_id
-        default: ''
-      - name: event
-        default: ''
-      outputs:
-      - default: ''
-        type: bool
-      lineno: 503
-    enrich_even_details:
-      name: enrich_even_details
-      doc: ''
-      parameters:
-      - name: event
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 511
-    is_not_none:
-      name: is_not_none
-      doc: ''
-      parameters:
-      - name: field
-        type: Any
-        default: ''
-      - name: dict_path
-        type: List[str]
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 536
-    is_list_of_numerics:
-      name: is_list_of_numerics
-      doc: ''
-      parameters:
-      - name: field
-        type: List[Union[int, float, dict, list]]
-        default: ''
-      - name: dict_path
-        type: List[str]
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 545
-    get_endpoint_record:
-      name: get_endpoint_record
-      doc: ''
-      parameters:
-      - name: kv_container
-        type: str
-        default: ''
-      - name: kv_path
-        type: str
-        default: ''
-      - name: endpoint_id
-        type: str
-        default: ''
-      - name: access_key
-        type: str
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 717
-    init_context:
-      name: init_context
-      doc: ''
-      parameters:
-      - name: context
-        type: MLClientCtx
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 743
-    handler:
-      name: handler
-      doc: ''
-      parameters:
-      - name: context
-        type: MLClientCtx
-        default: ''
-      - name: event
-        type: Event
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 751
-  description: ''
-  min_replicas: 1
-  max_replicas: 4
-  env: []
-  base_spec:
-    apiVersion: nuclio.io/v1
-    kind: Function
-    metadata:
-      name: model-monitoring-stream
-      labels: {}
-      annotations:
-        nuclio.io/generated_by: function generated from /home/michaell/projects/functions/model_monitoring_stream/model_monitoring_stream.py
-    spec:
-      runtime: python:3.6
-      handler: model_monitoring_stream:handler
-      env: []
-      volumes: []
-      build:
-        commands: []
-        noBaseImagesPull: true
-        functionSourceCode: 
-  source: ''
-  build:
-    commands: []
-    code_origin: https://github.com/Michaelliv/functions.git#202b4c489e4c02c3025742ea237f1a042b7c6043:/home/michaell/projects/functions/model_monitoring_stream/model_monitoring_stream.py
-  default_handler: handler
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/model_monitoring_stream/latest/static/item.html b/functions/development/model_monitoring_stream/latest/static/item.html deleted file mode 100644 index 454c1df8..00000000 --- a/functions/development/model_monitoring_stream/latest/static/item.html +++ /dev/null @@ -1,45 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- monitoring
-description: ''
-doc: ''
-example: model_monitoring_stream.ipynb
-generationDate: 2022-08-28:17-25
-hidden: false
-icon: ''
-labels: {}
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 1.1.0
-name: model-monitoring-stream
-platformVersion: 3.5.0
-spec:
-  filename: model_monitoring_stream.py
-  handler: handler
-  image: livsmichael/mlrun-api:automation
-  kind: nuclio
-  requirements: []
-url: ''
-version: 1.1.0
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/model_monitoring_stream/latest/static/model_monitoring_stream.html b/functions/development/model_monitoring_stream/latest/static/model_monitoring_stream.html deleted file mode 100644 index 6f57cfc6..00000000 --- a/functions/development/model_monitoring_stream/latest/static/model_monitoring_stream.html +++ /dev/null @@ -1,908 +0,0 @@ - - - - - - - -model_monitoring_stream.model_monitoring_stream - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - -
-
- -
-
-
-
-
- -
-

- -
-
-
-
-
-
-
-

Source code for model_monitoring_stream.model_monitoring_stream

-# Copyright 2019 Iguazio
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-import json
-import os
-from collections import defaultdict
-from datetime import datetime
-from os import environ
-from typing import Dict, List, Set, Optional, Any, Union
-
-import pandas as pd
-import v3io
-from mlrun.config import config
-from mlrun.run import MLClientCtx
-from mlrun.utils import logger
-from mlrun.utils.model_monitoring import (
-    parse_model_endpoint_store_prefix,
-    create_model_endpoint_id,
-)
-from mlrun.utils.v3io_clients import get_v3io_client, get_frames_client
-from nuclio import Event
-from storey import (
-    FieldAggregator,
-    NoopDriver,
-    Table,
-    Map,
-    MapClass,
-    AggregateByKey,
-    build_flow,
-    Filter,
-    FlatMap,
-    TSDBTarget,
-    ParquetTarget,
-    SyncEmitSource,
-)
-from storey.dtypes import SlidingWindows
-from storey.steps import SampleWindow
-# Constants
-from v3io.dataplane import RaiseForStatus
-
-ISO_8061_UTC = "%Y-%m-%d %H:%M:%S.%f%z"
-FUNCTION_URI = "function_uri"
-MODEL = "model"
-VERSION = "version"
-VERSIONED_MODEL = "versioned_model"
-MODEL_CLASS = "model_class"
-TIMESTAMP = "timestamp"
-ENDPOINT_ID = "endpoint_id"
-REQUEST_ID = "request_id"
-LABELS = "labels"
-UNPACKED_LABELS = "unpacked_labels"
-LATENCY_AVG_5M = "latency_avg_5m"
-LATENCY_AVG_1H = "latency_avg_1h"
-PREDICTIONS_PER_SECOND = "predictions_per_second"
-PREDICTIONS_COUNT_5M = "predictions_count_5m"
-PREDICTIONS_COUNT_1H = "predictions_count_1h"
-FIRST_REQUEST = "first_request"
-LAST_REQUEST = "last_request"
-ERROR_COUNT = "error_count"
-ENTITIES = "entities"
-FEATURE_NAMES = "feature_names"
-LABEL_COLUMNS = "label_columns"
-LATENCY = "latency"
-RECORD_TYPE = "record_type"
-FEATURES = "features"
-PREDICTION = "prediction"
-PREDICTIONS = "predictions"
-NAMED_FEATURES = "named_features"
-NAMED_PREDICTIONS = "named_predictions"
-BASE_METRICS = "base_metrics"
-CUSTOM_METRICS = "custom_metrics"
-ENDPOINT_FEATURES = "endpoint_features"
-METRICS = "metrics"
-BATCH_TIMESTAMP = "batch_timestamp"
-TIME_FORMAT: str = "%Y-%m-%d %H:%M:%S.%f"  # ISO 8061
-
-
-# Stream processing code
-
[docs]class EventStreamProcessor: - def __init__( - self, - project: str, - sample_window: int = 10, - tsdb_batching_max_events: int = 10, - tsdb_batching_timeout_secs: int = 60 * 5, # Default 5 minutes - parquet_batching_max_events: int = 10_000, - parquet_batching_timeout_secs: int = 60 * 60, # Default 1 hour - aggregate_count_windows: Optional[List[str]] = None, - aggregate_count_period: str = "30s", - aggregate_avg_windows: Optional[List[str]] = None, - aggregate_avg_period: str = "30s", - v3io_access_key: Optional[str] = None, - v3io_framesd: Optional[str] = None, - v3io_api: Optional[str] = None, - ): - self.project = project - self.sample_window = sample_window - self.tsdb_batching_max_events = tsdb_batching_max_events - self.tsdb_batching_timeout_secs = tsdb_batching_timeout_secs - self.parquet_batching_max_events = parquet_batching_max_events - self.parquet_batching_timeout_secs = parquet_batching_timeout_secs - self.aggregate_count_windows = aggregate_count_windows or ["5m", "1h"] - self.aggregate_count_period = aggregate_count_period - self.aggregate_avg_windows = aggregate_avg_windows or ["5m", "1h"] - self.aggregate_avg_period = aggregate_avg_period - - self.v3io_framesd = v3io_framesd or config.v3io_framesd - self.v3io_api = v3io_api or config.v3io_api - - self.v3io_access_key = v3io_access_key or environ.get("V3IO_ACCESS_KEY") - self.model_monitoring_access_key = ( - os.environ.get("MODEL_MONITORING_ACCESS_KEY") or self.v3io_access_key - ) - - template = config.model_endpoint_monitoring.store_prefixes.default - - kv_path = template.format(project=project, kind="endpoints") - _, self.kv_container, self.kv_path = parse_model_endpoint_store_prefix(kv_path) - - tsdb_path = template.format(project=project, kind="events") - _, self.tsdb_container, self.tsdb_path = parse_model_endpoint_store_prefix( - tsdb_path - ) - self.tsdb_path = f"{self.tsdb_container}/{self.tsdb_path}" - - self.parquet_path = config.model_endpoint_monitoring.store_prefixes.user_space.format( - project=project, kind="parquet" - ) - - logger.info( - "V3IO Configuration", - v3io_access_key=self.v3io_access_key, - model_monitoring_access_key=self.model_monitoring_access_key, - default_store_prefix=config.model_endpoint_monitoring.store_prefixes.default, - user_space_store_prefix=config.model_endpoint_monitoring.store_prefixes.user_space, - v3io_api=self.v3io_api, - v3io_framesd=self.v3io_framesd, - kv_container=self.kv_container, - kv_path=self.kv_path, - tsdb_container=self.tsdb_container, - tsdb_path=self.tsdb_path, - parquet_path=self.parquet_path, - ) - - self._kv_keys = [ - FUNCTION_URI, - MODEL, - MODEL_CLASS, - TIMESTAMP, - ENDPOINT_ID, - LABELS, - UNPACKED_LABELS, - LATENCY_AVG_5M, - LATENCY_AVG_1H, - PREDICTIONS_PER_SECOND, - PREDICTIONS_COUNT_5M, - PREDICTIONS_COUNT_1H, - FIRST_REQUEST, - LAST_REQUEST, - ERROR_COUNT, - ] - - self._flow = build_flow( - [ - SyncEmitSource(), - ProcessEndpointEvent( - kv_container=self.kv_container, - kv_path=self.kv_path, - v3io_access_key=self.v3io_access_key, - ), - FilterNotNone(), - FlatMap(lambda x: x), - MapFeatureNames( - kv_container=self.kv_container, - kv_path=self.kv_path, - access_key=self.v3io_access_key, - ), - # Branch 1: Aggregate events, count averages and update TSDB and KV - [ - AggregateByKey( - aggregates=[ - FieldAggregator( - PREDICTIONS, - ENDPOINT_ID, - ["count"], - SlidingWindows( - self.aggregate_count_windows, - self.aggregate_count_period, - ), - ), - FieldAggregator( - LATENCY, - LATENCY, - ["avg"], - SlidingWindows( - self.aggregate_avg_windows, - self.aggregate_avg_period, - ), - ), - ], - table=Table("notable", NoopDriver()), - ), - SampleWindow( - self.sample_window - ), # Add required gap between event to apply sampling - Map(self.compute_predictions_per_second), - # Branch 1.1: Updated KV - [ - Map(self.process_before_kv), - WriteToKV(container=self.kv_container, table=self.kv_path), - InferSchema( - v3io_access_key=self.v3io_access_key, - v3io_framesd=self.v3io_framesd, - container=self.kv_container, - table=self.kv_path, - ), - ], - # Branch 1.2: Update TSDB - [ - # Map the event into taggable fields, add record type to each field - Map(self.process_before_events_tsdb), - [ - FilterKeys(BASE_METRICS), - UnpackValues(BASE_METRICS), - TSDBTarget( - path=self.tsdb_path, - rate="10/m", - time_col=TIMESTAMP, - container=self.tsdb_container, - access_key=self.v3io_access_key, - v3io_frames=self.v3io_framesd, - index_cols=[ENDPOINT_ID, RECORD_TYPE], - # Settings for _Batching - max_events=self.tsdb_batching_max_events, - timeout_secs=self.tsdb_batching_timeout_secs, - key=ENDPOINT_ID, - ), - ], - [ - FilterKeys(ENDPOINT_FEATURES), - UnpackValues(ENDPOINT_FEATURES), - TSDBTarget( - path=self.tsdb_path, - rate="10/m", - time_col=TIMESTAMP, - container=self.tsdb_container, - access_key=self.v3io_access_key, - v3io_frames=self.v3io_framesd, - index_cols=[ENDPOINT_ID, RECORD_TYPE], - # Settings for _Batching - max_events=self.tsdb_batching_max_events, - timeout_secs=self.tsdb_batching_timeout_secs, - key=ENDPOINT_ID, - ), - ], - [ - FilterKeys(CUSTOM_METRICS), - FilterNotNone(), - UnpackValues(CUSTOM_METRICS), - TSDBTarget( - path=self.tsdb_path, - rate="10/m", - time_col=TIMESTAMP, - container=self.tsdb_container, - access_key=self.v3io_access_key, - v3io_frames=self.v3io_framesd, - index_cols=[ENDPOINT_ID, RECORD_TYPE], - # Settings for _Batching - max_events=self.tsdb_batching_max_events, - timeout_secs=self.tsdb_batching_timeout_secs, - key=ENDPOINT_ID, - ), - ], - ], - ], - # Branch 2: Batch events, write to parquet - [ - Map(self.process_before_parquet), - ParquetTarget( - path=self.parquet_path, - partition_cols=["$key", "$year", "$month", "$day", "$hour"], - infer_columns_from_data=True, - # Settings for _Batching - max_events=self.parquet_batching_max_events, - timeout_secs=self.parquet_batching_timeout_secs, - # Settings for v3io storage - storage_options={ - "v3io_api": self.v3io_api, - "v3io_access_key": self.model_monitoring_access_key, - }, - ), - ], - ] - ).run() - -
[docs] def consume(self, event: Dict): - events = [] - if "headers" in event and "values" in event: - for values in event["values"]: - events.append({k: v for k, v in zip(event["headers"], values)}) - else: - events.append(event) - - for enriched in map(enrich_even_details, events): - if enriched is not None: - self._flow.emit( - enriched, - key=enriched[ENDPOINT_ID], - event_time=datetime.strptime(enriched["when"], ISO_8061_UTC), - ) - else: - pass
- -
[docs] @staticmethod - def compute_predictions_per_second(event: dict): - event[PREDICTIONS_PER_SECOND] = float(event[PREDICTIONS_COUNT_5M]) / 600 - return event
- -
[docs] def process_before_kv(self, event: dict): - # Filter relevant keys - e = {k: event[k] for k in self._kv_keys} - # Unpack labels dictionary - e = {**e, **e.pop(UNPACKED_LABELS, {})} - # Write labels to kv as json string to be presentable later - e[LABELS] = json.dumps(e[LABELS]) - return e
- -
[docs] @staticmethod - def process_before_events_tsdb(event: Dict): - base_fields = [TIMESTAMP, ENDPOINT_ID] - - base_event = {k: event[k] for k in base_fields} - base_event[TIMESTAMP] = pd.to_datetime( - base_event[TIMESTAMP], format=TIME_FORMAT - ) - - base_metrics = { - RECORD_TYPE: BASE_METRICS, - PREDICTIONS_PER_SECOND: event[PREDICTIONS_PER_SECOND], - PREDICTIONS_COUNT_5M: event[PREDICTIONS_COUNT_5M], - PREDICTIONS_COUNT_1H: event[PREDICTIONS_COUNT_1H], - LATENCY_AVG_5M: event[LATENCY_AVG_5M], - LATENCY_AVG_1H: event[LATENCY_AVG_1H], - **base_event, - } - - endpoint_features = { - RECORD_TYPE: ENDPOINT_FEATURES, - **event[NAMED_PREDICTIONS], - **event[NAMED_FEATURES], - **base_event, - } - - processed = {BASE_METRICS: base_metrics, ENDPOINT_FEATURES: endpoint_features} - - if event[METRICS]: - processed[CUSTOM_METRICS] = { - RECORD_TYPE: CUSTOM_METRICS, - **event[METRICS], - **base_event, - } - - return processed
- -
[docs] @staticmethod - def process_before_parquet(event: dict): - def set_none_if_empty(_event: dict, keys: List[str]): - for key in keys: - if not _event.get(key): - _event[key] = None - - def drop_if_exists(_event: dict, keys: List[str]): - for key in keys: - _event.pop(key, None) - - def unpack_if_exists(_event: dict, keys: List[str]): - for key in keys: - value = _event.get(key) - if value is not None: - _event = {**value, **event} - - drop_if_exists(event, [UNPACKED_LABELS, FEATURES]) - unpack_if_exists(event, [ENTITIES]) - set_none_if_empty(event, [LABELS, METRICS, ENTITIES]) - return event
- - -
[docs]class ProcessEndpointEvent(MapClass): - def __init__(self, kv_container: str, kv_path: str, v3io_access_key: str, **kwargs): - super().__init__(**kwargs) - self.kv_container: str = kv_container - self.kv_path: str = kv_path - self.v3io_access_key: str = v3io_access_key - self.first_request: Dict[str, str] = dict() - self.last_request: Dict[str, str] = dict() - self.error_count: Dict[str, int] = defaultdict(int) - self.endpoints: Set[str] = set() - -
[docs] def do(self, event: dict): - function_uri = event[FUNCTION_URI] - versioned_model = event[VERSIONED_MODEL] - endpoint_id = event[ENDPOINT_ID] - - # In case this process fails, resume state from existing record - self.resume_state(endpoint_id) - - # Handle errors coming from stream - found_errors = self.handle_errors(endpoint_id, event) - if found_errors: - return None - - # Validate event fields - model_class = event.get("model_class") or event.get("class") - timestamp = event.get("when") - request_id = event.get("request", {}).get("id") - latency = event.get("microsec") - features = event.get("request", {}).get("inputs") - predictions = event.get("resp", {}).get("outputs") - - if not self.is_valid(endpoint_id, is_not_none, timestamp, ["when"],): - return None - - if endpoint_id not in self.first_request: - self.first_request[endpoint_id] = timestamp - self.last_request[endpoint_id] = timestamp - - if not self.is_valid(endpoint_id, is_not_none, request_id, ["request", "id"],): - return None - if not self.is_valid(endpoint_id, is_not_none, latency, ["microsec"],): - return None - if not self.is_valid( - endpoint_id, is_not_none, features, ["request", "inputs"], - ): - return None - if not self.is_valid( - endpoint_id, is_not_none, predictions, ["resp", "outputs"], - ): - return None - - unpacked_labels = {f"_{k}": v for k, v in event.get(LABELS, {}).items()} - - # Separate each model invocation into sub events - events = [] - for i, (feature, prediction) in enumerate(zip(features, predictions)): - if not self.is_valid( - endpoint_id, - is_list_of_numerics, - feature, - ["request", "inputs", f"[{i}]"], - ): - return None - - if not isinstance(prediction, list): - prediction = [prediction] - - events.append( - { - FUNCTION_URI: function_uri, - MODEL: versioned_model, - MODEL_CLASS: model_class, - TIMESTAMP: timestamp, - ENDPOINT_ID: endpoint_id, - REQUEST_ID: request_id, - LATENCY: latency, - FEATURES: feature, - PREDICTION: prediction, - FIRST_REQUEST: self.first_request[endpoint_id], - LAST_REQUEST: self.last_request[endpoint_id], - ERROR_COUNT: self.error_count[endpoint_id], - LABELS: event.get(LABELS, {}), - METRICS: event.get(METRICS, {}), - ENTITIES: event.get("request", {}).get(ENTITIES, {}), - UNPACKED_LABELS: unpacked_labels, - } - ) - return events
- -
[docs] def resume_state(self, endpoint_id): - # Make sure process is resumable, if process fails for any reason, be able to pick things up close to where we - # left them - if endpoint_id not in self.endpoints: - logger.info("Trying to resume state", endpoint_id=endpoint_id) - endpoint_record = get_endpoint_record( - kv_container=self.kv_container, - kv_path=self.kv_path, - endpoint_id=endpoint_id, - access_key=self.v3io_access_key, - ) - if endpoint_record: - first_request = endpoint_record.get(FIRST_REQUEST) - if first_request: - self.first_request[endpoint_id] = first_request - error_count = endpoint_record.get(ERROR_COUNT) - if error_count: - self.error_count[endpoint_id] = error_count - self.endpoints.add(endpoint_id)
- -
[docs] def is_valid( - self, endpoint_id: str, validation_function, field: Any, dict_path: List[str] - ): - if validation_function(field, dict_path): - return True - self.error_count[endpoint_id] += 1 - return False
- -
[docs] def handle_errors(self, endpoint_id, event) -> bool: - if "error" in event: - self.error_count[endpoint_id] += 1 - return True - - return False
- - -
[docs]def enrich_even_details(event) -> Optional[dict]: - function_uri = event.get(FUNCTION_URI) - - if not is_not_none(function_uri, [FUNCTION_URI]): - return None - - model = event.get(MODEL) - if not is_not_none(model, [MODEL]): - return None - - version = event.get(VERSION) - versioned_model = f"{model}:{version}" if version else f"{model}:latest" - - endpoint_id = create_model_endpoint_id( - function_uri=function_uri, versioned_model=versioned_model, - ) - - endpoint_id = str(endpoint_id) - - event[VERSIONED_MODEL] = versioned_model - event[ENDPOINT_ID] = endpoint_id - - return event
- - -
[docs]def is_not_none(field: Any, dict_path: List[str]): - if field is not None: - return True - logger.error( - f"Expected event field is missing: {field} [Event -> {''.join(dict_path)}]" - ) - return False
- - -
[docs]def is_list_of_numerics( - field: List[Union[int, float, dict, list]], dict_path: List[str] -): - if all(isinstance(x, int) or isinstance(x, float) for x in field): - return True - logger.error( - f"Expected event field is missing: {field} [Event -> {''.join(dict_path)}]" - ) - return False
- - -
[docs]class FilterNotNone(Filter): - def __init__(self, **kwargs): - super().__init__(fn=lambda event: event is not None, **kwargs)
- - -
[docs]class FilterKeys(MapClass): - def __init__(self, *args, **kwargs): - super().__init__(**kwargs) - self.keys = list(args) - -
[docs] def do(self, event): - new_event = {} - for key in self.keys: - if key in event: - new_event[key] = event[key] - - return new_event if new_event else None
- - -
[docs]class UnpackValues(MapClass): - def __init__(self, *args, **kwargs): - super().__init__(**kwargs) - self.keys_to_unpack = set(args) - -
[docs] def do(self, event): - unpacked = {} - for key in event.keys(): - if key in self.keys_to_unpack: - unpacked = {**unpacked, **event[key]} - else: - unpacked[key] = event[key] - return unpacked
- - -
[docs]class MapFeatureNames(MapClass): - def __init__(self, kv_container: str, kv_path: str, access_key: str, **kwargs): - super().__init__(**kwargs) - self.kv_container = kv_container - self.kv_path = kv_path - self.access_key = access_key - self.feature_names = {} - self.label_columns = {} - -
[docs] def do(self, event: Dict): - endpoint_id = event[ENDPOINT_ID] - - if endpoint_id not in self.feature_names: - endpoint_record = get_endpoint_record( - kv_container=self.kv_container, - kv_path=self.kv_path, - endpoint_id=endpoint_id, - access_key=self.access_key, - ) - feature_names = endpoint_record.get(FEATURE_NAMES) - feature_names = json.loads(feature_names) if feature_names else None - - label_columns = endpoint_record.get(LABEL_COLUMNS) - label_columns = json.loads(label_columns) if label_columns else None - - if not feature_names: - logger.warn( - f"Feature names are not initialized, they will be automatically generated", - endpoint_id=endpoint_id, - ) - feature_names = [f"f{i}" for i, _ in enumerate(event[FEATURES])] - get_v3io_client().kv.update( - container=self.kv_container, - table_path=self.kv_path, - access_key=self.access_key, - key=event[ENDPOINT_ID], - attributes={FEATURE_NAMES: json.dumps(feature_names)}, - raise_for_status=RaiseForStatus.always, - ) - - if not label_columns: - logger.warn( - f"label column names are not initialized, they will be automatically generated", - endpoint_id=endpoint_id, - ) - label_columns = [f"p{i}" for i, _ in enumerate(event[PREDICTION])] - get_v3io_client().kv.update( - container=self.kv_container, - table_path=self.kv_path, - access_key=self.access_key, - key=event[ENDPOINT_ID], - attributes={LABEL_COLUMNS: json.dumps(label_columns)}, - raise_for_status=RaiseForStatus.always, - ) - - self.label_columns[endpoint_id] = label_columns - self.feature_names[endpoint_id] = feature_names - - logger.info( - "Label columns", endpoint_id=endpoint_id, label_columns=label_columns - ) - logger.info( - "Feature names", endpoint_id=endpoint_id, feature_names=feature_names - ) - - feature_names = self.feature_names[endpoint_id] - features = event[FEATURES] - event[NAMED_FEATURES] = { - name: feature for name, feature in zip(feature_names, features) - } - - label_columns = self.label_columns[endpoint_id] - prediction = event[PREDICTION] - event[NAMED_PREDICTIONS] = { - name: prediction for name, prediction in zip(label_columns, prediction) - } - logger.info("Mapped event", event=event) - return event
- - -
[docs]class WriteToKV(MapClass): - def __init__(self, container: str, table: str, **kwargs): - super().__init__(**kwargs) - self.container = container - self.table = table - -
[docs] def do(self, event: Dict): - get_v3io_client().kv.update( - container=self.container, - table_path=self.table, - key=event[ENDPOINT_ID], - attributes=event, - ) - return event
- - -
[docs]class InferSchema(MapClass): - def __init__( - self, - v3io_access_key: str, - v3io_framesd: str, - container: str, - table: str, - **kwargs, - ): - super().__init__(**kwargs) - self.container = container - self.v3io_access_key = v3io_access_key - self.v3io_framesd = v3io_framesd - self.table = table - self.keys = set() - -
[docs] def do(self, event: Dict): - key_set = set(event.keys()) - if not key_set.issubset(self.keys): - self.keys.update(key_set) - get_frames_client( - token=self.v3io_access_key, - container=self.container, - address=self.v3io_framesd, - ).execute(backend="kv", table=self.table, command="infer_schema") - logger.info( - "Found new keys, inferred schema", table=self.table, event=event - ) - return event
- - -
[docs]def get_endpoint_record( - kv_container: str, kv_path: str, endpoint_id: str, access_key: str -) -> Optional[dict]: - logger.info( - f"Grabbing endpoint data", - container=kv_container, - table_path=kv_path, - key=endpoint_id, - ) - try: - endpoint_record = ( - get_v3io_client() - .kv.get( - container=kv_container, - table_path=kv_path, - key=endpoint_id, - access_key=access_key, - raise_for_status=v3io.dataplane.RaiseForStatus.always, - ) - .output.item - ) - return endpoint_record - except Exception: - return None
- - -
[docs]def init_context(context: MLClientCtx): - context.logger.info("Initializing EventStreamProcessor") - parameters = environ.get("MODEL_MONITORING_PARAMETERS") - parameters = json.loads(parameters) if parameters else {} - stream_processor = EventStreamProcessor(**parameters) - setattr(context, "stream_processor", stream_processor)
- - -
[docs]def handler(context: MLClientCtx, event: Event): - event_body = json.loads(event.body) - context.logger.debug(event_body) - context.stream_processor.consume(event_body)
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/model_monitoring_stream/latest/static/source.html b/functions/development/model_monitoring_stream/latest/static/source.html deleted file mode 100644 index 01b89503..00000000 --- a/functions/development/model_monitoring_stream/latest/static/source.html +++ /dev/null @@ -1,790 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-# Copyright 2019 Iguazio
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-import json
-import os
-from collections import defaultdict
-from datetime import datetime
-from os import environ
-from typing import Dict, List, Set, Optional, Any, Union
-
-import pandas as pd
-import v3io
-from mlrun.config import config
-from mlrun.run import MLClientCtx
-from mlrun.utils import logger
-from mlrun.utils.model_monitoring import (
-    parse_model_endpoint_store_prefix,
-    create_model_endpoint_id,
-)
-from mlrun.utils.v3io_clients import get_v3io_client, get_frames_client
-from nuclio import Event
-from storey import (
-    FieldAggregator,
-    NoopDriver,
-    Table,
-    Map,
-    MapClass,
-    AggregateByKey,
-    build_flow,
-    Filter,
-    FlatMap,
-    TSDBTarget,
-    ParquetTarget,
-    SyncEmitSource,
-)
-from storey.dtypes import SlidingWindows
-from storey.steps import SampleWindow
-# Constants
-from v3io.dataplane import RaiseForStatus
-
-ISO_8061_UTC = "%Y-%m-%d %H:%M:%S.%f%z"
-FUNCTION_URI = "function_uri"
-MODEL = "model"
-VERSION = "version"
-VERSIONED_MODEL = "versioned_model"
-MODEL_CLASS = "model_class"
-TIMESTAMP = "timestamp"
-ENDPOINT_ID = "endpoint_id"
-REQUEST_ID = "request_id"
-LABELS = "labels"
-UNPACKED_LABELS = "unpacked_labels"
-LATENCY_AVG_5M = "latency_avg_5m"
-LATENCY_AVG_1H = "latency_avg_1h"
-PREDICTIONS_PER_SECOND = "predictions_per_second"
-PREDICTIONS_COUNT_5M = "predictions_count_5m"
-PREDICTIONS_COUNT_1H = "predictions_count_1h"
-FIRST_REQUEST = "first_request"
-LAST_REQUEST = "last_request"
-ERROR_COUNT = "error_count"
-ENTITIES = "entities"
-FEATURE_NAMES = "feature_names"
-LABEL_COLUMNS = "label_columns"
-LATENCY = "latency"
-RECORD_TYPE = "record_type"
-FEATURES = "features"
-PREDICTION = "prediction"
-PREDICTIONS = "predictions"
-NAMED_FEATURES = "named_features"
-NAMED_PREDICTIONS = "named_predictions"
-BASE_METRICS = "base_metrics"
-CUSTOM_METRICS = "custom_metrics"
-ENDPOINT_FEATURES = "endpoint_features"
-METRICS = "metrics"
-BATCH_TIMESTAMP = "batch_timestamp"
-TIME_FORMAT: str = "%Y-%m-%d %H:%M:%S.%f"  # ISO 8061
-
-
-# Stream processing code
-class EventStreamProcessor:
-    def __init__(
-        self,
-        project: str,
-        sample_window: int = 10,
-        tsdb_batching_max_events: int = 10,
-        tsdb_batching_timeout_secs: int = 60 * 5,  # Default 5 minutes
-        parquet_batching_max_events: int = 10_000,
-        parquet_batching_timeout_secs: int = 60 * 60,  # Default 1 hour
-        aggregate_count_windows: Optional[List[str]] = None,
-        aggregate_count_period: str = "30s",
-        aggregate_avg_windows: Optional[List[str]] = None,
-        aggregate_avg_period: str = "30s",
-        v3io_access_key: Optional[str] = None,
-        v3io_framesd: Optional[str] = None,
-        v3io_api: Optional[str] = None,
-    ):
-        self.project = project
-        self.sample_window = sample_window
-        self.tsdb_batching_max_events = tsdb_batching_max_events
-        self.tsdb_batching_timeout_secs = tsdb_batching_timeout_secs
-        self.parquet_batching_max_events = parquet_batching_max_events
-        self.parquet_batching_timeout_secs = parquet_batching_timeout_secs
-        self.aggregate_count_windows = aggregate_count_windows or ["5m", "1h"]
-        self.aggregate_count_period = aggregate_count_period
-        self.aggregate_avg_windows = aggregate_avg_windows or ["5m", "1h"]
-        self.aggregate_avg_period = aggregate_avg_period
-
-        self.v3io_framesd = v3io_framesd or config.v3io_framesd
-        self.v3io_api = v3io_api or config.v3io_api
-
-        self.v3io_access_key = v3io_access_key or environ.get("V3IO_ACCESS_KEY")
-        self.model_monitoring_access_key = (
-            os.environ.get("MODEL_MONITORING_ACCESS_KEY") or self.v3io_access_key
-        )
-
-        template = config.model_endpoint_monitoring.store_prefixes.default
-
-        kv_path = template.format(project=project, kind="endpoints")
-        _, self.kv_container, self.kv_path = parse_model_endpoint_store_prefix(kv_path)
-
-        tsdb_path = template.format(project=project, kind="events")
-        _, self.tsdb_container, self.tsdb_path = parse_model_endpoint_store_prefix(
-            tsdb_path
-        )
-        self.tsdb_path = f"{self.tsdb_container}/{self.tsdb_path}"
-
-        self.parquet_path = config.model_endpoint_monitoring.store_prefixes.user_space.format(
-            project=project, kind="parquet"
-        )
-
-        logger.info(
-            "V3IO Configuration",
-            v3io_access_key=self.v3io_access_key,
-            model_monitoring_access_key=self.model_monitoring_access_key,
-            default_store_prefix=config.model_endpoint_monitoring.store_prefixes.default,
-            user_space_store_prefix=config.model_endpoint_monitoring.store_prefixes.user_space,
-            v3io_api=self.v3io_api,
-            v3io_framesd=self.v3io_framesd,
-            kv_container=self.kv_container,
-            kv_path=self.kv_path,
-            tsdb_container=self.tsdb_container,
-            tsdb_path=self.tsdb_path,
-            parquet_path=self.parquet_path,
-        )
-
-        self._kv_keys = [
-            FUNCTION_URI,
-            MODEL,
-            MODEL_CLASS,
-            TIMESTAMP,
-            ENDPOINT_ID,
-            LABELS,
-            UNPACKED_LABELS,
-            LATENCY_AVG_5M,
-            LATENCY_AVG_1H,
-            PREDICTIONS_PER_SECOND,
-            PREDICTIONS_COUNT_5M,
-            PREDICTIONS_COUNT_1H,
-            FIRST_REQUEST,
-            LAST_REQUEST,
-            ERROR_COUNT,
-        ]
-
-        self._flow = build_flow(
-            [
-                SyncEmitSource(),
-                ProcessEndpointEvent(
-                    kv_container=self.kv_container,
-                    kv_path=self.kv_path,
-                    v3io_access_key=self.v3io_access_key,
-                ),
-                FilterNotNone(),
-                FlatMap(lambda x: x),
-                MapFeatureNames(
-                    kv_container=self.kv_container,
-                    kv_path=self.kv_path,
-                    access_key=self.v3io_access_key,
-                ),
-                # Branch 1: Aggregate events, count averages and update TSDB and KV
-                [
-                    AggregateByKey(
-                        aggregates=[
-                            FieldAggregator(
-                                PREDICTIONS,
-                                ENDPOINT_ID,
-                                ["count"],
-                                SlidingWindows(
-                                    self.aggregate_count_windows,
-                                    self.aggregate_count_period,
-                                ),
-                            ),
-                            FieldAggregator(
-                                LATENCY,
-                                LATENCY,
-                                ["avg"],
-                                SlidingWindows(
-                                    self.aggregate_avg_windows,
-                                    self.aggregate_avg_period,
-                                ),
-                            ),
-                        ],
-                        table=Table("notable", NoopDriver()),
-                    ),
-                    SampleWindow(
-                        self.sample_window
-                    ),  # Add required gap between event to apply sampling
-                    Map(self.compute_predictions_per_second),
-                    # Branch 1.1: Updated KV
-                    [
-                        Map(self.process_before_kv),
-                        WriteToKV(container=self.kv_container, table=self.kv_path),
-                        InferSchema(
-                            v3io_access_key=self.v3io_access_key,
-                            v3io_framesd=self.v3io_framesd,
-                            container=self.kv_container,
-                            table=self.kv_path,
-                        ),
-                    ],
-                    # Branch 1.2: Update TSDB
-                    [
-                        # Map the event into taggable fields, add record type to each field
-                        Map(self.process_before_events_tsdb),
-                        [
-                            FilterKeys(BASE_METRICS),
-                            UnpackValues(BASE_METRICS),
-                            TSDBTarget(
-                                path=self.tsdb_path,
-                                rate="10/m",
-                                time_col=TIMESTAMP,
-                                container=self.tsdb_container,
-                                access_key=self.v3io_access_key,
-                                v3io_frames=self.v3io_framesd,
-                                index_cols=[ENDPOINT_ID, RECORD_TYPE],
-                                # Settings for _Batching
-                                max_events=self.tsdb_batching_max_events,
-                                timeout_secs=self.tsdb_batching_timeout_secs,
-                                key=ENDPOINT_ID,
-                            ),
-                        ],
-                        [
-                            FilterKeys(ENDPOINT_FEATURES),
-                            UnpackValues(ENDPOINT_FEATURES),
-                            TSDBTarget(
-                                path=self.tsdb_path,
-                                rate="10/m",
-                                time_col=TIMESTAMP,
-                                container=self.tsdb_container,
-                                access_key=self.v3io_access_key,
-                                v3io_frames=self.v3io_framesd,
-                                index_cols=[ENDPOINT_ID, RECORD_TYPE],
-                                # Settings for _Batching
-                                max_events=self.tsdb_batching_max_events,
-                                timeout_secs=self.tsdb_batching_timeout_secs,
-                                key=ENDPOINT_ID,
-                            ),
-                        ],
-                        [
-                            FilterKeys(CUSTOM_METRICS),
-                            FilterNotNone(),
-                            UnpackValues(CUSTOM_METRICS),
-                            TSDBTarget(
-                                path=self.tsdb_path,
-                                rate="10/m",
-                                time_col=TIMESTAMP,
-                                container=self.tsdb_container,
-                                access_key=self.v3io_access_key,
-                                v3io_frames=self.v3io_framesd,
-                                index_cols=[ENDPOINT_ID, RECORD_TYPE],
-                                # Settings for _Batching
-                                max_events=self.tsdb_batching_max_events,
-                                timeout_secs=self.tsdb_batching_timeout_secs,
-                                key=ENDPOINT_ID,
-                            ),
-                        ],
-                    ],
-                ],
-                # Branch 2: Batch events, write to parquet
-                [
-                    Map(self.process_before_parquet),
-                    ParquetTarget(
-                        path=self.parquet_path,
-                        partition_cols=["$key", "$year", "$month", "$day", "$hour"],
-                        infer_columns_from_data=True,
-                        # Settings for _Batching
-                        max_events=self.parquet_batching_max_events,
-                        timeout_secs=self.parquet_batching_timeout_secs,
-                        # Settings for v3io storage
-                        storage_options={
-                            "v3io_api": self.v3io_api,
-                            "v3io_access_key": self.model_monitoring_access_key,
-                        },
-                    ),
-                ],
-            ]
-        ).run()
-
-    def consume(self, event: Dict):
-        events = []
-        if "headers" in event and "values" in event:
-            for values in event["values"]:
-                events.append({k: v for k, v in zip(event["headers"], values)})
-        else:
-            events.append(event)
-
-        for enriched in map(enrich_even_details, events):
-            if enriched is not None:
-                self._flow.emit(
-                    enriched,
-                    key=enriched[ENDPOINT_ID],
-                    event_time=datetime.strptime(enriched["when"], ISO_8061_UTC),
-                )
-            else:
-                pass
-
-    @staticmethod
-    def compute_predictions_per_second(event: dict):
-        event[PREDICTIONS_PER_SECOND] = float(event[PREDICTIONS_COUNT_5M]) / 600
-        return event
-
-    def process_before_kv(self, event: dict):
-        # Filter relevant keys
-        e = {k: event[k] for k in self._kv_keys}
-        # Unpack labels dictionary
-        e = {**e, **e.pop(UNPACKED_LABELS, {})}
-        # Write labels to kv as json string to be presentable later
-        e[LABELS] = json.dumps(e[LABELS])
-        return e
-
-    @staticmethod
-    def process_before_events_tsdb(event: Dict):
-        base_fields = [TIMESTAMP, ENDPOINT_ID]
-
-        base_event = {k: event[k] for k in base_fields}
-        base_event[TIMESTAMP] = pd.to_datetime(
-            base_event[TIMESTAMP], format=TIME_FORMAT
-        )
-
-        base_metrics = {
-            RECORD_TYPE: BASE_METRICS,
-            PREDICTIONS_PER_SECOND: event[PREDICTIONS_PER_SECOND],
-            PREDICTIONS_COUNT_5M: event[PREDICTIONS_COUNT_5M],
-            PREDICTIONS_COUNT_1H: event[PREDICTIONS_COUNT_1H],
-            LATENCY_AVG_5M: event[LATENCY_AVG_5M],
-            LATENCY_AVG_1H: event[LATENCY_AVG_1H],
-            **base_event,
-        }
-
-        endpoint_features = {
-            RECORD_TYPE: ENDPOINT_FEATURES,
-            **event[NAMED_PREDICTIONS],
-            **event[NAMED_FEATURES],
-            **base_event,
-        }
-
-        processed = {BASE_METRICS: base_metrics, ENDPOINT_FEATURES: endpoint_features}
-
-        if event[METRICS]:
-            processed[CUSTOM_METRICS] = {
-                RECORD_TYPE: CUSTOM_METRICS,
-                **event[METRICS],
-                **base_event,
-            }
-
-        return processed
-
-    @staticmethod
-    def process_before_parquet(event: dict):
-        def set_none_if_empty(_event: dict, keys: List[str]):
-            for key in keys:
-                if not _event.get(key):
-                    _event[key] = None
-
-        def drop_if_exists(_event: dict, keys: List[str]):
-            for key in keys:
-                _event.pop(key, None)
-
-        def unpack_if_exists(_event: dict, keys: List[str]):
-            for key in keys:
-                value = _event.get(key)
-                if value is not None:
-                    _event = {**value, **event}
-
-        drop_if_exists(event, [UNPACKED_LABELS, FEATURES])
-        unpack_if_exists(event, [ENTITIES])
-        set_none_if_empty(event, [LABELS, METRICS, ENTITIES])
-        return event
-
-
-class ProcessEndpointEvent(MapClass):
-    def __init__(self, kv_container: str, kv_path: str, v3io_access_key: str, **kwargs):
-        super().__init__(**kwargs)
-        self.kv_container: str = kv_container
-        self.kv_path: str = kv_path
-        self.v3io_access_key: str = v3io_access_key
-        self.first_request: Dict[str, str] = dict()
-        self.last_request: Dict[str, str] = dict()
-        self.error_count: Dict[str, int] = defaultdict(int)
-        self.endpoints: Set[str] = set()
-
-    def do(self, event: dict):
-        function_uri = event[FUNCTION_URI]
-        versioned_model = event[VERSIONED_MODEL]
-        endpoint_id = event[ENDPOINT_ID]
-
-        # In case this process fails, resume state from existing record
-        self.resume_state(endpoint_id)
-
-        # Handle errors coming from stream
-        found_errors = self.handle_errors(endpoint_id, event)
-        if found_errors:
-            return None
-
-        # Validate event fields
-        model_class = event.get("model_class") or event.get("class")
-        timestamp = event.get("when")
-        request_id = event.get("request", {}).get("id")
-        latency = event.get("microsec")
-        features = event.get("request", {}).get("inputs")
-        predictions = event.get("resp", {}).get("outputs")
-
-        if not self.is_valid(endpoint_id, is_not_none, timestamp, ["when"],):
-            return None
-
-        if endpoint_id not in self.first_request:
-            self.first_request[endpoint_id] = timestamp
-        self.last_request[endpoint_id] = timestamp
-
-        if not self.is_valid(endpoint_id, is_not_none, request_id, ["request", "id"],):
-            return None
-        if not self.is_valid(endpoint_id, is_not_none, latency, ["microsec"],):
-            return None
-        if not self.is_valid(
-            endpoint_id, is_not_none, features, ["request", "inputs"],
-        ):
-            return None
-        if not self.is_valid(
-            endpoint_id, is_not_none, predictions, ["resp", "outputs"],
-        ):
-            return None
-
-        unpacked_labels = {f"_{k}": v for k, v in event.get(LABELS, {}).items()}
-
-        # Separate each model invocation into sub events
-        events = []
-        for i, (feature, prediction) in enumerate(zip(features, predictions)):
-            if not self.is_valid(
-                endpoint_id,
-                is_list_of_numerics,
-                feature,
-                ["request", "inputs", f"[{i}]"],
-            ):
-                return None
-
-            if not isinstance(prediction, list):
-                prediction = [prediction]
-
-            events.append(
-                {
-                    FUNCTION_URI: function_uri,
-                    MODEL: versioned_model,
-                    MODEL_CLASS: model_class,
-                    TIMESTAMP: timestamp,
-                    ENDPOINT_ID: endpoint_id,
-                    REQUEST_ID: request_id,
-                    LATENCY: latency,
-                    FEATURES: feature,
-                    PREDICTION: prediction,
-                    FIRST_REQUEST: self.first_request[endpoint_id],
-                    LAST_REQUEST: self.last_request[endpoint_id],
-                    ERROR_COUNT: self.error_count[endpoint_id],
-                    LABELS: event.get(LABELS, {}),
-                    METRICS: event.get(METRICS, {}),
-                    ENTITIES: event.get("request", {}).get(ENTITIES, {}),
-                    UNPACKED_LABELS: unpacked_labels,
-                }
-            )
-        return events
-
-    def resume_state(self, endpoint_id):
-        # Make sure process is resumable, if process fails for any reason, be able to pick things up close to where we
-        # left them
-        if endpoint_id not in self.endpoints:
-            logger.info("Trying to resume state", endpoint_id=endpoint_id)
-            endpoint_record = get_endpoint_record(
-                kv_container=self.kv_container,
-                kv_path=self.kv_path,
-                endpoint_id=endpoint_id,
-                access_key=self.v3io_access_key,
-            )
-            if endpoint_record:
-                first_request = endpoint_record.get(FIRST_REQUEST)
-                if first_request:
-                    self.first_request[endpoint_id] = first_request
-                error_count = endpoint_record.get(ERROR_COUNT)
-                if error_count:
-                    self.error_count[endpoint_id] = error_count
-            self.endpoints.add(endpoint_id)
-
-    def is_valid(
-        self, endpoint_id: str, validation_function, field: Any, dict_path: List[str]
-    ):
-        if validation_function(field, dict_path):
-            return True
-        self.error_count[endpoint_id] += 1
-        return False
-
-    def handle_errors(self, endpoint_id, event) -> bool:
-        if "error" in event:
-            self.error_count[endpoint_id] += 1
-            return True
-
-        return False
-
-
-def enrich_even_details(event) -> Optional[dict]:
-    function_uri = event.get(FUNCTION_URI)
-
-    if not is_not_none(function_uri, [FUNCTION_URI]):
-        return None
-
-    model = event.get(MODEL)
-    if not is_not_none(model, [MODEL]):
-        return None
-
-    version = event.get(VERSION)
-    versioned_model = f"{model}:{version}" if version else f"{model}:latest"
-
-    endpoint_id = create_model_endpoint_id(
-        function_uri=function_uri, versioned_model=versioned_model,
-    )
-
-    endpoint_id = str(endpoint_id)
-
-    event[VERSIONED_MODEL] = versioned_model
-    event[ENDPOINT_ID] = endpoint_id
-
-    return event
-
-
-def is_not_none(field: Any, dict_path: List[str]):
-    if field is not None:
-        return True
-    logger.error(
-        f"Expected event field is missing: {field} [Event -> {''.join(dict_path)}]"
-    )
-    return False
-
-
-def is_list_of_numerics(
-    field: List[Union[int, float, dict, list]], dict_path: List[str]
-):
-    if all(isinstance(x, int) or isinstance(x, float) for x in field):
-        return True
-    logger.error(
-        f"Expected event field is missing: {field} [Event -> {''.join(dict_path)}]"
-    )
-    return False
-
-
-class FilterNotNone(Filter):
-    def __init__(self, **kwargs):
-        super().__init__(fn=lambda event: event is not None, **kwargs)
-
-
-class FilterKeys(MapClass):
-    def __init__(self, *args, **kwargs):
-        super().__init__(**kwargs)
-        self.keys = list(args)
-
-    def do(self, event):
-        new_event = {}
-        for key in self.keys:
-            if key in event:
-                new_event[key] = event[key]
-
-        return new_event if new_event else None
-
-
-class UnpackValues(MapClass):
-    def __init__(self, *args, **kwargs):
-        super().__init__(**kwargs)
-        self.keys_to_unpack = set(args)
-
-    def do(self, event):
-        unpacked = {}
-        for key in event.keys():
-            if key in self.keys_to_unpack:
-                unpacked = {**unpacked, **event[key]}
-            else:
-                unpacked[key] = event[key]
-        return unpacked
-
-
-class MapFeatureNames(MapClass):
-    def __init__(self, kv_container: str, kv_path: str, access_key: str, **kwargs):
-        super().__init__(**kwargs)
-        self.kv_container = kv_container
-        self.kv_path = kv_path
-        self.access_key = access_key
-        self.feature_names = {}
-        self.label_columns = {}
-
-    def do(self, event: Dict):
-        endpoint_id = event[ENDPOINT_ID]
-
-        if endpoint_id not in self.feature_names:
-            endpoint_record = get_endpoint_record(
-                kv_container=self.kv_container,
-                kv_path=self.kv_path,
-                endpoint_id=endpoint_id,
-                access_key=self.access_key,
-            )
-            feature_names = endpoint_record.get(FEATURE_NAMES)
-            feature_names = json.loads(feature_names) if feature_names else None
-
-            label_columns = endpoint_record.get(LABEL_COLUMNS)
-            label_columns = json.loads(label_columns) if label_columns else None
-
-            if not feature_names:
-                logger.warn(
-                    f"Feature names are not initialized, they will be automatically generated",
-                    endpoint_id=endpoint_id,
-                )
-                feature_names = [f"f{i}" for i, _ in enumerate(event[FEATURES])]
-                get_v3io_client().kv.update(
-                    container=self.kv_container,
-                    table_path=self.kv_path,
-                    access_key=self.access_key,
-                    key=event[ENDPOINT_ID],
-                    attributes={FEATURE_NAMES: json.dumps(feature_names)},
-                    raise_for_status=RaiseForStatus.always,
-                )
-
-            if not label_columns:
-                logger.warn(
-                    f"label column names are not initialized, they will be automatically generated",
-                    endpoint_id=endpoint_id,
-                )
-                label_columns = [f"p{i}" for i, _ in enumerate(event[PREDICTION])]
-                get_v3io_client().kv.update(
-                    container=self.kv_container,
-                    table_path=self.kv_path,
-                    access_key=self.access_key,
-                    key=event[ENDPOINT_ID],
-                    attributes={LABEL_COLUMNS: json.dumps(label_columns)},
-                    raise_for_status=RaiseForStatus.always,
-                )
-
-            self.label_columns[endpoint_id] = label_columns
-            self.feature_names[endpoint_id] = feature_names
-
-            logger.info(
-                "Label columns", endpoint_id=endpoint_id, label_columns=label_columns
-            )
-            logger.info(
-                "Feature names", endpoint_id=endpoint_id, feature_names=feature_names
-            )
-
-        feature_names = self.feature_names[endpoint_id]
-        features = event[FEATURES]
-        event[NAMED_FEATURES] = {
-            name: feature for name, feature in zip(feature_names, features)
-        }
-
-        label_columns = self.label_columns[endpoint_id]
-        prediction = event[PREDICTION]
-        event[NAMED_PREDICTIONS] = {
-            name: prediction for name, prediction in zip(label_columns, prediction)
-        }
-        logger.info("Mapped event", event=event)
-        return event
-
-
-class WriteToKV(MapClass):
-    def __init__(self, container: str, table: str, **kwargs):
-        super().__init__(**kwargs)
-        self.container = container
-        self.table = table
-
-    def do(self, event: Dict):
-        get_v3io_client().kv.update(
-            container=self.container,
-            table_path=self.table,
-            key=event[ENDPOINT_ID],
-            attributes=event,
-        )
-        return event
-
-
-class InferSchema(MapClass):
-    def __init__(
-        self,
-        v3io_access_key: str,
-        v3io_framesd: str,
-        container: str,
-        table: str,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.container = container
-        self.v3io_access_key = v3io_access_key
-        self.v3io_framesd = v3io_framesd
-        self.table = table
-        self.keys = set()
-
-    def do(self, event: Dict):
-        key_set = set(event.keys())
-        if not key_set.issubset(self.keys):
-            self.keys.update(key_set)
-            get_frames_client(
-                token=self.v3io_access_key,
-                container=self.container,
-                address=self.v3io_framesd,
-            ).execute(backend="kv", table=self.table, command="infer_schema")
-            logger.info(
-                "Found new keys, inferred schema", table=self.table, event=event
-            )
-        return event
-
-
-def get_endpoint_record(
-    kv_container: str, kv_path: str, endpoint_id: str, access_key: str
-) -> Optional[dict]:
-    logger.info(
-        f"Grabbing endpoint data",
-        container=kv_container,
-        table_path=kv_path,
-        key=endpoint_id,
-    )
-    try:
-        endpoint_record = (
-            get_v3io_client()
-            .kv.get(
-                container=kv_container,
-                table_path=kv_path,
-                key=endpoint_id,
-                access_key=access_key,
-                raise_for_status=v3io.dataplane.RaiseForStatus.always,
-            )
-            .output.item
-        )
-        return endpoint_record
-    except Exception:
-        return None
-
-
-def init_context(context: MLClientCtx):
-    context.logger.info("Initializing EventStreamProcessor")
-    parameters = environ.get("MODEL_MONITORING_PARAMETERS")
-    parameters = json.loads(parameters) if parameters else {}
-    stream_processor = EventStreamProcessor(**parameters)
-    setattr(context, "stream_processor", stream_processor)
-
-
-def handler(context: MLClientCtx, event: Event):
-    event_body = json.loads(event.body)
-    context.logger.debug(event_body)
-    context.stream_processor.consume(event_body)
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/pandas_profiling_report/0.0.1/src/README.md b/functions/development/pandas_profiling_report/0.0.1/src/README.md deleted file mode 100644 index 40e0c9b2..00000000 --- a/functions/development/pandas_profiling_report/0.0.1/src/README.md +++ /dev/null @@ -1,26 +0,0 @@ -## pandas_profiling_report - -Creates an html report with various graphs/statistics/correlations for a given dataset. See sample report [here](https://pandas-profiling.github.io/pandas-profiling/examples/master/titanic/titanic_report.html). Link to GitHub page [here](https://github.com/pandas-profiling/pandas-profiling). - - -Usage example: - -```python -import mlrun, os -mlrun.mlconf.dbpath = 'http://mlrun-api:8080' - -# Load pandas_profiling_report function from Github -func = mlrun.import_function("hub://pandas_profiling_report").apply(mlrun.mount_v3io()) - -# Build MLRun image (only needs to be run once) -func.deploy() - -# Create task -data = 'https://iguazio-sample-data.s3.amazonaws.com/datasets/iris_dataset.csv' - -task = NewTask(name="pandas-profiling-report", - inputs={"data": DATA_URL}) - -# Run task on cluster -run = func.run(task, artifact_path='/User/artifacts') -``` diff --git a/functions/development/pandas_profiling_report/0.0.1/src/function.yaml b/functions/development/pandas_profiling_report/0.0.1/src/function.yaml deleted file mode 100644 index 4ad8c870..00000000 --- a/functions/development/pandas_profiling_report/0.0.1/src/function.yaml +++ /dev/null @@ -1,40 +0,0 @@ -kind: job -metadata: - name: pandas-profiling-report - tag: '' - hash: 79fe77fb2920a8ffecfef2f614a0be494c2ea43b - project: default - labels: - author: nicks - categories: - - data-analysis -spec: - command: '' - args: [] - image: mlrun/mlrun - env: [] - default_handler: pandas_profiling_report - entry_points: - pandas_profiling_report: - name: pandas_profiling_report - doc: Create a Pandas Profiling Report for a dataset. - parameters: - - name: context - type: MLClientCtx - doc: the function context - default: '' - - name: data - type: DataItem - doc: Dataset to create report for - default: '' - outputs: - - default: '' - lineno: 10 - description: Create Pandas Profiling Report from Dataset - build: - functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHBhbmRhcyBhcyBwZAppbXBvcnQgcGFuZGFzX3Byb2ZpbGluZwoKZnJvbSBtbHJ1bi5leGVjdXRpb24gaW1wb3J0IE1MQ2xpZW50Q3R4CmZyb20gbWxydW4uZGF0YXN0b3JlIGltcG9ydCBEYXRhSXRlbQoKCmRlZiBwYW5kYXNfcHJvZmlsaW5nX3JlcG9ydCgKICAgIGNvbnRleHQ6IE1MQ2xpZW50Q3R4LAogICAgZGF0YTogRGF0YUl0ZW0sCikgLT4gTm9uZToKICAgICIiIkNyZWF0ZSBhIFBhbmRhcyBQcm9maWxpbmcgUmVwb3J0IGZvciBhIGRhdGFzZXQuCiAgICA6cGFyYW0gY29udGV4dDogICAgICAgICB0aGUgZnVuY3Rpb24gY29udGV4dAogICAgOnBhcmFtIGRhdGE6ICAgICAgICAgICAgRGF0YXNldCB0byBjcmVhdGUgcmVwb3J0IGZvcgogICAgIiIiCgogICAgZGYgPSBkYXRhLmFzX2RmKCkKCiAgICBwcm9maWxlID0gZGYucHJvZmlsZV9yZXBvcnQodGl0bGU9IlBhbmRhcyBQcm9maWxpbmcgUmVwb3J0IikKCiAgICBjb250ZXh0LmxvZ19hcnRpZmFjdCgKICAgICAgICAiUGFuZGFzIFByb2ZpbGluZyBSZXBvcnQiLAogICAgICAgIGJvZHk9cHJvZmlsZS50b19odG1sKCksCiAgICAgICAgbG9jYWxfcGF0aD0icGFuZGFzX3Byb2ZpbGluZ19yZXBvcnQuaHRtbCIsCiAgICApCg== - commands: - - python -m pip install pandas_profiling - code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/pandas_profiling_report/pandas_profiling_report.py - affinity: null -verbose: false diff --git a/functions/development/pandas_profiling_report/0.0.1/src/item.yaml b/functions/development/pandas_profiling_report/0.0.1/src/item.yaml deleted file mode 100644 index 1335a252..00000000 --- a/functions/development/pandas_profiling_report/0.0.1/src/item.yaml +++ /dev/null @@ -1,24 +0,0 @@ -apiVersion: v1 -categories: -- data-analysis -description: Create Pandas Profiling Report from Dataset -doc: '' -example: pandas_profiling_report.ipynb -generationDate: 2021-05-19:23-13 -icon: '' -labels: - author: nicks -maintainers: [] -marketplaceType: '' -mlrunVersion: '' -name: pandas-profiling-report -platformVersion: '' -spec: - filename: pandas_profiling_report.py - handler: pandas_profiling_report - image: mlrun/mlrun - kind: job - requirements: - - pandas_profiling -url: '' -version: 0.0.1 diff --git a/functions/development/pandas_profiling_report/0.0.1/src/pandas_profiling_report.ipynb b/functions/development/pandas_profiling_report/0.0.1/src/pandas_profiling_report.ipynb deleted file mode 100644 index 61aeba26..00000000 --- a/functions/development/pandas_profiling_report/0.0.1/src/pandas_profiling_report.ipynb +++ /dev/null @@ -1,794 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Pandas Profiling Report" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: ignore\n", - "import nuclio" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "%nuclio: setting kind to 'job'\n", - "%nuclio: setting spec.image to 'mlrun/mlrun'\n" - ] - } - ], - "source": [ - "%nuclio config kind = \"job\"\n", - "%nuclio config spec.image = \"mlrun/mlrun\"" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "%%nuclio cmd -c\n", - "pip install pandas_profiling" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import pandas_profiling\n", - "\n", - "from mlrun.execution import MLClientCtx\n", - "from mlrun.datastore import DataItem" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "def pandas_profiling_report(\n", - " context: MLClientCtx,\n", - " data: DataItem,\n", - ") -> None:\n", - " \"\"\"Create a Pandas Profiling Report for a dataset.\n", - " :param context: the function context\n", - " :param data: Dataset to create report for\n", - " \"\"\"\n", - " \n", - " # Load dataset\n", - " df = data.as_df()\n", - " \n", - " # Create Pandas Profiling Report\n", - " profile = df.profile_report(title='Pandas Profiling Report')\n", - " \n", - " # Save to MLRun DB\n", - " context.log_artifact('Pandas Profiling Report',\n", - " body=profile.to_html(),\n", - " local_path='pandas_profiling_report.html')" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: end-code" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### mlconfig" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import mlconf\n", - "import os\n", - "\n", - "mlconf.dbpath = 'http://mlrun-api:8080'\n", - "mlconf.artifact_path = mlconf.artifact_path or f'{os.environ[\"HOME\"]}/artifacts'" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### save" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2020-10-15 19:21:40,986 [info] function spec saved to path: function.yaml\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from mlrun import code_to_function\n", - "\n", - "# create job function object from notebook code\n", - "fn = code_to_function(\"pandas_profiling_report\", kind=\"job\")\n", - "\n", - "# add metadata (for templates and reuse)\n", - "fn.spec.default_handler = \"pandas_profiling_report\"\n", - "fn.spec.description = \"Create Pandas Profiling Report from Dataset\"\n", - "fn.metadata.categories = [\"analysis\"]\n", - "fn.metadata.labels = {\"author\": \"nicks\"}\n", - "fn.export(\"function.yaml\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## tests" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from mlrun.platforms import auto_mount\n", - "fn.apply(auto_mount())" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import NewTask, run_local\n", - "\n", - "DATA_URL = 'https://iguazio-sample-data.s3.amazonaws.com/datasets/iris_dataset.csv'" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "task = NewTask(name=\"pandas-profiling-report\", \n", - " handler=pandas_profiling_report, \n", - " inputs={\"data\": DATA_URL})" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### run locally" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2020-10-15 19:21:41,030 [warning] warning!, server (0.5.1) and client (0.5.2) ver dont match\n", - "> 2020-10-15 19:21:41,031 [info] starting run pandas-profiling-report uid=0894aed4f2854d96b776e25bdcaff80e -> http://mlrun-api:8080\n", - "> 2020-10-15 19:21:41,062 [warning] warning!, server (0.5.1) and client (0.5.2) ver dont match\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "86c3397cc7384565815af90bc5a6d10b", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(FloatProgress(value=0.0, description='Summarize dataset', max=19.0, style=ProgressStyle(descrip…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "e7dece2ab7184c909611cf0aed3ef474", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(FloatProgress(value=0.0, description='Generate report structure', max=1.0, style=ProgressStyle(…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "7153ac93afcd4e77a4b5a312766af995", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(FloatProgress(value=0.0, description='Render HTML', max=1.0, style=ProgressStyle(description_wi…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Oct 15 19:21:41completedpandas-profiling-report
v3io_user=nicks
kind=handler
owner=nicks
host=nicks-jupyter-76668bdd46-g9sxf
data
Pandas Profiling Report
\n", - "
\n", - "
\n", - "
\n", - " Title\n", - " ×\n", - "
\n", - " \n", - "
\n", - "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "to track results use .show() or .logs() or in CLI: \n", - "!mlrun get run 0894aed4f2854d96b776e25bdcaff80e --project default , !mlrun logs 0894aed4f2854d96b776e25bdcaff80e --project default\n", - "> 2020-10-15 19:21:52,944 [info] run executed, status=completed\n" - ] - } - ], - "source": [ - "run = run_local(task)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### run remotely" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "# Create MLRun image (only needs to be run once)\n", - "fn.deploy()" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2020-10-15 19:23:17,199 [info] starting run pandas-profiling-report uid=0ab5c8dbff95471da6018c1a7afd3b22 -> http://mlrun-api:8080\n", - "> 2020-10-15 19:23:17,303 [info] Job is running in the background, pod: pandas-profiling-report-xr48m\n", - "Summarize dataset: 100%|██████████| 19/19 [00:05<00:00, 3.78it/s, Completed] \n", - "Generate report structure: 100%|██████████| 1/1 [00:02<00:00, 2.22s/it]\n", - "> 2020-10-15 19:23:33,779 [info] run executed, status=completed\n", - "Render HTML: 100%|██████████| 1/1 [00:00<00:00, 2.07it/s]\n", - "final state: succeeded\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Oct 15 19:23:25completedpandas-profiling-report
v3io_user=nicks
kind=job
owner=nicks
host=pandas-profiling-report-xr48m
data
Pandas Profiling Report
\n", - "
\n", - "
\n", - "
\n", - " Title\n", - " ×\n", - "
\n", - " \n", - "
\n", - "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "to track results use .show() or .logs() or in CLI: \n", - "!mlrun get run 0ab5c8dbff95471da6018c1a7afd3b22 --project default , !mlrun logs 0ab5c8dbff95471da6018c1a7afd3b22 --project default\n", - "> 2020-10-15 19:23:36,481 [info] run executed, status=completed\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fn.run(task, inputs={\"data\": DATA_URL})" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/functions/development/pandas_profiling_report/0.0.1/src/pandas_profiling_report.py b/functions/development/pandas_profiling_report/0.0.1/src/pandas_profiling_report.py deleted file mode 100644 index c03077df..00000000 --- a/functions/development/pandas_profiling_report/0.0.1/src/pandas_profiling_report.py +++ /dev/null @@ -1,27 +0,0 @@ -# Generated by nuclio.export.NuclioExporter - -import pandas as pd -import pandas_profiling - -from mlrun.execution import MLClientCtx -from mlrun.datastore import DataItem - - -def pandas_profiling_report( - context: MLClientCtx, - data: DataItem, -) -> None: - """Create a Pandas Profiling Report for a dataset. - :param context: the function context - :param data: Dataset to create report for - """ - - df = data.as_df() - - profile = df.profile_report(title="Pandas Profiling Report") - - context.log_artifact( - "Pandas Profiling Report", - body=profile.to_html(), - local_path="pandas_profiling_report.html", - ) diff --git a/functions/development/pandas_profiling_report/0.0.1/static/documentation.html b/functions/development/pandas_profiling_report/0.0.1/static/documentation.html deleted file mode 100644 index eb625786..00000000 --- a/functions/development/pandas_profiling_report/0.0.1/static/documentation.html +++ /dev/null @@ -1,132 +0,0 @@ - - - - - - - -pandas_profiling_report package - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- - -
-
-
-
-
-
-

pandas_profiling_report package

-
-

Submodules

-
-
-

pandas_profiling_report.pandas_profiling_report module

-
-
-pandas_profiling_report.pandas_profiling_report.pandas_profiling_report(context: mlrun.execution.MLClientCtx, data: mlrun.datastore.base.DataItem)None[source]
-

Create a Pandas Profiling Report for a dataset. -:param context: the function context -:param data: Dataset to create report for

-
-
-
-

Module contents

-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/pandas_profiling_report/0.0.1/static/example.html b/functions/development/pandas_profiling_report/0.0.1/static/example.html deleted file mode 100644 index 47cdc635..00000000 --- a/functions/development/pandas_profiling_report/0.0.1/static/example.html +++ /dev/null @@ -1,733 +0,0 @@ - - - - - - - -Pandas Profiling Report - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- - -
-
-
-
-
-
-

Pandas Profiling Report

-
-
-
# nuclio: ignore
-import nuclio
-
-
-
-
-
-
-
%nuclio config kind = "job"
-%nuclio config spec.image = "mlrun/mlrun"
-
-
-
-
-
%nuclio: setting kind to 'job'
-%nuclio: setting spec.image to 'mlrun/mlrun'
-
-
-
-
-
-
-
%%nuclio cmd -c
-pip install pandas_profiling
-
-
-
-
-
-
-
import pandas as pd
-import pandas_profiling
-
-from mlrun.execution import MLClientCtx
-from mlrun.datastore import DataItem
-
-
-
-
-
-
-
def pandas_profiling_report(
-    context: MLClientCtx,
-    data: DataItem,
-) -> None:
-    """Create a Pandas Profiling Report for a dataset.
-    :param context:         the function context
-    :param data:            Dataset to create report for
-    """
-    
-    # Load dataset
-    df = data.as_df()
-    
-    # Create Pandas Profiling Report
-    profile = df.profile_report(title='Pandas Profiling Report')
-    
-    # Save to MLRun DB
-    context.log_artifact('Pandas Profiling Report',
-                         body=profile.to_html(),
-                         local_path='pandas_profiling_report.html')
-
-
-
-
-
-
-
# nuclio: end-code
-
-
-
-
-
-
-

mlconfig

-
-
-
from mlrun import mlconf
-import os
-
-mlconf.dbpath = 'http://mlrun-api:8080'
-mlconf.artifact_path = mlconf.artifact_path or f'{os.environ["HOME"]}/artifacts'
-
-
-
-
-
-
-

save

-
-
-
from mlrun import code_to_function
-
-# create job function object from notebook code
-fn = code_to_function("pandas_profiling_report", kind="job")
-
-# add metadata (for templates and reuse)
-fn.spec.default_handler = "pandas_profiling_report"
-fn.spec.description = "Create Pandas Profiling Report from Dataset"
-fn.metadata.categories = ["analysis"]
-fn.metadata.labels = {"author": "nicks"}
-fn.export("function.yaml")
-
-
-
-
-
> 2020-10-15 19:21:40,986 [info] function spec saved to path: function.yaml
-
-
-
<mlrun.runtimes.kubejob.KubejobRuntime at 0x7fe232431610>
-
-
-
-
-
-
-

tests

-
-
-
from mlrun.platforms import auto_mount
-fn.apply(auto_mount())
-
-
-
-
-
<mlrun.runtimes.kubejob.KubejobRuntime at 0x7fe232431610>
-
-
-
-
-
-
-
from mlrun import NewTask, run_local
-
-DATA_URL = 'https://iguazio-sample-data.s3.amazonaws.com/datasets/iris_dataset.csv'
-
-
-
-
-
-
-
task = NewTask(name="pandas-profiling-report", 
-               handler=pandas_profiling_report, 
-               inputs={"data": DATA_URL})
-
-
-
-
-
-

run locally

-
-
-
run = run_local(task)
-
-
-
-
-
> 2020-10-15 19:21:41,030 [warning] warning!, server (0.5.1) and client (0.5.2) ver dont match
-> 2020-10-15 19:21:41,031 [info] starting run pandas-profiling-report uid=0894aed4f2854d96b776e25bdcaff80e  -> http://mlrun-api:8080
-> 2020-10-15 19:21:41,062 [warning] warning!, server (0.5.1) and client (0.5.2) ver dont match
-
-
-

-
-
-

-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Oct 15 19:21:41completedpandas-profiling-report
v3io_user=nicks
kind=handler
owner=nicks
host=nicks-jupyter-76668bdd46-g9sxf
data
Pandas Profiling Report
-
- -
-
to track results use .show() or .logs() or in CLI: 
-!mlrun get run 0894aed4f2854d96b776e25bdcaff80e --project default , !mlrun logs 0894aed4f2854d96b776e25bdcaff80e --project default
-> 2020-10-15 19:21:52,944 [info] run executed, status=completed
-
-
-
-
-
-
-

run remotely

-
-
-
# Create MLRun image (only needs to be run once)
-fn.deploy()
-
-
-
-
-
-
-
fn.run(task, inputs={"data": DATA_URL})
-
-
-
-
-
> 2020-10-15 19:23:17,199 [info] starting run pandas-profiling-report uid=0ab5c8dbff95471da6018c1a7afd3b22  -> http://mlrun-api:8080
-> 2020-10-15 19:23:17,303 [info] Job is running in the background, pod: pandas-profiling-report-xr48m
-Summarize dataset: 100%|██████████| 19/19 [00:05<00:00,  3.78it/s, Completed]                         
-Generate report structure: 100%|██████████| 1/1 [00:02<00:00,  2.22s/it]
-> 2020-10-15 19:23:33,779 [info] run executed, status=completed
-Render HTML: 100%|██████████| 1/1 [00:00<00:00,  2.07it/s]
-final state: succeeded
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Oct 15 19:23:25completedpandas-profiling-report
v3io_user=nicks
kind=job
owner=nicks
host=pandas-profiling-report-xr48m
data
Pandas Profiling Report
-
- -
-
to track results use .show() or .logs() or in CLI: 
-!mlrun get run 0ab5c8dbff95471da6018c1a7afd3b22 --project default , !mlrun logs 0ab5c8dbff95471da6018c1a7afd3b22 --project default
-> 2020-10-15 19:23:36,481 [info] run executed, status=completed
-
-
-
<mlrun.model.RunObject at 0x7fe2297b51d0>
-
-
-
-
-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/pandas_profiling_report/0.0.1/static/function.html b/functions/development/pandas_profiling_report/0.0.1/static/function.html deleted file mode 100644 index 2b5b5972..00000000 --- a/functions/development/pandas_profiling_report/0.0.1/static/function.html +++ /dev/null @@ -1,62 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: job
-metadata:
-  name: pandas-profiling-report
-  tag: ''
-  hash: 79fe77fb2920a8ffecfef2f614a0be494c2ea43b
-  project: default
-  labels:
-    author: nicks
-  categories:
-  - data-analysis
-spec:
-  command: ''
-  args: []
-  image: mlrun/mlrun
-  env: []
-  default_handler: pandas_profiling_report
-  entry_points:
-    pandas_profiling_report:
-      name: pandas_profiling_report
-      doc: Create a Pandas Profiling Report for a dataset.
-      parameters:
-      - name: context
-        type: MLClientCtx
-        doc: the function context
-        default: ''
-      - name: data
-        type: DataItem
-        doc: Dataset to create report for
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 10
-  description: Create Pandas Profiling Report from Dataset
-  build:
-    functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHBhbmRhcyBhcyBwZAppbXBvcnQgcGFuZGFzX3Byb2ZpbGluZwoKZnJvbSBtbHJ1bi5leGVjdXRpb24gaW1wb3J0IE1MQ2xpZW50Q3R4CmZyb20gbWxydW4uZGF0YXN0b3JlIGltcG9ydCBEYXRhSXRlbQoKCmRlZiBwYW5kYXNfcHJvZmlsaW5nX3JlcG9ydCgKICAgIGNvbnRleHQ6IE1MQ2xpZW50Q3R4LAogICAgZGF0YTogRGF0YUl0ZW0sCikgLT4gTm9uZToKICAgICIiIkNyZWF0ZSBhIFBhbmRhcyBQcm9maWxpbmcgUmVwb3J0IGZvciBhIGRhdGFzZXQuCiAgICA6cGFyYW0gY29udGV4dDogICAgICAgICB0aGUgZnVuY3Rpb24gY29udGV4dAogICAgOnBhcmFtIGRhdGE6ICAgICAgICAgICAgRGF0YXNldCB0byBjcmVhdGUgcmVwb3J0IGZvcgogICAgIiIiCgogICAgZGYgPSBkYXRhLmFzX2RmKCkKCiAgICBwcm9maWxlID0gZGYucHJvZmlsZV9yZXBvcnQodGl0bGU9IlBhbmRhcyBQcm9maWxpbmcgUmVwb3J0IikKCiAgICBjb250ZXh0LmxvZ19hcnRpZmFjdCgKICAgICAgICAiUGFuZGFzIFByb2ZpbGluZyBSZXBvcnQiLAogICAgICAgIGJvZHk9cHJvZmlsZS50b19odG1sKCksCiAgICAgICAgbG9jYWxfcGF0aD0icGFuZGFzX3Byb2ZpbGluZ19yZXBvcnQuaHRtbCIsCiAgICApCg==
-    commands:
-    - python -m pip install pandas_profiling
-    code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/pandas_profiling_report/pandas_profiling_report.py
-  affinity: null
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/pandas_profiling_report/0.0.1/static/item.html b/functions/development/pandas_profiling_report/0.0.1/static/item.html deleted file mode 100644 index 3821f67c..00000000 --- a/functions/development/pandas_profiling_report/0.0.1/static/item.html +++ /dev/null @@ -1,46 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- data-analysis
-description: Create Pandas Profiling Report from Dataset
-doc: ''
-example: pandas_profiling_report.ipynb
-generationDate: 2021-05-19:23-13
-icon: ''
-labels:
-  author: nicks
-maintainers: []
-marketplaceType: ''
-mlrunVersion: ''
-name: pandas-profiling-report
-platformVersion: ''
-spec:
-  filename: pandas_profiling_report.py
-  handler: pandas_profiling_report
-  image: mlrun/mlrun
-  kind: job
-  requirements:
-  - pandas_profiling
-url: ''
-version: 0.0.1
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/pandas_profiling_report/0.0.1/static/source.html b/functions/development/pandas_profiling_report/0.0.1/static/source.html deleted file mode 100644 index 439b80b2..00000000 --- a/functions/development/pandas_profiling_report/0.0.1/static/source.html +++ /dev/null @@ -1,49 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-# Generated by nuclio.export.NuclioExporter
-
-import pandas as pd
-import pandas_profiling
-
-from mlrun.execution import MLClientCtx
-from mlrun.datastore import DataItem
-
-
-def pandas_profiling_report(
-    context: MLClientCtx,
-    data: DataItem,
-) -> None:
-    """Create a Pandas Profiling Report for a dataset.
-    :param context:         the function context
-    :param data:            Dataset to create report for
-    """
-
-    df = data.as_df()
-
-    profile = df.profile_report(title="Pandas Profiling Report")
-
-    context.log_artifact(
-        "Pandas Profiling Report",
-        body=profile.to_html(),
-        local_path="pandas_profiling_report.html",
-    )
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/pandas_profiling_report/0.8.0/src/README.md b/functions/development/pandas_profiling_report/0.8.0/src/README.md deleted file mode 100644 index 40e0c9b2..00000000 --- a/functions/development/pandas_profiling_report/0.8.0/src/README.md +++ /dev/null @@ -1,26 +0,0 @@ -## pandas_profiling_report - -Creates an html report with various graphs/statistics/correlations for a given dataset. See sample report [here](https://pandas-profiling.github.io/pandas-profiling/examples/master/titanic/titanic_report.html). Link to GitHub page [here](https://github.com/pandas-profiling/pandas-profiling). - - -Usage example: - -```python -import mlrun, os -mlrun.mlconf.dbpath = 'http://mlrun-api:8080' - -# Load pandas_profiling_report function from Github -func = mlrun.import_function("hub://pandas_profiling_report").apply(mlrun.mount_v3io()) - -# Build MLRun image (only needs to be run once) -func.deploy() - -# Create task -data = 'https://iguazio-sample-data.s3.amazonaws.com/datasets/iris_dataset.csv' - -task = NewTask(name="pandas-profiling-report", - inputs={"data": DATA_URL}) - -# Run task on cluster -run = func.run(task, artifact_path='/User/artifacts') -``` diff --git a/functions/development/pandas_profiling_report/0.8.0/src/function.yaml b/functions/development/pandas_profiling_report/0.8.0/src/function.yaml deleted file mode 100644 index 4ad8c870..00000000 --- a/functions/development/pandas_profiling_report/0.8.0/src/function.yaml +++ /dev/null @@ -1,40 +0,0 @@ -kind: job -metadata: - name: pandas-profiling-report - tag: '' - hash: 79fe77fb2920a8ffecfef2f614a0be494c2ea43b - project: default - labels: - author: nicks - categories: - - data-analysis -spec: - command: '' - args: [] - image: mlrun/mlrun - env: [] - default_handler: pandas_profiling_report - entry_points: - pandas_profiling_report: - name: pandas_profiling_report - doc: Create a Pandas Profiling Report for a dataset. - parameters: - - name: context - type: MLClientCtx - doc: the function context - default: '' - - name: data - type: DataItem - doc: Dataset to create report for - default: '' - outputs: - - default: '' - lineno: 10 - description: Create Pandas Profiling Report from Dataset - build: - functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHBhbmRhcyBhcyBwZAppbXBvcnQgcGFuZGFzX3Byb2ZpbGluZwoKZnJvbSBtbHJ1bi5leGVjdXRpb24gaW1wb3J0IE1MQ2xpZW50Q3R4CmZyb20gbWxydW4uZGF0YXN0b3JlIGltcG9ydCBEYXRhSXRlbQoKCmRlZiBwYW5kYXNfcHJvZmlsaW5nX3JlcG9ydCgKICAgIGNvbnRleHQ6IE1MQ2xpZW50Q3R4LAogICAgZGF0YTogRGF0YUl0ZW0sCikgLT4gTm9uZToKICAgICIiIkNyZWF0ZSBhIFBhbmRhcyBQcm9maWxpbmcgUmVwb3J0IGZvciBhIGRhdGFzZXQuCiAgICA6cGFyYW0gY29udGV4dDogICAgICAgICB0aGUgZnVuY3Rpb24gY29udGV4dAogICAgOnBhcmFtIGRhdGE6ICAgICAgICAgICAgRGF0YXNldCB0byBjcmVhdGUgcmVwb3J0IGZvcgogICAgIiIiCgogICAgZGYgPSBkYXRhLmFzX2RmKCkKCiAgICBwcm9maWxlID0gZGYucHJvZmlsZV9yZXBvcnQodGl0bGU9IlBhbmRhcyBQcm9maWxpbmcgUmVwb3J0IikKCiAgICBjb250ZXh0LmxvZ19hcnRpZmFjdCgKICAgICAgICAiUGFuZGFzIFByb2ZpbGluZyBSZXBvcnQiLAogICAgICAgIGJvZHk9cHJvZmlsZS50b19odG1sKCksCiAgICAgICAgbG9jYWxfcGF0aD0icGFuZGFzX3Byb2ZpbGluZ19yZXBvcnQuaHRtbCIsCiAgICApCg== - commands: - - python -m pip install pandas_profiling - code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/pandas_profiling_report/pandas_profiling_report.py - affinity: null -verbose: false diff --git a/functions/development/pandas_profiling_report/0.8.0/src/item.yaml b/functions/development/pandas_profiling_report/0.8.0/src/item.yaml deleted file mode 100644 index 6ffd2470..00000000 --- a/functions/development/pandas_profiling_report/0.8.0/src/item.yaml +++ /dev/null @@ -1,24 +0,0 @@ -apiVersion: v1 -categories: -- data-analysis -description: Create Pandas Profiling Report from Dataset -doc: '' -example: pandas_profiling_report.ipynb -generationDate: 2021-05-19:23-13 -icon: '' -labels: - author: nicks -maintainers: [] -marketplaceType: '' -mlrunVersion: 0.8.0 -name: pandas-profiling-report -platformVersion: 3.2.0 -spec: - filename: pandas_profiling_report.py - handler: pandas_profiling_report - image: mlrun/mlrun - kind: job - requirements: - - pandas_profiling -url: '' -version: 0.8.0 diff --git a/functions/development/pandas_profiling_report/0.8.0/src/pandas_profiling_report.ipynb b/functions/development/pandas_profiling_report/0.8.0/src/pandas_profiling_report.ipynb deleted file mode 100644 index 61aeba26..00000000 --- a/functions/development/pandas_profiling_report/0.8.0/src/pandas_profiling_report.ipynb +++ /dev/null @@ -1,794 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Pandas Profiling Report" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: ignore\n", - "import nuclio" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "%nuclio: setting kind to 'job'\n", - "%nuclio: setting spec.image to 'mlrun/mlrun'\n" - ] - } - ], - "source": [ - "%nuclio config kind = \"job\"\n", - "%nuclio config spec.image = \"mlrun/mlrun\"" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "%%nuclio cmd -c\n", - "pip install pandas_profiling" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import pandas_profiling\n", - "\n", - "from mlrun.execution import MLClientCtx\n", - "from mlrun.datastore import DataItem" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "def pandas_profiling_report(\n", - " context: MLClientCtx,\n", - " data: DataItem,\n", - ") -> None:\n", - " \"\"\"Create a Pandas Profiling Report for a dataset.\n", - " :param context: the function context\n", - " :param data: Dataset to create report for\n", - " \"\"\"\n", - " \n", - " # Load dataset\n", - " df = data.as_df()\n", - " \n", - " # Create Pandas Profiling Report\n", - " profile = df.profile_report(title='Pandas Profiling Report')\n", - " \n", - " # Save to MLRun DB\n", - " context.log_artifact('Pandas Profiling Report',\n", - " body=profile.to_html(),\n", - " local_path='pandas_profiling_report.html')" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: end-code" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### mlconfig" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import mlconf\n", - "import os\n", - "\n", - "mlconf.dbpath = 'http://mlrun-api:8080'\n", - "mlconf.artifact_path = mlconf.artifact_path or f'{os.environ[\"HOME\"]}/artifacts'" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### save" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2020-10-15 19:21:40,986 [info] function spec saved to path: function.yaml\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from mlrun import code_to_function\n", - "\n", - "# create job function object from notebook code\n", - "fn = code_to_function(\"pandas_profiling_report\", kind=\"job\")\n", - "\n", - "# add metadata (for templates and reuse)\n", - "fn.spec.default_handler = \"pandas_profiling_report\"\n", - "fn.spec.description = \"Create Pandas Profiling Report from Dataset\"\n", - "fn.metadata.categories = [\"analysis\"]\n", - "fn.metadata.labels = {\"author\": \"nicks\"}\n", - "fn.export(\"function.yaml\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## tests" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from mlrun.platforms import auto_mount\n", - "fn.apply(auto_mount())" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import NewTask, run_local\n", - "\n", - "DATA_URL = 'https://iguazio-sample-data.s3.amazonaws.com/datasets/iris_dataset.csv'" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "task = NewTask(name=\"pandas-profiling-report\", \n", - " handler=pandas_profiling_report, \n", - " inputs={\"data\": DATA_URL})" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### run locally" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2020-10-15 19:21:41,030 [warning] warning!, server (0.5.1) and client (0.5.2) ver dont match\n", - "> 2020-10-15 19:21:41,031 [info] starting run pandas-profiling-report uid=0894aed4f2854d96b776e25bdcaff80e -> http://mlrun-api:8080\n", - "> 2020-10-15 19:21:41,062 [warning] warning!, server (0.5.1) and client (0.5.2) ver dont match\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "86c3397cc7384565815af90bc5a6d10b", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(FloatProgress(value=0.0, description='Summarize dataset', max=19.0, style=ProgressStyle(descrip…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "e7dece2ab7184c909611cf0aed3ef474", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(FloatProgress(value=0.0, description='Generate report structure', max=1.0, style=ProgressStyle(…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "7153ac93afcd4e77a4b5a312766af995", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(FloatProgress(value=0.0, description='Render HTML', max=1.0, style=ProgressStyle(description_wi…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Oct 15 19:21:41completedpandas-profiling-report
v3io_user=nicks
kind=handler
owner=nicks
host=nicks-jupyter-76668bdd46-g9sxf
data
Pandas Profiling Report
\n", - "
\n", - "
\n", - "
\n", - " Title\n", - " ×\n", - "
\n", - " \n", - "
\n", - "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "to track results use .show() or .logs() or in CLI: \n", - "!mlrun get run 0894aed4f2854d96b776e25bdcaff80e --project default , !mlrun logs 0894aed4f2854d96b776e25bdcaff80e --project default\n", - "> 2020-10-15 19:21:52,944 [info] run executed, status=completed\n" - ] - } - ], - "source": [ - "run = run_local(task)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### run remotely" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "# Create MLRun image (only needs to be run once)\n", - "fn.deploy()" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2020-10-15 19:23:17,199 [info] starting run pandas-profiling-report uid=0ab5c8dbff95471da6018c1a7afd3b22 -> http://mlrun-api:8080\n", - "> 2020-10-15 19:23:17,303 [info] Job is running in the background, pod: pandas-profiling-report-xr48m\n", - "Summarize dataset: 100%|██████████| 19/19 [00:05<00:00, 3.78it/s, Completed] \n", - "Generate report structure: 100%|██████████| 1/1 [00:02<00:00, 2.22s/it]\n", - "> 2020-10-15 19:23:33,779 [info] run executed, status=completed\n", - "Render HTML: 100%|██████████| 1/1 [00:00<00:00, 2.07it/s]\n", - "final state: succeeded\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Oct 15 19:23:25completedpandas-profiling-report
v3io_user=nicks
kind=job
owner=nicks
host=pandas-profiling-report-xr48m
data
Pandas Profiling Report
\n", - "
\n", - "
\n", - "
\n", - " Title\n", - " ×\n", - "
\n", - " \n", - "
\n", - "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "to track results use .show() or .logs() or in CLI: \n", - "!mlrun get run 0ab5c8dbff95471da6018c1a7afd3b22 --project default , !mlrun logs 0ab5c8dbff95471da6018c1a7afd3b22 --project default\n", - "> 2020-10-15 19:23:36,481 [info] run executed, status=completed\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fn.run(task, inputs={\"data\": DATA_URL})" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/functions/development/pandas_profiling_report/0.8.0/src/pandas_profiling_report.py b/functions/development/pandas_profiling_report/0.8.0/src/pandas_profiling_report.py deleted file mode 100644 index c03077df..00000000 --- a/functions/development/pandas_profiling_report/0.8.0/src/pandas_profiling_report.py +++ /dev/null @@ -1,27 +0,0 @@ -# Generated by nuclio.export.NuclioExporter - -import pandas as pd -import pandas_profiling - -from mlrun.execution import MLClientCtx -from mlrun.datastore import DataItem - - -def pandas_profiling_report( - context: MLClientCtx, - data: DataItem, -) -> None: - """Create a Pandas Profiling Report for a dataset. - :param context: the function context - :param data: Dataset to create report for - """ - - df = data.as_df() - - profile = df.profile_report(title="Pandas Profiling Report") - - context.log_artifact( - "Pandas Profiling Report", - body=profile.to_html(), - local_path="pandas_profiling_report.html", - ) diff --git a/functions/development/pandas_profiling_report/0.8.0/static/documentation.html b/functions/development/pandas_profiling_report/0.8.0/static/documentation.html deleted file mode 100644 index eb625786..00000000 --- a/functions/development/pandas_profiling_report/0.8.0/static/documentation.html +++ /dev/null @@ -1,132 +0,0 @@ - - - - - - - -pandas_profiling_report package - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- - -
-
-
-
-
-
-

pandas_profiling_report package

-
-

Submodules

-
-
-

pandas_profiling_report.pandas_profiling_report module

-
-
-pandas_profiling_report.pandas_profiling_report.pandas_profiling_report(context: mlrun.execution.MLClientCtx, data: mlrun.datastore.base.DataItem)None[source]
-

Create a Pandas Profiling Report for a dataset. -:param context: the function context -:param data: Dataset to create report for

-
-
-
-

Module contents

-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/pandas_profiling_report/0.8.0/static/example.html b/functions/development/pandas_profiling_report/0.8.0/static/example.html deleted file mode 100644 index 47cdc635..00000000 --- a/functions/development/pandas_profiling_report/0.8.0/static/example.html +++ /dev/null @@ -1,733 +0,0 @@ - - - - - - - -Pandas Profiling Report - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- - -
-
-
-
-
-
-

Pandas Profiling Report

-
-
-
# nuclio: ignore
-import nuclio
-
-
-
-
-
-
-
%nuclio config kind = "job"
-%nuclio config spec.image = "mlrun/mlrun"
-
-
-
-
-
%nuclio: setting kind to 'job'
-%nuclio: setting spec.image to 'mlrun/mlrun'
-
-
-
-
-
-
-
%%nuclio cmd -c
-pip install pandas_profiling
-
-
-
-
-
-
-
import pandas as pd
-import pandas_profiling
-
-from mlrun.execution import MLClientCtx
-from mlrun.datastore import DataItem
-
-
-
-
-
-
-
def pandas_profiling_report(
-    context: MLClientCtx,
-    data: DataItem,
-) -> None:
-    """Create a Pandas Profiling Report for a dataset.
-    :param context:         the function context
-    :param data:            Dataset to create report for
-    """
-    
-    # Load dataset
-    df = data.as_df()
-    
-    # Create Pandas Profiling Report
-    profile = df.profile_report(title='Pandas Profiling Report')
-    
-    # Save to MLRun DB
-    context.log_artifact('Pandas Profiling Report',
-                         body=profile.to_html(),
-                         local_path='pandas_profiling_report.html')
-
-
-
-
-
-
-
# nuclio: end-code
-
-
-
-
-
-
-

mlconfig

-
-
-
from mlrun import mlconf
-import os
-
-mlconf.dbpath = 'http://mlrun-api:8080'
-mlconf.artifact_path = mlconf.artifact_path or f'{os.environ["HOME"]}/artifacts'
-
-
-
-
-
-
-

save

-
-
-
from mlrun import code_to_function
-
-# create job function object from notebook code
-fn = code_to_function("pandas_profiling_report", kind="job")
-
-# add metadata (for templates and reuse)
-fn.spec.default_handler = "pandas_profiling_report"
-fn.spec.description = "Create Pandas Profiling Report from Dataset"
-fn.metadata.categories = ["analysis"]
-fn.metadata.labels = {"author": "nicks"}
-fn.export("function.yaml")
-
-
-
-
-
> 2020-10-15 19:21:40,986 [info] function spec saved to path: function.yaml
-
-
-
<mlrun.runtimes.kubejob.KubejobRuntime at 0x7fe232431610>
-
-
-
-
-
-
-

tests

-
-
-
from mlrun.platforms import auto_mount
-fn.apply(auto_mount())
-
-
-
-
-
<mlrun.runtimes.kubejob.KubejobRuntime at 0x7fe232431610>
-
-
-
-
-
-
-
from mlrun import NewTask, run_local
-
-DATA_URL = 'https://iguazio-sample-data.s3.amazonaws.com/datasets/iris_dataset.csv'
-
-
-
-
-
-
-
task = NewTask(name="pandas-profiling-report", 
-               handler=pandas_profiling_report, 
-               inputs={"data": DATA_URL})
-
-
-
-
-
-

run locally

-
-
-
run = run_local(task)
-
-
-
-
-
> 2020-10-15 19:21:41,030 [warning] warning!, server (0.5.1) and client (0.5.2) ver dont match
-> 2020-10-15 19:21:41,031 [info] starting run pandas-profiling-report uid=0894aed4f2854d96b776e25bdcaff80e  -> http://mlrun-api:8080
-> 2020-10-15 19:21:41,062 [warning] warning!, server (0.5.1) and client (0.5.2) ver dont match
-
-
-

-
-
-

-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Oct 15 19:21:41completedpandas-profiling-report
v3io_user=nicks
kind=handler
owner=nicks
host=nicks-jupyter-76668bdd46-g9sxf
data
Pandas Profiling Report
-
- -
-
to track results use .show() or .logs() or in CLI: 
-!mlrun get run 0894aed4f2854d96b776e25bdcaff80e --project default , !mlrun logs 0894aed4f2854d96b776e25bdcaff80e --project default
-> 2020-10-15 19:21:52,944 [info] run executed, status=completed
-
-
-
-
-
-
-

run remotely

-
-
-
# Create MLRun image (only needs to be run once)
-fn.deploy()
-
-
-
-
-
-
-
fn.run(task, inputs={"data": DATA_URL})
-
-
-
-
-
> 2020-10-15 19:23:17,199 [info] starting run pandas-profiling-report uid=0ab5c8dbff95471da6018c1a7afd3b22  -> http://mlrun-api:8080
-> 2020-10-15 19:23:17,303 [info] Job is running in the background, pod: pandas-profiling-report-xr48m
-Summarize dataset: 100%|██████████| 19/19 [00:05<00:00,  3.78it/s, Completed]                         
-Generate report structure: 100%|██████████| 1/1 [00:02<00:00,  2.22s/it]
-> 2020-10-15 19:23:33,779 [info] run executed, status=completed
-Render HTML: 100%|██████████| 1/1 [00:00<00:00,  2.07it/s]
-final state: succeeded
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Oct 15 19:23:25completedpandas-profiling-report
v3io_user=nicks
kind=job
owner=nicks
host=pandas-profiling-report-xr48m
data
Pandas Profiling Report
-
- -
-
to track results use .show() or .logs() or in CLI: 
-!mlrun get run 0ab5c8dbff95471da6018c1a7afd3b22 --project default , !mlrun logs 0ab5c8dbff95471da6018c1a7afd3b22 --project default
-> 2020-10-15 19:23:36,481 [info] run executed, status=completed
-
-
-
<mlrun.model.RunObject at 0x7fe2297b51d0>
-
-
-
-
-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/pandas_profiling_report/0.8.0/static/function.html b/functions/development/pandas_profiling_report/0.8.0/static/function.html deleted file mode 100644 index 2b5b5972..00000000 --- a/functions/development/pandas_profiling_report/0.8.0/static/function.html +++ /dev/null @@ -1,62 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: job
-metadata:
-  name: pandas-profiling-report
-  tag: ''
-  hash: 79fe77fb2920a8ffecfef2f614a0be494c2ea43b
-  project: default
-  labels:
-    author: nicks
-  categories:
-  - data-analysis
-spec:
-  command: ''
-  args: []
-  image: mlrun/mlrun
-  env: []
-  default_handler: pandas_profiling_report
-  entry_points:
-    pandas_profiling_report:
-      name: pandas_profiling_report
-      doc: Create a Pandas Profiling Report for a dataset.
-      parameters:
-      - name: context
-        type: MLClientCtx
-        doc: the function context
-        default: ''
-      - name: data
-        type: DataItem
-        doc: Dataset to create report for
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 10
-  description: Create Pandas Profiling Report from Dataset
-  build:
-    functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHBhbmRhcyBhcyBwZAppbXBvcnQgcGFuZGFzX3Byb2ZpbGluZwoKZnJvbSBtbHJ1bi5leGVjdXRpb24gaW1wb3J0IE1MQ2xpZW50Q3R4CmZyb20gbWxydW4uZGF0YXN0b3JlIGltcG9ydCBEYXRhSXRlbQoKCmRlZiBwYW5kYXNfcHJvZmlsaW5nX3JlcG9ydCgKICAgIGNvbnRleHQ6IE1MQ2xpZW50Q3R4LAogICAgZGF0YTogRGF0YUl0ZW0sCikgLT4gTm9uZToKICAgICIiIkNyZWF0ZSBhIFBhbmRhcyBQcm9maWxpbmcgUmVwb3J0IGZvciBhIGRhdGFzZXQuCiAgICA6cGFyYW0gY29udGV4dDogICAgICAgICB0aGUgZnVuY3Rpb24gY29udGV4dAogICAgOnBhcmFtIGRhdGE6ICAgICAgICAgICAgRGF0YXNldCB0byBjcmVhdGUgcmVwb3J0IGZvcgogICAgIiIiCgogICAgZGYgPSBkYXRhLmFzX2RmKCkKCiAgICBwcm9maWxlID0gZGYucHJvZmlsZV9yZXBvcnQodGl0bGU9IlBhbmRhcyBQcm9maWxpbmcgUmVwb3J0IikKCiAgICBjb250ZXh0LmxvZ19hcnRpZmFjdCgKICAgICAgICAiUGFuZGFzIFByb2ZpbGluZyBSZXBvcnQiLAogICAgICAgIGJvZHk9cHJvZmlsZS50b19odG1sKCksCiAgICAgICAgbG9jYWxfcGF0aD0icGFuZGFzX3Byb2ZpbGluZ19yZXBvcnQuaHRtbCIsCiAgICApCg==
-    commands:
-    - python -m pip install pandas_profiling
-    code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/pandas_profiling_report/pandas_profiling_report.py
-  affinity: null
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/pandas_profiling_report/0.8.0/static/item.html b/functions/development/pandas_profiling_report/0.8.0/static/item.html deleted file mode 100644 index beef8723..00000000 --- a/functions/development/pandas_profiling_report/0.8.0/static/item.html +++ /dev/null @@ -1,46 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- data-analysis
-description: Create Pandas Profiling Report from Dataset
-doc: ''
-example: pandas_profiling_report.ipynb
-generationDate: 2021-05-19:23-13
-icon: ''
-labels:
-  author: nicks
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 0.8.0
-name: pandas-profiling-report
-platformVersion: 3.2.0
-spec:
-  filename: pandas_profiling_report.py
-  handler: pandas_profiling_report
-  image: mlrun/mlrun
-  kind: job
-  requirements:
-  - pandas_profiling
-url: ''
-version: 0.8.0
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/pandas_profiling_report/0.8.0/static/source.html b/functions/development/pandas_profiling_report/0.8.0/static/source.html deleted file mode 100644 index 439b80b2..00000000 --- a/functions/development/pandas_profiling_report/0.8.0/static/source.html +++ /dev/null @@ -1,49 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-# Generated by nuclio.export.NuclioExporter
-
-import pandas as pd
-import pandas_profiling
-
-from mlrun.execution import MLClientCtx
-from mlrun.datastore import DataItem
-
-
-def pandas_profiling_report(
-    context: MLClientCtx,
-    data: DataItem,
-) -> None:
-    """Create a Pandas Profiling Report for a dataset.
-    :param context:         the function context
-    :param data:            Dataset to create report for
-    """
-
-    df = data.as_df()
-
-    profile = df.profile_report(title="Pandas Profiling Report")
-
-    context.log_artifact(
-        "Pandas Profiling Report",
-        body=profile.to_html(),
-        local_path="pandas_profiling_report.html",
-    )
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/pandas_profiling_report/0.9.0/src/README.md b/functions/development/pandas_profiling_report/0.9.0/src/README.md deleted file mode 100644 index 40e0c9b2..00000000 --- a/functions/development/pandas_profiling_report/0.9.0/src/README.md +++ /dev/null @@ -1,26 +0,0 @@ -## pandas_profiling_report - -Creates an html report with various graphs/statistics/correlations for a given dataset. See sample report [here](https://pandas-profiling.github.io/pandas-profiling/examples/master/titanic/titanic_report.html). Link to GitHub page [here](https://github.com/pandas-profiling/pandas-profiling). - - -Usage example: - -```python -import mlrun, os -mlrun.mlconf.dbpath = 'http://mlrun-api:8080' - -# Load pandas_profiling_report function from Github -func = mlrun.import_function("hub://pandas_profiling_report").apply(mlrun.mount_v3io()) - -# Build MLRun image (only needs to be run once) -func.deploy() - -# Create task -data = 'https://iguazio-sample-data.s3.amazonaws.com/datasets/iris_dataset.csv' - -task = NewTask(name="pandas-profiling-report", - inputs={"data": DATA_URL}) - -# Run task on cluster -run = func.run(task, artifact_path='/User/artifacts') -``` diff --git a/functions/development/pandas_profiling_report/0.9.0/src/function.yaml b/functions/development/pandas_profiling_report/0.9.0/src/function.yaml deleted file mode 100644 index ffdbbf83..00000000 --- a/functions/development/pandas_profiling_report/0.9.0/src/function.yaml +++ /dev/null @@ -1,40 +0,0 @@ -kind: job -metadata: - name: pandas-profiling-report - tag: '' - hash: 79fe77fb2920a8ffecfef2f614a0be494c2ea43b - project: '' - labels: - author: nicks - categories: - - data-analysis -spec: - command: '' - args: [] - image: mlrun/mlrun - env: [] - default_handler: pandas_profiling_report - entry_points: - pandas_profiling_report: - name: pandas_profiling_report - doc: Create a Pandas Profiling Report for a dataset. - parameters: - - name: context - type: MLClientCtx - doc: the function context - default: '' - - name: data - type: DataItem - doc: Dataset to create report for - default: '' - outputs: - - default: '' - lineno: 10 - description: Create Pandas Profiling Report from Dataset - build: - functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHBhbmRhcyBhcyBwZAppbXBvcnQgcGFuZGFzX3Byb2ZpbGluZwoKZnJvbSBtbHJ1bi5leGVjdXRpb24gaW1wb3J0IE1MQ2xpZW50Q3R4CmZyb20gbWxydW4uZGF0YXN0b3JlIGltcG9ydCBEYXRhSXRlbQoKCmRlZiBwYW5kYXNfcHJvZmlsaW5nX3JlcG9ydCgKICAgIGNvbnRleHQ6IE1MQ2xpZW50Q3R4LAogICAgZGF0YTogRGF0YUl0ZW0sCikgLT4gTm9uZToKICAgICIiIkNyZWF0ZSBhIFBhbmRhcyBQcm9maWxpbmcgUmVwb3J0IGZvciBhIGRhdGFzZXQuCiAgICA6cGFyYW0gY29udGV4dDogICAgICAgICB0aGUgZnVuY3Rpb24gY29udGV4dAogICAgOnBhcmFtIGRhdGE6ICAgICAgICAgICAgRGF0YXNldCB0byBjcmVhdGUgcmVwb3J0IGZvcgogICAgIiIiCgogICAgZGYgPSBkYXRhLmFzX2RmKCkKCiAgICBwcm9maWxlID0gZGYucHJvZmlsZV9yZXBvcnQodGl0bGU9IlBhbmRhcyBQcm9maWxpbmcgUmVwb3J0IikKCiAgICBjb250ZXh0LmxvZ19hcnRpZmFjdCgKICAgICAgICAiUGFuZGFzIFByb2ZpbGluZyBSZXBvcnQiLAogICAgICAgIGJvZHk9cHJvZmlsZS50b19odG1sKCksCiAgICAgICAgbG9jYWxfcGF0aD0icGFuZGFzX3Byb2ZpbGluZ19yZXBvcnQuaHRtbCIsCiAgICApCg== - commands: - - python -m pip install pandas_profiling - code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/pandas_profiling_report/pandas_profiling_report.py - affinity: null -verbose: false diff --git a/functions/development/pandas_profiling_report/0.9.0/src/item.yaml b/functions/development/pandas_profiling_report/0.9.0/src/item.yaml deleted file mode 100644 index d9f83489..00000000 --- a/functions/development/pandas_profiling_report/0.9.0/src/item.yaml +++ /dev/null @@ -1,24 +0,0 @@ -apiVersion: v1 -categories: -- data-analysis -description: Create Pandas Profiling Report from Dataset -doc: '' -example: pandas_profiling_report.ipynb -generationDate: 2021-11-18:12-28 -icon: '' -labels: - author: nicks -maintainers: [] -marketplaceType: '' -mlrunVersion: 0.8.0 -name: pandas-profiling-report -platformVersion: 3.2.0 -spec: - filename: pandas_profiling_report.py - handler: pandas_profiling_report - image: mlrun/mlrun - kind: job - requirements: - - pandas_profiling -url: '' -version: 0.9.0 diff --git a/functions/development/pandas_profiling_report/0.9.0/src/pandas_profiling_report.ipynb b/functions/development/pandas_profiling_report/0.9.0/src/pandas_profiling_report.ipynb deleted file mode 100644 index 61aeba26..00000000 --- a/functions/development/pandas_profiling_report/0.9.0/src/pandas_profiling_report.ipynb +++ /dev/null @@ -1,794 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Pandas Profiling Report" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: ignore\n", - "import nuclio" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "%nuclio: setting kind to 'job'\n", - "%nuclio: setting spec.image to 'mlrun/mlrun'\n" - ] - } - ], - "source": [ - "%nuclio config kind = \"job\"\n", - "%nuclio config spec.image = \"mlrun/mlrun\"" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "%%nuclio cmd -c\n", - "pip install pandas_profiling" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import pandas_profiling\n", - "\n", - "from mlrun.execution import MLClientCtx\n", - "from mlrun.datastore import DataItem" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "def pandas_profiling_report(\n", - " context: MLClientCtx,\n", - " data: DataItem,\n", - ") -> None:\n", - " \"\"\"Create a Pandas Profiling Report for a dataset.\n", - " :param context: the function context\n", - " :param data: Dataset to create report for\n", - " \"\"\"\n", - " \n", - " # Load dataset\n", - " df = data.as_df()\n", - " \n", - " # Create Pandas Profiling Report\n", - " profile = df.profile_report(title='Pandas Profiling Report')\n", - " \n", - " # Save to MLRun DB\n", - " context.log_artifact('Pandas Profiling Report',\n", - " body=profile.to_html(),\n", - " local_path='pandas_profiling_report.html')" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: end-code" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### mlconfig" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import mlconf\n", - "import os\n", - "\n", - "mlconf.dbpath = 'http://mlrun-api:8080'\n", - "mlconf.artifact_path = mlconf.artifact_path or f'{os.environ[\"HOME\"]}/artifacts'" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### save" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2020-10-15 19:21:40,986 [info] function spec saved to path: function.yaml\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from mlrun import code_to_function\n", - "\n", - "# create job function object from notebook code\n", - "fn = code_to_function(\"pandas_profiling_report\", kind=\"job\")\n", - "\n", - "# add metadata (for templates and reuse)\n", - "fn.spec.default_handler = \"pandas_profiling_report\"\n", - "fn.spec.description = \"Create Pandas Profiling Report from Dataset\"\n", - "fn.metadata.categories = [\"analysis\"]\n", - "fn.metadata.labels = {\"author\": \"nicks\"}\n", - "fn.export(\"function.yaml\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## tests" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from mlrun.platforms import auto_mount\n", - "fn.apply(auto_mount())" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import NewTask, run_local\n", - "\n", - "DATA_URL = 'https://iguazio-sample-data.s3.amazonaws.com/datasets/iris_dataset.csv'" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "task = NewTask(name=\"pandas-profiling-report\", \n", - " handler=pandas_profiling_report, \n", - " inputs={\"data\": DATA_URL})" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### run locally" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2020-10-15 19:21:41,030 [warning] warning!, server (0.5.1) and client (0.5.2) ver dont match\n", - "> 2020-10-15 19:21:41,031 [info] starting run pandas-profiling-report uid=0894aed4f2854d96b776e25bdcaff80e -> http://mlrun-api:8080\n", - "> 2020-10-15 19:21:41,062 [warning] warning!, server (0.5.1) and client (0.5.2) ver dont match\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "86c3397cc7384565815af90bc5a6d10b", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(FloatProgress(value=0.0, description='Summarize dataset', max=19.0, style=ProgressStyle(descrip…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "e7dece2ab7184c909611cf0aed3ef474", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(FloatProgress(value=0.0, description='Generate report structure', max=1.0, style=ProgressStyle(…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "7153ac93afcd4e77a4b5a312766af995", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(FloatProgress(value=0.0, description='Render HTML', max=1.0, style=ProgressStyle(description_wi…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Oct 15 19:21:41completedpandas-profiling-report
v3io_user=nicks
kind=handler
owner=nicks
host=nicks-jupyter-76668bdd46-g9sxf
data
Pandas Profiling Report
\n", - "
\n", - "
\n", - "
\n", - " Title\n", - " ×\n", - "
\n", - " \n", - "
\n", - "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "to track results use .show() or .logs() or in CLI: \n", - "!mlrun get run 0894aed4f2854d96b776e25bdcaff80e --project default , !mlrun logs 0894aed4f2854d96b776e25bdcaff80e --project default\n", - "> 2020-10-15 19:21:52,944 [info] run executed, status=completed\n" - ] - } - ], - "source": [ - "run = run_local(task)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### run remotely" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "# Create MLRun image (only needs to be run once)\n", - "fn.deploy()" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2020-10-15 19:23:17,199 [info] starting run pandas-profiling-report uid=0ab5c8dbff95471da6018c1a7afd3b22 -> http://mlrun-api:8080\n", - "> 2020-10-15 19:23:17,303 [info] Job is running in the background, pod: pandas-profiling-report-xr48m\n", - "Summarize dataset: 100%|██████████| 19/19 [00:05<00:00, 3.78it/s, Completed] \n", - "Generate report structure: 100%|██████████| 1/1 [00:02<00:00, 2.22s/it]\n", - "> 2020-10-15 19:23:33,779 [info] run executed, status=completed\n", - "Render HTML: 100%|██████████| 1/1 [00:00<00:00, 2.07it/s]\n", - "final state: succeeded\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Oct 15 19:23:25completedpandas-profiling-report
v3io_user=nicks
kind=job
owner=nicks
host=pandas-profiling-report-xr48m
data
Pandas Profiling Report
\n", - "
\n", - "
\n", - "
\n", - " Title\n", - " ×\n", - "
\n", - " \n", - "
\n", - "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "to track results use .show() or .logs() or in CLI: \n", - "!mlrun get run 0ab5c8dbff95471da6018c1a7afd3b22 --project default , !mlrun logs 0ab5c8dbff95471da6018c1a7afd3b22 --project default\n", - "> 2020-10-15 19:23:36,481 [info] run executed, status=completed\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fn.run(task, inputs={\"data\": DATA_URL})" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/functions/development/pandas_profiling_report/0.9.0/src/pandas_profiling_report.py b/functions/development/pandas_profiling_report/0.9.0/src/pandas_profiling_report.py deleted file mode 100644 index c03077df..00000000 --- a/functions/development/pandas_profiling_report/0.9.0/src/pandas_profiling_report.py +++ /dev/null @@ -1,27 +0,0 @@ -# Generated by nuclio.export.NuclioExporter - -import pandas as pd -import pandas_profiling - -from mlrun.execution import MLClientCtx -from mlrun.datastore import DataItem - - -def pandas_profiling_report( - context: MLClientCtx, - data: DataItem, -) -> None: - """Create a Pandas Profiling Report for a dataset. - :param context: the function context - :param data: Dataset to create report for - """ - - df = data.as_df() - - profile = df.profile_report(title="Pandas Profiling Report") - - context.log_artifact( - "Pandas Profiling Report", - body=profile.to_html(), - local_path="pandas_profiling_report.html", - ) diff --git a/functions/development/pandas_profiling_report/0.9.0/static/documentation.html b/functions/development/pandas_profiling_report/0.9.0/static/documentation.html deleted file mode 100644 index 2c9807f4..00000000 --- a/functions/development/pandas_profiling_report/0.9.0/static/documentation.html +++ /dev/null @@ -1,132 +0,0 @@ - - - - - - - -pandas_profiling_report package - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- - -
-
-
-
-
-
-

pandas_profiling_report package

-
-

Submodules

-
-
-

pandas_profiling_report.pandas_profiling_report module

-
-
-pandas_profiling_report.pandas_profiling_report.pandas_profiling_report(context: mlrun.execution.MLClientCtx, data: mlrun.datastore.DataItem)None[source]
-

Create a Pandas Profiling Report for a dataset. -:param context: the function context -:param data: Dataset to create report for

-
-
-
-

Module contents

-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/pandas_profiling_report/0.9.0/static/example.html b/functions/development/pandas_profiling_report/0.9.0/static/example.html deleted file mode 100644 index 47cdc635..00000000 --- a/functions/development/pandas_profiling_report/0.9.0/static/example.html +++ /dev/null @@ -1,733 +0,0 @@ - - - - - - - -Pandas Profiling Report - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- - -
-
-
-
-
-
-

Pandas Profiling Report

-
-
-
# nuclio: ignore
-import nuclio
-
-
-
-
-
-
-
%nuclio config kind = "job"
-%nuclio config spec.image = "mlrun/mlrun"
-
-
-
-
-
%nuclio: setting kind to 'job'
-%nuclio: setting spec.image to 'mlrun/mlrun'
-
-
-
-
-
-
-
%%nuclio cmd -c
-pip install pandas_profiling
-
-
-
-
-
-
-
import pandas as pd
-import pandas_profiling
-
-from mlrun.execution import MLClientCtx
-from mlrun.datastore import DataItem
-
-
-
-
-
-
-
def pandas_profiling_report(
-    context: MLClientCtx,
-    data: DataItem,
-) -> None:
-    """Create a Pandas Profiling Report for a dataset.
-    :param context:         the function context
-    :param data:            Dataset to create report for
-    """
-    
-    # Load dataset
-    df = data.as_df()
-    
-    # Create Pandas Profiling Report
-    profile = df.profile_report(title='Pandas Profiling Report')
-    
-    # Save to MLRun DB
-    context.log_artifact('Pandas Profiling Report',
-                         body=profile.to_html(),
-                         local_path='pandas_profiling_report.html')
-
-
-
-
-
-
-
# nuclio: end-code
-
-
-
-
-
-
-

mlconfig

-
-
-
from mlrun import mlconf
-import os
-
-mlconf.dbpath = 'http://mlrun-api:8080'
-mlconf.artifact_path = mlconf.artifact_path or f'{os.environ["HOME"]}/artifacts'
-
-
-
-
-
-
-

save

-
-
-
from mlrun import code_to_function
-
-# create job function object from notebook code
-fn = code_to_function("pandas_profiling_report", kind="job")
-
-# add metadata (for templates and reuse)
-fn.spec.default_handler = "pandas_profiling_report"
-fn.spec.description = "Create Pandas Profiling Report from Dataset"
-fn.metadata.categories = ["analysis"]
-fn.metadata.labels = {"author": "nicks"}
-fn.export("function.yaml")
-
-
-
-
-
> 2020-10-15 19:21:40,986 [info] function spec saved to path: function.yaml
-
-
-
<mlrun.runtimes.kubejob.KubejobRuntime at 0x7fe232431610>
-
-
-
-
-
-
-

tests

-
-
-
from mlrun.platforms import auto_mount
-fn.apply(auto_mount())
-
-
-
-
-
<mlrun.runtimes.kubejob.KubejobRuntime at 0x7fe232431610>
-
-
-
-
-
-
-
from mlrun import NewTask, run_local
-
-DATA_URL = 'https://iguazio-sample-data.s3.amazonaws.com/datasets/iris_dataset.csv'
-
-
-
-
-
-
-
task = NewTask(name="pandas-profiling-report", 
-               handler=pandas_profiling_report, 
-               inputs={"data": DATA_URL})
-
-
-
-
-
-

run locally

-
-
-
run = run_local(task)
-
-
-
-
-
> 2020-10-15 19:21:41,030 [warning] warning!, server (0.5.1) and client (0.5.2) ver dont match
-> 2020-10-15 19:21:41,031 [info] starting run pandas-profiling-report uid=0894aed4f2854d96b776e25bdcaff80e  -> http://mlrun-api:8080
-> 2020-10-15 19:21:41,062 [warning] warning!, server (0.5.1) and client (0.5.2) ver dont match
-
-
-

-
-
-

-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Oct 15 19:21:41completedpandas-profiling-report
v3io_user=nicks
kind=handler
owner=nicks
host=nicks-jupyter-76668bdd46-g9sxf
data
Pandas Profiling Report
-
- -
-
to track results use .show() or .logs() or in CLI: 
-!mlrun get run 0894aed4f2854d96b776e25bdcaff80e --project default , !mlrun logs 0894aed4f2854d96b776e25bdcaff80e --project default
-> 2020-10-15 19:21:52,944 [info] run executed, status=completed
-
-
-
-
-
-
-

run remotely

-
-
-
# Create MLRun image (only needs to be run once)
-fn.deploy()
-
-
-
-
-
-
-
fn.run(task, inputs={"data": DATA_URL})
-
-
-
-
-
> 2020-10-15 19:23:17,199 [info] starting run pandas-profiling-report uid=0ab5c8dbff95471da6018c1a7afd3b22  -> http://mlrun-api:8080
-> 2020-10-15 19:23:17,303 [info] Job is running in the background, pod: pandas-profiling-report-xr48m
-Summarize dataset: 100%|██████████| 19/19 [00:05<00:00,  3.78it/s, Completed]                         
-Generate report structure: 100%|██████████| 1/1 [00:02<00:00,  2.22s/it]
-> 2020-10-15 19:23:33,779 [info] run executed, status=completed
-Render HTML: 100%|██████████| 1/1 [00:00<00:00,  2.07it/s]
-final state: succeeded
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Oct 15 19:23:25completedpandas-profiling-report
v3io_user=nicks
kind=job
owner=nicks
host=pandas-profiling-report-xr48m
data
Pandas Profiling Report
-
- -
-
to track results use .show() or .logs() or in CLI: 
-!mlrun get run 0ab5c8dbff95471da6018c1a7afd3b22 --project default , !mlrun logs 0ab5c8dbff95471da6018c1a7afd3b22 --project default
-> 2020-10-15 19:23:36,481 [info] run executed, status=completed
-
-
-
<mlrun.model.RunObject at 0x7fe2297b51d0>
-
-
-
-
-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/pandas_profiling_report/0.9.0/static/function.html b/functions/development/pandas_profiling_report/0.9.0/static/function.html deleted file mode 100644 index 2c7d0c27..00000000 --- a/functions/development/pandas_profiling_report/0.9.0/static/function.html +++ /dev/null @@ -1,62 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: job
-metadata:
-  name: pandas-profiling-report
-  tag: ''
-  hash: 79fe77fb2920a8ffecfef2f614a0be494c2ea43b
-  project: ''
-  labels:
-    author: nicks
-  categories:
-  - data-analysis
-spec:
-  command: ''
-  args: []
-  image: mlrun/mlrun
-  env: []
-  default_handler: pandas_profiling_report
-  entry_points:
-    pandas_profiling_report:
-      name: pandas_profiling_report
-      doc: Create a Pandas Profiling Report for a dataset.
-      parameters:
-      - name: context
-        type: MLClientCtx
-        doc: the function context
-        default: ''
-      - name: data
-        type: DataItem
-        doc: Dataset to create report for
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 10
-  description: Create Pandas Profiling Report from Dataset
-  build:
-    functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHBhbmRhcyBhcyBwZAppbXBvcnQgcGFuZGFzX3Byb2ZpbGluZwoKZnJvbSBtbHJ1bi5leGVjdXRpb24gaW1wb3J0IE1MQ2xpZW50Q3R4CmZyb20gbWxydW4uZGF0YXN0b3JlIGltcG9ydCBEYXRhSXRlbQoKCmRlZiBwYW5kYXNfcHJvZmlsaW5nX3JlcG9ydCgKICAgIGNvbnRleHQ6IE1MQ2xpZW50Q3R4LAogICAgZGF0YTogRGF0YUl0ZW0sCikgLT4gTm9uZToKICAgICIiIkNyZWF0ZSBhIFBhbmRhcyBQcm9maWxpbmcgUmVwb3J0IGZvciBhIGRhdGFzZXQuCiAgICA6cGFyYW0gY29udGV4dDogICAgICAgICB0aGUgZnVuY3Rpb24gY29udGV4dAogICAgOnBhcmFtIGRhdGE6ICAgICAgICAgICAgRGF0YXNldCB0byBjcmVhdGUgcmVwb3J0IGZvcgogICAgIiIiCgogICAgZGYgPSBkYXRhLmFzX2RmKCkKCiAgICBwcm9maWxlID0gZGYucHJvZmlsZV9yZXBvcnQodGl0bGU9IlBhbmRhcyBQcm9maWxpbmcgUmVwb3J0IikKCiAgICBjb250ZXh0LmxvZ19hcnRpZmFjdCgKICAgICAgICAiUGFuZGFzIFByb2ZpbGluZyBSZXBvcnQiLAogICAgICAgIGJvZHk9cHJvZmlsZS50b19odG1sKCksCiAgICAgICAgbG9jYWxfcGF0aD0icGFuZGFzX3Byb2ZpbGluZ19yZXBvcnQuaHRtbCIsCiAgICApCg==
-    commands:
-    - python -m pip install pandas_profiling
-    code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/pandas_profiling_report/pandas_profiling_report.py
-  affinity: null
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/pandas_profiling_report/0.9.0/static/item.html b/functions/development/pandas_profiling_report/0.9.0/static/item.html deleted file mode 100644 index ed4d0390..00000000 --- a/functions/development/pandas_profiling_report/0.9.0/static/item.html +++ /dev/null @@ -1,46 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- data-analysis
-description: Create Pandas Profiling Report from Dataset
-doc: ''
-example: pandas_profiling_report.ipynb
-generationDate: 2021-11-18:12-28
-icon: ''
-labels:
-  author: nicks
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 0.8.0
-name: pandas-profiling-report
-platformVersion: 3.2.0
-spec:
-  filename: pandas_profiling_report.py
-  handler: pandas_profiling_report
-  image: mlrun/mlrun
-  kind: job
-  requirements:
-  - pandas_profiling
-url: ''
-version: 0.9.0
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/pandas_profiling_report/0.9.0/static/source.html b/functions/development/pandas_profiling_report/0.9.0/static/source.html deleted file mode 100644 index 439b80b2..00000000 --- a/functions/development/pandas_profiling_report/0.9.0/static/source.html +++ /dev/null @@ -1,49 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-# Generated by nuclio.export.NuclioExporter
-
-import pandas as pd
-import pandas_profiling
-
-from mlrun.execution import MLClientCtx
-from mlrun.datastore import DataItem
-
-
-def pandas_profiling_report(
-    context: MLClientCtx,
-    data: DataItem,
-) -> None:
-    """Create a Pandas Profiling Report for a dataset.
-    :param context:         the function context
-    :param data:            Dataset to create report for
-    """
-
-    df = data.as_df()
-
-    profile = df.profile_report(title="Pandas Profiling Report")
-
-    context.log_artifact(
-        "Pandas Profiling Report",
-        body=profile.to_html(),
-        local_path="pandas_profiling_report.html",
-    )
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/pandas_profiling_report/1.1.0/src/README.md b/functions/development/pandas_profiling_report/1.1.0/src/README.md deleted file mode 100644 index 40e0c9b2..00000000 --- a/functions/development/pandas_profiling_report/1.1.0/src/README.md +++ /dev/null @@ -1,26 +0,0 @@ -## pandas_profiling_report - -Creates an html report with various graphs/statistics/correlations for a given dataset. See sample report [here](https://pandas-profiling.github.io/pandas-profiling/examples/master/titanic/titanic_report.html). Link to GitHub page [here](https://github.com/pandas-profiling/pandas-profiling). - - -Usage example: - -```python -import mlrun, os -mlrun.mlconf.dbpath = 'http://mlrun-api:8080' - -# Load pandas_profiling_report function from Github -func = mlrun.import_function("hub://pandas_profiling_report").apply(mlrun.mount_v3io()) - -# Build MLRun image (only needs to be run once) -func.deploy() - -# Create task -data = 'https://iguazio-sample-data.s3.amazonaws.com/datasets/iris_dataset.csv' - -task = NewTask(name="pandas-profiling-report", - inputs={"data": DATA_URL}) - -# Run task on cluster -run = func.run(task, artifact_path='/User/artifacts') -``` diff --git a/functions/development/pandas_profiling_report/1.1.0/src/function.yaml b/functions/development/pandas_profiling_report/1.1.0/src/function.yaml deleted file mode 100644 index ffdbbf83..00000000 --- a/functions/development/pandas_profiling_report/1.1.0/src/function.yaml +++ /dev/null @@ -1,40 +0,0 @@ -kind: job -metadata: - name: pandas-profiling-report - tag: '' - hash: 79fe77fb2920a8ffecfef2f614a0be494c2ea43b - project: '' - labels: - author: nicks - categories: - - data-analysis -spec: - command: '' - args: [] - image: mlrun/mlrun - env: [] - default_handler: pandas_profiling_report - entry_points: - pandas_profiling_report: - name: pandas_profiling_report - doc: Create a Pandas Profiling Report for a dataset. - parameters: - - name: context - type: MLClientCtx - doc: the function context - default: '' - - name: data - type: DataItem - doc: Dataset to create report for - default: '' - outputs: - - default: '' - lineno: 10 - description: Create Pandas Profiling Report from Dataset - build: - functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHBhbmRhcyBhcyBwZAppbXBvcnQgcGFuZGFzX3Byb2ZpbGluZwoKZnJvbSBtbHJ1bi5leGVjdXRpb24gaW1wb3J0IE1MQ2xpZW50Q3R4CmZyb20gbWxydW4uZGF0YXN0b3JlIGltcG9ydCBEYXRhSXRlbQoKCmRlZiBwYW5kYXNfcHJvZmlsaW5nX3JlcG9ydCgKICAgIGNvbnRleHQ6IE1MQ2xpZW50Q3R4LAogICAgZGF0YTogRGF0YUl0ZW0sCikgLT4gTm9uZToKICAgICIiIkNyZWF0ZSBhIFBhbmRhcyBQcm9maWxpbmcgUmVwb3J0IGZvciBhIGRhdGFzZXQuCiAgICA6cGFyYW0gY29udGV4dDogICAgICAgICB0aGUgZnVuY3Rpb24gY29udGV4dAogICAgOnBhcmFtIGRhdGE6ICAgICAgICAgICAgRGF0YXNldCB0byBjcmVhdGUgcmVwb3J0IGZvcgogICAgIiIiCgogICAgZGYgPSBkYXRhLmFzX2RmKCkKCiAgICBwcm9maWxlID0gZGYucHJvZmlsZV9yZXBvcnQodGl0bGU9IlBhbmRhcyBQcm9maWxpbmcgUmVwb3J0IikKCiAgICBjb250ZXh0LmxvZ19hcnRpZmFjdCgKICAgICAgICAiUGFuZGFzIFByb2ZpbGluZyBSZXBvcnQiLAogICAgICAgIGJvZHk9cHJvZmlsZS50b19odG1sKCksCiAgICAgICAgbG9jYWxfcGF0aD0icGFuZGFzX3Byb2ZpbGluZ19yZXBvcnQuaHRtbCIsCiAgICApCg== - commands: - - python -m pip install pandas_profiling - code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/pandas_profiling_report/pandas_profiling_report.py - affinity: null -verbose: false diff --git a/functions/development/pandas_profiling_report/1.1.0/src/item.yaml b/functions/development/pandas_profiling_report/1.1.0/src/item.yaml deleted file mode 100644 index 13d37436..00000000 --- a/functions/development/pandas_profiling_report/1.1.0/src/item.yaml +++ /dev/null @@ -1,25 +0,0 @@ -apiVersion: v1 -categories: -- data-analysis -description: Create Pandas Profiling Report from Dataset -doc: '' -example: pandas_profiling_report.ipynb -generationDate: 2022-08-28:17-25 -hidden: false -icon: '' -labels: - author: nicks -maintainers: [] -marketplaceType: '' -mlrunVersion: 1.1.0 -name: pandas-profiling-report -platformVersion: 3.5.0 -spec: - filename: pandas_profiling_report.py - handler: pandas_profiling_report - image: mlrun/mlrun - kind: job - requirements: - - pandas_profiling -url: '' -version: 1.1.0 diff --git a/functions/development/pandas_profiling_report/1.1.0/src/pandas_profiling_report.ipynb b/functions/development/pandas_profiling_report/1.1.0/src/pandas_profiling_report.ipynb deleted file mode 100644 index 61aeba26..00000000 --- a/functions/development/pandas_profiling_report/1.1.0/src/pandas_profiling_report.ipynb +++ /dev/null @@ -1,794 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Pandas Profiling Report" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: ignore\n", - "import nuclio" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "%nuclio: setting kind to 'job'\n", - "%nuclio: setting spec.image to 'mlrun/mlrun'\n" - ] - } - ], - "source": [ - "%nuclio config kind = \"job\"\n", - "%nuclio config spec.image = \"mlrun/mlrun\"" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "%%nuclio cmd -c\n", - "pip install pandas_profiling" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import pandas_profiling\n", - "\n", - "from mlrun.execution import MLClientCtx\n", - "from mlrun.datastore import DataItem" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "def pandas_profiling_report(\n", - " context: MLClientCtx,\n", - " data: DataItem,\n", - ") -> None:\n", - " \"\"\"Create a Pandas Profiling Report for a dataset.\n", - " :param context: the function context\n", - " :param data: Dataset to create report for\n", - " \"\"\"\n", - " \n", - " # Load dataset\n", - " df = data.as_df()\n", - " \n", - " # Create Pandas Profiling Report\n", - " profile = df.profile_report(title='Pandas Profiling Report')\n", - " \n", - " # Save to MLRun DB\n", - " context.log_artifact('Pandas Profiling Report',\n", - " body=profile.to_html(),\n", - " local_path='pandas_profiling_report.html')" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: end-code" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### mlconfig" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import mlconf\n", - "import os\n", - "\n", - "mlconf.dbpath = 'http://mlrun-api:8080'\n", - "mlconf.artifact_path = mlconf.artifact_path or f'{os.environ[\"HOME\"]}/artifacts'" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### save" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2020-10-15 19:21:40,986 [info] function spec saved to path: function.yaml\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from mlrun import code_to_function\n", - "\n", - "# create job function object from notebook code\n", - "fn = code_to_function(\"pandas_profiling_report\", kind=\"job\")\n", - "\n", - "# add metadata (for templates and reuse)\n", - "fn.spec.default_handler = \"pandas_profiling_report\"\n", - "fn.spec.description = \"Create Pandas Profiling Report from Dataset\"\n", - "fn.metadata.categories = [\"analysis\"]\n", - "fn.metadata.labels = {\"author\": \"nicks\"}\n", - "fn.export(\"function.yaml\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## tests" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from mlrun.platforms import auto_mount\n", - "fn.apply(auto_mount())" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import NewTask, run_local\n", - "\n", - "DATA_URL = 'https://iguazio-sample-data.s3.amazonaws.com/datasets/iris_dataset.csv'" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "task = NewTask(name=\"pandas-profiling-report\", \n", - " handler=pandas_profiling_report, \n", - " inputs={\"data\": DATA_URL})" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### run locally" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2020-10-15 19:21:41,030 [warning] warning!, server (0.5.1) and client (0.5.2) ver dont match\n", - "> 2020-10-15 19:21:41,031 [info] starting run pandas-profiling-report uid=0894aed4f2854d96b776e25bdcaff80e -> http://mlrun-api:8080\n", - "> 2020-10-15 19:21:41,062 [warning] warning!, server (0.5.1) and client (0.5.2) ver dont match\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "86c3397cc7384565815af90bc5a6d10b", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(FloatProgress(value=0.0, description='Summarize dataset', max=19.0, style=ProgressStyle(descrip…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "e7dece2ab7184c909611cf0aed3ef474", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(FloatProgress(value=0.0, description='Generate report structure', max=1.0, style=ProgressStyle(…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "7153ac93afcd4e77a4b5a312766af995", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(FloatProgress(value=0.0, description='Render HTML', max=1.0, style=ProgressStyle(description_wi…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Oct 15 19:21:41completedpandas-profiling-report
v3io_user=nicks
kind=handler
owner=nicks
host=nicks-jupyter-76668bdd46-g9sxf
data
Pandas Profiling Report
\n", - "
\n", - "
\n", - "
\n", - " Title\n", - " ×\n", - "
\n", - " \n", - "
\n", - "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "to track results use .show() or .logs() or in CLI: \n", - "!mlrun get run 0894aed4f2854d96b776e25bdcaff80e --project default , !mlrun logs 0894aed4f2854d96b776e25bdcaff80e --project default\n", - "> 2020-10-15 19:21:52,944 [info] run executed, status=completed\n" - ] - } - ], - "source": [ - "run = run_local(task)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### run remotely" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "# Create MLRun image (only needs to be run once)\n", - "fn.deploy()" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2020-10-15 19:23:17,199 [info] starting run pandas-profiling-report uid=0ab5c8dbff95471da6018c1a7afd3b22 -> http://mlrun-api:8080\n", - "> 2020-10-15 19:23:17,303 [info] Job is running in the background, pod: pandas-profiling-report-xr48m\n", - "Summarize dataset: 100%|██████████| 19/19 [00:05<00:00, 3.78it/s, Completed] \n", - "Generate report structure: 100%|██████████| 1/1 [00:02<00:00, 2.22s/it]\n", - "> 2020-10-15 19:23:33,779 [info] run executed, status=completed\n", - "Render HTML: 100%|██████████| 1/1 [00:00<00:00, 2.07it/s]\n", - "final state: succeeded\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Oct 15 19:23:25completedpandas-profiling-report
v3io_user=nicks
kind=job
owner=nicks
host=pandas-profiling-report-xr48m
data
Pandas Profiling Report
\n", - "
\n", - "
\n", - "
\n", - " Title\n", - " ×\n", - "
\n", - " \n", - "
\n", - "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "to track results use .show() or .logs() or in CLI: \n", - "!mlrun get run 0ab5c8dbff95471da6018c1a7afd3b22 --project default , !mlrun logs 0ab5c8dbff95471da6018c1a7afd3b22 --project default\n", - "> 2020-10-15 19:23:36,481 [info] run executed, status=completed\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fn.run(task, inputs={\"data\": DATA_URL})" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/functions/development/pandas_profiling_report/1.1.0/src/pandas_profiling_report.py b/functions/development/pandas_profiling_report/1.1.0/src/pandas_profiling_report.py deleted file mode 100644 index c3d3d4d3..00000000 --- a/functions/development/pandas_profiling_report/1.1.0/src/pandas_profiling_report.py +++ /dev/null @@ -1,41 +0,0 @@ -# Copyright 2019 Iguazio -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# Generated by nuclio.export.NuclioExporter - -import pandas as pd -import pandas_profiling - -from mlrun.execution import MLClientCtx -from mlrun.datastore import DataItem - - -def pandas_profiling_report( - context: MLClientCtx, - data: DataItem, -) -> None: - """Create a Pandas Profiling Report for a dataset. - :param context: the function context - :param data: Dataset to create report for - """ - - df = data.as_df() - - profile = df.profile_report(title="Pandas Profiling Report") - - context.log_artifact( - "Pandas Profiling Report", - body=profile.to_html(), - local_path="pandas_profiling_report.html", - ) diff --git a/functions/development/pandas_profiling_report/1.1.0/static/documentation.html b/functions/development/pandas_profiling_report/1.1.0/static/documentation.html deleted file mode 100644 index c0f26ed1..00000000 --- a/functions/development/pandas_profiling_report/1.1.0/static/documentation.html +++ /dev/null @@ -1,229 +0,0 @@ - - - - - - - -pandas_profiling_report package - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - - - -
-
- - -
-
-
- -
-

pandas_profiling_report package

- -
- -
-
-
-
-
-

pandas_profiling_report package#

-
-

Submodules#

-
-
-

pandas_profiling_report.pandas_profiling_report module#

-
-
-pandas_profiling_report.pandas_profiling_report.pandas_profiling_report(context: mlrun.execution.MLClientCtx, data: mlrun.datastore.base.DataItem)None[source]#
-

Create a Pandas Profiling Report for a dataset. -:param context: the function context -:param data: Dataset to create report for

-
-
-
-

Module contents#

-
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/pandas_profiling_report/1.1.0/static/example.html b/functions/development/pandas_profiling_report/1.1.0/static/example.html deleted file mode 100644 index 9eff0e46..00000000 --- a/functions/development/pandas_profiling_report/1.1.0/static/example.html +++ /dev/null @@ -1,851 +0,0 @@ - - - - - - - -Pandas Profiling Report - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - - - -
-
- - -
-
-
- -
-

Pandas Profiling Report

- -
- -
-
-
-
-
-

Pandas Profiling Report#

-
-
-
# nuclio: ignore
-import nuclio
-
-
-
-
-
-
-
%nuclio config kind = "job"
-%nuclio config spec.image = "mlrun/mlrun"
-
-
-
-
-
%nuclio: setting kind to 'job'
-%nuclio: setting spec.image to 'mlrun/mlrun'
-
-
-
-
-
-
-
%%nuclio cmd -c
-pip install pandas_profiling
-
-
-
-
-
-
-
import pandas as pd
-import pandas_profiling
-
-from mlrun.execution import MLClientCtx
-from mlrun.datastore import DataItem
-
-
-
-
-
-
-
def pandas_profiling_report(
-    context: MLClientCtx,
-    data: DataItem,
-) -> None:
-    """Create a Pandas Profiling Report for a dataset.
-    :param context:         the function context
-    :param data:            Dataset to create report for
-    """
-    
-    # Load dataset
-    df = data.as_df()
-    
-    # Create Pandas Profiling Report
-    profile = df.profile_report(title='Pandas Profiling Report')
-    
-    # Save to MLRun DB
-    context.log_artifact('Pandas Profiling Report',
-                         body=profile.to_html(),
-                         local_path='pandas_profiling_report.html')
-
-
-
-
-
-
-
# nuclio: end-code
-
-
-
-
-
-
-

mlconfig#

-
-
-
from mlrun import mlconf
-import os
-
-mlconf.dbpath = 'http://mlrun-api:8080'
-mlconf.artifact_path = mlconf.artifact_path or f'{os.environ["HOME"]}/artifacts'
-
-
-
-
-
-
-

save#

-
-
-
from mlrun import code_to_function
-
-# create job function object from notebook code
-fn = code_to_function("pandas_profiling_report", kind="job")
-
-# add metadata (for templates and reuse)
-fn.spec.default_handler = "pandas_profiling_report"
-fn.spec.description = "Create Pandas Profiling Report from Dataset"
-fn.metadata.categories = ["analysis"]
-fn.metadata.labels = {"author": "nicks"}
-fn.export("function.yaml")
-
-
-
-
-
> 2020-10-15 19:21:40,986 [info] function spec saved to path: function.yaml
-
-
-
<mlrun.runtimes.kubejob.KubejobRuntime at 0x7fe232431610>
-
-
-
-
-
-
-

tests#

-
-
-
from mlrun.platforms import auto_mount
-fn.apply(auto_mount())
-
-
-
-
-
<mlrun.runtimes.kubejob.KubejobRuntime at 0x7fe232431610>
-
-
-
-
-
-
-
from mlrun import NewTask, run_local
-
-DATA_URL = 'https://iguazio-sample-data.s3.amazonaws.com/datasets/iris_dataset.csv'
-
-
-
-
-
-
-
task = NewTask(name="pandas-profiling-report", 
-               handler=pandas_profiling_report, 
-               inputs={"data": DATA_URL})
-
-
-
-
-
-

run locally#

-
-
-
run = run_local(task)
-
-
-
-
-
> 2020-10-15 19:21:41,030 [warning] warning!, server (0.5.1) and client (0.5.2) ver dont match
-> 2020-10-15 19:21:41,031 [info] starting run pandas-profiling-report uid=0894aed4f2854d96b776e25bdcaff80e  -> http://mlrun-api:8080
-> 2020-10-15 19:21:41,062 [warning] warning!, server (0.5.1) and client (0.5.2) ver dont match
-
-
-

-
-
-

-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Oct 15 19:21:41completedpandas-profiling-report
v3io_user=nicks
kind=handler
owner=nicks
host=nicks-jupyter-76668bdd46-g9sxf
data
Pandas Profiling Report
-
- -
-
to track results use .show() or .logs() or in CLI: 
-!mlrun get run 0894aed4f2854d96b776e25bdcaff80e --project default , !mlrun logs 0894aed4f2854d96b776e25bdcaff80e --project default
-> 2020-10-15 19:21:52,944 [info] run executed, status=completed
-
-
-
-
-
-
-

run remotely#

-
-
-
# Create MLRun image (only needs to be run once)
-fn.deploy()
-
-
-
-
-
-
-
fn.run(task, inputs={"data": DATA_URL})
-
-
-
-
-
> 2020-10-15 19:23:17,199 [info] starting run pandas-profiling-report uid=0ab5c8dbff95471da6018c1a7afd3b22  -> http://mlrun-api:8080
-> 2020-10-15 19:23:17,303 [info] Job is running in the background, pod: pandas-profiling-report-xr48m
-Summarize dataset: 100%|██████████| 19/19 [00:05<00:00,  3.78it/s, Completed]                         
-Generate report structure: 100%|██████████| 1/1 [00:02<00:00,  2.22s/it]
-> 2020-10-15 19:23:33,779 [info] run executed, status=completed
-Render HTML: 100%|██████████| 1/1 [00:00<00:00,  2.07it/s]
-final state: succeeded
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Oct 15 19:23:25completedpandas-profiling-report
v3io_user=nicks
kind=job
owner=nicks
host=pandas-profiling-report-xr48m
data
Pandas Profiling Report
-
- -
-
to track results use .show() or .logs() or in CLI: 
-!mlrun get run 0ab5c8dbff95471da6018c1a7afd3b22 --project default , !mlrun logs 0ab5c8dbff95471da6018c1a7afd3b22 --project default
-> 2020-10-15 19:23:36,481 [info] run executed, status=completed
-
-
-
<mlrun.model.RunObject at 0x7fe2297b51d0>
-
-
-
-
-
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/pandas_profiling_report/1.1.0/static/function.html b/functions/development/pandas_profiling_report/1.1.0/static/function.html deleted file mode 100644 index 2c7d0c27..00000000 --- a/functions/development/pandas_profiling_report/1.1.0/static/function.html +++ /dev/null @@ -1,62 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: job
-metadata:
-  name: pandas-profiling-report
-  tag: ''
-  hash: 79fe77fb2920a8ffecfef2f614a0be494c2ea43b
-  project: ''
-  labels:
-    author: nicks
-  categories:
-  - data-analysis
-spec:
-  command: ''
-  args: []
-  image: mlrun/mlrun
-  env: []
-  default_handler: pandas_profiling_report
-  entry_points:
-    pandas_profiling_report:
-      name: pandas_profiling_report
-      doc: Create a Pandas Profiling Report for a dataset.
-      parameters:
-      - name: context
-        type: MLClientCtx
-        doc: the function context
-        default: ''
-      - name: data
-        type: DataItem
-        doc: Dataset to create report for
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 10
-  description: Create Pandas Profiling Report from Dataset
-  build:
-    functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHBhbmRhcyBhcyBwZAppbXBvcnQgcGFuZGFzX3Byb2ZpbGluZwoKZnJvbSBtbHJ1bi5leGVjdXRpb24gaW1wb3J0IE1MQ2xpZW50Q3R4CmZyb20gbWxydW4uZGF0YXN0b3JlIGltcG9ydCBEYXRhSXRlbQoKCmRlZiBwYW5kYXNfcHJvZmlsaW5nX3JlcG9ydCgKICAgIGNvbnRleHQ6IE1MQ2xpZW50Q3R4LAogICAgZGF0YTogRGF0YUl0ZW0sCikgLT4gTm9uZToKICAgICIiIkNyZWF0ZSBhIFBhbmRhcyBQcm9maWxpbmcgUmVwb3J0IGZvciBhIGRhdGFzZXQuCiAgICA6cGFyYW0gY29udGV4dDogICAgICAgICB0aGUgZnVuY3Rpb24gY29udGV4dAogICAgOnBhcmFtIGRhdGE6ICAgICAgICAgICAgRGF0YXNldCB0byBjcmVhdGUgcmVwb3J0IGZvcgogICAgIiIiCgogICAgZGYgPSBkYXRhLmFzX2RmKCkKCiAgICBwcm9maWxlID0gZGYucHJvZmlsZV9yZXBvcnQodGl0bGU9IlBhbmRhcyBQcm9maWxpbmcgUmVwb3J0IikKCiAgICBjb250ZXh0LmxvZ19hcnRpZmFjdCgKICAgICAgICAiUGFuZGFzIFByb2ZpbGluZyBSZXBvcnQiLAogICAgICAgIGJvZHk9cHJvZmlsZS50b19odG1sKCksCiAgICAgICAgbG9jYWxfcGF0aD0icGFuZGFzX3Byb2ZpbGluZ19yZXBvcnQuaHRtbCIsCiAgICApCg==
-    commands:
-    - python -m pip install pandas_profiling
-    code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/pandas_profiling_report/pandas_profiling_report.py
-  affinity: null
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/pandas_profiling_report/1.1.0/static/item.html b/functions/development/pandas_profiling_report/1.1.0/static/item.html deleted file mode 100644 index a7417fab..00000000 --- a/functions/development/pandas_profiling_report/1.1.0/static/item.html +++ /dev/null @@ -1,47 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- data-analysis
-description: Create Pandas Profiling Report from Dataset
-doc: ''
-example: pandas_profiling_report.ipynb
-generationDate: 2022-08-28:17-25
-hidden: false
-icon: ''
-labels:
-  author: nicks
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 1.1.0
-name: pandas-profiling-report
-platformVersion: 3.5.0
-spec:
-  filename: pandas_profiling_report.py
-  handler: pandas_profiling_report
-  image: mlrun/mlrun
-  kind: job
-  requirements:
-  - pandas_profiling
-url: ''
-version: 1.1.0
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/pandas_profiling_report/1.1.0/static/pandas_profiling_report.html b/functions/development/pandas_profiling_report/1.1.0/static/pandas_profiling_report.html deleted file mode 100644 index af1fec42..00000000 --- a/functions/development/pandas_profiling_report/1.1.0/static/pandas_profiling_report.html +++ /dev/null @@ -1,181 +0,0 @@ - - - - - - - -pandas_profiling_report.pandas_profiling_report - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - -
-
- -
-
-
-
-
- -
-

- -
-
-
-
-
-
-
-

Source code for pandas_profiling_report.pandas_profiling_report

-# Copyright 2019 Iguazio
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# Generated by nuclio.export.NuclioExporter
-
-import pandas as pd
-import pandas_profiling
-
-from mlrun.execution import MLClientCtx
-from mlrun.datastore import DataItem
-
-
-
[docs]def pandas_profiling_report( - context: MLClientCtx, - data: DataItem, -) -> None: - """Create a Pandas Profiling Report for a dataset. - :param context: the function context - :param data: Dataset to create report for - """ - - df = data.as_df() - - profile = df.profile_report(title="Pandas Profiling Report") - - context.log_artifact( - "Pandas Profiling Report", - body=profile.to_html(), - local_path="pandas_profiling_report.html", - )
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/pandas_profiling_report/1.1.0/static/source.html b/functions/development/pandas_profiling_report/1.1.0/static/source.html deleted file mode 100644 index 8e0cfe44..00000000 --- a/functions/development/pandas_profiling_report/1.1.0/static/source.html +++ /dev/null @@ -1,63 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-# Copyright 2019 Iguazio
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# Generated by nuclio.export.NuclioExporter
-
-import pandas as pd
-import pandas_profiling
-
-from mlrun.execution import MLClientCtx
-from mlrun.datastore import DataItem
-
-
-def pandas_profiling_report(
-    context: MLClientCtx,
-    data: DataItem,
-) -> None:
-    """Create a Pandas Profiling Report for a dataset.
-    :param context:         the function context
-    :param data:            Dataset to create report for
-    """
-
-    df = data.as_df()
-
-    profile = df.profile_report(title="Pandas Profiling Report")
-
-    context.log_artifact(
-        "Pandas Profiling Report",
-        body=profile.to_html(),
-        local_path="pandas_profiling_report.html",
-    )
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/pandas_profiling_report/latest/src/README.md b/functions/development/pandas_profiling_report/latest/src/README.md deleted file mode 100644 index 40e0c9b2..00000000 --- a/functions/development/pandas_profiling_report/latest/src/README.md +++ /dev/null @@ -1,26 +0,0 @@ -## pandas_profiling_report - -Creates an html report with various graphs/statistics/correlations for a given dataset. See sample report [here](https://pandas-profiling.github.io/pandas-profiling/examples/master/titanic/titanic_report.html). Link to GitHub page [here](https://github.com/pandas-profiling/pandas-profiling). - - -Usage example: - -```python -import mlrun, os -mlrun.mlconf.dbpath = 'http://mlrun-api:8080' - -# Load pandas_profiling_report function from Github -func = mlrun.import_function("hub://pandas_profiling_report").apply(mlrun.mount_v3io()) - -# Build MLRun image (only needs to be run once) -func.deploy() - -# Create task -data = 'https://iguazio-sample-data.s3.amazonaws.com/datasets/iris_dataset.csv' - -task = NewTask(name="pandas-profiling-report", - inputs={"data": DATA_URL}) - -# Run task on cluster -run = func.run(task, artifact_path='/User/artifacts') -``` diff --git a/functions/development/pandas_profiling_report/latest/src/function.yaml b/functions/development/pandas_profiling_report/latest/src/function.yaml deleted file mode 100644 index ffdbbf83..00000000 --- a/functions/development/pandas_profiling_report/latest/src/function.yaml +++ /dev/null @@ -1,40 +0,0 @@ -kind: job -metadata: - name: pandas-profiling-report - tag: '' - hash: 79fe77fb2920a8ffecfef2f614a0be494c2ea43b - project: '' - labels: - author: nicks - categories: - - data-analysis -spec: - command: '' - args: [] - image: mlrun/mlrun - env: [] - default_handler: pandas_profiling_report - entry_points: - pandas_profiling_report: - name: pandas_profiling_report - doc: Create a Pandas Profiling Report for a dataset. - parameters: - - name: context - type: MLClientCtx - doc: the function context - default: '' - - name: data - type: DataItem - doc: Dataset to create report for - default: '' - outputs: - - default: '' - lineno: 10 - description: Create Pandas Profiling Report from Dataset - build: - functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHBhbmRhcyBhcyBwZAppbXBvcnQgcGFuZGFzX3Byb2ZpbGluZwoKZnJvbSBtbHJ1bi5leGVjdXRpb24gaW1wb3J0IE1MQ2xpZW50Q3R4CmZyb20gbWxydW4uZGF0YXN0b3JlIGltcG9ydCBEYXRhSXRlbQoKCmRlZiBwYW5kYXNfcHJvZmlsaW5nX3JlcG9ydCgKICAgIGNvbnRleHQ6IE1MQ2xpZW50Q3R4LAogICAgZGF0YTogRGF0YUl0ZW0sCikgLT4gTm9uZToKICAgICIiIkNyZWF0ZSBhIFBhbmRhcyBQcm9maWxpbmcgUmVwb3J0IGZvciBhIGRhdGFzZXQuCiAgICA6cGFyYW0gY29udGV4dDogICAgICAgICB0aGUgZnVuY3Rpb24gY29udGV4dAogICAgOnBhcmFtIGRhdGE6ICAgICAgICAgICAgRGF0YXNldCB0byBjcmVhdGUgcmVwb3J0IGZvcgogICAgIiIiCgogICAgZGYgPSBkYXRhLmFzX2RmKCkKCiAgICBwcm9maWxlID0gZGYucHJvZmlsZV9yZXBvcnQodGl0bGU9IlBhbmRhcyBQcm9maWxpbmcgUmVwb3J0IikKCiAgICBjb250ZXh0LmxvZ19hcnRpZmFjdCgKICAgICAgICAiUGFuZGFzIFByb2ZpbGluZyBSZXBvcnQiLAogICAgICAgIGJvZHk9cHJvZmlsZS50b19odG1sKCksCiAgICAgICAgbG9jYWxfcGF0aD0icGFuZGFzX3Byb2ZpbGluZ19yZXBvcnQuaHRtbCIsCiAgICApCg== - commands: - - python -m pip install pandas_profiling - code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/pandas_profiling_report/pandas_profiling_report.py - affinity: null -verbose: false diff --git a/functions/development/pandas_profiling_report/latest/src/item.yaml b/functions/development/pandas_profiling_report/latest/src/item.yaml deleted file mode 100644 index 13d37436..00000000 --- a/functions/development/pandas_profiling_report/latest/src/item.yaml +++ /dev/null @@ -1,25 +0,0 @@ -apiVersion: v1 -categories: -- data-analysis -description: Create Pandas Profiling Report from Dataset -doc: '' -example: pandas_profiling_report.ipynb -generationDate: 2022-08-28:17-25 -hidden: false -icon: '' -labels: - author: nicks -maintainers: [] -marketplaceType: '' -mlrunVersion: 1.1.0 -name: pandas-profiling-report -platformVersion: 3.5.0 -spec: - filename: pandas_profiling_report.py - handler: pandas_profiling_report - image: mlrun/mlrun - kind: job - requirements: - - pandas_profiling -url: '' -version: 1.1.0 diff --git a/functions/development/pandas_profiling_report/latest/src/pandas_profiling_report.ipynb b/functions/development/pandas_profiling_report/latest/src/pandas_profiling_report.ipynb deleted file mode 100644 index 61aeba26..00000000 --- a/functions/development/pandas_profiling_report/latest/src/pandas_profiling_report.ipynb +++ /dev/null @@ -1,794 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Pandas Profiling Report" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: ignore\n", - "import nuclio" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "%nuclio: setting kind to 'job'\n", - "%nuclio: setting spec.image to 'mlrun/mlrun'\n" - ] - } - ], - "source": [ - "%nuclio config kind = \"job\"\n", - "%nuclio config spec.image = \"mlrun/mlrun\"" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "%%nuclio cmd -c\n", - "pip install pandas_profiling" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import pandas_profiling\n", - "\n", - "from mlrun.execution import MLClientCtx\n", - "from mlrun.datastore import DataItem" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "def pandas_profiling_report(\n", - " context: MLClientCtx,\n", - " data: DataItem,\n", - ") -> None:\n", - " \"\"\"Create a Pandas Profiling Report for a dataset.\n", - " :param context: the function context\n", - " :param data: Dataset to create report for\n", - " \"\"\"\n", - " \n", - " # Load dataset\n", - " df = data.as_df()\n", - " \n", - " # Create Pandas Profiling Report\n", - " profile = df.profile_report(title='Pandas Profiling Report')\n", - " \n", - " # Save to MLRun DB\n", - " context.log_artifact('Pandas Profiling Report',\n", - " body=profile.to_html(),\n", - " local_path='pandas_profiling_report.html')" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: end-code" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### mlconfig" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import mlconf\n", - "import os\n", - "\n", - "mlconf.dbpath = 'http://mlrun-api:8080'\n", - "mlconf.artifact_path = mlconf.artifact_path or f'{os.environ[\"HOME\"]}/artifacts'" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### save" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2020-10-15 19:21:40,986 [info] function spec saved to path: function.yaml\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from mlrun import code_to_function\n", - "\n", - "# create job function object from notebook code\n", - "fn = code_to_function(\"pandas_profiling_report\", kind=\"job\")\n", - "\n", - "# add metadata (for templates and reuse)\n", - "fn.spec.default_handler = \"pandas_profiling_report\"\n", - "fn.spec.description = \"Create Pandas Profiling Report from Dataset\"\n", - "fn.metadata.categories = [\"analysis\"]\n", - "fn.metadata.labels = {\"author\": \"nicks\"}\n", - "fn.export(\"function.yaml\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## tests" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from mlrun.platforms import auto_mount\n", - "fn.apply(auto_mount())" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import NewTask, run_local\n", - "\n", - "DATA_URL = 'https://iguazio-sample-data.s3.amazonaws.com/datasets/iris_dataset.csv'" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "task = NewTask(name=\"pandas-profiling-report\", \n", - " handler=pandas_profiling_report, \n", - " inputs={\"data\": DATA_URL})" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### run locally" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2020-10-15 19:21:41,030 [warning] warning!, server (0.5.1) and client (0.5.2) ver dont match\n", - "> 2020-10-15 19:21:41,031 [info] starting run pandas-profiling-report uid=0894aed4f2854d96b776e25bdcaff80e -> http://mlrun-api:8080\n", - "> 2020-10-15 19:21:41,062 [warning] warning!, server (0.5.1) and client (0.5.2) ver dont match\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "86c3397cc7384565815af90bc5a6d10b", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(FloatProgress(value=0.0, description='Summarize dataset', max=19.0, style=ProgressStyle(descrip…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "e7dece2ab7184c909611cf0aed3ef474", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(FloatProgress(value=0.0, description='Generate report structure', max=1.0, style=ProgressStyle(…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "7153ac93afcd4e77a4b5a312766af995", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(FloatProgress(value=0.0, description='Render HTML', max=1.0, style=ProgressStyle(description_wi…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Oct 15 19:21:41completedpandas-profiling-report
v3io_user=nicks
kind=handler
owner=nicks
host=nicks-jupyter-76668bdd46-g9sxf
data
Pandas Profiling Report
\n", - "
\n", - "
\n", - "
\n", - " Title\n", - " ×\n", - "
\n", - " \n", - "
\n", - "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "to track results use .show() or .logs() or in CLI: \n", - "!mlrun get run 0894aed4f2854d96b776e25bdcaff80e --project default , !mlrun logs 0894aed4f2854d96b776e25bdcaff80e --project default\n", - "> 2020-10-15 19:21:52,944 [info] run executed, status=completed\n" - ] - } - ], - "source": [ - "run = run_local(task)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### run remotely" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "# Create MLRun image (only needs to be run once)\n", - "fn.deploy()" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2020-10-15 19:23:17,199 [info] starting run pandas-profiling-report uid=0ab5c8dbff95471da6018c1a7afd3b22 -> http://mlrun-api:8080\n", - "> 2020-10-15 19:23:17,303 [info] Job is running in the background, pod: pandas-profiling-report-xr48m\n", - "Summarize dataset: 100%|██████████| 19/19 [00:05<00:00, 3.78it/s, Completed] \n", - "Generate report structure: 100%|██████████| 1/1 [00:02<00:00, 2.22s/it]\n", - "> 2020-10-15 19:23:33,779 [info] run executed, status=completed\n", - "Render HTML: 100%|██████████| 1/1 [00:00<00:00, 2.07it/s]\n", - "final state: succeeded\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Oct 15 19:23:25completedpandas-profiling-report
v3io_user=nicks
kind=job
owner=nicks
host=pandas-profiling-report-xr48m
data
Pandas Profiling Report
\n", - "
\n", - "
\n", - "
\n", - " Title\n", - " ×\n", - "
\n", - " \n", - "
\n", - "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "to track results use .show() or .logs() or in CLI: \n", - "!mlrun get run 0ab5c8dbff95471da6018c1a7afd3b22 --project default , !mlrun logs 0ab5c8dbff95471da6018c1a7afd3b22 --project default\n", - "> 2020-10-15 19:23:36,481 [info] run executed, status=completed\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fn.run(task, inputs={\"data\": DATA_URL})" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/functions/development/pandas_profiling_report/latest/src/pandas_profiling_report.py b/functions/development/pandas_profiling_report/latest/src/pandas_profiling_report.py deleted file mode 100644 index c3d3d4d3..00000000 --- a/functions/development/pandas_profiling_report/latest/src/pandas_profiling_report.py +++ /dev/null @@ -1,41 +0,0 @@ -# Copyright 2019 Iguazio -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# Generated by nuclio.export.NuclioExporter - -import pandas as pd -import pandas_profiling - -from mlrun.execution import MLClientCtx -from mlrun.datastore import DataItem - - -def pandas_profiling_report( - context: MLClientCtx, - data: DataItem, -) -> None: - """Create a Pandas Profiling Report for a dataset. - :param context: the function context - :param data: Dataset to create report for - """ - - df = data.as_df() - - profile = df.profile_report(title="Pandas Profiling Report") - - context.log_artifact( - "Pandas Profiling Report", - body=profile.to_html(), - local_path="pandas_profiling_report.html", - ) diff --git a/functions/development/pandas_profiling_report/latest/static/documentation.html b/functions/development/pandas_profiling_report/latest/static/documentation.html deleted file mode 100644 index c0f26ed1..00000000 --- a/functions/development/pandas_profiling_report/latest/static/documentation.html +++ /dev/null @@ -1,229 +0,0 @@ - - - - - - - -pandas_profiling_report package - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - - - -
-
- - -
-
-
- -
-

pandas_profiling_report package

- -
- -
-
-
-
-
-

pandas_profiling_report package#

-
-

Submodules#

-
-
-

pandas_profiling_report.pandas_profiling_report module#

-
-
-pandas_profiling_report.pandas_profiling_report.pandas_profiling_report(context: mlrun.execution.MLClientCtx, data: mlrun.datastore.base.DataItem)None[source]#
-

Create a Pandas Profiling Report for a dataset. -:param context: the function context -:param data: Dataset to create report for

-
-
-
-

Module contents#

-
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/pandas_profiling_report/latest/static/example.html b/functions/development/pandas_profiling_report/latest/static/example.html deleted file mode 100644 index 9eff0e46..00000000 --- a/functions/development/pandas_profiling_report/latest/static/example.html +++ /dev/null @@ -1,851 +0,0 @@ - - - - - - - -Pandas Profiling Report - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - - - -
-
- - -
-
-
- -
-

Pandas Profiling Report

- -
- -
-
-
-
-
-

Pandas Profiling Report#

-
-
-
# nuclio: ignore
-import nuclio
-
-
-
-
-
-
-
%nuclio config kind = "job"
-%nuclio config spec.image = "mlrun/mlrun"
-
-
-
-
-
%nuclio: setting kind to 'job'
-%nuclio: setting spec.image to 'mlrun/mlrun'
-
-
-
-
-
-
-
%%nuclio cmd -c
-pip install pandas_profiling
-
-
-
-
-
-
-
import pandas as pd
-import pandas_profiling
-
-from mlrun.execution import MLClientCtx
-from mlrun.datastore import DataItem
-
-
-
-
-
-
-
def pandas_profiling_report(
-    context: MLClientCtx,
-    data: DataItem,
-) -> None:
-    """Create a Pandas Profiling Report for a dataset.
-    :param context:         the function context
-    :param data:            Dataset to create report for
-    """
-    
-    # Load dataset
-    df = data.as_df()
-    
-    # Create Pandas Profiling Report
-    profile = df.profile_report(title='Pandas Profiling Report')
-    
-    # Save to MLRun DB
-    context.log_artifact('Pandas Profiling Report',
-                         body=profile.to_html(),
-                         local_path='pandas_profiling_report.html')
-
-
-
-
-
-
-
# nuclio: end-code
-
-
-
-
-
-
-

mlconfig#

-
-
-
from mlrun import mlconf
-import os
-
-mlconf.dbpath = 'http://mlrun-api:8080'
-mlconf.artifact_path = mlconf.artifact_path or f'{os.environ["HOME"]}/artifacts'
-
-
-
-
-
-
-

save#

-
-
-
from mlrun import code_to_function
-
-# create job function object from notebook code
-fn = code_to_function("pandas_profiling_report", kind="job")
-
-# add metadata (for templates and reuse)
-fn.spec.default_handler = "pandas_profiling_report"
-fn.spec.description = "Create Pandas Profiling Report from Dataset"
-fn.metadata.categories = ["analysis"]
-fn.metadata.labels = {"author": "nicks"}
-fn.export("function.yaml")
-
-
-
-
-
> 2020-10-15 19:21:40,986 [info] function spec saved to path: function.yaml
-
-
-
<mlrun.runtimes.kubejob.KubejobRuntime at 0x7fe232431610>
-
-
-
-
-
-
-

tests#

-
-
-
from mlrun.platforms import auto_mount
-fn.apply(auto_mount())
-
-
-
-
-
<mlrun.runtimes.kubejob.KubejobRuntime at 0x7fe232431610>
-
-
-
-
-
-
-
from mlrun import NewTask, run_local
-
-DATA_URL = 'https://iguazio-sample-data.s3.amazonaws.com/datasets/iris_dataset.csv'
-
-
-
-
-
-
-
task = NewTask(name="pandas-profiling-report", 
-               handler=pandas_profiling_report, 
-               inputs={"data": DATA_URL})
-
-
-
-
-
-

run locally#

-
-
-
run = run_local(task)
-
-
-
-
-
> 2020-10-15 19:21:41,030 [warning] warning!, server (0.5.1) and client (0.5.2) ver dont match
-> 2020-10-15 19:21:41,031 [info] starting run pandas-profiling-report uid=0894aed4f2854d96b776e25bdcaff80e  -> http://mlrun-api:8080
-> 2020-10-15 19:21:41,062 [warning] warning!, server (0.5.1) and client (0.5.2) ver dont match
-
-
-

-
-
-

-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Oct 15 19:21:41completedpandas-profiling-report
v3io_user=nicks
kind=handler
owner=nicks
host=nicks-jupyter-76668bdd46-g9sxf
data
Pandas Profiling Report
-
- -
-
to track results use .show() or .logs() or in CLI: 
-!mlrun get run 0894aed4f2854d96b776e25bdcaff80e --project default , !mlrun logs 0894aed4f2854d96b776e25bdcaff80e --project default
-> 2020-10-15 19:21:52,944 [info] run executed, status=completed
-
-
-
-
-
-
-

run remotely#

-
-
-
# Create MLRun image (only needs to be run once)
-fn.deploy()
-
-
-
-
-
-
-
fn.run(task, inputs={"data": DATA_URL})
-
-
-
-
-
> 2020-10-15 19:23:17,199 [info] starting run pandas-profiling-report uid=0ab5c8dbff95471da6018c1a7afd3b22  -> http://mlrun-api:8080
-> 2020-10-15 19:23:17,303 [info] Job is running in the background, pod: pandas-profiling-report-xr48m
-Summarize dataset: 100%|██████████| 19/19 [00:05<00:00,  3.78it/s, Completed]                         
-Generate report structure: 100%|██████████| 1/1 [00:02<00:00,  2.22s/it]
-> 2020-10-15 19:23:33,779 [info] run executed, status=completed
-Render HTML: 100%|██████████| 1/1 [00:00<00:00,  2.07it/s]
-final state: succeeded
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Oct 15 19:23:25completedpandas-profiling-report
v3io_user=nicks
kind=job
owner=nicks
host=pandas-profiling-report-xr48m
data
Pandas Profiling Report
-
- -
-
to track results use .show() or .logs() or in CLI: 
-!mlrun get run 0ab5c8dbff95471da6018c1a7afd3b22 --project default , !mlrun logs 0ab5c8dbff95471da6018c1a7afd3b22 --project default
-> 2020-10-15 19:23:36,481 [info] run executed, status=completed
-
-
-
<mlrun.model.RunObject at 0x7fe2297b51d0>
-
-
-
-
-
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/pandas_profiling_report/latest/static/function.html b/functions/development/pandas_profiling_report/latest/static/function.html deleted file mode 100644 index 2c7d0c27..00000000 --- a/functions/development/pandas_profiling_report/latest/static/function.html +++ /dev/null @@ -1,62 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: job
-metadata:
-  name: pandas-profiling-report
-  tag: ''
-  hash: 79fe77fb2920a8ffecfef2f614a0be494c2ea43b
-  project: ''
-  labels:
-    author: nicks
-  categories:
-  - data-analysis
-spec:
-  command: ''
-  args: []
-  image: mlrun/mlrun
-  env: []
-  default_handler: pandas_profiling_report
-  entry_points:
-    pandas_profiling_report:
-      name: pandas_profiling_report
-      doc: Create a Pandas Profiling Report for a dataset.
-      parameters:
-      - name: context
-        type: MLClientCtx
-        doc: the function context
-        default: ''
-      - name: data
-        type: DataItem
-        doc: Dataset to create report for
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 10
-  description: Create Pandas Profiling Report from Dataset
-  build:
-    functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHBhbmRhcyBhcyBwZAppbXBvcnQgcGFuZGFzX3Byb2ZpbGluZwoKZnJvbSBtbHJ1bi5leGVjdXRpb24gaW1wb3J0IE1MQ2xpZW50Q3R4CmZyb20gbWxydW4uZGF0YXN0b3JlIGltcG9ydCBEYXRhSXRlbQoKCmRlZiBwYW5kYXNfcHJvZmlsaW5nX3JlcG9ydCgKICAgIGNvbnRleHQ6IE1MQ2xpZW50Q3R4LAogICAgZGF0YTogRGF0YUl0ZW0sCikgLT4gTm9uZToKICAgICIiIkNyZWF0ZSBhIFBhbmRhcyBQcm9maWxpbmcgUmVwb3J0IGZvciBhIGRhdGFzZXQuCiAgICA6cGFyYW0gY29udGV4dDogICAgICAgICB0aGUgZnVuY3Rpb24gY29udGV4dAogICAgOnBhcmFtIGRhdGE6ICAgICAgICAgICAgRGF0YXNldCB0byBjcmVhdGUgcmVwb3J0IGZvcgogICAgIiIiCgogICAgZGYgPSBkYXRhLmFzX2RmKCkKCiAgICBwcm9maWxlID0gZGYucHJvZmlsZV9yZXBvcnQodGl0bGU9IlBhbmRhcyBQcm9maWxpbmcgUmVwb3J0IikKCiAgICBjb250ZXh0LmxvZ19hcnRpZmFjdCgKICAgICAgICAiUGFuZGFzIFByb2ZpbGluZyBSZXBvcnQiLAogICAgICAgIGJvZHk9cHJvZmlsZS50b19odG1sKCksCiAgICAgICAgbG9jYWxfcGF0aD0icGFuZGFzX3Byb2ZpbGluZ19yZXBvcnQuaHRtbCIsCiAgICApCg==
-    commands:
-    - python -m pip install pandas_profiling
-    code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/pandas_profiling_report/pandas_profiling_report.py
-  affinity: null
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/pandas_profiling_report/latest/static/item.html b/functions/development/pandas_profiling_report/latest/static/item.html deleted file mode 100644 index a7417fab..00000000 --- a/functions/development/pandas_profiling_report/latest/static/item.html +++ /dev/null @@ -1,47 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- data-analysis
-description: Create Pandas Profiling Report from Dataset
-doc: ''
-example: pandas_profiling_report.ipynb
-generationDate: 2022-08-28:17-25
-hidden: false
-icon: ''
-labels:
-  author: nicks
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 1.1.0
-name: pandas-profiling-report
-platformVersion: 3.5.0
-spec:
-  filename: pandas_profiling_report.py
-  handler: pandas_profiling_report
-  image: mlrun/mlrun
-  kind: job
-  requirements:
-  - pandas_profiling
-url: ''
-version: 1.1.0
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/pandas_profiling_report/latest/static/pandas_profiling_report.html b/functions/development/pandas_profiling_report/latest/static/pandas_profiling_report.html deleted file mode 100644 index af1fec42..00000000 --- a/functions/development/pandas_profiling_report/latest/static/pandas_profiling_report.html +++ /dev/null @@ -1,181 +0,0 @@ - - - - - - - -pandas_profiling_report.pandas_profiling_report - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - -
-
- -
-
-
-
-
- -
-

- -
-
-
-
-
-
-
-

Source code for pandas_profiling_report.pandas_profiling_report

-# Copyright 2019 Iguazio
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# Generated by nuclio.export.NuclioExporter
-
-import pandas as pd
-import pandas_profiling
-
-from mlrun.execution import MLClientCtx
-from mlrun.datastore import DataItem
-
-
-
[docs]def pandas_profiling_report( - context: MLClientCtx, - data: DataItem, -) -> None: - """Create a Pandas Profiling Report for a dataset. - :param context: the function context - :param data: Dataset to create report for - """ - - df = data.as_df() - - profile = df.profile_report(title="Pandas Profiling Report") - - context.log_artifact( - "Pandas Profiling Report", - body=profile.to_html(), - local_path="pandas_profiling_report.html", - )
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/pandas_profiling_report/latest/static/source.html b/functions/development/pandas_profiling_report/latest/static/source.html deleted file mode 100644 index 8e0cfe44..00000000 --- a/functions/development/pandas_profiling_report/latest/static/source.html +++ /dev/null @@ -1,63 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-# Copyright 2019 Iguazio
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# Generated by nuclio.export.NuclioExporter
-
-import pandas as pd
-import pandas_profiling
-
-from mlrun.execution import MLClientCtx
-from mlrun.datastore import DataItem
-
-
-def pandas_profiling_report(
-    context: MLClientCtx,
-    data: DataItem,
-) -> None:
-    """Create a Pandas Profiling Report for a dataset.
-    :param context:         the function context
-    :param data:            Dataset to create report for
-    """
-
-    df = data.as_df()
-
-    profile = df.profile_report(title="Pandas Profiling Report")
-
-    context.log_artifact(
-        "Pandas Profiling Report",
-        body=profile.to_html(),
-        local_path="pandas_profiling_report.html",
-    )
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/rnn_serving/0.0.1/src/function.yaml b/functions/development/rnn_serving/0.0.1/src/function.yaml deleted file mode 100644 index 827d5973..00000000 --- a/functions/development/rnn_serving/0.0.1/src/function.yaml +++ /dev/null @@ -1,45 +0,0 @@ -kind: serving -metadata: - name: rnn-serving - tag: '' - hash: c21d8eee1a9d5eb69be14cc4d397f1227137fa93 - project: default - labels: - author: Daniel - categories: - - model-serving - - machine-learning -spec: - command: '' - args: [] - image: mlrun/ml-models - description: deploy an rnn based stock analysis model server. - min_replicas: 1 - max_replicas: 4 - env: [] - base_spec: - apiVersion: nuclio.io/v1 - kind: Function - metadata: - name: rnn-serving - labels: {} - annotations: - nuclio.io/generated_by: function generated from /home/kali/functions/rnn_serving/rnn_serving.py - spec: - runtime: python:3.6 - handler: rnn_serving:handler - env: [] - volumes: [] - build: - commands: [] - noBaseImagesPull: true - functionSourceCode: aW1wb3J0IG1scnVuCmltcG9ydCBudW1weSBhcyBucAppbXBvcnQga2VyYXMKaW1wb3J0IGpzb24KCgpjbGFzcyBSTk5fTW9kZWxfU2VydmluZyhtbHJ1bi5zZXJ2aW5nLlYyTW9kZWxTZXJ2ZXIpOgogICAgZGVmIGxvYWQoc2VsZik6CiAgICAgICAgIiIibG9hZCBhbmQgaW5pdGlhbGl6ZSB0aGUgbW9kZWwgYW5kL29yIG90aGVyIGVsZW1lbnRzIiIiCiAgICAgICAgbW9kZWxfZmlsZSwgZXh0cmFfZGF0YSA9IHNlbGYuZ2V0X21vZGVsKHN1ZmZpeD0iLmg1IikKICAgICAgICBzZWxmLm1vZGVsID0ga2VyYXMubW9kZWxzLmxvYWRfbW9kZWwobW9kZWxfZmlsZSkKCiAgICBkZWYgcHJlZGljdChzZWxmLCBib2R5KToKICAgICAgICB0cnk6CiAgICAgICAgICAgICIiIkdlbmVyYXRlIG1vZGVsIHByZWRpY3Rpb25zIGZyb20gc2FtcGxlLiIiIgogICAgICAgICAgICBmZWF0cyA9IG5wLmFzYXJyYXkoYm9keVsnaW5wdXRzJ10pCiAgICAgICAgICAgIHJlc3VsdCA9IHNlbGYubW9kZWwucHJlZGljdChmZWF0cykKICAgICAgICAgICAgcmVzdWx0ID0ganNvbi5kdW1wcyhyZXN1bHQudG9saXN0KCkpCiAgICAgICAgICAgIHJldHVybiByZXN1bHQKICAgICAgICBleGNlcHQgRXhjZXB0aW9uIGFzIGU6CiAgICAgICAgICAgIHJhaXNlIEV4Y2VwdGlvbigiRmFpbGVkIHRvIHByZWRpY3QgJXMiICUgZSkKZnJvbSBtbHJ1bi5ydW50aW1lcyBpbXBvcnQgbnVjbGlvX2luaXRfaG9vawpkZWYgaW5pdF9jb250ZXh0KGNvbnRleHQpOgogICAgbnVjbGlvX2luaXRfaG9vayhjb250ZXh0LCBnbG9iYWxzKCksICdzZXJ2aW5nX3YyJykKCmRlZiBoYW5kbGVyKGNvbnRleHQsIGV2ZW50KToKICAgIHJldHVybiBjb250ZXh0Lm1scnVuX2hhbmRsZXIoY29udGV4dCwgZXZlbnQpCg== - source: '' - function_kind: serving_v2 - build: - commands: - - python -m pip install keras - code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/rnn_serving/rnn_serving.py - secret_sources: [] - affinity: null -verbose: false diff --git a/functions/development/rnn_serving/0.0.1/src/item.yaml b/functions/development/rnn_serving/0.0.1/src/item.yaml deleted file mode 100644 index d3a273cd..00000000 --- a/functions/development/rnn_serving/0.0.1/src/item.yaml +++ /dev/null @@ -1,24 +0,0 @@ -apiVersion: v1 -categories: -- model-serving -- machine-learning -description: deploy an rnn based stock analysis model server. -doc: '' -example: rnn_serving.ipynb -generationDate: 2021-05-19:23-13 -icon: '' -labels: - author: Daniel -maintainers: [] -marketplaceType: '' -mlrunVersion: '' -name: rnn-serving -platformVersion: '' -spec: - filename: rnn_serving.py - handler: handler - image: mlrun/ml-models - kind: serving - requirements: [keras] -url: '' -version: 0.0.1 diff --git a/functions/development/rnn_serving/0.0.1/src/requirements.txt b/functions/development/rnn_serving/0.0.1/src/requirements.txt deleted file mode 100644 index 2764273d..00000000 --- a/functions/development/rnn_serving/0.0.1/src/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ -mlrun -keras -tensorflow -wget \ No newline at end of file diff --git a/functions/development/rnn_serving/0.0.1/src/rnn_serving.ipynb b/functions/development/rnn_serving/0.0.1/src/rnn_serving.ipynb deleted file mode 100644 index 313e2b4f..00000000 --- a/functions/development/rnn_serving/0.0.1/src/rnn_serving.ipynb +++ /dev/null @@ -1,248 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: ignore\n", - "import nuclio" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%nuclio config \n", - "spec.build.baseImage = \"mlrun/mlrun\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%nuclio config kind=\"serving\"\n", - "%nuclio cmd python -m pip install torch==1.6.0\n", - "%nuclio cmd python -m pip install tensorflow\n", - "%nuclio cmd python -m pip install keras" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "from mlrun import MLClientCtx\n", - "import os\n", - "import mlrun\n", - "import numpy as np\n", - "import keras\n", - "import json" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "class RNN_Model_Serving(mlrun.serving.V2ModelServer):\n", - "\n", - "\n", - " def load(self):\n", - " \"\"\"load and initialize the model and/or other elements\"\"\"\n", - " model_file,extra_data = self.get_model(suffix = \".h5\")\n", - " self.model = keras.models.load_model(model_file)\n", - " \n", - " def predict(self, body):\n", - " try:\n", - " \"\"\"Generate model predictions from sample.\"\"\"\n", - " feats = np.asarray(body['inputs'])\n", - " result = self.model.predict(feats)\n", - " result = json.dumps(result.tolist())\n", - " return result\n", - " except Exception as e:\n", - " raise Exception(\"Failed to predict %s\" % e)" - ] - }, - { - "cell_type": "code", - "execution_count": 281, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: end-code" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# test localy" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Run this to download the pre-trained model to your `models` directory\n", - "\n", - "import os\n", - "model_location = 'https://igz-app-lab.s3.us-east-2.amazonaws.com/RNN.h5'\n", - "saved_models_directory = os.path.join(os.path.abspath('./'), 'models')\n", - "\n", - "# Create paths\n", - "os.makedirs(saved_models_directory, exist_ok=1)\n", - "model_filepath = os.path.join(saved_models_directory, os.path.basename(model_location))\n", - "!wget -nc -P {saved_models_directory} {model_location}\n", - "\n", - "\n", - "\n", - "\n", - "model_dir = model_filepath\n", - "model_server = RNN_Model_Serving(model_dir = model_dir,name = \"RNN_serving\")\n", - "model_server.load()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# test remotely" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import new_model_server, mount_v3io\n", - "import requests\n", - "import os" - ] - }, - { - "cell_type": "code", - "execution_count": 281, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2021-04-25 15:04:37,946 [info] function spec saved to path: function.yaml\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 281, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fn = new_model_server(name = 'RNN-serving',\n", - " model_class='RNN_Model_Serving')\n", - "fn.spec.description = \"RNN based stock prediction\"\n", - "fn.metadata.categories = ['RNN', 'Stocks', 'Predictions']\n", - "fn.metadata.labels = {'author': 'Daniel', 'framework': \"tensorflow, keras\"}\n", - "fn.spec.max_replicas = 1\n", - "fn.export(\"function.yaml\")\n", - "fn.apply(mount_v3io())\n", - "fn.add_model('RNN_stocks', \"/Users/test/demos/stock-analysis/models/RNN.h5\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2021-04-25 15:04:37,956 [info] Starting remote function deploy\n", - "2021-04-25 15:04:38 (info) Deploying function\n", - "2021-04-25 15:04:38 (info) Building\n", - "2021-04-25 15:04:38 (info) Staging files and preparing base images\n", - "2021-04-25 15:04:38 (info) Building processor image\n" - ] - } - ], - "source": [ - "addr = fn.deploy()" - ] - }, - { - "cell_type": "code", - "execution_count": 256, - "metadata": {}, - "outputs": [], - "source": [ - "# getting answer" - ] - }, - { - "cell_type": "code", - "execution_count": 257, - "metadata": {}, - "outputs": [], - "source": [ - "data = [[[0.0, 0.0, 0.0003997438873852843, 0.0, 0.0031504590166214763, 0.07695351128043912],[0.0, 0.0076737889289563155, 0.0011658857834522252, 0.0034156784725392075, 0.0014412414129996165, 0.04148475898082488],[0.0, 0.005938872792230376, 0.0, 0.004308552624495143, 0.0052619477491432365, 0.030420689911339623],[0.0, 0.00974237019744173, 0.002864898969104024, 0.009126076646371328, 0.00754097274129828, 0.03420920993845116],[0.0, 0.012044527240189229, 0.006196233583000366, 0.01248470808067681, 0.009082756366275535, 0.03354111121910287],[0.0, 0.013579264661919943, 0.006129643489857672, 0.012247201899718174, 0.011462630413374697, 0.034077706918863734],[0.0, 0.01594821582320083, 0.009427632225050497, 0.01567375140495164, 0.014746872963329771, 0.041772700519581665],[0.00030003000301803695, 0.015347577857912631, 0.01465785196829128, 0.010008183299277662, 0.015383674399518488, 0.09121246281753899],[0.00030003000301803695, 0.019851802585909906, 0.01057694673341536, 0.01258648165342402, 0.015283029906551948, 0.04697070093018961],[0.00030003000301803695, 0.019718316169145544, 0.01062686388720635, 0.013909641632783631, 0.010724979922241862, 0.03015250378874856],[0.00030003000301803695, 0.015247488500415018, 0.007595438853113201, 0.011372653202875505, 0.008378858601442496, 0.024210897325231427],[0.00030003000301803695, 0.012878639159436767, 0.008361580749180142, 0.012077613789687636, 0.013104547126374655, 0.023022372892001874],[0.00030003000301803695, 0.017633036371026645, 0.010127184027974745, 0.01808246164906735, 0.013422334158536331, 0.030119960676463234],[0.00030003000301803695, 0.017983400032419605, 0.011592979391230274, 0.018353926865489978, 0.01575229508310738, 0.03006790591664354],[0.00040004000402404927, 0.026291122165628233, 0.018955200421947316, 0.01818423522181467, 0.015819289130762915, 0.07152689652172939],[0.00040004000402404927, 0.020318954134433742, 0.013358582670024766, 0.012077613789687636, 0.011462323570408328, 0.0456794077047931],[0.00040004000402404927, 0.01588142170466733, 0.007395465245156219, 0.007908106850034513, 0.005597122549398592, 0.031427718441502224],[0.00040004000402404927, 0.010042638269934567, 0.005296809836383476, 0.009621381602651136, 0.00738243720867815, 0.02567574365912337],[0.00040004000402404927, 0.011977834941958365, 0.006362759647989491, 0.010788412845792017, 0.008764355648181232, 0.022205280753766737],[0.00040004000402404927, 0.013178907231929493, 0.004497220397348678, 0.010177771409308312, 0.0074069846459871025, 0.025356051256133154],[0.00040004000402404927, 0.011810951465927255, 0.0022319372593224696, 0.0040711499771812765, 0.0, 0.031626735214947514],[0.0005000500050300616, 0.025123446934923743, 0.028016434638316157, 0.023612400680174672, 0.032778090757603406, 0.10153228594145776],[0.0005000500050300616, 0.03713457711584622, 0.03174761480386212, 0.03477405231513664, 0.030096896917537364, 0.047626905771733244],[0.0005000500050300616, 0.03456554905987275, 0.025784497378993576, 0.03300994254296863, 0.02869022647874786, 0.028768812095043804],[0.0005000500050300616, 0.03329870085439446, 0.026250831359521554, 0.03389163506129611, 0.028856842209482148, 0.025428460696670638],[0.0005000500050300616, 0.033264387412404006, 0.02518488154791565, 0.031652823528145024, 0.02952708952900418, 0.028119392147072767],[0.0005000500050300616, 0.03399840997415393, 0.029015896021043597, 0.03396659342006425, 0.03401824774664752, 0.033913843613464494],[0.0005000500050300616, 0.038369148285386845, 0.030614871570584734, 0.0350454139979145, 0.031102319037314974, 0.033925229639953834],[0.0006000600060360739, 0.04257300312058865, 0.04390696581173115, 0.03887905779413703, 0.04573187570650594, 0.09803619685873038],[0.0006000600060360739, 0.05008001039381649, 0.04099204802457179, 0.045291103478150685, 0.041925488708792535, 0.036871285277117145],[0.0006000600060360739, 0.04614302657184077, 0.0375107586207738, 0.04030399134624385, 0.03666589342239146, 0.04111875053787802],[0.0006000600060360739, 0.04090477928209357, 0.03571180946327568, 0.039964711592538005, 0.040218521286923936, 0.027399177725746942],[0.0006000600060360739, 0.044508199792612446, 0.03767738635002715, 0.04535891801543401, 0.042307610482901215, 0.030618589411891214],[0.0006000600060360739, 0.04657678106109786, 0.04117524702891262, 0.04627560490567262, 0.044793447634383865, 0.05405884268018613],[0.0006000600060360739, 0.04910742286297509, 0.047837916256705526, 0.04749626657677186, 0.05335661657762081, 0.06314168219836809],[0.007700770077008201, 0.030194810748639833, 0.06282892201923929, 0.03002444636418833, 0.05895373912701951, 0.2321397402054094],[0.007700770077008201, 0.06292535431428914, 0.05726493849616998, 0.060116707265659874, 0.05526702088618707, 0.078071078727893],[0.007700770077008201, 0.05958890663729899, 0.05200208452407595, 0.04851400230424485, 0.04286626924365666, 0.07557726468701688],[0.007700770077008201, 0.04721071426535017, 0.03784391241501628, 0.03962543183883216, 0.039346780419491134, 0.0617907570715274],[0.007700770077008201, 0.043530928527849455, 0.03890996389088652, 0.04125391253643351, 0.044340956539989596, 0.03782111959214908],[0.007700770077008201, 0.0485636006265614, 0.04137511897260526, 0.045935703950098494, 0.04155011747994375, 0.0446209643074681],[0.007700770077008201, 0.04580936144008119, 0.036544643116749764, 0.03738630970474699, 0.03525830245470285, 0.05051381704471508],[0.008000800080026238, 0.03993728276638431, 0.037731268410127417, 0.03160944293101264, 0.04028224234293831, 0.09797242089055323],[0.008000800080026238, 0.04313250568360927, 0.03691114278969587, 0.041932368510200546, 0.040620792415823725, 0.03983262723887141],[0.008000800080026238, 0.04514213299686476, 0.03984273351620693, 0.04647853084929898, 0.0439723358564007, 0.03439152856064892],[0.008000800080026238, 0.048335115867431555, 0.03857681009664404, 0.03775954849391683, 0.0353588446666806, 0.04126804866755406],[0.008000800080026238, 0.0397703992903532, 0.03627818107991432, 0.0411181799282222, 0.04165976269992355, 0.029229849676112238],[0.008000800080026238, 0.04600954015507641, 0.04194149058924401, 0.04400190253425518, 0.04668707786078907, 0.04065361967520816],[0.008000800080026238, 0.05101421167056164, 0.04513954335259052, 0.051906592774014215, 0.04899965101726622, 0.03669087617586497],[0.00810081008103225, 0.05958890663729899, 0.060596883095652365, 0.05923460061275354, 0.06327715652305188, 0.0879437184546251],[0.00810081008103225, 0.06756306545905066, 0.062162614430754326, 0.06873391604829482, 0.06699742292820654, 0.04326880002341804],[0.00810081008103225, 0.0711663841492669, 0.06549394904465067, 0.0727161306246723, 0.06872341461398956, 0.04025076154082211],[0.00810081008103225, 0.07293469734525959, 0.06572716686704694, 0.07304246867278985, 0.07101921368830555, 0.03401019316500588],[0.00810081008103225, 0.07523685438800698, 0.06839229555672255, 0.07562087056058087, 0.07413622682159382, 0.042899714001500375],[0.00810081008103225, 0.07830643105177104, 0.06949159124703208, 0.07365311010927356, 0.06820259981908527, 0.0531809708965342],[0.00810081008103225, 0.07233416120027403, 0.06372834576585618, 0.06798754200359969, 0.06445021718345156, 0.05271274214084092],[0.008200820082038263, 0.07760570372898534, 0.07595438853113234, 0.07419593700847404, 0.0738178261034993, 0.1051673434581077],[0.008200820082038263, 0.07803945821824254, 0.07592104265242872, 0.07908127556763367, 0.0799008856306257, 0.06479979642881596],[0.008200820082038263, 0.0840451251290063, 0.07642072251166032, 0.08345774626305669, 0.07933107824209251, 0.038738024827896046],[0.008200820082038263, 0.08354457652121561, 0.07695374824959555, 0.08169353295724402, 0.07758759650722691, 0.037608309576979866]]]" - ] - }, - { - "cell_type": "code", - "execution_count": 258, - "metadata": {}, - "outputs": [], - "source": [ - "import json\n", - "\n", - "event_data = {'inputs':X_test[:5].tolist()}\n", - "\n", - "resp = requests.put(addr+\"/RNN_stocks/predict\", json = event_data)\n", - "s = json.loads(json.loads(resp.text))" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} \ No newline at end of file diff --git a/functions/development/rnn_serving/0.0.1/src/rnn_serving.py b/functions/development/rnn_serving/0.0.1/src/rnn_serving.py deleted file mode 100644 index 89cefd19..00000000 --- a/functions/development/rnn_serving/0.0.1/src/rnn_serving.py +++ /dev/null @@ -1,21 +0,0 @@ -import mlrun -import numpy as np -import keras -import json - - -class RNN_Model_Serving(mlrun.serving.V2ModelServer): - def load(self): - """load and initialize the model and/or other elements""" - model_file, extra_data = self.get_model(suffix=".h5") - self.model = keras.models.load_model(model_file) - - def predict(self, body): - try: - """Generate model predictions from sample.""" - feats = np.asarray(body['inputs']) - result = self.model.predict(feats) - result = json.dumps(result.tolist()) - return result - except Exception as e: - raise Exception("Failed to predict %s" % e) \ No newline at end of file diff --git a/functions/development/rnn_serving/0.0.1/src/test_rnn_serving.py b/functions/development/rnn_serving/0.0.1/src/test_rnn_serving.py deleted file mode 100644 index 255ac5e7..00000000 --- a/functions/development/rnn_serving/0.0.1/src/test_rnn_serving.py +++ /dev/null @@ -1,60 +0,0 @@ -import os -import wget -from mlrun import import_function -from os import path -from rnn_serving import * - -DATASET = np.array([[6.9955170e-01, 6.9952875e-01, 2.7922913e-02, 2.7853036e-02, - 6.9955170e-01, 7.0086759e-01, 7.0118028e-01, 7.0142627e-01, - 2.7922913e-02, 0.0000000e+00, 0.0000000e+00], - [6.9955170e-01, 6.9998503e-01, 1.6527303e-03, 2.7853036e-02, - 7.0000792e-01, 7.0085293e-01, 7.0118028e-01, 7.0203447e-01, - 1.6527303e-03, 0.0000000e+00, 0.0000000e+00], - [6.9955170e-01, 7.0025057e-01, 1.6904050e-04, 2.7853036e-02, - 7.0027345e-01, 7.0014298e-01, 7.0190376e-01, 7.0128226e-01, - 1.6904050e-04, 0.0000000e+00, 0.0000000e+00], - [6.9955170e-01, 7.0144778e-01, 1.6904050e-04, 2.7853036e-02, - 7.0147055e-01, 7.0178574e-01, 7.0236105e-01, 7.0295709e-01, - 7.3906886e-03, 0.0000000e+00, 0.0000000e+00], - [6.9955170e-01, 7.0324355e-01, 1.6904050e-04, 2.7853036e-02, - 7.0326620e-01, 7.0308524e-01, 7.0490342e-01, 7.0427048e-01, - 2.4815742e-03, 0.0000000e+00, 0.0000000e+00], - [6.9955170e-01, 7.0324355e-01, 1.6904050e-04, 2.7853036e-02, - 7.0191067e-01, 7.0173001e-01, 7.0354480e-01, 7.0291305e-01, - 2.9976186e-03, 0.0000000e+00, 0.0000000e+00], - [6.9955170e-01, 7.0324355e-01, 1.6904050e-04, 2.7853036e-02, - 7.0166123e-01, 7.0148063e-01, 7.0284635e-01, 7.0249581e-01, - 2.7904075e-03, 0.0000000e+00, 0.0000000e+00], - [6.9955170e-01, 7.0324355e-01, 1.6904050e-04, 2.7853036e-02, - 7.0133996e-01, 7.0143080e-01, 7.0297277e-01, 7.0250750e-01, - 4.1491759e-04, 0.0000000e+00, 0.0000000e+00], - [6.9955170e-01, 7.0324355e-01, 1.6904050e-04, 2.7853036e-02, - 7.0150572e-01, 7.0251614e-01, 7.0281982e-01, 7.0370042e-01, - 2.1256472e-03, 0.0000000e+00, 0.0000000e+00], - [6.9955170e-01, 7.0324355e-01, 1.6904050e-04, 2.7853036e-02, - 7.0272487e-01, 7.0258951e-01, 7.0429617e-01, 7.0376801e-01, - 1.4207334e-03, 0.0000000e+00, 0.0000000e+00]]).reshape(1, 10, 11).tolist() - - -def download_pretrained_model(model_path): - # Run this to download the pre-trained model to your `models` directory - model_location = 'https://s3.wasabisys.com/iguazio/models/rnn/rnn_model.h5' - saved_models_directory = model_path - # Create paths - os.makedirs(saved_models_directory, exist_ok=1) - model_filepath = os.path.join(saved_models_directory, os.path.basename(model_location)) - wget.download(model_location, model_filepath) - - -def test_rnn_serving(): - model_path = os.path.join(os.path.abspath('./'), 'models') - model = model_path + '/rnn_model.h5' - if not path.exists(model): - download_pretrained_model(model_path) - - fn = import_function('function.yaml') - fn.add_model('rnn_model', model_path=model, class_name='RNN_Model_Serving') - # create an emulator (mock server) from the function configuration) - server = fn.to_mock_server() - resp = server.test("/v2/models/rnn_model/infer", {"inputs": DATASET}) - assert (resp['outputs'] == '[[0.453309565782547]]') diff --git a/functions/development/rnn_serving/0.0.1/static/documentation.html b/functions/development/rnn_serving/0.0.1/static/documentation.html deleted file mode 100644 index 039c2600..00000000 --- a/functions/development/rnn_serving/0.0.1/static/documentation.html +++ /dev/null @@ -1,140 +0,0 @@ - - - - - - - -rnn_serving package - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- - -
-
-
-
-
-
-

rnn_serving package

-
-

Submodules

-
-
-

rnn_serving.rnn_serving module

-
-
-class rnn_serving.rnn_serving.RNN_Model_Serving(context, name: str, model_path: Optional[str] = None, model=None, protocol=None, **class_args)[source]
-

Bases: mlrun.serving.v2_serving.V2ModelServer

-
-
-load()[source]
-

load and initialize the model and/or other elements

-
-
-
-predict(body)[source]
-

model prediction operation

-
-
-
-
-

Module contents

-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/rnn_serving/0.0.1/static/example.html b/functions/development/rnn_serving/0.0.1/static/example.html deleted file mode 100644 index 20e32e74..00000000 --- a/functions/development/rnn_serving/0.0.1/static/example.html +++ /dev/null @@ -1,278 +0,0 @@ - - - - - - - -<no title> - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- -
-
- Contents -
- -
-
-
-
-
-
-
-
-
# nuclio: ignore
-import nuclio
-
-
-
-
-
-
-
%%nuclio config 
-spec.build.baseImage = "mlrun/mlrun"
-
-
-
-
-
-
-
%nuclio config kind="serving"
-%nuclio cmd python -m pip install torch==1.6.0
-%nuclio cmd python -m pip install tensorflow
-%nuclio cmd python -m pip install keras
-
-
-
-
-
-
-
import pandas as pd
-from mlrun import MLClientCtx
-import os
-import mlrun
-import numpy as np
-import keras
-import json
-
-
-
-
-
-
-
class RNN_Model_Serving(mlrun.serving.V2ModelServer):
-
-
-    def load(self):
-        """load and initialize the model and/or other elements"""
-        model_file,extra_data = self.get_model(suffix = ".h5")
-        self.model = keras.models.load_model(model_file)
-        
-    def predict(self, body):
-        try:
-            """Generate model predictions from sample."""
-            feats = np.asarray(body['inputs'])
-            result = self.model.predict(feats)
-            result = json.dumps(result.tolist())
-            return result
-        except Exception as e:
-            raise Exception("Failed to predict %s" % e)
-
-
-
-
-
-
-
# nuclio: end-code
-
-
-
-
-
-
-
# test localy
-
-
-
-
-
-
-
# Run this to download the pre-trained model to your `models` directory
-
-import os
-model_location = 'https://igz-app-lab.s3.us-east-2.amazonaws.com/RNN.h5'
-saved_models_directory = os.path.join(os.path.abspath('./'), 'models')
-
-# Create paths
-os.makedirs(saved_models_directory, exist_ok=1)
-model_filepath = os.path.join(saved_models_directory, os.path.basename(model_location))
-!wget -nc -P {saved_models_directory} {model_location}
-
-
-
-
-model_dir = model_filepath
-model_server = RNN_Model_Serving(model_dir = model_dir,name = "RNN_serving")
-model_server.load()
-
-
-
-
-
-
-
# test remotely
-
-
-
-
-
-
-
from mlrun import new_model_server, mount_v3io
-import requests
-import os
-
-
-
-
-
-
-
fn = new_model_server(name = 'RNN-serving',
-                      model_class='RNN_Model_Serving')
-fn.spec.description = "RNN based stock prediction"
-fn.metadata.categories = ['RNN', 'Stocks', 'Predictions']
-fn.metadata.labels = {'author': 'Daniel', 'framework': "tensorflow, keras"}
-fn.spec.max_replicas = 1
-fn.export("function.yaml")
-fn.apply(mount_v3io())
-fn.add_model('RNN_stocks', "/Users/test/demos/stock-analysis/models/RNN.h5")
-
-
-
-
-
> 2021-04-25 15:04:37,946 [info] function spec saved to path: function.yaml
-
-
-
<mlrun.runtimes.function.RemoteRuntime at 0x7f6a64571250>
-
-
-
-
-
-
-
addr = fn.deploy()
-
-
-
-
-
> 2021-04-25 15:04:37,956 [info] Starting remote function deploy
-2021-04-25 15:04:38  (info) Deploying function
-2021-04-25 15:04:38  (info) Building
-2021-04-25 15:04:38  (info) Staging files and preparing base images
-2021-04-25 15:04:38  (info) Building processor image
-
-
-
-
-
-
-
# getting answer
-
-
-
-
-
-
-
data = [[[0.0, 0.0, 0.0003997438873852843, 0.0, 0.0031504590166214763, 0.07695351128043912],[0.0, 0.0076737889289563155, 0.0011658857834522252, 0.0034156784725392075, 0.0014412414129996165, 0.04148475898082488],[0.0, 0.005938872792230376, 0.0, 0.004308552624495143, 0.0052619477491432365, 0.030420689911339623],[0.0, 0.00974237019744173, 0.002864898969104024, 0.009126076646371328, 0.00754097274129828, 0.03420920993845116],[0.0, 0.012044527240189229, 0.006196233583000366, 0.01248470808067681, 0.009082756366275535, 0.03354111121910287],[0.0, 0.013579264661919943, 0.006129643489857672, 0.012247201899718174, 0.011462630413374697, 0.034077706918863734],[0.0, 0.01594821582320083, 0.009427632225050497, 0.01567375140495164, 0.014746872963329771, 0.041772700519581665],[0.00030003000301803695, 0.015347577857912631, 0.01465785196829128, 0.010008183299277662, 0.015383674399518488, 0.09121246281753899],[0.00030003000301803695, 0.019851802585909906, 0.01057694673341536, 0.01258648165342402, 0.015283029906551948, 0.04697070093018961],[0.00030003000301803695, 0.019718316169145544, 0.01062686388720635, 0.013909641632783631, 0.010724979922241862, 0.03015250378874856],[0.00030003000301803695, 0.015247488500415018, 0.007595438853113201, 0.011372653202875505, 0.008378858601442496, 0.024210897325231427],[0.00030003000301803695, 0.012878639159436767, 0.008361580749180142, 0.012077613789687636, 0.013104547126374655, 0.023022372892001874],[0.00030003000301803695, 0.017633036371026645, 0.010127184027974745, 0.01808246164906735, 0.013422334158536331, 0.030119960676463234],[0.00030003000301803695, 0.017983400032419605, 0.011592979391230274, 0.018353926865489978, 0.01575229508310738, 0.03006790591664354],[0.00040004000402404927, 0.026291122165628233, 0.018955200421947316, 0.01818423522181467, 0.015819289130762915, 0.07152689652172939],[0.00040004000402404927, 0.020318954134433742, 0.013358582670024766, 0.012077613789687636, 0.011462323570408328, 0.0456794077047931],[0.00040004000402404927, 0.01588142170466733, 0.007395465245156219, 0.007908106850034513, 0.005597122549398592, 0.031427718441502224],[0.00040004000402404927, 0.010042638269934567, 0.005296809836383476, 0.009621381602651136, 0.00738243720867815, 0.02567574365912337],[0.00040004000402404927, 0.011977834941958365, 0.006362759647989491, 0.010788412845792017, 0.008764355648181232, 0.022205280753766737],[0.00040004000402404927, 0.013178907231929493, 0.004497220397348678, 0.010177771409308312, 0.0074069846459871025, 0.025356051256133154],[0.00040004000402404927, 0.011810951465927255, 0.0022319372593224696, 0.0040711499771812765, 0.0, 0.031626735214947514],[0.0005000500050300616, 0.025123446934923743, 0.028016434638316157, 0.023612400680174672, 0.032778090757603406, 0.10153228594145776],[0.0005000500050300616, 0.03713457711584622, 0.03174761480386212, 0.03477405231513664, 0.030096896917537364, 0.047626905771733244],[0.0005000500050300616, 0.03456554905987275, 0.025784497378993576, 0.03300994254296863, 0.02869022647874786, 0.028768812095043804],[0.0005000500050300616, 0.03329870085439446, 0.026250831359521554, 0.03389163506129611, 0.028856842209482148, 0.025428460696670638],[0.0005000500050300616, 0.033264387412404006, 0.02518488154791565, 0.031652823528145024, 0.02952708952900418, 0.028119392147072767],[0.0005000500050300616, 0.03399840997415393, 0.029015896021043597, 0.03396659342006425, 0.03401824774664752, 0.033913843613464494],[0.0005000500050300616, 0.038369148285386845, 0.030614871570584734, 0.0350454139979145, 0.031102319037314974, 0.033925229639953834],[0.0006000600060360739, 0.04257300312058865, 0.04390696581173115, 0.03887905779413703, 0.04573187570650594, 0.09803619685873038],[0.0006000600060360739, 0.05008001039381649, 0.04099204802457179, 0.045291103478150685, 0.041925488708792535, 0.036871285277117145],[0.0006000600060360739, 0.04614302657184077, 0.0375107586207738, 0.04030399134624385, 0.03666589342239146, 0.04111875053787802],[0.0006000600060360739, 0.04090477928209357, 0.03571180946327568, 0.039964711592538005, 0.040218521286923936, 0.027399177725746942],[0.0006000600060360739, 0.044508199792612446, 0.03767738635002715, 0.04535891801543401, 0.042307610482901215, 0.030618589411891214],[0.0006000600060360739, 0.04657678106109786, 0.04117524702891262, 0.04627560490567262, 0.044793447634383865, 0.05405884268018613],[0.0006000600060360739, 0.04910742286297509, 0.047837916256705526, 0.04749626657677186, 0.05335661657762081, 0.06314168219836809],[0.007700770077008201, 0.030194810748639833, 0.06282892201923929, 0.03002444636418833, 0.05895373912701951, 0.2321397402054094],[0.007700770077008201, 0.06292535431428914, 0.05726493849616998, 0.060116707265659874, 0.05526702088618707, 0.078071078727893],[0.007700770077008201, 0.05958890663729899, 0.05200208452407595, 0.04851400230424485, 0.04286626924365666, 0.07557726468701688],[0.007700770077008201, 0.04721071426535017, 0.03784391241501628, 0.03962543183883216, 0.039346780419491134, 0.0617907570715274],[0.007700770077008201, 0.043530928527849455, 0.03890996389088652, 0.04125391253643351, 0.044340956539989596, 0.03782111959214908],[0.007700770077008201, 0.0485636006265614, 0.04137511897260526, 0.045935703950098494, 0.04155011747994375, 0.0446209643074681],[0.007700770077008201, 0.04580936144008119, 0.036544643116749764, 0.03738630970474699, 0.03525830245470285, 0.05051381704471508],[0.008000800080026238, 0.03993728276638431, 0.037731268410127417, 0.03160944293101264, 0.04028224234293831, 0.09797242089055323],[0.008000800080026238, 0.04313250568360927, 0.03691114278969587, 0.041932368510200546, 0.040620792415823725, 0.03983262723887141],[0.008000800080026238, 0.04514213299686476, 0.03984273351620693, 0.04647853084929898, 0.0439723358564007, 0.03439152856064892],[0.008000800080026238, 0.048335115867431555, 0.03857681009664404, 0.03775954849391683, 0.0353588446666806, 0.04126804866755406],[0.008000800080026238, 0.0397703992903532, 0.03627818107991432, 0.0411181799282222, 0.04165976269992355, 0.029229849676112238],[0.008000800080026238, 0.04600954015507641, 0.04194149058924401, 0.04400190253425518, 0.04668707786078907, 0.04065361967520816],[0.008000800080026238, 0.05101421167056164, 0.04513954335259052, 0.051906592774014215, 0.04899965101726622, 0.03669087617586497],[0.00810081008103225, 0.05958890663729899, 0.060596883095652365, 0.05923460061275354, 0.06327715652305188, 0.0879437184546251],[0.00810081008103225, 0.06756306545905066, 0.062162614430754326, 0.06873391604829482, 0.06699742292820654, 0.04326880002341804],[0.00810081008103225, 0.0711663841492669, 0.06549394904465067, 0.0727161306246723, 0.06872341461398956, 0.04025076154082211],[0.00810081008103225, 0.07293469734525959, 0.06572716686704694, 0.07304246867278985, 0.07101921368830555, 0.03401019316500588],[0.00810081008103225, 0.07523685438800698, 0.06839229555672255, 0.07562087056058087, 0.07413622682159382, 0.042899714001500375],[0.00810081008103225, 0.07830643105177104, 0.06949159124703208, 0.07365311010927356, 0.06820259981908527, 0.0531809708965342],[0.00810081008103225, 0.07233416120027403, 0.06372834576585618, 0.06798754200359969, 0.06445021718345156, 0.05271274214084092],[0.008200820082038263, 0.07760570372898534, 0.07595438853113234, 0.07419593700847404, 0.0738178261034993, 0.1051673434581077],[0.008200820082038263, 0.07803945821824254, 0.07592104265242872, 0.07908127556763367, 0.0799008856306257, 0.06479979642881596],[0.008200820082038263, 0.0840451251290063, 0.07642072251166032, 0.08345774626305669, 0.07933107824209251, 0.038738024827896046],[0.008200820082038263, 0.08354457652121561, 0.07695374824959555, 0.08169353295724402, 0.07758759650722691, 0.037608309576979866]]]
-
-
-
-
-
-
-
import json
-
-event_data = {'inputs':X_test[:5].tolist()}
-
-resp = requests.put(addr+"/RNN_stocks/predict", json = event_data)
-s = json.loads(json.loads(resp.text))
-
-
-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/rnn_serving/0.0.1/static/function.html b/functions/development/rnn_serving/0.0.1/static/function.html deleted file mode 100644 index b5a7d1d4..00000000 --- a/functions/development/rnn_serving/0.0.1/static/function.html +++ /dev/null @@ -1,67 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: serving
-metadata:
-  name: rnn-serving
-  tag: ''
-  hash: c21d8eee1a9d5eb69be14cc4d397f1227137fa93
-  project: default
-  labels:
-    author: Daniel
-  categories:
-  - model-serving
-  - machine-learning
-spec:
-  command: ''
-  args: []
-  image: mlrun/ml-models
-  description: deploy an rnn based stock analysis model server.
-  min_replicas: 1
-  max_replicas: 4
-  env: []
-  base_spec:
-    apiVersion: nuclio.io/v1
-    kind: Function
-    metadata:
-      name: rnn-serving
-      labels: {}
-      annotations:
-        nuclio.io/generated_by: function generated from /home/kali/functions/rnn_serving/rnn_serving.py
-    spec:
-      runtime: python:3.6
-      handler: rnn_serving:handler
-      env: []
-      volumes: []
-      build:
-        commands: []
-        noBaseImagesPull: true
-        functionSourceCode: aW1wb3J0IG1scnVuCmltcG9ydCBudW1weSBhcyBucAppbXBvcnQga2VyYXMKaW1wb3J0IGpzb24KCgpjbGFzcyBSTk5fTW9kZWxfU2VydmluZyhtbHJ1bi5zZXJ2aW5nLlYyTW9kZWxTZXJ2ZXIpOgogICAgZGVmIGxvYWQoc2VsZik6CiAgICAgICAgIiIibG9hZCBhbmQgaW5pdGlhbGl6ZSB0aGUgbW9kZWwgYW5kL29yIG90aGVyIGVsZW1lbnRzIiIiCiAgICAgICAgbW9kZWxfZmlsZSwgZXh0cmFfZGF0YSA9IHNlbGYuZ2V0X21vZGVsKHN1ZmZpeD0iLmg1IikKICAgICAgICBzZWxmLm1vZGVsID0ga2VyYXMubW9kZWxzLmxvYWRfbW9kZWwobW9kZWxfZmlsZSkKCiAgICBkZWYgcHJlZGljdChzZWxmLCBib2R5KToKICAgICAgICB0cnk6CiAgICAgICAgICAgICIiIkdlbmVyYXRlIG1vZGVsIHByZWRpY3Rpb25zIGZyb20gc2FtcGxlLiIiIgogICAgICAgICAgICBmZWF0cyA9IG5wLmFzYXJyYXkoYm9keVsnaW5wdXRzJ10pCiAgICAgICAgICAgIHJlc3VsdCA9IHNlbGYubW9kZWwucHJlZGljdChmZWF0cykKICAgICAgICAgICAgcmVzdWx0ID0ganNvbi5kdW1wcyhyZXN1bHQudG9saXN0KCkpCiAgICAgICAgICAgIHJldHVybiByZXN1bHQKICAgICAgICBleGNlcHQgRXhjZXB0aW9uIGFzIGU6CiAgICAgICAgICAgIHJhaXNlIEV4Y2VwdGlvbigiRmFpbGVkIHRvIHByZWRpY3QgJXMiICUgZSkKZnJvbSBtbHJ1bi5ydW50aW1lcyBpbXBvcnQgbnVjbGlvX2luaXRfaG9vawpkZWYgaW5pdF9jb250ZXh0KGNvbnRleHQpOgogICAgbnVjbGlvX2luaXRfaG9vayhjb250ZXh0LCBnbG9iYWxzKCksICdzZXJ2aW5nX3YyJykKCmRlZiBoYW5kbGVyKGNvbnRleHQsIGV2ZW50KToKICAgIHJldHVybiBjb250ZXh0Lm1scnVuX2hhbmRsZXIoY29udGV4dCwgZXZlbnQpCg==
-  source: ''
-  function_kind: serving_v2
-  build:
-    commands:
-    - python -m pip install keras
-    code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/rnn_serving/rnn_serving.py
-  secret_sources: []
-  affinity: null
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/rnn_serving/0.0.1/static/item.html b/functions/development/rnn_serving/0.0.1/static/item.html deleted file mode 100644 index 053b258f..00000000 --- a/functions/development/rnn_serving/0.0.1/static/item.html +++ /dev/null @@ -1,46 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- model-serving
-- machine-learning
-description: deploy an rnn based stock analysis model server.
-doc: ''
-example: rnn_serving.ipynb
-generationDate: 2021-05-19:23-13
-icon: ''
-labels:
-  author: Daniel
-maintainers: []
-marketplaceType: ''
-mlrunVersion: ''
-name: rnn-serving
-platformVersion: ''
-spec:
-  filename: rnn_serving.py
-  handler: handler
-  image: mlrun/ml-models
-  kind: serving
-  requirements: [keras]
-url: ''
-version: 0.0.1
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/rnn_serving/0.0.1/static/source.html b/functions/development/rnn_serving/0.0.1/static/source.html deleted file mode 100644 index 9981d925..00000000 --- a/functions/development/rnn_serving/0.0.1/static/source.html +++ /dev/null @@ -1,42 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-import mlrun
-import numpy as np
-import keras
-import json
-
-
-class RNN_Model_Serving(mlrun.serving.V2ModelServer):
-    def load(self):
-        """load and initialize the model and/or other elements"""
-        model_file, extra_data = self.get_model(suffix=".h5")
-        self.model = keras.models.load_model(model_file)
-
-    def predict(self, body):
-        try:
-            """Generate model predictions from sample."""
-            feats = np.asarray(body['inputs'])
-            result = self.model.predict(feats)
-            result = json.dumps(result.tolist())
-            return result
-        except Exception as e:
-            raise Exception("Failed to predict %s" % e)
-        
-    
- - \ No newline at end of file diff --git a/functions/development/rnn_serving/0.8.0/src/function.yaml b/functions/development/rnn_serving/0.8.0/src/function.yaml deleted file mode 100644 index 445dec4d..00000000 --- a/functions/development/rnn_serving/0.8.0/src/function.yaml +++ /dev/null @@ -1,46 +0,0 @@ -kind: serving -metadata: - name: rnn-serving - tag: '' - hash: 548cd27edfdc49aed0b069d94bd049435d484722 - project: default - labels: - author: Daniel - categories: - - model-serving - - machine-learning -spec: - command: '' - args: [] - image: mlrun/ml-models - description: deploy an rnn based stock analysis model server. - min_replicas: 1 - max_replicas: 4 - env: [] - base_spec: - apiVersion: nuclio.io/v1 - kind: Function - metadata: - name: rnn-serving - labels: {} - annotations: - nuclio.io/generated_by: function generated from /User/test/functions/rnn_serving/rnn_serving.py - spec: - runtime: python:3.6 - handler: rnn_serving:handler - env: [] - volumes: [] - build: - commands: [] - noBaseImagesPull: true - functionSourceCode: aW1wb3J0IG1scnVuCmltcG9ydCBudW1weSBhcyBucApmcm9tIHRlbnNvcmZsb3cgaW1wb3J0IGtlcmFzCmltcG9ydCBqc29uCgoKY2xhc3MgUk5OX01vZGVsX1NlcnZpbmcobWxydW4uc2VydmluZy5WMk1vZGVsU2VydmVyKToKICAgIGRlZiBsb2FkKHNlbGYpOgogICAgICAgICIiImxvYWQgYW5kIGluaXRpYWxpemUgdGhlIG1vZGVsIGFuZC9vciBvdGhlciBlbGVtZW50cyIiIgogICAgICAgIG1vZGVsX2ZpbGUsIGV4dHJhX2RhdGEgPSBzZWxmLmdldF9tb2RlbChzdWZmaXg9Ii5oNSIpCiAgICAgICAgc2VsZi5tb2RlbCA9IGtlcmFzLm1vZGVscy5sb2FkX21vZGVsKG1vZGVsX2ZpbGUpCgogICAgZGVmIHByZWRpY3Qoc2VsZiwgYm9keSk6CiAgICAgICAgdHJ5OgogICAgICAgICAgICAiIiJHZW5lcmF0ZSBtb2RlbCBwcmVkaWN0aW9ucyBmcm9tIHNhbXBsZS4iIiIKICAgICAgICAgICAgZmVhdHMgPSBucC5hc2FycmF5KGJvZHlbJ2lucHV0cyddKQogICAgICAgICAgICByZXN1bHQgPSBzZWxmLm1vZGVsLnByZWRpY3QoZmVhdHMpCiAgICAgICAgICAgIHJlc3VsdCA9IGpzb24uZHVtcHMocmVzdWx0LnRvbGlzdCgpKQogICAgICAgICAgICByZXR1cm4gcmVzdWx0CiAgICAgICAgZXhjZXB0IEV4Y2VwdGlvbiBhcyBlOgogICAgICAgICAgICByYWlzZSBFeGNlcHRpb24oIkZhaWxlZCB0byBwcmVkaWN0ICVzIiAlIGUpCmZyb20gbWxydW4ucnVudGltZXMgaW1wb3J0IG51Y2xpb19pbml0X2hvb2sKZGVmIGluaXRfY29udGV4dChjb250ZXh0KToKICAgIG51Y2xpb19pbml0X2hvb2soY29udGV4dCwgZ2xvYmFscygpLCAnc2VydmluZ192MicpCgpkZWYgaGFuZGxlcihjb250ZXh0LCBldmVudCk6CiAgICByZXR1cm4gY29udGV4dC5tbHJ1bl9oYW5kbGVyKGNvbnRleHQsIGV2ZW50KQo= - source: '' - function_kind: serving_v2 - build: - commands: [] - code_origin: https://github.com/daniels290813/functions.git#97b63199864dd95681bca5af86835d177bf9d67b:/User/test/functions/rnn_serving/rnn_serving.py - origin_filename: /User/test/functions/rnn_serving/rnn_serving.py - secret_sources: [] - mount_applied: false - affinity: null -verbose: false diff --git a/functions/development/rnn_serving/0.8.0/src/item.yaml b/functions/development/rnn_serving/0.8.0/src/item.yaml deleted file mode 100644 index 7630baf1..00000000 --- a/functions/development/rnn_serving/0.8.0/src/item.yaml +++ /dev/null @@ -1,24 +0,0 @@ -apiVersion: v1 -categories: -- model-serving -- machine-learning -description: deploy an rnn based stock analysis model server. -doc: '' -example: rnn_serving.ipynb -generationDate: 2021-05-19:23-13 -icon: '' -labels: - author: Daniel -maintainers: [] -marketplaceType: '' -mlrunVersion: 0.8.0 -name: rnn-serving -platformVersion: 3.2.0 -spec: - filename: rnn_serving.py - handler: handler - image: mlrun/ml-models - kind: serving - requirements: null -url: '' -version: 0.8.0 diff --git a/functions/development/rnn_serving/0.8.0/src/requirements.txt b/functions/development/rnn_serving/0.8.0/src/requirements.txt deleted file mode 100644 index 2764273d..00000000 --- a/functions/development/rnn_serving/0.8.0/src/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ -mlrun -keras -tensorflow -wget \ No newline at end of file diff --git a/functions/development/rnn_serving/0.8.0/src/rnn_serving.ipynb b/functions/development/rnn_serving/0.8.0/src/rnn_serving.ipynb deleted file mode 100644 index dbdf3b87..00000000 --- a/functions/development/rnn_serving/0.8.0/src/rnn_serving.ipynb +++ /dev/null @@ -1,285 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# **RNN Serving**" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The following section we create a new model serving function which wraps our class , and specify model and other resources.
\n", - "Deploying the serving function will provide us an http endpoint that can handle requests in real time.
\n", - "This function is part of the [stock-analysis demo](https://github.com/mlrun/demos/tree/master/stock-analysis).
\n", - "To see how the model is trained or how the data-set is generated, check out code folder in the demo repository." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Steps**\n", - "\n", - "1. [Setup function parameters](#Setup-function-parameters)\n", - "2. [Importing the function](#Importing-the-function)\n", - "3. [Testing the function locally](#Testing-the-function-locally)\n", - "4. [Testing the function remotely](#Testing-the-function-remotely)" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import warnings\n", - "warnings.filterwarnings(\"ignore\")" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "# Following packages are required, make sure to install\n", - "# !pip install pip install torch==1.6.0\n", - "# !pip install tensorflow\n", - "# !pip install keras" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Setup function parameters**" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "# Setting up models path\n", - "rnn_model_path = 'https://s3.wasabisys.com/iguazio/models/function-marketplace-models/rnn_serving/rnn_model.h5'\n", - "data_path = 'https://s3.wasabisys.com/iguazio/data/function-marketplace-data/rnn_serving/stocks_data.pkl'" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Importing the function**" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2021-10-17 10:43:46,363 [info] loaded project function-marketplace from MLRun DB\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import mlrun\n", - "mlrun.set_environment(project='function-marketplace')\n", - "\n", - "# Importing the function from the hub\n", - "fn = mlrun.import_function(\"hub://rnn_serving\")\n", - "fn.apply(mlrun.auto_mount())\n", - "\n", - "# Manually specifying needed packages \n", - "fn.spec.build.commands = ['pip install torch==1.6.0', 'pip install tensorflow', 'pip install keras']\n", - "\n", - "# Adding the model \n", - "fn.add_model(key='rnn_model', model_path=rnn_model_path ,class_name='RNN_Model_Serving')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Testing the function locally**" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2021-10-17 10:43:54,256 [info] model rnn_model was loaded\n", - "> 2021-10-17 10:43:54,257 [info] Initializing endpoint records\n", - "> 2021-10-17 10:43:54,276 [info] Loaded ['rnn_model']\n" - ] - } - ], - "source": [ - "# When mocking, class has to be present\n", - "from rnn_serving import *\n", - "\n", - "# Mocking function\n", - "server = fn.to_mock_server()" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "# Getting the data\n", - "import cloudpickle as cp\n", - "from urllib.request import urlopen\n", - "\n", - "rnn_data = cp.load(urlopen(data_path))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "model used in this example take inputs with the shape `(None, None, 11)`.
\n", - "whereas the first dimenstion is the number of instances, the second dimenstion is the number of timestamps
\n", - "and the last dimenstion is the number of features the dataset have.
\n", - "our testing dataset has `(1,10,11)` means one instance to predict, with sequence length of 10, each step has 11 features." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'id': '1bf6a3dc4d204e6e8bfd5834f5d691f1',\n", - " 'model_name': 'rnn_model',\n", - " 'outputs': '[[0.43563252687454224]]'}" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import requests\n", - "\n", - "# KFServing protocol event\n", - "event_data = {\"inputs\": rnn_data}\n", - "\n", - "response = server.test(path='/v2/models/rnn_model/predict',body=event_data)\n", - "response" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Testing the function remotely**" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2021-10-17 10:43:57,192 [info] Starting remote function deploy\n", - "2021-10-17 10:43:57 (info) Deploying function\n", - "2021-10-17 10:43:57 (info) Building\n", - "2021-10-17 10:43:57 (info) Staging files and preparing base images\n", - "2021-10-17 10:43:57 (info) Building processor image\n", - "2021-10-17 10:43:58 (info) Build complete\n", - "2021-10-17 10:44:10 (info) Function deploy complete\n", - "> 2021-10-17 10:44:11,677 [info] successfully deployed function: {'internal_invocation_urls': ['nuclio-function-marketplace-rnn-serving.default-tenant.svc.cluster.local:8080'], 'external_invocation_urls': ['default-tenant.app.dev39.lab.iguazeng.com:30255']}\n" - ] - } - ], - "source": [ - "address = fn.deploy()" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'id': '1bf6a3dc4d204e6e8bfd5834f5d691f1',\n", - " 'model_name': 'rnn_model',\n", - " 'outputs': '[[0.43563252687454224]]'}" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import json\n", - "import requests\n", - "\n", - "# using requests to predict\n", - "response = requests.put(address+\"/v2/models/rnn_model/predict\", json = json.dumps(event_data))\n", - "json.loads(response.text)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "[Back to the top](#RNN-Serving)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/functions/development/rnn_serving/0.8.0/src/rnn_serving.py b/functions/development/rnn_serving/0.8.0/src/rnn_serving.py deleted file mode 100644 index aa66ee9f..00000000 --- a/functions/development/rnn_serving/0.8.0/src/rnn_serving.py +++ /dev/null @@ -1,21 +0,0 @@ -import mlrun -import numpy as np -from tensorflow import keras -import json - - -class RNN_Model_Serving(mlrun.serving.V2ModelServer): - def load(self): - """load and initialize the model and/or other elements""" - model_file, extra_data = self.get_model(suffix=".h5") - self.model = keras.models.load_model(model_file) - - def predict(self, body): - try: - """Generate model predictions from sample.""" - feats = np.asarray(body['inputs']) - result = self.model.predict(feats) - result = json.dumps(result.tolist()) - return result - except Exception as e: - raise Exception("Failed to predict %s" % e) \ No newline at end of file diff --git a/functions/development/rnn_serving/0.8.0/src/test_rnn_serving.py b/functions/development/rnn_serving/0.8.0/src/test_rnn_serving.py deleted file mode 100644 index 255ac5e7..00000000 --- a/functions/development/rnn_serving/0.8.0/src/test_rnn_serving.py +++ /dev/null @@ -1,60 +0,0 @@ -import os -import wget -from mlrun import import_function -from os import path -from rnn_serving import * - -DATASET = np.array([[6.9955170e-01, 6.9952875e-01, 2.7922913e-02, 2.7853036e-02, - 6.9955170e-01, 7.0086759e-01, 7.0118028e-01, 7.0142627e-01, - 2.7922913e-02, 0.0000000e+00, 0.0000000e+00], - [6.9955170e-01, 6.9998503e-01, 1.6527303e-03, 2.7853036e-02, - 7.0000792e-01, 7.0085293e-01, 7.0118028e-01, 7.0203447e-01, - 1.6527303e-03, 0.0000000e+00, 0.0000000e+00], - [6.9955170e-01, 7.0025057e-01, 1.6904050e-04, 2.7853036e-02, - 7.0027345e-01, 7.0014298e-01, 7.0190376e-01, 7.0128226e-01, - 1.6904050e-04, 0.0000000e+00, 0.0000000e+00], - [6.9955170e-01, 7.0144778e-01, 1.6904050e-04, 2.7853036e-02, - 7.0147055e-01, 7.0178574e-01, 7.0236105e-01, 7.0295709e-01, - 7.3906886e-03, 0.0000000e+00, 0.0000000e+00], - [6.9955170e-01, 7.0324355e-01, 1.6904050e-04, 2.7853036e-02, - 7.0326620e-01, 7.0308524e-01, 7.0490342e-01, 7.0427048e-01, - 2.4815742e-03, 0.0000000e+00, 0.0000000e+00], - [6.9955170e-01, 7.0324355e-01, 1.6904050e-04, 2.7853036e-02, - 7.0191067e-01, 7.0173001e-01, 7.0354480e-01, 7.0291305e-01, - 2.9976186e-03, 0.0000000e+00, 0.0000000e+00], - [6.9955170e-01, 7.0324355e-01, 1.6904050e-04, 2.7853036e-02, - 7.0166123e-01, 7.0148063e-01, 7.0284635e-01, 7.0249581e-01, - 2.7904075e-03, 0.0000000e+00, 0.0000000e+00], - [6.9955170e-01, 7.0324355e-01, 1.6904050e-04, 2.7853036e-02, - 7.0133996e-01, 7.0143080e-01, 7.0297277e-01, 7.0250750e-01, - 4.1491759e-04, 0.0000000e+00, 0.0000000e+00], - [6.9955170e-01, 7.0324355e-01, 1.6904050e-04, 2.7853036e-02, - 7.0150572e-01, 7.0251614e-01, 7.0281982e-01, 7.0370042e-01, - 2.1256472e-03, 0.0000000e+00, 0.0000000e+00], - [6.9955170e-01, 7.0324355e-01, 1.6904050e-04, 2.7853036e-02, - 7.0272487e-01, 7.0258951e-01, 7.0429617e-01, 7.0376801e-01, - 1.4207334e-03, 0.0000000e+00, 0.0000000e+00]]).reshape(1, 10, 11).tolist() - - -def download_pretrained_model(model_path): - # Run this to download the pre-trained model to your `models` directory - model_location = 'https://s3.wasabisys.com/iguazio/models/rnn/rnn_model.h5' - saved_models_directory = model_path - # Create paths - os.makedirs(saved_models_directory, exist_ok=1) - model_filepath = os.path.join(saved_models_directory, os.path.basename(model_location)) - wget.download(model_location, model_filepath) - - -def test_rnn_serving(): - model_path = os.path.join(os.path.abspath('./'), 'models') - model = model_path + '/rnn_model.h5' - if not path.exists(model): - download_pretrained_model(model_path) - - fn = import_function('function.yaml') - fn.add_model('rnn_model', model_path=model, class_name='RNN_Model_Serving') - # create an emulator (mock server) from the function configuration) - server = fn.to_mock_server() - resp = server.test("/v2/models/rnn_model/infer", {"inputs": DATASET}) - assert (resp['outputs'] == '[[0.453309565782547]]') diff --git a/functions/development/rnn_serving/0.8.0/static/documentation.html b/functions/development/rnn_serving/0.8.0/static/documentation.html deleted file mode 100644 index eb2a1914..00000000 --- a/functions/development/rnn_serving/0.8.0/static/documentation.html +++ /dev/null @@ -1,140 +0,0 @@ - - - - - - - -rnn_serving package - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- - -
-
-
-
-
-
-

rnn_serving package

-
-

Submodules

-
-
-

rnn_serving.rnn_serving module

-
-
-class rnn_serving.rnn_serving.RNN_Model_Serving(context=None, name: Optional[str] = None, model_path: Optional[str] = None, model=None, protocol=None, **kwargs)[source]
-

Bases: mlrun.serving.v2_serving.V2ModelServer

-
-
-load()[source]
-

load and initialize the model and/or other elements

-
-
-
-predict(body)[source]
-

model prediction operation

-
-
-
-
-

Module contents

-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/rnn_serving/0.8.0/static/example.html b/functions/development/rnn_serving/0.8.0/static/example.html deleted file mode 100644 index 771e674b..00000000 --- a/functions/development/rnn_serving/0.8.0/static/example.html +++ /dev/null @@ -1,306 +0,0 @@ - - - - - - - -RNN Serving - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
- -
-
-
-
-

RNN Serving

-

The following section we create a new model serving function which wraps our class , and specify model and other resources.
-Deploying the serving function will provide us an http endpoint that can handle requests in real time.
-This function is part of the stock-analysis demo.
-To see how the model is trained or how the data-set is generated, check out code folder in the demo repository.

-
-

Steps

-
    -
  1. Setup function parameters

  2. -
  3. Importing the function

  4. -
  5. Testing the function locally

  6. -
  7. Testing the function remotely

  8. -
-
-
-
import warnings
-warnings.filterwarnings("ignore")
-
-
-
-
-
-
-
# Following packages are required, make sure to install
-# !pip install pip install torch==1.6.0
-# !pip install tensorflow
-# !pip install keras
-
-
-
-
-
-
-

Setup function parameters

-
-
-
# Setting up models path
-rnn_model_path = 'https://s3.wasabisys.com/iguazio/models/function-marketplace-models/rnn_serving/rnn_model.h5'
-data_path = 'https://s3.wasabisys.com/iguazio/data/function-marketplace-data/rnn_serving/stocks_data.pkl'
-
-
-
-
-
-
-

Importing the function

-
-
-
import mlrun
-mlrun.set_environment(project='function-marketplace')
-
-# Importing the function from the hub
-fn = mlrun.import_function("hub://rnn_serving")
-fn.apply(mlrun.auto_mount())
-
-# Manually specifying needed packages 
-fn.spec.build.commands = ['pip install torch==1.6.0', 'pip install tensorflow', 'pip install keras']
-
-# Adding the model 
-fn.add_model(key='rnn_model', model_path=rnn_model_path ,class_name='RNN_Model_Serving')
-
-
-
-
-
> 2021-10-17 10:43:46,363 [info] loaded project function-marketplace from MLRun DB
-
-
-
<mlrun.serving.states.TaskStep at 0x7fb59c8fa2d0>
-
-
-
-
-
-
-

Testing the function locally

-
-
-
# When mocking, class has to be present
-from rnn_serving import *
-
-# Mocking function
-server = fn.to_mock_server()
-
-
-
-
-
> 2021-10-17 10:43:54,256 [info] model rnn_model was loaded
-> 2021-10-17 10:43:54,257 [info] Initializing endpoint records
-> 2021-10-17 10:43:54,276 [info] Loaded ['rnn_model']
-
-
-
-
-
-
-
# Getting the data
-import cloudpickle as cp
-from urllib.request import urlopen
-
-rnn_data = cp.load(urlopen(data_path))
-
-
-
-
-

model used in this example take inputs with the shape (None, None, 11).
-whereas the first dimenstion is the number of instances, the second dimenstion is the number of timestamps
-and the last dimenstion is the number of features the dataset have.
-our testing dataset has (1,10,11) means one instance to predict, with sequence length of 10, each step has 11 features.

-
-
-
import requests
-
-# KFServing protocol event
-event_data = {"inputs": rnn_data}
-
-response = server.test(path='/v2/models/rnn_model/predict',body=event_data)
-response
-
-
-
-
-
{'id': '1bf6a3dc4d204e6e8bfd5834f5d691f1',
- 'model_name': 'rnn_model',
- 'outputs': '[[0.43563252687454224]]'}
-
-
-
-
-
-
-

Testing the function remotely

-
-
-
address = fn.deploy()
-
-
-
-
-
> 2021-10-17 10:43:57,192 [info] Starting remote function deploy
-2021-10-17 10:43:57  (info) Deploying function
-2021-10-17 10:43:57  (info) Building
-2021-10-17 10:43:57  (info) Staging files and preparing base images
-2021-10-17 10:43:57  (info) Building processor image
-2021-10-17 10:43:58  (info) Build complete
-2021-10-17 10:44:10  (info) Function deploy complete
-> 2021-10-17 10:44:11,677 [info] successfully deployed function: {'internal_invocation_urls': ['nuclio-function-marketplace-rnn-serving.default-tenant.svc.cluster.local:8080'], 'external_invocation_urls': ['default-tenant.app.dev39.lab.iguazeng.com:30255']}
-
-
-
-
-
-
-
import json
-import requests
-
-# using requests to predict
-response = requests.put(address+"/v2/models/rnn_model/predict", json = json.dumps(event_data))
-json.loads(response.text)
-
-
-
-
-
{'id': '1bf6a3dc4d204e6e8bfd5834f5d691f1',
- 'model_name': 'rnn_model',
- 'outputs': '[[0.43563252687454224]]'}
-
-
-
-
-

Back to the top

-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/rnn_serving/0.8.0/static/function.html b/functions/development/rnn_serving/0.8.0/static/function.html deleted file mode 100644 index 6b205d54..00000000 --- a/functions/development/rnn_serving/0.8.0/static/function.html +++ /dev/null @@ -1,68 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: serving
-metadata:
-  name: rnn-serving
-  tag: ''
-  hash: 548cd27edfdc49aed0b069d94bd049435d484722
-  project: default
-  labels:
-    author: Daniel
-  categories:
-  - model-serving
-  - machine-learning
-spec:
-  command: ''
-  args: []
-  image: mlrun/ml-models
-  description: deploy an rnn based stock analysis model server.
-  min_replicas: 1
-  max_replicas: 4
-  env: []
-  base_spec:
-    apiVersion: nuclio.io/v1
-    kind: Function
-    metadata:
-      name: rnn-serving
-      labels: {}
-      annotations:
-        nuclio.io/generated_by: function generated from /User/test/functions/rnn_serving/rnn_serving.py
-    spec:
-      runtime: python:3.6
-      handler: rnn_serving:handler
-      env: []
-      volumes: []
-      build:
-        commands: []
-        noBaseImagesPull: true
-        functionSourceCode: aW1wb3J0IG1scnVuCmltcG9ydCBudW1weSBhcyBucApmcm9tIHRlbnNvcmZsb3cgaW1wb3J0IGtlcmFzCmltcG9ydCBqc29uCgoKY2xhc3MgUk5OX01vZGVsX1NlcnZpbmcobWxydW4uc2VydmluZy5WMk1vZGVsU2VydmVyKToKICAgIGRlZiBsb2FkKHNlbGYpOgogICAgICAgICIiImxvYWQgYW5kIGluaXRpYWxpemUgdGhlIG1vZGVsIGFuZC9vciBvdGhlciBlbGVtZW50cyIiIgogICAgICAgIG1vZGVsX2ZpbGUsIGV4dHJhX2RhdGEgPSBzZWxmLmdldF9tb2RlbChzdWZmaXg9Ii5oNSIpCiAgICAgICAgc2VsZi5tb2RlbCA9IGtlcmFzLm1vZGVscy5sb2FkX21vZGVsKG1vZGVsX2ZpbGUpCgogICAgZGVmIHByZWRpY3Qoc2VsZiwgYm9keSk6CiAgICAgICAgdHJ5OgogICAgICAgICAgICAiIiJHZW5lcmF0ZSBtb2RlbCBwcmVkaWN0aW9ucyBmcm9tIHNhbXBsZS4iIiIKICAgICAgICAgICAgZmVhdHMgPSBucC5hc2FycmF5KGJvZHlbJ2lucHV0cyddKQogICAgICAgICAgICByZXN1bHQgPSBzZWxmLm1vZGVsLnByZWRpY3QoZmVhdHMpCiAgICAgICAgICAgIHJlc3VsdCA9IGpzb24uZHVtcHMocmVzdWx0LnRvbGlzdCgpKQogICAgICAgICAgICByZXR1cm4gcmVzdWx0CiAgICAgICAgZXhjZXB0IEV4Y2VwdGlvbiBhcyBlOgogICAgICAgICAgICByYWlzZSBFeGNlcHRpb24oIkZhaWxlZCB0byBwcmVkaWN0ICVzIiAlIGUpCmZyb20gbWxydW4ucnVudGltZXMgaW1wb3J0IG51Y2xpb19pbml0X2hvb2sKZGVmIGluaXRfY29udGV4dChjb250ZXh0KToKICAgIG51Y2xpb19pbml0X2hvb2soY29udGV4dCwgZ2xvYmFscygpLCAnc2VydmluZ192MicpCgpkZWYgaGFuZGxlcihjb250ZXh0LCBldmVudCk6CiAgICByZXR1cm4gY29udGV4dC5tbHJ1bl9oYW5kbGVyKGNvbnRleHQsIGV2ZW50KQo=
-  source: ''
-  function_kind: serving_v2
-  build:
-    commands: []
-    code_origin: https://github.com/daniels290813/functions.git#97b63199864dd95681bca5af86835d177bf9d67b:/User/test/functions/rnn_serving/rnn_serving.py
-    origin_filename: /User/test/functions/rnn_serving/rnn_serving.py
-  secret_sources: []
-  mount_applied: false
-  affinity: null
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/rnn_serving/0.8.0/static/item.html b/functions/development/rnn_serving/0.8.0/static/item.html deleted file mode 100644 index 3befe36d..00000000 --- a/functions/development/rnn_serving/0.8.0/static/item.html +++ /dev/null @@ -1,46 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- model-serving
-- machine-learning
-description: deploy an rnn based stock analysis model server.
-doc: ''
-example: rnn_serving.ipynb
-generationDate: 2021-05-19:23-13
-icon: ''
-labels:
-  author: Daniel
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 0.8.0
-name: rnn-serving
-platformVersion: 3.2.0
-spec:
-  filename: rnn_serving.py
-  handler: handler
-  image: mlrun/ml-models
-  kind: serving
-  requirements: null
-url: ''
-version: 0.8.0
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/rnn_serving/0.8.0/static/source.html b/functions/development/rnn_serving/0.8.0/static/source.html deleted file mode 100644 index 8f648e8e..00000000 --- a/functions/development/rnn_serving/0.8.0/static/source.html +++ /dev/null @@ -1,42 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-import mlrun
-import numpy as np
-from tensorflow import keras
-import json
-
-
-class RNN_Model_Serving(mlrun.serving.V2ModelServer):
-    def load(self):
-        """load and initialize the model and/or other elements"""
-        model_file, extra_data = self.get_model(suffix=".h5")
-        self.model = keras.models.load_model(model_file)
-
-    def predict(self, body):
-        try:
-            """Generate model predictions from sample."""
-            feats = np.asarray(body['inputs'])
-            result = self.model.predict(feats)
-            result = json.dumps(result.tolist())
-            return result
-        except Exception as e:
-            raise Exception("Failed to predict %s" % e)
-        
-    
- - \ No newline at end of file diff --git a/functions/development/rnn_serving/0.9.0/src/function.yaml b/functions/development/rnn_serving/0.9.0/src/function.yaml deleted file mode 100644 index 7a09e1f4..00000000 --- a/functions/development/rnn_serving/0.9.0/src/function.yaml +++ /dev/null @@ -1,46 +0,0 @@ -kind: serving -metadata: - name: rnn-serving - tag: '' - hash: 548cd27edfdc49aed0b069d94bd049435d484722 - project: '' - labels: - author: Daniel - categories: - - model-serving - - machine-learning -spec: - command: '' - args: [] - image: mlrun/ml-models - description: deploy an rnn based stock analysis model server. - min_replicas: 1 - max_replicas: 4 - env: [] - base_spec: - apiVersion: nuclio.io/v1 - kind: Function - metadata: - name: rnn-serving - labels: {} - annotations: - nuclio.io/generated_by: function generated from /User/test/functions/rnn_serving/rnn_serving.py - spec: - runtime: python:3.6 - handler: rnn_serving:handler - env: [] - volumes: [] - build: - commands: [] - noBaseImagesPull: true - functionSourceCode: aW1wb3J0IG1scnVuCmltcG9ydCBudW1weSBhcyBucApmcm9tIHRlbnNvcmZsb3cgaW1wb3J0IGtlcmFzCmltcG9ydCBqc29uCgoKY2xhc3MgUk5OX01vZGVsX1NlcnZpbmcobWxydW4uc2VydmluZy5WMk1vZGVsU2VydmVyKToKICAgIGRlZiBsb2FkKHNlbGYpOgogICAgICAgICIiImxvYWQgYW5kIGluaXRpYWxpemUgdGhlIG1vZGVsIGFuZC9vciBvdGhlciBlbGVtZW50cyIiIgogICAgICAgIG1vZGVsX2ZpbGUsIGV4dHJhX2RhdGEgPSBzZWxmLmdldF9tb2RlbChzdWZmaXg9Ii5oNSIpCiAgICAgICAgc2VsZi5tb2RlbCA9IGtlcmFzLm1vZGVscy5sb2FkX21vZGVsKG1vZGVsX2ZpbGUpCgogICAgZGVmIHByZWRpY3Qoc2VsZiwgYm9keSk6CiAgICAgICAgdHJ5OgogICAgICAgICAgICAiIiJHZW5lcmF0ZSBtb2RlbCBwcmVkaWN0aW9ucyBmcm9tIHNhbXBsZS4iIiIKICAgICAgICAgICAgZmVhdHMgPSBucC5hc2FycmF5KGJvZHlbJ2lucHV0cyddKQogICAgICAgICAgICByZXN1bHQgPSBzZWxmLm1vZGVsLnByZWRpY3QoZmVhdHMpCiAgICAgICAgICAgIHJlc3VsdCA9IGpzb24uZHVtcHMocmVzdWx0LnRvbGlzdCgpKQogICAgICAgICAgICByZXR1cm4gcmVzdWx0CiAgICAgICAgZXhjZXB0IEV4Y2VwdGlvbiBhcyBlOgogICAgICAgICAgICByYWlzZSBFeGNlcHRpb24oIkZhaWxlZCB0byBwcmVkaWN0ICVzIiAlIGUpCmZyb20gbWxydW4ucnVudGltZXMgaW1wb3J0IG51Y2xpb19pbml0X2hvb2sKZGVmIGluaXRfY29udGV4dChjb250ZXh0KToKICAgIG51Y2xpb19pbml0X2hvb2soY29udGV4dCwgZ2xvYmFscygpLCAnc2VydmluZ192MicpCgpkZWYgaGFuZGxlcihjb250ZXh0LCBldmVudCk6CiAgICByZXR1cm4gY29udGV4dC5tbHJ1bl9oYW5kbGVyKGNvbnRleHQsIGV2ZW50KQo= - source: '' - function_kind: serving_v2 - build: - commands: [] - code_origin: https://github.com/daniels290813/functions.git#97b63199864dd95681bca5af86835d177bf9d67b:/User/test/functions/rnn_serving/rnn_serving.py - origin_filename: /User/test/functions/rnn_serving/rnn_serving.py - secret_sources: [] - mount_applied: false - affinity: null -verbose: false diff --git a/functions/development/rnn_serving/0.9.0/src/item.yaml b/functions/development/rnn_serving/0.9.0/src/item.yaml deleted file mode 100644 index 8203450a..00000000 --- a/functions/development/rnn_serving/0.9.0/src/item.yaml +++ /dev/null @@ -1,24 +0,0 @@ -apiVersion: v1 -categories: -- model-serving -- machine-learning -description: deploy an rnn based stock analysis model server. -doc: '' -example: rnn_serving.ipynb -generationDate: 2021-11-18:12-28 -icon: '' -labels: - author: Daniel -maintainers: [] -marketplaceType: '' -mlrunVersion: 0.8.0 -name: rnn-serving -platformVersion: 3.2.0 -spec: - filename: rnn_serving.py - handler: handler - image: mlrun/ml-models - kind: serving - requirements: null -url: '' -version: 0.9.0 diff --git a/functions/development/rnn_serving/0.9.0/src/requirements.txt b/functions/development/rnn_serving/0.9.0/src/requirements.txt deleted file mode 100644 index 2764273d..00000000 --- a/functions/development/rnn_serving/0.9.0/src/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ -mlrun -keras -tensorflow -wget \ No newline at end of file diff --git a/functions/development/rnn_serving/0.9.0/src/rnn_serving.ipynb b/functions/development/rnn_serving/0.9.0/src/rnn_serving.ipynb deleted file mode 100644 index dbdf3b87..00000000 --- a/functions/development/rnn_serving/0.9.0/src/rnn_serving.ipynb +++ /dev/null @@ -1,285 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# **RNN Serving**" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The following section we create a new model serving function which wraps our class , and specify model and other resources.
\n", - "Deploying the serving function will provide us an http endpoint that can handle requests in real time.
\n", - "This function is part of the [stock-analysis demo](https://github.com/mlrun/demos/tree/master/stock-analysis).
\n", - "To see how the model is trained or how the data-set is generated, check out code folder in the demo repository." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Steps**\n", - "\n", - "1. [Setup function parameters](#Setup-function-parameters)\n", - "2. [Importing the function](#Importing-the-function)\n", - "3. [Testing the function locally](#Testing-the-function-locally)\n", - "4. [Testing the function remotely](#Testing-the-function-remotely)" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import warnings\n", - "warnings.filterwarnings(\"ignore\")" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "# Following packages are required, make sure to install\n", - "# !pip install pip install torch==1.6.0\n", - "# !pip install tensorflow\n", - "# !pip install keras" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Setup function parameters**" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "# Setting up models path\n", - "rnn_model_path = 'https://s3.wasabisys.com/iguazio/models/function-marketplace-models/rnn_serving/rnn_model.h5'\n", - "data_path = 'https://s3.wasabisys.com/iguazio/data/function-marketplace-data/rnn_serving/stocks_data.pkl'" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Importing the function**" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2021-10-17 10:43:46,363 [info] loaded project function-marketplace from MLRun DB\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import mlrun\n", - "mlrun.set_environment(project='function-marketplace')\n", - "\n", - "# Importing the function from the hub\n", - "fn = mlrun.import_function(\"hub://rnn_serving\")\n", - "fn.apply(mlrun.auto_mount())\n", - "\n", - "# Manually specifying needed packages \n", - "fn.spec.build.commands = ['pip install torch==1.6.0', 'pip install tensorflow', 'pip install keras']\n", - "\n", - "# Adding the model \n", - "fn.add_model(key='rnn_model', model_path=rnn_model_path ,class_name='RNN_Model_Serving')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Testing the function locally**" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2021-10-17 10:43:54,256 [info] model rnn_model was loaded\n", - "> 2021-10-17 10:43:54,257 [info] Initializing endpoint records\n", - "> 2021-10-17 10:43:54,276 [info] Loaded ['rnn_model']\n" - ] - } - ], - "source": [ - "# When mocking, class has to be present\n", - "from rnn_serving import *\n", - "\n", - "# Mocking function\n", - "server = fn.to_mock_server()" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "# Getting the data\n", - "import cloudpickle as cp\n", - "from urllib.request import urlopen\n", - "\n", - "rnn_data = cp.load(urlopen(data_path))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "model used in this example take inputs with the shape `(None, None, 11)`.
\n", - "whereas the first dimenstion is the number of instances, the second dimenstion is the number of timestamps
\n", - "and the last dimenstion is the number of features the dataset have.
\n", - "our testing dataset has `(1,10,11)` means one instance to predict, with sequence length of 10, each step has 11 features." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'id': '1bf6a3dc4d204e6e8bfd5834f5d691f1',\n", - " 'model_name': 'rnn_model',\n", - " 'outputs': '[[0.43563252687454224]]'}" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import requests\n", - "\n", - "# KFServing protocol event\n", - "event_data = {\"inputs\": rnn_data}\n", - "\n", - "response = server.test(path='/v2/models/rnn_model/predict',body=event_data)\n", - "response" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Testing the function remotely**" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2021-10-17 10:43:57,192 [info] Starting remote function deploy\n", - "2021-10-17 10:43:57 (info) Deploying function\n", - "2021-10-17 10:43:57 (info) Building\n", - "2021-10-17 10:43:57 (info) Staging files and preparing base images\n", - "2021-10-17 10:43:57 (info) Building processor image\n", - "2021-10-17 10:43:58 (info) Build complete\n", - "2021-10-17 10:44:10 (info) Function deploy complete\n", - "> 2021-10-17 10:44:11,677 [info] successfully deployed function: {'internal_invocation_urls': ['nuclio-function-marketplace-rnn-serving.default-tenant.svc.cluster.local:8080'], 'external_invocation_urls': ['default-tenant.app.dev39.lab.iguazeng.com:30255']}\n" - ] - } - ], - "source": [ - "address = fn.deploy()" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'id': '1bf6a3dc4d204e6e8bfd5834f5d691f1',\n", - " 'model_name': 'rnn_model',\n", - " 'outputs': '[[0.43563252687454224]]'}" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import json\n", - "import requests\n", - "\n", - "# using requests to predict\n", - "response = requests.put(address+\"/v2/models/rnn_model/predict\", json = json.dumps(event_data))\n", - "json.loads(response.text)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "[Back to the top](#RNN-Serving)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/functions/development/rnn_serving/0.9.0/src/rnn_serving.py b/functions/development/rnn_serving/0.9.0/src/rnn_serving.py deleted file mode 100644 index aa66ee9f..00000000 --- a/functions/development/rnn_serving/0.9.0/src/rnn_serving.py +++ /dev/null @@ -1,21 +0,0 @@ -import mlrun -import numpy as np -from tensorflow import keras -import json - - -class RNN_Model_Serving(mlrun.serving.V2ModelServer): - def load(self): - """load and initialize the model and/or other elements""" - model_file, extra_data = self.get_model(suffix=".h5") - self.model = keras.models.load_model(model_file) - - def predict(self, body): - try: - """Generate model predictions from sample.""" - feats = np.asarray(body['inputs']) - result = self.model.predict(feats) - result = json.dumps(result.tolist()) - return result - except Exception as e: - raise Exception("Failed to predict %s" % e) \ No newline at end of file diff --git a/functions/development/rnn_serving/0.9.0/src/test_rnn_serving.py b/functions/development/rnn_serving/0.9.0/src/test_rnn_serving.py deleted file mode 100644 index 255ac5e7..00000000 --- a/functions/development/rnn_serving/0.9.0/src/test_rnn_serving.py +++ /dev/null @@ -1,60 +0,0 @@ -import os -import wget -from mlrun import import_function -from os import path -from rnn_serving import * - -DATASET = np.array([[6.9955170e-01, 6.9952875e-01, 2.7922913e-02, 2.7853036e-02, - 6.9955170e-01, 7.0086759e-01, 7.0118028e-01, 7.0142627e-01, - 2.7922913e-02, 0.0000000e+00, 0.0000000e+00], - [6.9955170e-01, 6.9998503e-01, 1.6527303e-03, 2.7853036e-02, - 7.0000792e-01, 7.0085293e-01, 7.0118028e-01, 7.0203447e-01, - 1.6527303e-03, 0.0000000e+00, 0.0000000e+00], - [6.9955170e-01, 7.0025057e-01, 1.6904050e-04, 2.7853036e-02, - 7.0027345e-01, 7.0014298e-01, 7.0190376e-01, 7.0128226e-01, - 1.6904050e-04, 0.0000000e+00, 0.0000000e+00], - [6.9955170e-01, 7.0144778e-01, 1.6904050e-04, 2.7853036e-02, - 7.0147055e-01, 7.0178574e-01, 7.0236105e-01, 7.0295709e-01, - 7.3906886e-03, 0.0000000e+00, 0.0000000e+00], - [6.9955170e-01, 7.0324355e-01, 1.6904050e-04, 2.7853036e-02, - 7.0326620e-01, 7.0308524e-01, 7.0490342e-01, 7.0427048e-01, - 2.4815742e-03, 0.0000000e+00, 0.0000000e+00], - [6.9955170e-01, 7.0324355e-01, 1.6904050e-04, 2.7853036e-02, - 7.0191067e-01, 7.0173001e-01, 7.0354480e-01, 7.0291305e-01, - 2.9976186e-03, 0.0000000e+00, 0.0000000e+00], - [6.9955170e-01, 7.0324355e-01, 1.6904050e-04, 2.7853036e-02, - 7.0166123e-01, 7.0148063e-01, 7.0284635e-01, 7.0249581e-01, - 2.7904075e-03, 0.0000000e+00, 0.0000000e+00], - [6.9955170e-01, 7.0324355e-01, 1.6904050e-04, 2.7853036e-02, - 7.0133996e-01, 7.0143080e-01, 7.0297277e-01, 7.0250750e-01, - 4.1491759e-04, 0.0000000e+00, 0.0000000e+00], - [6.9955170e-01, 7.0324355e-01, 1.6904050e-04, 2.7853036e-02, - 7.0150572e-01, 7.0251614e-01, 7.0281982e-01, 7.0370042e-01, - 2.1256472e-03, 0.0000000e+00, 0.0000000e+00], - [6.9955170e-01, 7.0324355e-01, 1.6904050e-04, 2.7853036e-02, - 7.0272487e-01, 7.0258951e-01, 7.0429617e-01, 7.0376801e-01, - 1.4207334e-03, 0.0000000e+00, 0.0000000e+00]]).reshape(1, 10, 11).tolist() - - -def download_pretrained_model(model_path): - # Run this to download the pre-trained model to your `models` directory - model_location = 'https://s3.wasabisys.com/iguazio/models/rnn/rnn_model.h5' - saved_models_directory = model_path - # Create paths - os.makedirs(saved_models_directory, exist_ok=1) - model_filepath = os.path.join(saved_models_directory, os.path.basename(model_location)) - wget.download(model_location, model_filepath) - - -def test_rnn_serving(): - model_path = os.path.join(os.path.abspath('./'), 'models') - model = model_path + '/rnn_model.h5' - if not path.exists(model): - download_pretrained_model(model_path) - - fn = import_function('function.yaml') - fn.add_model('rnn_model', model_path=model, class_name='RNN_Model_Serving') - # create an emulator (mock server) from the function configuration) - server = fn.to_mock_server() - resp = server.test("/v2/models/rnn_model/infer", {"inputs": DATASET}) - assert (resp['outputs'] == '[[0.453309565782547]]') diff --git a/functions/development/rnn_serving/0.9.0/static/documentation.html b/functions/development/rnn_serving/0.9.0/static/documentation.html deleted file mode 100644 index d1793a53..00000000 --- a/functions/development/rnn_serving/0.9.0/static/documentation.html +++ /dev/null @@ -1,125 +0,0 @@ - - - - - - - -rnn_serving package - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- - -
-
-
-
-
-
-

rnn_serving package

-
-

Submodules

-
-
-

rnn_serving.rnn_serving module

-
-
-

Module contents

-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/rnn_serving/0.9.0/static/example.html b/functions/development/rnn_serving/0.9.0/static/example.html deleted file mode 100644 index 771e674b..00000000 --- a/functions/development/rnn_serving/0.9.0/static/example.html +++ /dev/null @@ -1,306 +0,0 @@ - - - - - - - -RNN Serving - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
- -
-
-
-
-

RNN Serving

-

The following section we create a new model serving function which wraps our class , and specify model and other resources.
-Deploying the serving function will provide us an http endpoint that can handle requests in real time.
-This function is part of the stock-analysis demo.
-To see how the model is trained or how the data-set is generated, check out code folder in the demo repository.

-
-

Steps

-
    -
  1. Setup function parameters

  2. -
  3. Importing the function

  4. -
  5. Testing the function locally

  6. -
  7. Testing the function remotely

  8. -
-
-
-
import warnings
-warnings.filterwarnings("ignore")
-
-
-
-
-
-
-
# Following packages are required, make sure to install
-# !pip install pip install torch==1.6.0
-# !pip install tensorflow
-# !pip install keras
-
-
-
-
-
-
-

Setup function parameters

-
-
-
# Setting up models path
-rnn_model_path = 'https://s3.wasabisys.com/iguazio/models/function-marketplace-models/rnn_serving/rnn_model.h5'
-data_path = 'https://s3.wasabisys.com/iguazio/data/function-marketplace-data/rnn_serving/stocks_data.pkl'
-
-
-
-
-
-
-

Importing the function

-
-
-
import mlrun
-mlrun.set_environment(project='function-marketplace')
-
-# Importing the function from the hub
-fn = mlrun.import_function("hub://rnn_serving")
-fn.apply(mlrun.auto_mount())
-
-# Manually specifying needed packages 
-fn.spec.build.commands = ['pip install torch==1.6.0', 'pip install tensorflow', 'pip install keras']
-
-# Adding the model 
-fn.add_model(key='rnn_model', model_path=rnn_model_path ,class_name='RNN_Model_Serving')
-
-
-
-
-
> 2021-10-17 10:43:46,363 [info] loaded project function-marketplace from MLRun DB
-
-
-
<mlrun.serving.states.TaskStep at 0x7fb59c8fa2d0>
-
-
-
-
-
-
-

Testing the function locally

-
-
-
# When mocking, class has to be present
-from rnn_serving import *
-
-# Mocking function
-server = fn.to_mock_server()
-
-
-
-
-
> 2021-10-17 10:43:54,256 [info] model rnn_model was loaded
-> 2021-10-17 10:43:54,257 [info] Initializing endpoint records
-> 2021-10-17 10:43:54,276 [info] Loaded ['rnn_model']
-
-
-
-
-
-
-
# Getting the data
-import cloudpickle as cp
-from urllib.request import urlopen
-
-rnn_data = cp.load(urlopen(data_path))
-
-
-
-
-

model used in this example take inputs with the shape (None, None, 11).
-whereas the first dimenstion is the number of instances, the second dimenstion is the number of timestamps
-and the last dimenstion is the number of features the dataset have.
-our testing dataset has (1,10,11) means one instance to predict, with sequence length of 10, each step has 11 features.

-
-
-
import requests
-
-# KFServing protocol event
-event_data = {"inputs": rnn_data}
-
-response = server.test(path='/v2/models/rnn_model/predict',body=event_data)
-response
-
-
-
-
-
{'id': '1bf6a3dc4d204e6e8bfd5834f5d691f1',
- 'model_name': 'rnn_model',
- 'outputs': '[[0.43563252687454224]]'}
-
-
-
-
-
-
-

Testing the function remotely

-
-
-
address = fn.deploy()
-
-
-
-
-
> 2021-10-17 10:43:57,192 [info] Starting remote function deploy
-2021-10-17 10:43:57  (info) Deploying function
-2021-10-17 10:43:57  (info) Building
-2021-10-17 10:43:57  (info) Staging files and preparing base images
-2021-10-17 10:43:57  (info) Building processor image
-2021-10-17 10:43:58  (info) Build complete
-2021-10-17 10:44:10  (info) Function deploy complete
-> 2021-10-17 10:44:11,677 [info] successfully deployed function: {'internal_invocation_urls': ['nuclio-function-marketplace-rnn-serving.default-tenant.svc.cluster.local:8080'], 'external_invocation_urls': ['default-tenant.app.dev39.lab.iguazeng.com:30255']}
-
-
-
-
-
-
-
import json
-import requests
-
-# using requests to predict
-response = requests.put(address+"/v2/models/rnn_model/predict", json = json.dumps(event_data))
-json.loads(response.text)
-
-
-
-
-
{'id': '1bf6a3dc4d204e6e8bfd5834f5d691f1',
- 'model_name': 'rnn_model',
- 'outputs': '[[0.43563252687454224]]'}
-
-
-
-
-

Back to the top

-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/rnn_serving/0.9.0/static/function.html b/functions/development/rnn_serving/0.9.0/static/function.html deleted file mode 100644 index 8a7e0549..00000000 --- a/functions/development/rnn_serving/0.9.0/static/function.html +++ /dev/null @@ -1,68 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: serving
-metadata:
-  name: rnn-serving
-  tag: ''
-  hash: 548cd27edfdc49aed0b069d94bd049435d484722
-  project: ''
-  labels:
-    author: Daniel
-  categories:
-  - model-serving
-  - machine-learning
-spec:
-  command: ''
-  args: []
-  image: mlrun/ml-models
-  description: deploy an rnn based stock analysis model server.
-  min_replicas: 1
-  max_replicas: 4
-  env: []
-  base_spec:
-    apiVersion: nuclio.io/v1
-    kind: Function
-    metadata:
-      name: rnn-serving
-      labels: {}
-      annotations:
-        nuclio.io/generated_by: function generated from /User/test/functions/rnn_serving/rnn_serving.py
-    spec:
-      runtime: python:3.6
-      handler: rnn_serving:handler
-      env: []
-      volumes: []
-      build:
-        commands: []
-        noBaseImagesPull: true
-        functionSourceCode: aW1wb3J0IG1scnVuCmltcG9ydCBudW1weSBhcyBucApmcm9tIHRlbnNvcmZsb3cgaW1wb3J0IGtlcmFzCmltcG9ydCBqc29uCgoKY2xhc3MgUk5OX01vZGVsX1NlcnZpbmcobWxydW4uc2VydmluZy5WMk1vZGVsU2VydmVyKToKICAgIGRlZiBsb2FkKHNlbGYpOgogICAgICAgICIiImxvYWQgYW5kIGluaXRpYWxpemUgdGhlIG1vZGVsIGFuZC9vciBvdGhlciBlbGVtZW50cyIiIgogICAgICAgIG1vZGVsX2ZpbGUsIGV4dHJhX2RhdGEgPSBzZWxmLmdldF9tb2RlbChzdWZmaXg9Ii5oNSIpCiAgICAgICAgc2VsZi5tb2RlbCA9IGtlcmFzLm1vZGVscy5sb2FkX21vZGVsKG1vZGVsX2ZpbGUpCgogICAgZGVmIHByZWRpY3Qoc2VsZiwgYm9keSk6CiAgICAgICAgdHJ5OgogICAgICAgICAgICAiIiJHZW5lcmF0ZSBtb2RlbCBwcmVkaWN0aW9ucyBmcm9tIHNhbXBsZS4iIiIKICAgICAgICAgICAgZmVhdHMgPSBucC5hc2FycmF5KGJvZHlbJ2lucHV0cyddKQogICAgICAgICAgICByZXN1bHQgPSBzZWxmLm1vZGVsLnByZWRpY3QoZmVhdHMpCiAgICAgICAgICAgIHJlc3VsdCA9IGpzb24uZHVtcHMocmVzdWx0LnRvbGlzdCgpKQogICAgICAgICAgICByZXR1cm4gcmVzdWx0CiAgICAgICAgZXhjZXB0IEV4Y2VwdGlvbiBhcyBlOgogICAgICAgICAgICByYWlzZSBFeGNlcHRpb24oIkZhaWxlZCB0byBwcmVkaWN0ICVzIiAlIGUpCmZyb20gbWxydW4ucnVudGltZXMgaW1wb3J0IG51Y2xpb19pbml0X2hvb2sKZGVmIGluaXRfY29udGV4dChjb250ZXh0KToKICAgIG51Y2xpb19pbml0X2hvb2soY29udGV4dCwgZ2xvYmFscygpLCAnc2VydmluZ192MicpCgpkZWYgaGFuZGxlcihjb250ZXh0LCBldmVudCk6CiAgICByZXR1cm4gY29udGV4dC5tbHJ1bl9oYW5kbGVyKGNvbnRleHQsIGV2ZW50KQo=
-  source: ''
-  function_kind: serving_v2
-  build:
-    commands: []
-    code_origin: https://github.com/daniels290813/functions.git#97b63199864dd95681bca5af86835d177bf9d67b:/User/test/functions/rnn_serving/rnn_serving.py
-    origin_filename: /User/test/functions/rnn_serving/rnn_serving.py
-  secret_sources: []
-  mount_applied: false
-  affinity: null
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/rnn_serving/0.9.0/static/item.html b/functions/development/rnn_serving/0.9.0/static/item.html deleted file mode 100644 index 1cdc2b58..00000000 --- a/functions/development/rnn_serving/0.9.0/static/item.html +++ /dev/null @@ -1,46 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- model-serving
-- machine-learning
-description: deploy an rnn based stock analysis model server.
-doc: ''
-example: rnn_serving.ipynb
-generationDate: 2021-11-18:12-28
-icon: ''
-labels:
-  author: Daniel
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 0.8.0
-name: rnn-serving
-platformVersion: 3.2.0
-spec:
-  filename: rnn_serving.py
-  handler: handler
-  image: mlrun/ml-models
-  kind: serving
-  requirements: null
-url: ''
-version: 0.9.0
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/rnn_serving/0.9.0/static/source.html b/functions/development/rnn_serving/0.9.0/static/source.html deleted file mode 100644 index 8f648e8e..00000000 --- a/functions/development/rnn_serving/0.9.0/static/source.html +++ /dev/null @@ -1,42 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-import mlrun
-import numpy as np
-from tensorflow import keras
-import json
-
-
-class RNN_Model_Serving(mlrun.serving.V2ModelServer):
-    def load(self):
-        """load and initialize the model and/or other elements"""
-        model_file, extra_data = self.get_model(suffix=".h5")
-        self.model = keras.models.load_model(model_file)
-
-    def predict(self, body):
-        try:
-            """Generate model predictions from sample."""
-            feats = np.asarray(body['inputs'])
-            result = self.model.predict(feats)
-            result = json.dumps(result.tolist())
-            return result
-        except Exception as e:
-            raise Exception("Failed to predict %s" % e)
-        
-    
- - \ No newline at end of file diff --git a/functions/development/rnn_serving/1.0.0/src/function.yaml b/functions/development/rnn_serving/1.0.0/src/function.yaml deleted file mode 100644 index 7a09e1f4..00000000 --- a/functions/development/rnn_serving/1.0.0/src/function.yaml +++ /dev/null @@ -1,46 +0,0 @@ -kind: serving -metadata: - name: rnn-serving - tag: '' - hash: 548cd27edfdc49aed0b069d94bd049435d484722 - project: '' - labels: - author: Daniel - categories: - - model-serving - - machine-learning -spec: - command: '' - args: [] - image: mlrun/ml-models - description: deploy an rnn based stock analysis model server. - min_replicas: 1 - max_replicas: 4 - env: [] - base_spec: - apiVersion: nuclio.io/v1 - kind: Function - metadata: - name: rnn-serving - labels: {} - annotations: - nuclio.io/generated_by: function generated from /User/test/functions/rnn_serving/rnn_serving.py - spec: - runtime: python:3.6 - handler: rnn_serving:handler - env: [] - volumes: [] - build: - commands: [] - noBaseImagesPull: true - functionSourceCode: aW1wb3J0IG1scnVuCmltcG9ydCBudW1weSBhcyBucApmcm9tIHRlbnNvcmZsb3cgaW1wb3J0IGtlcmFzCmltcG9ydCBqc29uCgoKY2xhc3MgUk5OX01vZGVsX1NlcnZpbmcobWxydW4uc2VydmluZy5WMk1vZGVsU2VydmVyKToKICAgIGRlZiBsb2FkKHNlbGYpOgogICAgICAgICIiImxvYWQgYW5kIGluaXRpYWxpemUgdGhlIG1vZGVsIGFuZC9vciBvdGhlciBlbGVtZW50cyIiIgogICAgICAgIG1vZGVsX2ZpbGUsIGV4dHJhX2RhdGEgPSBzZWxmLmdldF9tb2RlbChzdWZmaXg9Ii5oNSIpCiAgICAgICAgc2VsZi5tb2RlbCA9IGtlcmFzLm1vZGVscy5sb2FkX21vZGVsKG1vZGVsX2ZpbGUpCgogICAgZGVmIHByZWRpY3Qoc2VsZiwgYm9keSk6CiAgICAgICAgdHJ5OgogICAgICAgICAgICAiIiJHZW5lcmF0ZSBtb2RlbCBwcmVkaWN0aW9ucyBmcm9tIHNhbXBsZS4iIiIKICAgICAgICAgICAgZmVhdHMgPSBucC5hc2FycmF5KGJvZHlbJ2lucHV0cyddKQogICAgICAgICAgICByZXN1bHQgPSBzZWxmLm1vZGVsLnByZWRpY3QoZmVhdHMpCiAgICAgICAgICAgIHJlc3VsdCA9IGpzb24uZHVtcHMocmVzdWx0LnRvbGlzdCgpKQogICAgICAgICAgICByZXR1cm4gcmVzdWx0CiAgICAgICAgZXhjZXB0IEV4Y2VwdGlvbiBhcyBlOgogICAgICAgICAgICByYWlzZSBFeGNlcHRpb24oIkZhaWxlZCB0byBwcmVkaWN0ICVzIiAlIGUpCmZyb20gbWxydW4ucnVudGltZXMgaW1wb3J0IG51Y2xpb19pbml0X2hvb2sKZGVmIGluaXRfY29udGV4dChjb250ZXh0KToKICAgIG51Y2xpb19pbml0X2hvb2soY29udGV4dCwgZ2xvYmFscygpLCAnc2VydmluZ192MicpCgpkZWYgaGFuZGxlcihjb250ZXh0LCBldmVudCk6CiAgICByZXR1cm4gY29udGV4dC5tbHJ1bl9oYW5kbGVyKGNvbnRleHQsIGV2ZW50KQo= - source: '' - function_kind: serving_v2 - build: - commands: [] - code_origin: https://github.com/daniels290813/functions.git#97b63199864dd95681bca5af86835d177bf9d67b:/User/test/functions/rnn_serving/rnn_serving.py - origin_filename: /User/test/functions/rnn_serving/rnn_serving.py - secret_sources: [] - mount_applied: false - affinity: null -verbose: false diff --git a/functions/development/rnn_serving/1.0.0/src/item.yaml b/functions/development/rnn_serving/1.0.0/src/item.yaml deleted file mode 100644 index 50a17871..00000000 --- a/functions/development/rnn_serving/1.0.0/src/item.yaml +++ /dev/null @@ -1,24 +0,0 @@ -apiVersion: v1 -categories: -- model-serving -- machine-learning -description: deploy an rnn based stock analysis model server. -doc: '' -example: rnn_serving.ipynb -generationDate: 2021-11-18:12-28 -icon: '' -labels: - author: Daniel -maintainers: [] -marketplaceType: '' -mlrunVersion: 0.8.0 -name: rnn-serving -platformVersion: 3.2.0 -spec: - filename: rnn_serving.py - handler: handler - image: mlrun/ml-models - kind: serving - requirements: null -url: '' -version: 1.0.0 diff --git a/functions/development/rnn_serving/1.0.0/src/requirements.txt b/functions/development/rnn_serving/1.0.0/src/requirements.txt deleted file mode 100644 index 33d855c7..00000000 --- a/functions/development/rnn_serving/1.0.0/src/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -mlrun -tensorflow==2.8.2 -wget \ No newline at end of file diff --git a/functions/development/rnn_serving/1.0.0/src/rnn_serving.ipynb b/functions/development/rnn_serving/1.0.0/src/rnn_serving.ipynb deleted file mode 100644 index dbdf3b87..00000000 --- a/functions/development/rnn_serving/1.0.0/src/rnn_serving.ipynb +++ /dev/null @@ -1,285 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# **RNN Serving**" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The following section we create a new model serving function which wraps our class , and specify model and other resources.
\n", - "Deploying the serving function will provide us an http endpoint that can handle requests in real time.
\n", - "This function is part of the [stock-analysis demo](https://github.com/mlrun/demos/tree/master/stock-analysis).
\n", - "To see how the model is trained or how the data-set is generated, check out code folder in the demo repository." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Steps**\n", - "\n", - "1. [Setup function parameters](#Setup-function-parameters)\n", - "2. [Importing the function](#Importing-the-function)\n", - "3. [Testing the function locally](#Testing-the-function-locally)\n", - "4. [Testing the function remotely](#Testing-the-function-remotely)" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import warnings\n", - "warnings.filterwarnings(\"ignore\")" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "# Following packages are required, make sure to install\n", - "# !pip install pip install torch==1.6.0\n", - "# !pip install tensorflow\n", - "# !pip install keras" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Setup function parameters**" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "# Setting up models path\n", - "rnn_model_path = 'https://s3.wasabisys.com/iguazio/models/function-marketplace-models/rnn_serving/rnn_model.h5'\n", - "data_path = 'https://s3.wasabisys.com/iguazio/data/function-marketplace-data/rnn_serving/stocks_data.pkl'" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Importing the function**" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2021-10-17 10:43:46,363 [info] loaded project function-marketplace from MLRun DB\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import mlrun\n", - "mlrun.set_environment(project='function-marketplace')\n", - "\n", - "# Importing the function from the hub\n", - "fn = mlrun.import_function(\"hub://rnn_serving\")\n", - "fn.apply(mlrun.auto_mount())\n", - "\n", - "# Manually specifying needed packages \n", - "fn.spec.build.commands = ['pip install torch==1.6.0', 'pip install tensorflow', 'pip install keras']\n", - "\n", - "# Adding the model \n", - "fn.add_model(key='rnn_model', model_path=rnn_model_path ,class_name='RNN_Model_Serving')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Testing the function locally**" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2021-10-17 10:43:54,256 [info] model rnn_model was loaded\n", - "> 2021-10-17 10:43:54,257 [info] Initializing endpoint records\n", - "> 2021-10-17 10:43:54,276 [info] Loaded ['rnn_model']\n" - ] - } - ], - "source": [ - "# When mocking, class has to be present\n", - "from rnn_serving import *\n", - "\n", - "# Mocking function\n", - "server = fn.to_mock_server()" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "# Getting the data\n", - "import cloudpickle as cp\n", - "from urllib.request import urlopen\n", - "\n", - "rnn_data = cp.load(urlopen(data_path))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "model used in this example take inputs with the shape `(None, None, 11)`.
\n", - "whereas the first dimenstion is the number of instances, the second dimenstion is the number of timestamps
\n", - "and the last dimenstion is the number of features the dataset have.
\n", - "our testing dataset has `(1,10,11)` means one instance to predict, with sequence length of 10, each step has 11 features." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'id': '1bf6a3dc4d204e6e8bfd5834f5d691f1',\n", - " 'model_name': 'rnn_model',\n", - " 'outputs': '[[0.43563252687454224]]'}" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import requests\n", - "\n", - "# KFServing protocol event\n", - "event_data = {\"inputs\": rnn_data}\n", - "\n", - "response = server.test(path='/v2/models/rnn_model/predict',body=event_data)\n", - "response" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Testing the function remotely**" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2021-10-17 10:43:57,192 [info] Starting remote function deploy\n", - "2021-10-17 10:43:57 (info) Deploying function\n", - "2021-10-17 10:43:57 (info) Building\n", - "2021-10-17 10:43:57 (info) Staging files and preparing base images\n", - "2021-10-17 10:43:57 (info) Building processor image\n", - "2021-10-17 10:43:58 (info) Build complete\n", - "2021-10-17 10:44:10 (info) Function deploy complete\n", - "> 2021-10-17 10:44:11,677 [info] successfully deployed function: {'internal_invocation_urls': ['nuclio-function-marketplace-rnn-serving.default-tenant.svc.cluster.local:8080'], 'external_invocation_urls': ['default-tenant.app.dev39.lab.iguazeng.com:30255']}\n" - ] - } - ], - "source": [ - "address = fn.deploy()" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'id': '1bf6a3dc4d204e6e8bfd5834f5d691f1',\n", - " 'model_name': 'rnn_model',\n", - " 'outputs': '[[0.43563252687454224]]'}" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import json\n", - "import requests\n", - "\n", - "# using requests to predict\n", - "response = requests.put(address+\"/v2/models/rnn_model/predict\", json = json.dumps(event_data))\n", - "json.loads(response.text)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "[Back to the top](#RNN-Serving)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/functions/development/rnn_serving/1.0.0/src/rnn_serving.py b/functions/development/rnn_serving/1.0.0/src/rnn_serving.py deleted file mode 100644 index aa66ee9f..00000000 --- a/functions/development/rnn_serving/1.0.0/src/rnn_serving.py +++ /dev/null @@ -1,21 +0,0 @@ -import mlrun -import numpy as np -from tensorflow import keras -import json - - -class RNN_Model_Serving(mlrun.serving.V2ModelServer): - def load(self): - """load and initialize the model and/or other elements""" - model_file, extra_data = self.get_model(suffix=".h5") - self.model = keras.models.load_model(model_file) - - def predict(self, body): - try: - """Generate model predictions from sample.""" - feats = np.asarray(body['inputs']) - result = self.model.predict(feats) - result = json.dumps(result.tolist()) - return result - except Exception as e: - raise Exception("Failed to predict %s" % e) \ No newline at end of file diff --git a/functions/development/rnn_serving/1.0.0/src/test_rnn_serving.py b/functions/development/rnn_serving/1.0.0/src/test_rnn_serving.py deleted file mode 100644 index 255ac5e7..00000000 --- a/functions/development/rnn_serving/1.0.0/src/test_rnn_serving.py +++ /dev/null @@ -1,60 +0,0 @@ -import os -import wget -from mlrun import import_function -from os import path -from rnn_serving import * - -DATASET = np.array([[6.9955170e-01, 6.9952875e-01, 2.7922913e-02, 2.7853036e-02, - 6.9955170e-01, 7.0086759e-01, 7.0118028e-01, 7.0142627e-01, - 2.7922913e-02, 0.0000000e+00, 0.0000000e+00], - [6.9955170e-01, 6.9998503e-01, 1.6527303e-03, 2.7853036e-02, - 7.0000792e-01, 7.0085293e-01, 7.0118028e-01, 7.0203447e-01, - 1.6527303e-03, 0.0000000e+00, 0.0000000e+00], - [6.9955170e-01, 7.0025057e-01, 1.6904050e-04, 2.7853036e-02, - 7.0027345e-01, 7.0014298e-01, 7.0190376e-01, 7.0128226e-01, - 1.6904050e-04, 0.0000000e+00, 0.0000000e+00], - [6.9955170e-01, 7.0144778e-01, 1.6904050e-04, 2.7853036e-02, - 7.0147055e-01, 7.0178574e-01, 7.0236105e-01, 7.0295709e-01, - 7.3906886e-03, 0.0000000e+00, 0.0000000e+00], - [6.9955170e-01, 7.0324355e-01, 1.6904050e-04, 2.7853036e-02, - 7.0326620e-01, 7.0308524e-01, 7.0490342e-01, 7.0427048e-01, - 2.4815742e-03, 0.0000000e+00, 0.0000000e+00], - [6.9955170e-01, 7.0324355e-01, 1.6904050e-04, 2.7853036e-02, - 7.0191067e-01, 7.0173001e-01, 7.0354480e-01, 7.0291305e-01, - 2.9976186e-03, 0.0000000e+00, 0.0000000e+00], - [6.9955170e-01, 7.0324355e-01, 1.6904050e-04, 2.7853036e-02, - 7.0166123e-01, 7.0148063e-01, 7.0284635e-01, 7.0249581e-01, - 2.7904075e-03, 0.0000000e+00, 0.0000000e+00], - [6.9955170e-01, 7.0324355e-01, 1.6904050e-04, 2.7853036e-02, - 7.0133996e-01, 7.0143080e-01, 7.0297277e-01, 7.0250750e-01, - 4.1491759e-04, 0.0000000e+00, 0.0000000e+00], - [6.9955170e-01, 7.0324355e-01, 1.6904050e-04, 2.7853036e-02, - 7.0150572e-01, 7.0251614e-01, 7.0281982e-01, 7.0370042e-01, - 2.1256472e-03, 0.0000000e+00, 0.0000000e+00], - [6.9955170e-01, 7.0324355e-01, 1.6904050e-04, 2.7853036e-02, - 7.0272487e-01, 7.0258951e-01, 7.0429617e-01, 7.0376801e-01, - 1.4207334e-03, 0.0000000e+00, 0.0000000e+00]]).reshape(1, 10, 11).tolist() - - -def download_pretrained_model(model_path): - # Run this to download the pre-trained model to your `models` directory - model_location = 'https://s3.wasabisys.com/iguazio/models/rnn/rnn_model.h5' - saved_models_directory = model_path - # Create paths - os.makedirs(saved_models_directory, exist_ok=1) - model_filepath = os.path.join(saved_models_directory, os.path.basename(model_location)) - wget.download(model_location, model_filepath) - - -def test_rnn_serving(): - model_path = os.path.join(os.path.abspath('./'), 'models') - model = model_path + '/rnn_model.h5' - if not path.exists(model): - download_pretrained_model(model_path) - - fn = import_function('function.yaml') - fn.add_model('rnn_model', model_path=model, class_name='RNN_Model_Serving') - # create an emulator (mock server) from the function configuration) - server = fn.to_mock_server() - resp = server.test("/v2/models/rnn_model/infer", {"inputs": DATASET}) - assert (resp['outputs'] == '[[0.453309565782547]]') diff --git a/functions/development/rnn_serving/1.0.0/static/documentation.html b/functions/development/rnn_serving/1.0.0/static/documentation.html deleted file mode 100644 index 0a784615..00000000 --- a/functions/development/rnn_serving/1.0.0/static/documentation.html +++ /dev/null @@ -1,143 +0,0 @@ - - - - - - - -rnn_serving package - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- - -
-
-
-
-
-
-

rnn_serving package

-
-

Submodules

-
-
-

rnn_serving.rnn_serving module

-
-
-class rnn_serving.rnn_serving.RNN_Model_Serving(context=None, name: Optional[str] = None, model_path: Optional[str] = None, model=None, protocol=None, input_path: Optional[str] = None, result_path: Optional[str] = None, **kwargs)[source]
-

Bases: mlrun.serving.v2_serving.V2ModelServer

-
-
-load()[source]
-

load and initialize the model and/or other elements

-
-
-
-predict(body)[source]
-

model prediction operation

-
-
-
-
-

Module contents

-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/rnn_serving/1.0.0/static/example.html b/functions/development/rnn_serving/1.0.0/static/example.html deleted file mode 100644 index 6fdff0c2..00000000 --- a/functions/development/rnn_serving/1.0.0/static/example.html +++ /dev/null @@ -1,309 +0,0 @@ - - - - - - - -RNN Serving - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
- -
-
-
-
-

RNN Serving

-

The following section we create a new model serving function which wraps our class , and specify model and other resources.
-Deploying the serving function will provide us an http endpoint that can handle requests in real time.
-This function is part of the stock-analysis demo.
-To see how the model is trained or how the data-set is generated, check out code folder in the demo repository.

-
-

Steps

-
    -
  1. Setup function parameters

  2. -
  3. Importing the function

  4. -
  5. Testing the function locally

  6. -
  7. Testing the function remotely

  8. -
-
-
-
import warnings
-warnings.filterwarnings("ignore")
-
-
-
-
-
-
-
# Following packages are required, make sure to install
-# !pip install pip install torch==1.6.0
-# !pip install tensorflow
-# !pip install keras
-
-
-
-
-
-
-

Setup function parameters

-
-
-
# Setting up models path
-rnn_model_path = 'https://s3.wasabisys.com/iguazio/models/function-marketplace-models/rnn_serving/rnn_model.h5'
-data_path = 'https://s3.wasabisys.com/iguazio/data/function-marketplace-data/rnn_serving/stocks_data.pkl'
-
-
-
-
-
-
-

Importing the function

-
-
-
import mlrun
-mlrun.set_environment(project='function-marketplace')
-
-# Importing the function from the hub
-fn = mlrun.import_function("hub://rnn_serving")
-fn.apply(mlrun.auto_mount())
-
-# Manually specifying needed packages 
-fn.spec.build.commands = ['pip install torch==1.6.0', 'pip install tensorflow', 'pip install keras']
-
-# Adding the model 
-fn.add_model(key='rnn_model', model_path=rnn_model_path ,class_name='RNN_Model_Serving')
-
-
-
-
-
> 2021-10-17 10:43:46,363 [info] loaded project function-marketplace from MLRun DB
-
-
-
<mlrun.serving.states.TaskStep at 0x7fb59c8fa2d0>
-
-
-
-
-
-
-

Testing the function locally

-
-
-
# When mocking, class has to be present
-from rnn_serving import *
-
-# Mocking function
-server = fn.to_mock_server()
-
-
-
-
-
> 2021-10-17 10:43:54,256 [info] model rnn_model was loaded
-> 2021-10-17 10:43:54,257 [info] Initializing endpoint records
-> 2021-10-17 10:43:54,276 [info] Loaded ['rnn_model']
-
-
-
-
-
-
-
# Getting the data
-import cloudpickle as cp
-from urllib.request import urlopen
-
-rnn_data = cp.load(urlopen(data_path))
-
-
-
-
-

model used in this example take inputs with the shape (None, None, 11).
-whereas the first dimenstion is the number of instances, the second dimenstion is the number of timestamps
-and the last dimenstion is the number of features the dataset have.
-our testing dataset has (1,10,11) means one instance to predict, with sequence length of 10, each step has 11 features.

-
-
-
import requests
-
-# KFServing protocol event
-event_data = {"inputs": rnn_data}
-
-response = server.test(path='/v2/models/rnn_model/predict',body=event_data)
-response
-
-
-
-
-
{'id': '1bf6a3dc4d204e6e8bfd5834f5d691f1',
- 'model_name': 'rnn_model',
- 'outputs': '[[0.43563252687454224]]'}
-
-
-
-
-
-
-

Testing the function remotely

-
-
-
address = fn.deploy()
-
-
-
-
-
> 2021-10-17 10:43:57,192 [info] Starting remote function deploy
-2021-10-17 10:43:57  (info) Deploying function
-2021-10-17 10:43:57  (info) Building
-2021-10-17 10:43:57  (info) Staging files and preparing base images
-2021-10-17 10:43:57  (info) Building processor image
-2021-10-17 10:43:58  (info) Build complete
-2021-10-17 10:44:10  (info) Function deploy complete
-> 2021-10-17 10:44:11,677 [info] successfully deployed function: {'internal_invocation_urls': ['nuclio-function-marketplace-rnn-serving.default-tenant.svc.cluster.local:8080'], 'external_invocation_urls': ['default-tenant.app.dev39.lab.iguazeng.com:30255']}
-
-
-
-
-
-
-
import json
-import requests
-
-# using requests to predict
-response = requests.put(address+"/v2/models/rnn_model/predict", json = json.dumps(event_data))
-json.loads(response.text)
-
-
-
-
-
{'id': '1bf6a3dc4d204e6e8bfd5834f5d691f1',
- 'model_name': 'rnn_model',
- 'outputs': '[[0.43563252687454224]]'}
-
-
-
-
-

Back to the top

-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/rnn_serving/1.0.0/static/function.html b/functions/development/rnn_serving/1.0.0/static/function.html deleted file mode 100644 index 8a7e0549..00000000 --- a/functions/development/rnn_serving/1.0.0/static/function.html +++ /dev/null @@ -1,68 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: serving
-metadata:
-  name: rnn-serving
-  tag: ''
-  hash: 548cd27edfdc49aed0b069d94bd049435d484722
-  project: ''
-  labels:
-    author: Daniel
-  categories:
-  - model-serving
-  - machine-learning
-spec:
-  command: ''
-  args: []
-  image: mlrun/ml-models
-  description: deploy an rnn based stock analysis model server.
-  min_replicas: 1
-  max_replicas: 4
-  env: []
-  base_spec:
-    apiVersion: nuclio.io/v1
-    kind: Function
-    metadata:
-      name: rnn-serving
-      labels: {}
-      annotations:
-        nuclio.io/generated_by: function generated from /User/test/functions/rnn_serving/rnn_serving.py
-    spec:
-      runtime: python:3.6
-      handler: rnn_serving:handler
-      env: []
-      volumes: []
-      build:
-        commands: []
-        noBaseImagesPull: true
-        functionSourceCode: aW1wb3J0IG1scnVuCmltcG9ydCBudW1weSBhcyBucApmcm9tIHRlbnNvcmZsb3cgaW1wb3J0IGtlcmFzCmltcG9ydCBqc29uCgoKY2xhc3MgUk5OX01vZGVsX1NlcnZpbmcobWxydW4uc2VydmluZy5WMk1vZGVsU2VydmVyKToKICAgIGRlZiBsb2FkKHNlbGYpOgogICAgICAgICIiImxvYWQgYW5kIGluaXRpYWxpemUgdGhlIG1vZGVsIGFuZC9vciBvdGhlciBlbGVtZW50cyIiIgogICAgICAgIG1vZGVsX2ZpbGUsIGV4dHJhX2RhdGEgPSBzZWxmLmdldF9tb2RlbChzdWZmaXg9Ii5oNSIpCiAgICAgICAgc2VsZi5tb2RlbCA9IGtlcmFzLm1vZGVscy5sb2FkX21vZGVsKG1vZGVsX2ZpbGUpCgogICAgZGVmIHByZWRpY3Qoc2VsZiwgYm9keSk6CiAgICAgICAgdHJ5OgogICAgICAgICAgICAiIiJHZW5lcmF0ZSBtb2RlbCBwcmVkaWN0aW9ucyBmcm9tIHNhbXBsZS4iIiIKICAgICAgICAgICAgZmVhdHMgPSBucC5hc2FycmF5KGJvZHlbJ2lucHV0cyddKQogICAgICAgICAgICByZXN1bHQgPSBzZWxmLm1vZGVsLnByZWRpY3QoZmVhdHMpCiAgICAgICAgICAgIHJlc3VsdCA9IGpzb24uZHVtcHMocmVzdWx0LnRvbGlzdCgpKQogICAgICAgICAgICByZXR1cm4gcmVzdWx0CiAgICAgICAgZXhjZXB0IEV4Y2VwdGlvbiBhcyBlOgogICAgICAgICAgICByYWlzZSBFeGNlcHRpb24oIkZhaWxlZCB0byBwcmVkaWN0ICVzIiAlIGUpCmZyb20gbWxydW4ucnVudGltZXMgaW1wb3J0IG51Y2xpb19pbml0X2hvb2sKZGVmIGluaXRfY29udGV4dChjb250ZXh0KToKICAgIG51Y2xpb19pbml0X2hvb2soY29udGV4dCwgZ2xvYmFscygpLCAnc2VydmluZ192MicpCgpkZWYgaGFuZGxlcihjb250ZXh0LCBldmVudCk6CiAgICByZXR1cm4gY29udGV4dC5tbHJ1bl9oYW5kbGVyKGNvbnRleHQsIGV2ZW50KQo=
-  source: ''
-  function_kind: serving_v2
-  build:
-    commands: []
-    code_origin: https://github.com/daniels290813/functions.git#97b63199864dd95681bca5af86835d177bf9d67b:/User/test/functions/rnn_serving/rnn_serving.py
-    origin_filename: /User/test/functions/rnn_serving/rnn_serving.py
-  secret_sources: []
-  mount_applied: false
-  affinity: null
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/rnn_serving/1.0.0/static/item.html b/functions/development/rnn_serving/1.0.0/static/item.html deleted file mode 100644 index a8f05d83..00000000 --- a/functions/development/rnn_serving/1.0.0/static/item.html +++ /dev/null @@ -1,46 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- model-serving
-- machine-learning
-description: deploy an rnn based stock analysis model server.
-doc: ''
-example: rnn_serving.ipynb
-generationDate: 2021-11-18:12-28
-icon: ''
-labels:
-  author: Daniel
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 0.8.0
-name: rnn-serving
-platformVersion: 3.2.0
-spec:
-  filename: rnn_serving.py
-  handler: handler
-  image: mlrun/ml-models
-  kind: serving
-  requirements: null
-url: ''
-version: 1.0.0
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/rnn_serving/1.0.0/static/source.html b/functions/development/rnn_serving/1.0.0/static/source.html deleted file mode 100644 index 8f648e8e..00000000 --- a/functions/development/rnn_serving/1.0.0/static/source.html +++ /dev/null @@ -1,42 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-import mlrun
-import numpy as np
-from tensorflow import keras
-import json
-
-
-class RNN_Model_Serving(mlrun.serving.V2ModelServer):
-    def load(self):
-        """load and initialize the model and/or other elements"""
-        model_file, extra_data = self.get_model(suffix=".h5")
-        self.model = keras.models.load_model(model_file)
-
-    def predict(self, body):
-        try:
-            """Generate model predictions from sample."""
-            feats = np.asarray(body['inputs'])
-            result = self.model.predict(feats)
-            result = json.dumps(result.tolist())
-            return result
-        except Exception as e:
-            raise Exception("Failed to predict %s" % e)
-        
-    
- - \ No newline at end of file diff --git a/functions/development/rnn_serving/1.1.0/src/function.yaml b/functions/development/rnn_serving/1.1.0/src/function.yaml deleted file mode 100644 index 7a09e1f4..00000000 --- a/functions/development/rnn_serving/1.1.0/src/function.yaml +++ /dev/null @@ -1,46 +0,0 @@ -kind: serving -metadata: - name: rnn-serving - tag: '' - hash: 548cd27edfdc49aed0b069d94bd049435d484722 - project: '' - labels: - author: Daniel - categories: - - model-serving - - machine-learning -spec: - command: '' - args: [] - image: mlrun/ml-models - description: deploy an rnn based stock analysis model server. - min_replicas: 1 - max_replicas: 4 - env: [] - base_spec: - apiVersion: nuclio.io/v1 - kind: Function - metadata: - name: rnn-serving - labels: {} - annotations: - nuclio.io/generated_by: function generated from /User/test/functions/rnn_serving/rnn_serving.py - spec: - runtime: python:3.6 - handler: rnn_serving:handler - env: [] - volumes: [] - build: - commands: [] - noBaseImagesPull: true - functionSourceCode: aW1wb3J0IG1scnVuCmltcG9ydCBudW1weSBhcyBucApmcm9tIHRlbnNvcmZsb3cgaW1wb3J0IGtlcmFzCmltcG9ydCBqc29uCgoKY2xhc3MgUk5OX01vZGVsX1NlcnZpbmcobWxydW4uc2VydmluZy5WMk1vZGVsU2VydmVyKToKICAgIGRlZiBsb2FkKHNlbGYpOgogICAgICAgICIiImxvYWQgYW5kIGluaXRpYWxpemUgdGhlIG1vZGVsIGFuZC9vciBvdGhlciBlbGVtZW50cyIiIgogICAgICAgIG1vZGVsX2ZpbGUsIGV4dHJhX2RhdGEgPSBzZWxmLmdldF9tb2RlbChzdWZmaXg9Ii5oNSIpCiAgICAgICAgc2VsZi5tb2RlbCA9IGtlcmFzLm1vZGVscy5sb2FkX21vZGVsKG1vZGVsX2ZpbGUpCgogICAgZGVmIHByZWRpY3Qoc2VsZiwgYm9keSk6CiAgICAgICAgdHJ5OgogICAgICAgICAgICAiIiJHZW5lcmF0ZSBtb2RlbCBwcmVkaWN0aW9ucyBmcm9tIHNhbXBsZS4iIiIKICAgICAgICAgICAgZmVhdHMgPSBucC5hc2FycmF5KGJvZHlbJ2lucHV0cyddKQogICAgICAgICAgICByZXN1bHQgPSBzZWxmLm1vZGVsLnByZWRpY3QoZmVhdHMpCiAgICAgICAgICAgIHJlc3VsdCA9IGpzb24uZHVtcHMocmVzdWx0LnRvbGlzdCgpKQogICAgICAgICAgICByZXR1cm4gcmVzdWx0CiAgICAgICAgZXhjZXB0IEV4Y2VwdGlvbiBhcyBlOgogICAgICAgICAgICByYWlzZSBFeGNlcHRpb24oIkZhaWxlZCB0byBwcmVkaWN0ICVzIiAlIGUpCmZyb20gbWxydW4ucnVudGltZXMgaW1wb3J0IG51Y2xpb19pbml0X2hvb2sKZGVmIGluaXRfY29udGV4dChjb250ZXh0KToKICAgIG51Y2xpb19pbml0X2hvb2soY29udGV4dCwgZ2xvYmFscygpLCAnc2VydmluZ192MicpCgpkZWYgaGFuZGxlcihjb250ZXh0LCBldmVudCk6CiAgICByZXR1cm4gY29udGV4dC5tbHJ1bl9oYW5kbGVyKGNvbnRleHQsIGV2ZW50KQo= - source: '' - function_kind: serving_v2 - build: - commands: [] - code_origin: https://github.com/daniels290813/functions.git#97b63199864dd95681bca5af86835d177bf9d67b:/User/test/functions/rnn_serving/rnn_serving.py - origin_filename: /User/test/functions/rnn_serving/rnn_serving.py - secret_sources: [] - mount_applied: false - affinity: null -verbose: false diff --git a/functions/development/rnn_serving/1.1.0/src/item.yaml b/functions/development/rnn_serving/1.1.0/src/item.yaml deleted file mode 100644 index 5cc7b936..00000000 --- a/functions/development/rnn_serving/1.1.0/src/item.yaml +++ /dev/null @@ -1,25 +0,0 @@ -apiVersion: v1 -categories: -- model-serving -- machine-learning -description: deploy an rnn based stock analysis model server. -doc: '' -example: rnn_serving.ipynb -generationDate: 2022-08-28:17-25 -hidden: false -icon: '' -labels: - author: Daniel -maintainers: [] -marketplaceType: '' -mlrunVersion: 1.1.0 -name: rnn-serving -platformVersion: 3.5.0 -spec: - filename: rnn_serving.py - handler: handler - image: mlrun/ml-models - kind: serving - requirements: null -url: '' -version: 1.1.0 diff --git a/functions/development/rnn_serving/1.1.0/src/requirements.txt b/functions/development/rnn_serving/1.1.0/src/requirements.txt deleted file mode 100644 index ff480e35..00000000 --- a/functions/development/rnn_serving/1.1.0/src/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -tensorflow==2.8.2 -wget \ No newline at end of file diff --git a/functions/development/rnn_serving/1.1.0/src/rnn_serving.ipynb b/functions/development/rnn_serving/1.1.0/src/rnn_serving.ipynb deleted file mode 100644 index dbdf3b87..00000000 --- a/functions/development/rnn_serving/1.1.0/src/rnn_serving.ipynb +++ /dev/null @@ -1,285 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# **RNN Serving**" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The following section we create a new model serving function which wraps our class , and specify model and other resources.
\n", - "Deploying the serving function will provide us an http endpoint that can handle requests in real time.
\n", - "This function is part of the [stock-analysis demo](https://github.com/mlrun/demos/tree/master/stock-analysis).
\n", - "To see how the model is trained or how the data-set is generated, check out code folder in the demo repository." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Steps**\n", - "\n", - "1. [Setup function parameters](#Setup-function-parameters)\n", - "2. [Importing the function](#Importing-the-function)\n", - "3. [Testing the function locally](#Testing-the-function-locally)\n", - "4. [Testing the function remotely](#Testing-the-function-remotely)" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import warnings\n", - "warnings.filterwarnings(\"ignore\")" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "# Following packages are required, make sure to install\n", - "# !pip install pip install torch==1.6.0\n", - "# !pip install tensorflow\n", - "# !pip install keras" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Setup function parameters**" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "# Setting up models path\n", - "rnn_model_path = 'https://s3.wasabisys.com/iguazio/models/function-marketplace-models/rnn_serving/rnn_model.h5'\n", - "data_path = 'https://s3.wasabisys.com/iguazio/data/function-marketplace-data/rnn_serving/stocks_data.pkl'" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Importing the function**" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2021-10-17 10:43:46,363 [info] loaded project function-marketplace from MLRun DB\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import mlrun\n", - "mlrun.set_environment(project='function-marketplace')\n", - "\n", - "# Importing the function from the hub\n", - "fn = mlrun.import_function(\"hub://rnn_serving\")\n", - "fn.apply(mlrun.auto_mount())\n", - "\n", - "# Manually specifying needed packages \n", - "fn.spec.build.commands = ['pip install torch==1.6.0', 'pip install tensorflow', 'pip install keras']\n", - "\n", - "# Adding the model \n", - "fn.add_model(key='rnn_model', model_path=rnn_model_path ,class_name='RNN_Model_Serving')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Testing the function locally**" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2021-10-17 10:43:54,256 [info] model rnn_model was loaded\n", - "> 2021-10-17 10:43:54,257 [info] Initializing endpoint records\n", - "> 2021-10-17 10:43:54,276 [info] Loaded ['rnn_model']\n" - ] - } - ], - "source": [ - "# When mocking, class has to be present\n", - "from rnn_serving import *\n", - "\n", - "# Mocking function\n", - "server = fn.to_mock_server()" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "# Getting the data\n", - "import cloudpickle as cp\n", - "from urllib.request import urlopen\n", - "\n", - "rnn_data = cp.load(urlopen(data_path))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "model used in this example take inputs with the shape `(None, None, 11)`.
\n", - "whereas the first dimenstion is the number of instances, the second dimenstion is the number of timestamps
\n", - "and the last dimenstion is the number of features the dataset have.
\n", - "our testing dataset has `(1,10,11)` means one instance to predict, with sequence length of 10, each step has 11 features." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'id': '1bf6a3dc4d204e6e8bfd5834f5d691f1',\n", - " 'model_name': 'rnn_model',\n", - " 'outputs': '[[0.43563252687454224]]'}" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import requests\n", - "\n", - "# KFServing protocol event\n", - "event_data = {\"inputs\": rnn_data}\n", - "\n", - "response = server.test(path='/v2/models/rnn_model/predict',body=event_data)\n", - "response" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Testing the function remotely**" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2021-10-17 10:43:57,192 [info] Starting remote function deploy\n", - "2021-10-17 10:43:57 (info) Deploying function\n", - "2021-10-17 10:43:57 (info) Building\n", - "2021-10-17 10:43:57 (info) Staging files and preparing base images\n", - "2021-10-17 10:43:57 (info) Building processor image\n", - "2021-10-17 10:43:58 (info) Build complete\n", - "2021-10-17 10:44:10 (info) Function deploy complete\n", - "> 2021-10-17 10:44:11,677 [info] successfully deployed function: {'internal_invocation_urls': ['nuclio-function-marketplace-rnn-serving.default-tenant.svc.cluster.local:8080'], 'external_invocation_urls': ['default-tenant.app.dev39.lab.iguazeng.com:30255']}\n" - ] - } - ], - "source": [ - "address = fn.deploy()" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'id': '1bf6a3dc4d204e6e8bfd5834f5d691f1',\n", - " 'model_name': 'rnn_model',\n", - " 'outputs': '[[0.43563252687454224]]'}" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import json\n", - "import requests\n", - "\n", - "# using requests to predict\n", - "response = requests.put(address+\"/v2/models/rnn_model/predict\", json = json.dumps(event_data))\n", - "json.loads(response.text)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "[Back to the top](#RNN-Serving)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/functions/development/rnn_serving/1.1.0/src/rnn_serving.py b/functions/development/rnn_serving/1.1.0/src/rnn_serving.py deleted file mode 100644 index d7e783d7..00000000 --- a/functions/development/rnn_serving/1.1.0/src/rnn_serving.py +++ /dev/null @@ -1,35 +0,0 @@ -# Copyright 2019 Iguazio -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -import mlrun -import numpy as np -from tensorflow import keras -import json - - -class RNN_Model_Serving(mlrun.serving.V2ModelServer): - def load(self): - """load and initialize the model and/or other elements""" - model_file, extra_data = self.get_model(suffix=".h5") - self.model = keras.models.load_model(model_file) - - def predict(self, body): - try: - """Generate model predictions from sample.""" - feats = np.asarray(body['inputs']) - result = self.model.predict(feats) - result = json.dumps(result.tolist()) - return result - except Exception as e: - raise Exception("Failed to predict %s" % e) \ No newline at end of file diff --git a/functions/development/rnn_serving/1.1.0/src/test_rnn_serving.py b/functions/development/rnn_serving/1.1.0/src/test_rnn_serving.py deleted file mode 100644 index fb2f4997..00000000 --- a/functions/development/rnn_serving/1.1.0/src/test_rnn_serving.py +++ /dev/null @@ -1,74 +0,0 @@ -# Copyright 2019 Iguazio -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -import os -import wget -from mlrun import import_function -from os import path -from rnn_serving import * - -DATASET = np.array([[6.9955170e-01, 6.9952875e-01, 2.7922913e-02, 2.7853036e-02, - 6.9955170e-01, 7.0086759e-01, 7.0118028e-01, 7.0142627e-01, - 2.7922913e-02, 0.0000000e+00, 0.0000000e+00], - [6.9955170e-01, 6.9998503e-01, 1.6527303e-03, 2.7853036e-02, - 7.0000792e-01, 7.0085293e-01, 7.0118028e-01, 7.0203447e-01, - 1.6527303e-03, 0.0000000e+00, 0.0000000e+00], - [6.9955170e-01, 7.0025057e-01, 1.6904050e-04, 2.7853036e-02, - 7.0027345e-01, 7.0014298e-01, 7.0190376e-01, 7.0128226e-01, - 1.6904050e-04, 0.0000000e+00, 0.0000000e+00], - [6.9955170e-01, 7.0144778e-01, 1.6904050e-04, 2.7853036e-02, - 7.0147055e-01, 7.0178574e-01, 7.0236105e-01, 7.0295709e-01, - 7.3906886e-03, 0.0000000e+00, 0.0000000e+00], - [6.9955170e-01, 7.0324355e-01, 1.6904050e-04, 2.7853036e-02, - 7.0326620e-01, 7.0308524e-01, 7.0490342e-01, 7.0427048e-01, - 2.4815742e-03, 0.0000000e+00, 0.0000000e+00], - [6.9955170e-01, 7.0324355e-01, 1.6904050e-04, 2.7853036e-02, - 7.0191067e-01, 7.0173001e-01, 7.0354480e-01, 7.0291305e-01, - 2.9976186e-03, 0.0000000e+00, 0.0000000e+00], - [6.9955170e-01, 7.0324355e-01, 1.6904050e-04, 2.7853036e-02, - 7.0166123e-01, 7.0148063e-01, 7.0284635e-01, 7.0249581e-01, - 2.7904075e-03, 0.0000000e+00, 0.0000000e+00], - [6.9955170e-01, 7.0324355e-01, 1.6904050e-04, 2.7853036e-02, - 7.0133996e-01, 7.0143080e-01, 7.0297277e-01, 7.0250750e-01, - 4.1491759e-04, 0.0000000e+00, 0.0000000e+00], - [6.9955170e-01, 7.0324355e-01, 1.6904050e-04, 2.7853036e-02, - 7.0150572e-01, 7.0251614e-01, 7.0281982e-01, 7.0370042e-01, - 2.1256472e-03, 0.0000000e+00, 0.0000000e+00], - [6.9955170e-01, 7.0324355e-01, 1.6904050e-04, 2.7853036e-02, - 7.0272487e-01, 7.0258951e-01, 7.0429617e-01, 7.0376801e-01, - 1.4207334e-03, 0.0000000e+00, 0.0000000e+00]]).reshape(1, 10, 11).tolist() - - -def download_pretrained_model(model_path): - # Run this to download the pre-trained model to your `models` directory - model_location = 'https://s3.wasabisys.com/iguazio/models/rnn/rnn_model.h5' - saved_models_directory = model_path - # Create paths - os.makedirs(saved_models_directory, exist_ok=1) - model_filepath = os.path.join(saved_models_directory, os.path.basename(model_location)) - wget.download(model_location, model_filepath) - - -def test_rnn_serving(): - model_path = os.path.join(os.path.abspath('./'), 'models') - model = model_path + '/rnn_model.h5' - if not path.exists(model): - download_pretrained_model(model_path) - - fn = import_function('function.yaml') - fn.add_model('rnn_model', model_path=model, class_name='RNN_Model_Serving') - # create an emulator (mock server) from the function configuration) - server = fn.to_mock_server() - resp = server.test("/v2/models/rnn_model/infer", {"inputs": DATASET}) - assert (resp['outputs'] == '[[0.453309565782547]]') diff --git a/functions/development/rnn_serving/1.1.0/static/documentation.html b/functions/development/rnn_serving/1.1.0/static/documentation.html deleted file mode 100644 index eb387b20..00000000 --- a/functions/development/rnn_serving/1.1.0/static/documentation.html +++ /dev/null @@ -1,237 +0,0 @@ - - - - - - - -rnn_serving package - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - - - -
-
- - -
-
-
- -
-

rnn_serving package

- -
- -
-
-
-
-
-

rnn_serving package#

-
-

Submodules#

-
-
-

rnn_serving.rnn_serving module#

-
-
-class rnn_serving.rnn_serving.RNN_Model_Serving(context=None, name: Optional[str] = None, model_path: Optional[str] = None, model=None, protocol=None, input_path: Optional[str] = None, result_path: Optional[str] = None, **kwargs)[source]#
-

Bases: mlrun.serving.v2_serving.V2ModelServer

-
-
-load()[source]#
-

load and initialize the model and/or other elements

-
-
-
-predict(body)[source]#
-

model prediction operation

-
-
-
-
-

Module contents#

-
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/rnn_serving/1.1.0/static/example.html b/functions/development/rnn_serving/1.1.0/static/example.html deleted file mode 100644 index 6855b7c1..00000000 --- a/functions/development/rnn_serving/1.1.0/static/example.html +++ /dev/null @@ -1,433 +0,0 @@ - - - - - - - -RNN Serving - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - - - -
-
- - -
-
-
- - -
-
-
-

RNN Serving#

-

The following section we create a new model serving function which wraps our class , and specify model and other resources.
-Deploying the serving function will provide us an http endpoint that can handle requests in real time.
-This function is part of the stock-analysis demo.
-To see how the model is trained or how the data-set is generated, check out code folder in the demo repository.

-
-

Steps#

-
    -
  1. Setup function parameters

  2. -
  3. Importing the function

  4. -
  5. Testing the function locally

  6. -
  7. Testing the function remotely

  8. -
-
-
-
import warnings
-warnings.filterwarnings("ignore")
-
-
-
-
-
-
-
# Following packages are required, make sure to install
-# !pip install pip install torch==1.6.0
-# !pip install tensorflow
-# !pip install keras
-
-
-
-
-
-
-

Setup function parameters#

-
-
-
# Setting up models path
-rnn_model_path = 'https://s3.wasabisys.com/iguazio/models/function-marketplace-models/rnn_serving/rnn_model.h5'
-data_path = 'https://s3.wasabisys.com/iguazio/data/function-marketplace-data/rnn_serving/stocks_data.pkl'
-
-
-
-
-
-
-

Importing the function#

-
-
-
import mlrun
-mlrun.set_environment(project='function-marketplace')
-
-# Importing the function from the hub
-fn = mlrun.import_function("hub://rnn_serving")
-fn.apply(mlrun.auto_mount())
-
-# Manually specifying needed packages 
-fn.spec.build.commands = ['pip install torch==1.6.0', 'pip install tensorflow', 'pip install keras']
-
-# Adding the model 
-fn.add_model(key='rnn_model', model_path=rnn_model_path ,class_name='RNN_Model_Serving')
-
-
-
-
-
> 2021-10-17 10:43:46,363 [info] loaded project function-marketplace from MLRun DB
-
-
-
<mlrun.serving.states.TaskStep at 0x7fb59c8fa2d0>
-
-
-
-
-
-
-

Testing the function locally#

-
-
-
# When mocking, class has to be present
-from rnn_serving import *
-
-# Mocking function
-server = fn.to_mock_server()
-
-
-
-
-
> 2021-10-17 10:43:54,256 [info] model rnn_model was loaded
-> 2021-10-17 10:43:54,257 [info] Initializing endpoint records
-> 2021-10-17 10:43:54,276 [info] Loaded ['rnn_model']
-
-
-
-
-
-
-
# Getting the data
-import cloudpickle as cp
-from urllib.request import urlopen
-
-rnn_data = cp.load(urlopen(data_path))
-
-
-
-
-

model used in this example take inputs with the shape (None, None, 11).
-whereas the first dimenstion is the number of instances, the second dimenstion is the number of timestamps
-and the last dimenstion is the number of features the dataset have.
-our testing dataset has (1,10,11) means one instance to predict, with sequence length of 10, each step has 11 features.

-
-
-
import requests
-
-# KFServing protocol event
-event_data = {"inputs": rnn_data}
-
-response = server.test(path='/v2/models/rnn_model/predict',body=event_data)
-response
-
-
-
-
-
{'id': '1bf6a3dc4d204e6e8bfd5834f5d691f1',
- 'model_name': 'rnn_model',
- 'outputs': '[[0.43563252687454224]]'}
-
-
-
-
-
-
-

Testing the function remotely#

-
-
-
address = fn.deploy()
-
-
-
-
-
> 2021-10-17 10:43:57,192 [info] Starting remote function deploy
-2021-10-17 10:43:57  (info) Deploying function
-2021-10-17 10:43:57  (info) Building
-2021-10-17 10:43:57  (info) Staging files and preparing base images
-2021-10-17 10:43:57  (info) Building processor image
-2021-10-17 10:43:58  (info) Build complete
-2021-10-17 10:44:10  (info) Function deploy complete
-> 2021-10-17 10:44:11,677 [info] successfully deployed function: {'internal_invocation_urls': ['nuclio-function-marketplace-rnn-serving.default-tenant.svc.cluster.local:8080'], 'external_invocation_urls': ['default-tenant.app.dev39.lab.iguazeng.com:30255']}
-
-
-
-
-
-
-
import json
-import requests
-
-# using requests to predict
-response = requests.put(address+"/v2/models/rnn_model/predict", json = json.dumps(event_data))
-json.loads(response.text)
-
-
-
-
-
{'id': '1bf6a3dc4d204e6e8bfd5834f5d691f1',
- 'model_name': 'rnn_model',
- 'outputs': '[[0.43563252687454224]]'}
-
-
-
-
-

Back to the top

-
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/rnn_serving/1.1.0/static/function.html b/functions/development/rnn_serving/1.1.0/static/function.html deleted file mode 100644 index 8a7e0549..00000000 --- a/functions/development/rnn_serving/1.1.0/static/function.html +++ /dev/null @@ -1,68 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: serving
-metadata:
-  name: rnn-serving
-  tag: ''
-  hash: 548cd27edfdc49aed0b069d94bd049435d484722
-  project: ''
-  labels:
-    author: Daniel
-  categories:
-  - model-serving
-  - machine-learning
-spec:
-  command: ''
-  args: []
-  image: mlrun/ml-models
-  description: deploy an rnn based stock analysis model server.
-  min_replicas: 1
-  max_replicas: 4
-  env: []
-  base_spec:
-    apiVersion: nuclio.io/v1
-    kind: Function
-    metadata:
-      name: rnn-serving
-      labels: {}
-      annotations:
-        nuclio.io/generated_by: function generated from /User/test/functions/rnn_serving/rnn_serving.py
-    spec:
-      runtime: python:3.6
-      handler: rnn_serving:handler
-      env: []
-      volumes: []
-      build:
-        commands: []
-        noBaseImagesPull: true
-        functionSourceCode: aW1wb3J0IG1scnVuCmltcG9ydCBudW1weSBhcyBucApmcm9tIHRlbnNvcmZsb3cgaW1wb3J0IGtlcmFzCmltcG9ydCBqc29uCgoKY2xhc3MgUk5OX01vZGVsX1NlcnZpbmcobWxydW4uc2VydmluZy5WMk1vZGVsU2VydmVyKToKICAgIGRlZiBsb2FkKHNlbGYpOgogICAgICAgICIiImxvYWQgYW5kIGluaXRpYWxpemUgdGhlIG1vZGVsIGFuZC9vciBvdGhlciBlbGVtZW50cyIiIgogICAgICAgIG1vZGVsX2ZpbGUsIGV4dHJhX2RhdGEgPSBzZWxmLmdldF9tb2RlbChzdWZmaXg9Ii5oNSIpCiAgICAgICAgc2VsZi5tb2RlbCA9IGtlcmFzLm1vZGVscy5sb2FkX21vZGVsKG1vZGVsX2ZpbGUpCgogICAgZGVmIHByZWRpY3Qoc2VsZiwgYm9keSk6CiAgICAgICAgdHJ5OgogICAgICAgICAgICAiIiJHZW5lcmF0ZSBtb2RlbCBwcmVkaWN0aW9ucyBmcm9tIHNhbXBsZS4iIiIKICAgICAgICAgICAgZmVhdHMgPSBucC5hc2FycmF5KGJvZHlbJ2lucHV0cyddKQogICAgICAgICAgICByZXN1bHQgPSBzZWxmLm1vZGVsLnByZWRpY3QoZmVhdHMpCiAgICAgICAgICAgIHJlc3VsdCA9IGpzb24uZHVtcHMocmVzdWx0LnRvbGlzdCgpKQogICAgICAgICAgICByZXR1cm4gcmVzdWx0CiAgICAgICAgZXhjZXB0IEV4Y2VwdGlvbiBhcyBlOgogICAgICAgICAgICByYWlzZSBFeGNlcHRpb24oIkZhaWxlZCB0byBwcmVkaWN0ICVzIiAlIGUpCmZyb20gbWxydW4ucnVudGltZXMgaW1wb3J0IG51Y2xpb19pbml0X2hvb2sKZGVmIGluaXRfY29udGV4dChjb250ZXh0KToKICAgIG51Y2xpb19pbml0X2hvb2soY29udGV4dCwgZ2xvYmFscygpLCAnc2VydmluZ192MicpCgpkZWYgaGFuZGxlcihjb250ZXh0LCBldmVudCk6CiAgICByZXR1cm4gY29udGV4dC5tbHJ1bl9oYW5kbGVyKGNvbnRleHQsIGV2ZW50KQo=
-  source: ''
-  function_kind: serving_v2
-  build:
-    commands: []
-    code_origin: https://github.com/daniels290813/functions.git#97b63199864dd95681bca5af86835d177bf9d67b:/User/test/functions/rnn_serving/rnn_serving.py
-    origin_filename: /User/test/functions/rnn_serving/rnn_serving.py
-  secret_sources: []
-  mount_applied: false
-  affinity: null
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/rnn_serving/1.1.0/static/item.html b/functions/development/rnn_serving/1.1.0/static/item.html deleted file mode 100644 index e5ad52f5..00000000 --- a/functions/development/rnn_serving/1.1.0/static/item.html +++ /dev/null @@ -1,47 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- model-serving
-- machine-learning
-description: deploy an rnn based stock analysis model server.
-doc: ''
-example: rnn_serving.ipynb
-generationDate: 2022-08-28:17-25
-hidden: false
-icon: ''
-labels:
-  author: Daniel
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 1.1.0
-name: rnn-serving
-platformVersion: 3.5.0
-spec:
-  filename: rnn_serving.py
-  handler: handler
-  image: mlrun/ml-models
-  kind: serving
-  requirements: null
-url: ''
-version: 1.1.0
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/rnn_serving/1.1.0/static/rnn_serving.html b/functions/development/rnn_serving/1.1.0/static/rnn_serving.html deleted file mode 100644 index 449e55e7..00000000 --- a/functions/development/rnn_serving/1.1.0/static/rnn_serving.html +++ /dev/null @@ -1,175 +0,0 @@ - - - - - - - -rnn_serving.rnn_serving - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - -
-
- -
-
-
-
-
- -
-

- -
-
-
-
-
-
-
-

Source code for rnn_serving.rnn_serving

-# Copyright 2019 Iguazio
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-import mlrun
-import numpy as np
-from tensorflow import keras
-import json
-
-
-
[docs]class RNN_Model_Serving(mlrun.serving.V2ModelServer): -
[docs] def load(self): - """load and initialize the model and/or other elements""" - model_file, extra_data = self.get_model(suffix=".h5") - self.model = keras.models.load_model(model_file)
- -
[docs] def predict(self, body): - try: - """Generate model predictions from sample.""" - feats = np.asarray(body['inputs']) - result = self.model.predict(feats) - result = json.dumps(result.tolist()) - return result - except Exception as e: - raise Exception("Failed to predict %s" % e)
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/rnn_serving/1.1.0/static/source.html b/functions/development/rnn_serving/1.1.0/static/source.html deleted file mode 100644 index d7b2b50d..00000000 --- a/functions/development/rnn_serving/1.1.0/static/source.html +++ /dev/null @@ -1,56 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-# Copyright 2019 Iguazio
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-import mlrun
-import numpy as np
-from tensorflow import keras
-import json
-
-
-class RNN_Model_Serving(mlrun.serving.V2ModelServer):
-    def load(self):
-        """load and initialize the model and/or other elements"""
-        model_file, extra_data = self.get_model(suffix=".h5")
-        self.model = keras.models.load_model(model_file)
-
-    def predict(self, body):
-        try:
-            """Generate model predictions from sample."""
-            feats = np.asarray(body['inputs'])
-            result = self.model.predict(feats)
-            result = json.dumps(result.tolist())
-            return result
-        except Exception as e:
-            raise Exception("Failed to predict %s" % e)
-        
-    
- - \ No newline at end of file diff --git a/functions/development/rnn_serving/latest/src/function.yaml b/functions/development/rnn_serving/latest/src/function.yaml deleted file mode 100644 index 7a09e1f4..00000000 --- a/functions/development/rnn_serving/latest/src/function.yaml +++ /dev/null @@ -1,46 +0,0 @@ -kind: serving -metadata: - name: rnn-serving - tag: '' - hash: 548cd27edfdc49aed0b069d94bd049435d484722 - project: '' - labels: - author: Daniel - categories: - - model-serving - - machine-learning -spec: - command: '' - args: [] - image: mlrun/ml-models - description: deploy an rnn based stock analysis model server. - min_replicas: 1 - max_replicas: 4 - env: [] - base_spec: - apiVersion: nuclio.io/v1 - kind: Function - metadata: - name: rnn-serving - labels: {} - annotations: - nuclio.io/generated_by: function generated from /User/test/functions/rnn_serving/rnn_serving.py - spec: - runtime: python:3.6 - handler: rnn_serving:handler - env: [] - volumes: [] - build: - commands: [] - noBaseImagesPull: true - functionSourceCode: aW1wb3J0IG1scnVuCmltcG9ydCBudW1weSBhcyBucApmcm9tIHRlbnNvcmZsb3cgaW1wb3J0IGtlcmFzCmltcG9ydCBqc29uCgoKY2xhc3MgUk5OX01vZGVsX1NlcnZpbmcobWxydW4uc2VydmluZy5WMk1vZGVsU2VydmVyKToKICAgIGRlZiBsb2FkKHNlbGYpOgogICAgICAgICIiImxvYWQgYW5kIGluaXRpYWxpemUgdGhlIG1vZGVsIGFuZC9vciBvdGhlciBlbGVtZW50cyIiIgogICAgICAgIG1vZGVsX2ZpbGUsIGV4dHJhX2RhdGEgPSBzZWxmLmdldF9tb2RlbChzdWZmaXg9Ii5oNSIpCiAgICAgICAgc2VsZi5tb2RlbCA9IGtlcmFzLm1vZGVscy5sb2FkX21vZGVsKG1vZGVsX2ZpbGUpCgogICAgZGVmIHByZWRpY3Qoc2VsZiwgYm9keSk6CiAgICAgICAgdHJ5OgogICAgICAgICAgICAiIiJHZW5lcmF0ZSBtb2RlbCBwcmVkaWN0aW9ucyBmcm9tIHNhbXBsZS4iIiIKICAgICAgICAgICAgZmVhdHMgPSBucC5hc2FycmF5KGJvZHlbJ2lucHV0cyddKQogICAgICAgICAgICByZXN1bHQgPSBzZWxmLm1vZGVsLnByZWRpY3QoZmVhdHMpCiAgICAgICAgICAgIHJlc3VsdCA9IGpzb24uZHVtcHMocmVzdWx0LnRvbGlzdCgpKQogICAgICAgICAgICByZXR1cm4gcmVzdWx0CiAgICAgICAgZXhjZXB0IEV4Y2VwdGlvbiBhcyBlOgogICAgICAgICAgICByYWlzZSBFeGNlcHRpb24oIkZhaWxlZCB0byBwcmVkaWN0ICVzIiAlIGUpCmZyb20gbWxydW4ucnVudGltZXMgaW1wb3J0IG51Y2xpb19pbml0X2hvb2sKZGVmIGluaXRfY29udGV4dChjb250ZXh0KToKICAgIG51Y2xpb19pbml0X2hvb2soY29udGV4dCwgZ2xvYmFscygpLCAnc2VydmluZ192MicpCgpkZWYgaGFuZGxlcihjb250ZXh0LCBldmVudCk6CiAgICByZXR1cm4gY29udGV4dC5tbHJ1bl9oYW5kbGVyKGNvbnRleHQsIGV2ZW50KQo= - source: '' - function_kind: serving_v2 - build: - commands: [] - code_origin: https://github.com/daniels290813/functions.git#97b63199864dd95681bca5af86835d177bf9d67b:/User/test/functions/rnn_serving/rnn_serving.py - origin_filename: /User/test/functions/rnn_serving/rnn_serving.py - secret_sources: [] - mount_applied: false - affinity: null -verbose: false diff --git a/functions/development/rnn_serving/latest/src/item.yaml b/functions/development/rnn_serving/latest/src/item.yaml deleted file mode 100644 index 5cc7b936..00000000 --- a/functions/development/rnn_serving/latest/src/item.yaml +++ /dev/null @@ -1,25 +0,0 @@ -apiVersion: v1 -categories: -- model-serving -- machine-learning -description: deploy an rnn based stock analysis model server. -doc: '' -example: rnn_serving.ipynb -generationDate: 2022-08-28:17-25 -hidden: false -icon: '' -labels: - author: Daniel -maintainers: [] -marketplaceType: '' -mlrunVersion: 1.1.0 -name: rnn-serving -platformVersion: 3.5.0 -spec: - filename: rnn_serving.py - handler: handler - image: mlrun/ml-models - kind: serving - requirements: null -url: '' -version: 1.1.0 diff --git a/functions/development/rnn_serving/latest/src/requirements.txt b/functions/development/rnn_serving/latest/src/requirements.txt deleted file mode 100644 index ff480e35..00000000 --- a/functions/development/rnn_serving/latest/src/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -tensorflow==2.8.2 -wget \ No newline at end of file diff --git a/functions/development/rnn_serving/latest/src/rnn_serving.ipynb b/functions/development/rnn_serving/latest/src/rnn_serving.ipynb deleted file mode 100644 index dbdf3b87..00000000 --- a/functions/development/rnn_serving/latest/src/rnn_serving.ipynb +++ /dev/null @@ -1,285 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# **RNN Serving**" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The following section we create a new model serving function which wraps our class , and specify model and other resources.
\n", - "Deploying the serving function will provide us an http endpoint that can handle requests in real time.
\n", - "This function is part of the [stock-analysis demo](https://github.com/mlrun/demos/tree/master/stock-analysis).
\n", - "To see how the model is trained or how the data-set is generated, check out code folder in the demo repository." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Steps**\n", - "\n", - "1. [Setup function parameters](#Setup-function-parameters)\n", - "2. [Importing the function](#Importing-the-function)\n", - "3. [Testing the function locally](#Testing-the-function-locally)\n", - "4. [Testing the function remotely](#Testing-the-function-remotely)" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import warnings\n", - "warnings.filterwarnings(\"ignore\")" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "# Following packages are required, make sure to install\n", - "# !pip install pip install torch==1.6.0\n", - "# !pip install tensorflow\n", - "# !pip install keras" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Setup function parameters**" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "# Setting up models path\n", - "rnn_model_path = 'https://s3.wasabisys.com/iguazio/models/function-marketplace-models/rnn_serving/rnn_model.h5'\n", - "data_path = 'https://s3.wasabisys.com/iguazio/data/function-marketplace-data/rnn_serving/stocks_data.pkl'" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Importing the function**" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2021-10-17 10:43:46,363 [info] loaded project function-marketplace from MLRun DB\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import mlrun\n", - "mlrun.set_environment(project='function-marketplace')\n", - "\n", - "# Importing the function from the hub\n", - "fn = mlrun.import_function(\"hub://rnn_serving\")\n", - "fn.apply(mlrun.auto_mount())\n", - "\n", - "# Manually specifying needed packages \n", - "fn.spec.build.commands = ['pip install torch==1.6.0', 'pip install tensorflow', 'pip install keras']\n", - "\n", - "# Adding the model \n", - "fn.add_model(key='rnn_model', model_path=rnn_model_path ,class_name='RNN_Model_Serving')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Testing the function locally**" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2021-10-17 10:43:54,256 [info] model rnn_model was loaded\n", - "> 2021-10-17 10:43:54,257 [info] Initializing endpoint records\n", - "> 2021-10-17 10:43:54,276 [info] Loaded ['rnn_model']\n" - ] - } - ], - "source": [ - "# When mocking, class has to be present\n", - "from rnn_serving import *\n", - "\n", - "# Mocking function\n", - "server = fn.to_mock_server()" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "# Getting the data\n", - "import cloudpickle as cp\n", - "from urllib.request import urlopen\n", - "\n", - "rnn_data = cp.load(urlopen(data_path))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "model used in this example take inputs with the shape `(None, None, 11)`.
\n", - "whereas the first dimenstion is the number of instances, the second dimenstion is the number of timestamps
\n", - "and the last dimenstion is the number of features the dataset have.
\n", - "our testing dataset has `(1,10,11)` means one instance to predict, with sequence length of 10, each step has 11 features." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'id': '1bf6a3dc4d204e6e8bfd5834f5d691f1',\n", - " 'model_name': 'rnn_model',\n", - " 'outputs': '[[0.43563252687454224]]'}" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import requests\n", - "\n", - "# KFServing protocol event\n", - "event_data = {\"inputs\": rnn_data}\n", - "\n", - "response = server.test(path='/v2/models/rnn_model/predict',body=event_data)\n", - "response" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Testing the function remotely**" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2021-10-17 10:43:57,192 [info] Starting remote function deploy\n", - "2021-10-17 10:43:57 (info) Deploying function\n", - "2021-10-17 10:43:57 (info) Building\n", - "2021-10-17 10:43:57 (info) Staging files and preparing base images\n", - "2021-10-17 10:43:57 (info) Building processor image\n", - "2021-10-17 10:43:58 (info) Build complete\n", - "2021-10-17 10:44:10 (info) Function deploy complete\n", - "> 2021-10-17 10:44:11,677 [info] successfully deployed function: {'internal_invocation_urls': ['nuclio-function-marketplace-rnn-serving.default-tenant.svc.cluster.local:8080'], 'external_invocation_urls': ['default-tenant.app.dev39.lab.iguazeng.com:30255']}\n" - ] - } - ], - "source": [ - "address = fn.deploy()" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'id': '1bf6a3dc4d204e6e8bfd5834f5d691f1',\n", - " 'model_name': 'rnn_model',\n", - " 'outputs': '[[0.43563252687454224]]'}" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import json\n", - "import requests\n", - "\n", - "# using requests to predict\n", - "response = requests.put(address+\"/v2/models/rnn_model/predict\", json = json.dumps(event_data))\n", - "json.loads(response.text)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "[Back to the top](#RNN-Serving)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/functions/development/rnn_serving/latest/src/rnn_serving.py b/functions/development/rnn_serving/latest/src/rnn_serving.py deleted file mode 100644 index d7e783d7..00000000 --- a/functions/development/rnn_serving/latest/src/rnn_serving.py +++ /dev/null @@ -1,35 +0,0 @@ -# Copyright 2019 Iguazio -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -import mlrun -import numpy as np -from tensorflow import keras -import json - - -class RNN_Model_Serving(mlrun.serving.V2ModelServer): - def load(self): - """load and initialize the model and/or other elements""" - model_file, extra_data = self.get_model(suffix=".h5") - self.model = keras.models.load_model(model_file) - - def predict(self, body): - try: - """Generate model predictions from sample.""" - feats = np.asarray(body['inputs']) - result = self.model.predict(feats) - result = json.dumps(result.tolist()) - return result - except Exception as e: - raise Exception("Failed to predict %s" % e) \ No newline at end of file diff --git a/functions/development/rnn_serving/latest/src/test_rnn_serving.py b/functions/development/rnn_serving/latest/src/test_rnn_serving.py deleted file mode 100644 index fb2f4997..00000000 --- a/functions/development/rnn_serving/latest/src/test_rnn_serving.py +++ /dev/null @@ -1,74 +0,0 @@ -# Copyright 2019 Iguazio -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -import os -import wget -from mlrun import import_function -from os import path -from rnn_serving import * - -DATASET = np.array([[6.9955170e-01, 6.9952875e-01, 2.7922913e-02, 2.7853036e-02, - 6.9955170e-01, 7.0086759e-01, 7.0118028e-01, 7.0142627e-01, - 2.7922913e-02, 0.0000000e+00, 0.0000000e+00], - [6.9955170e-01, 6.9998503e-01, 1.6527303e-03, 2.7853036e-02, - 7.0000792e-01, 7.0085293e-01, 7.0118028e-01, 7.0203447e-01, - 1.6527303e-03, 0.0000000e+00, 0.0000000e+00], - [6.9955170e-01, 7.0025057e-01, 1.6904050e-04, 2.7853036e-02, - 7.0027345e-01, 7.0014298e-01, 7.0190376e-01, 7.0128226e-01, - 1.6904050e-04, 0.0000000e+00, 0.0000000e+00], - [6.9955170e-01, 7.0144778e-01, 1.6904050e-04, 2.7853036e-02, - 7.0147055e-01, 7.0178574e-01, 7.0236105e-01, 7.0295709e-01, - 7.3906886e-03, 0.0000000e+00, 0.0000000e+00], - [6.9955170e-01, 7.0324355e-01, 1.6904050e-04, 2.7853036e-02, - 7.0326620e-01, 7.0308524e-01, 7.0490342e-01, 7.0427048e-01, - 2.4815742e-03, 0.0000000e+00, 0.0000000e+00], - [6.9955170e-01, 7.0324355e-01, 1.6904050e-04, 2.7853036e-02, - 7.0191067e-01, 7.0173001e-01, 7.0354480e-01, 7.0291305e-01, - 2.9976186e-03, 0.0000000e+00, 0.0000000e+00], - [6.9955170e-01, 7.0324355e-01, 1.6904050e-04, 2.7853036e-02, - 7.0166123e-01, 7.0148063e-01, 7.0284635e-01, 7.0249581e-01, - 2.7904075e-03, 0.0000000e+00, 0.0000000e+00], - [6.9955170e-01, 7.0324355e-01, 1.6904050e-04, 2.7853036e-02, - 7.0133996e-01, 7.0143080e-01, 7.0297277e-01, 7.0250750e-01, - 4.1491759e-04, 0.0000000e+00, 0.0000000e+00], - [6.9955170e-01, 7.0324355e-01, 1.6904050e-04, 2.7853036e-02, - 7.0150572e-01, 7.0251614e-01, 7.0281982e-01, 7.0370042e-01, - 2.1256472e-03, 0.0000000e+00, 0.0000000e+00], - [6.9955170e-01, 7.0324355e-01, 1.6904050e-04, 2.7853036e-02, - 7.0272487e-01, 7.0258951e-01, 7.0429617e-01, 7.0376801e-01, - 1.4207334e-03, 0.0000000e+00, 0.0000000e+00]]).reshape(1, 10, 11).tolist() - - -def download_pretrained_model(model_path): - # Run this to download the pre-trained model to your `models` directory - model_location = 'https://s3.wasabisys.com/iguazio/models/rnn/rnn_model.h5' - saved_models_directory = model_path - # Create paths - os.makedirs(saved_models_directory, exist_ok=1) - model_filepath = os.path.join(saved_models_directory, os.path.basename(model_location)) - wget.download(model_location, model_filepath) - - -def test_rnn_serving(): - model_path = os.path.join(os.path.abspath('./'), 'models') - model = model_path + '/rnn_model.h5' - if not path.exists(model): - download_pretrained_model(model_path) - - fn = import_function('function.yaml') - fn.add_model('rnn_model', model_path=model, class_name='RNN_Model_Serving') - # create an emulator (mock server) from the function configuration) - server = fn.to_mock_server() - resp = server.test("/v2/models/rnn_model/infer", {"inputs": DATASET}) - assert (resp['outputs'] == '[[0.453309565782547]]') diff --git a/functions/development/rnn_serving/latest/static/documentation.html b/functions/development/rnn_serving/latest/static/documentation.html deleted file mode 100644 index eb387b20..00000000 --- a/functions/development/rnn_serving/latest/static/documentation.html +++ /dev/null @@ -1,237 +0,0 @@ - - - - - - - -rnn_serving package - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - - - -
-
- - -
-
-
- -
-

rnn_serving package

- -
- -
-
-
-
-
-

rnn_serving package#

-
-

Submodules#

-
-
-

rnn_serving.rnn_serving module#

-
-
-class rnn_serving.rnn_serving.RNN_Model_Serving(context=None, name: Optional[str] = None, model_path: Optional[str] = None, model=None, protocol=None, input_path: Optional[str] = None, result_path: Optional[str] = None, **kwargs)[source]#
-

Bases: mlrun.serving.v2_serving.V2ModelServer

-
-
-load()[source]#
-

load and initialize the model and/or other elements

-
-
-
-predict(body)[source]#
-

model prediction operation

-
-
-
-
-

Module contents#

-
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/rnn_serving/latest/static/example.html b/functions/development/rnn_serving/latest/static/example.html deleted file mode 100644 index 6855b7c1..00000000 --- a/functions/development/rnn_serving/latest/static/example.html +++ /dev/null @@ -1,433 +0,0 @@ - - - - - - - -RNN Serving - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - - - -
-
- - -
-
-
- - -
-
-
-

RNN Serving#

-

The following section we create a new model serving function which wraps our class , and specify model and other resources.
-Deploying the serving function will provide us an http endpoint that can handle requests in real time.
-This function is part of the stock-analysis demo.
-To see how the model is trained or how the data-set is generated, check out code folder in the demo repository.

-
-

Steps#

-
    -
  1. Setup function parameters

  2. -
  3. Importing the function

  4. -
  5. Testing the function locally

  6. -
  7. Testing the function remotely

  8. -
-
-
-
import warnings
-warnings.filterwarnings("ignore")
-
-
-
-
-
-
-
# Following packages are required, make sure to install
-# !pip install pip install torch==1.6.0
-# !pip install tensorflow
-# !pip install keras
-
-
-
-
-
-
-

Setup function parameters#

-
-
-
# Setting up models path
-rnn_model_path = 'https://s3.wasabisys.com/iguazio/models/function-marketplace-models/rnn_serving/rnn_model.h5'
-data_path = 'https://s3.wasabisys.com/iguazio/data/function-marketplace-data/rnn_serving/stocks_data.pkl'
-
-
-
-
-
-
-

Importing the function#

-
-
-
import mlrun
-mlrun.set_environment(project='function-marketplace')
-
-# Importing the function from the hub
-fn = mlrun.import_function("hub://rnn_serving")
-fn.apply(mlrun.auto_mount())
-
-# Manually specifying needed packages 
-fn.spec.build.commands = ['pip install torch==1.6.0', 'pip install tensorflow', 'pip install keras']
-
-# Adding the model 
-fn.add_model(key='rnn_model', model_path=rnn_model_path ,class_name='RNN_Model_Serving')
-
-
-
-
-
> 2021-10-17 10:43:46,363 [info] loaded project function-marketplace from MLRun DB
-
-
-
<mlrun.serving.states.TaskStep at 0x7fb59c8fa2d0>
-
-
-
-
-
-
-

Testing the function locally#

-
-
-
# When mocking, class has to be present
-from rnn_serving import *
-
-# Mocking function
-server = fn.to_mock_server()
-
-
-
-
-
> 2021-10-17 10:43:54,256 [info] model rnn_model was loaded
-> 2021-10-17 10:43:54,257 [info] Initializing endpoint records
-> 2021-10-17 10:43:54,276 [info] Loaded ['rnn_model']
-
-
-
-
-
-
-
# Getting the data
-import cloudpickle as cp
-from urllib.request import urlopen
-
-rnn_data = cp.load(urlopen(data_path))
-
-
-
-
-

model used in this example take inputs with the shape (None, None, 11).
-whereas the first dimenstion is the number of instances, the second dimenstion is the number of timestamps
-and the last dimenstion is the number of features the dataset have.
-our testing dataset has (1,10,11) means one instance to predict, with sequence length of 10, each step has 11 features.

-
-
-
import requests
-
-# KFServing protocol event
-event_data = {"inputs": rnn_data}
-
-response = server.test(path='/v2/models/rnn_model/predict',body=event_data)
-response
-
-
-
-
-
{'id': '1bf6a3dc4d204e6e8bfd5834f5d691f1',
- 'model_name': 'rnn_model',
- 'outputs': '[[0.43563252687454224]]'}
-
-
-
-
-
-
-

Testing the function remotely#

-
-
-
address = fn.deploy()
-
-
-
-
-
> 2021-10-17 10:43:57,192 [info] Starting remote function deploy
-2021-10-17 10:43:57  (info) Deploying function
-2021-10-17 10:43:57  (info) Building
-2021-10-17 10:43:57  (info) Staging files and preparing base images
-2021-10-17 10:43:57  (info) Building processor image
-2021-10-17 10:43:58  (info) Build complete
-2021-10-17 10:44:10  (info) Function deploy complete
-> 2021-10-17 10:44:11,677 [info] successfully deployed function: {'internal_invocation_urls': ['nuclio-function-marketplace-rnn-serving.default-tenant.svc.cluster.local:8080'], 'external_invocation_urls': ['default-tenant.app.dev39.lab.iguazeng.com:30255']}
-
-
-
-
-
-
-
import json
-import requests
-
-# using requests to predict
-response = requests.put(address+"/v2/models/rnn_model/predict", json = json.dumps(event_data))
-json.loads(response.text)
-
-
-
-
-
{'id': '1bf6a3dc4d204e6e8bfd5834f5d691f1',
- 'model_name': 'rnn_model',
- 'outputs': '[[0.43563252687454224]]'}
-
-
-
-
-

Back to the top

-
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/rnn_serving/latest/static/function.html b/functions/development/rnn_serving/latest/static/function.html deleted file mode 100644 index 8a7e0549..00000000 --- a/functions/development/rnn_serving/latest/static/function.html +++ /dev/null @@ -1,68 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: serving
-metadata:
-  name: rnn-serving
-  tag: ''
-  hash: 548cd27edfdc49aed0b069d94bd049435d484722
-  project: ''
-  labels:
-    author: Daniel
-  categories:
-  - model-serving
-  - machine-learning
-spec:
-  command: ''
-  args: []
-  image: mlrun/ml-models
-  description: deploy an rnn based stock analysis model server.
-  min_replicas: 1
-  max_replicas: 4
-  env: []
-  base_spec:
-    apiVersion: nuclio.io/v1
-    kind: Function
-    metadata:
-      name: rnn-serving
-      labels: {}
-      annotations:
-        nuclio.io/generated_by: function generated from /User/test/functions/rnn_serving/rnn_serving.py
-    spec:
-      runtime: python:3.6
-      handler: rnn_serving:handler
-      env: []
-      volumes: []
-      build:
-        commands: []
-        noBaseImagesPull: true
-        functionSourceCode: aW1wb3J0IG1scnVuCmltcG9ydCBudW1weSBhcyBucApmcm9tIHRlbnNvcmZsb3cgaW1wb3J0IGtlcmFzCmltcG9ydCBqc29uCgoKY2xhc3MgUk5OX01vZGVsX1NlcnZpbmcobWxydW4uc2VydmluZy5WMk1vZGVsU2VydmVyKToKICAgIGRlZiBsb2FkKHNlbGYpOgogICAgICAgICIiImxvYWQgYW5kIGluaXRpYWxpemUgdGhlIG1vZGVsIGFuZC9vciBvdGhlciBlbGVtZW50cyIiIgogICAgICAgIG1vZGVsX2ZpbGUsIGV4dHJhX2RhdGEgPSBzZWxmLmdldF9tb2RlbChzdWZmaXg9Ii5oNSIpCiAgICAgICAgc2VsZi5tb2RlbCA9IGtlcmFzLm1vZGVscy5sb2FkX21vZGVsKG1vZGVsX2ZpbGUpCgogICAgZGVmIHByZWRpY3Qoc2VsZiwgYm9keSk6CiAgICAgICAgdHJ5OgogICAgICAgICAgICAiIiJHZW5lcmF0ZSBtb2RlbCBwcmVkaWN0aW9ucyBmcm9tIHNhbXBsZS4iIiIKICAgICAgICAgICAgZmVhdHMgPSBucC5hc2FycmF5KGJvZHlbJ2lucHV0cyddKQogICAgICAgICAgICByZXN1bHQgPSBzZWxmLm1vZGVsLnByZWRpY3QoZmVhdHMpCiAgICAgICAgICAgIHJlc3VsdCA9IGpzb24uZHVtcHMocmVzdWx0LnRvbGlzdCgpKQogICAgICAgICAgICByZXR1cm4gcmVzdWx0CiAgICAgICAgZXhjZXB0IEV4Y2VwdGlvbiBhcyBlOgogICAgICAgICAgICByYWlzZSBFeGNlcHRpb24oIkZhaWxlZCB0byBwcmVkaWN0ICVzIiAlIGUpCmZyb20gbWxydW4ucnVudGltZXMgaW1wb3J0IG51Y2xpb19pbml0X2hvb2sKZGVmIGluaXRfY29udGV4dChjb250ZXh0KToKICAgIG51Y2xpb19pbml0X2hvb2soY29udGV4dCwgZ2xvYmFscygpLCAnc2VydmluZ192MicpCgpkZWYgaGFuZGxlcihjb250ZXh0LCBldmVudCk6CiAgICByZXR1cm4gY29udGV4dC5tbHJ1bl9oYW5kbGVyKGNvbnRleHQsIGV2ZW50KQo=
-  source: ''
-  function_kind: serving_v2
-  build:
-    commands: []
-    code_origin: https://github.com/daniels290813/functions.git#97b63199864dd95681bca5af86835d177bf9d67b:/User/test/functions/rnn_serving/rnn_serving.py
-    origin_filename: /User/test/functions/rnn_serving/rnn_serving.py
-  secret_sources: []
-  mount_applied: false
-  affinity: null
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/rnn_serving/latest/static/item.html b/functions/development/rnn_serving/latest/static/item.html deleted file mode 100644 index e5ad52f5..00000000 --- a/functions/development/rnn_serving/latest/static/item.html +++ /dev/null @@ -1,47 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- model-serving
-- machine-learning
-description: deploy an rnn based stock analysis model server.
-doc: ''
-example: rnn_serving.ipynb
-generationDate: 2022-08-28:17-25
-hidden: false
-icon: ''
-labels:
-  author: Daniel
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 1.1.0
-name: rnn-serving
-platformVersion: 3.5.0
-spec:
-  filename: rnn_serving.py
-  handler: handler
-  image: mlrun/ml-models
-  kind: serving
-  requirements: null
-url: ''
-version: 1.1.0
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/rnn_serving/latest/static/rnn_serving.html b/functions/development/rnn_serving/latest/static/rnn_serving.html deleted file mode 100644 index 449e55e7..00000000 --- a/functions/development/rnn_serving/latest/static/rnn_serving.html +++ /dev/null @@ -1,175 +0,0 @@ - - - - - - - -rnn_serving.rnn_serving - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - -
-
- -
-
-
-
-
- -
-

- -
-
-
-
-
-
-
-

Source code for rnn_serving.rnn_serving

-# Copyright 2019 Iguazio
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-import mlrun
-import numpy as np
-from tensorflow import keras
-import json
-
-
-
[docs]class RNN_Model_Serving(mlrun.serving.V2ModelServer): -
[docs] def load(self): - """load and initialize the model and/or other elements""" - model_file, extra_data = self.get_model(suffix=".h5") - self.model = keras.models.load_model(model_file)
- -
[docs] def predict(self, body): - try: - """Generate model predictions from sample.""" - feats = np.asarray(body['inputs']) - result = self.model.predict(feats) - result = json.dumps(result.tolist()) - return result - except Exception as e: - raise Exception("Failed to predict %s" % e)
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/rnn_serving/latest/static/source.html b/functions/development/rnn_serving/latest/static/source.html deleted file mode 100644 index d7b2b50d..00000000 --- a/functions/development/rnn_serving/latest/static/source.html +++ /dev/null @@ -1,56 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-# Copyright 2019 Iguazio
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-import mlrun
-import numpy as np
-from tensorflow import keras
-import json
-
-
-class RNN_Model_Serving(mlrun.serving.V2ModelServer):
-    def load(self):
-        """load and initialize the model and/or other elements"""
-        model_file, extra_data = self.get_model(suffix=".h5")
-        self.model = keras.models.load_model(model_file)
-
-    def predict(self, body):
-        try:
-            """Generate model predictions from sample."""
-            feats = np.asarray(body['inputs'])
-            result = self.model.predict(feats)
-            result = json.dumps(result.tolist())
-            return result
-        except Exception as e:
-            raise Exception("Failed to predict %s" % e)
-        
-    
- - \ No newline at end of file diff --git a/functions/development/slack_notify/0.0.1/src/README.md b/functions/development/slack_notify/0.0.1/src/README.md deleted file mode 100644 index 9bde3299..00000000 --- a/functions/development/slack_notify/0.0.1/src/README.md +++ /dev/null @@ -1 +0,0 @@ -# Send Notification to Slack \ No newline at end of file diff --git a/functions/development/slack_notify/0.0.1/src/function.yaml b/functions/development/slack_notify/0.0.1/src/function.yaml deleted file mode 100644 index 1ede8b03..00000000 --- a/functions/development/slack_notify/0.0.1/src/function.yaml +++ /dev/null @@ -1,48 +0,0 @@ -kind: job -metadata: - name: slack-notify - tag: '' - hash: 3de7e78ed9b7928af192badf988055086431fb58 - project: default - labels: - author: mdl - categories: - - utils -spec: - command: '' - args: [] - image: python:3.6-jessie - env: [] - default_handler: slack_notify - entry_points: - slack_notify: - name: slack_notify - doc: Summarize a table - parameters: - - name: context - type: MLClientCtx - doc: the function context - default: '' - - name: webhook_url - type: str - doc: 'Slack incoming webhook URL. Please read: https://api.slack.com/messaging/webhooks' - default: URL - - name: slack_blocks - type: List[str] - doc: Message blocks list. NOT IMPLEMENTED YET - default: [] - - name: notification_text - type: str - doc: Notification text - default: Notification - outputs: - - default: '' - lineno: 14 - description: Send Slack notification - build: - functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHdhcm5pbmdzCgp3YXJuaW5ncy5zaW1wbGVmaWx0ZXIoYWN0aW9uPSJpZ25vcmUiLCBjYXRlZ29yeT1GdXR1cmVXYXJuaW5nKQoKaW1wb3J0IG9zCmltcG9ydCBqc29uCmltcG9ydCByZXF1ZXN0cwpmcm9tIG1scnVuLmV4ZWN1dGlvbiBpbXBvcnQgTUxDbGllbnRDdHgKZnJvbSB0eXBpbmcgaW1wb3J0IExpc3QKCgpkZWYgc2xhY2tfbm90aWZ5KAogICAgY29udGV4dDogTUxDbGllbnRDdHgsCiAgICB3ZWJob29rX3VybDogc3RyID0gIlVSTCIsCiAgICBzbGFja19ibG9ja3M6IExpc3Rbc3RyXSA9IFtdLAogICAgbm90aWZpY2F0aW9uX3RleHQ6IHN0ciA9ICJOb3RpZmljYXRpb24iLAopIC0+IE5vbmU6CiAgICAiIiJTdW1tYXJpemUgYSB0YWJsZQogICAgOnBhcmFtIGNvbnRleHQ6ICAgICAgICAgdGhlIGZ1bmN0aW9uIGNvbnRleHQKICAgIDpwYXJhbSB3ZWJob29rX3VybDogICAgIFNsYWNrIGluY29taW5nIHdlYmhvb2sgVVJMLiBQbGVhc2UgcmVhZDogaHR0cHM6Ly9hcGkuc2xhY2suY29tL21lc3NhZ2luZy93ZWJob29rcwogICAgOnBhcmFtIG5vdGlmaWNhdGlvbl90ZXh0OiAgICAgICAgICAgIE5vdGlmaWNhdGlvbiB0ZXh0CiAgICA6cGFyYW0gc2xhY2tfYmxvY2tzOiAgICAgICAgICBNZXNzYWdlIGJsb2NrcyBsaXN0LiBOT1QgSU1QTEVNRU5URUQgWUVUCiAgICAiIiIKCiAgICBkYXRhID0geyJ0ZXh0Ijogbm90aWZpY2F0aW9uX3RleHR9CiAgICBwcmludCgiPT09PSIsIHdlYmhvb2tfdXJsKQogICAgcmVzcG9uc2UgPSByZXF1ZXN0cy5wb3N0KAogICAgICAgIHdlYmhvb2tfdXJsLCBkYXRhPWpzb24uZHVtcHMoZGF0YSksIGhlYWRlcnM9eyJDb250ZW50LVR5cGUiOiAiYXBwbGljYXRpb24vanNvbiJ9CiAgICApCgogICAgcHJpbnQoIlJlc3BvbnNlOiAiICsgc3RyKHJlc3BvbnNlLnRleHQpKQogICAgcHJpbnQoIlJlc3BvbnNlIGNvZGU6ICIgKyBzdHIocmVzcG9uc2Uuc3RhdHVzX2NvZGUpKQo= - commands: - - python -m pip install requests - code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/slack_notify/slack_notify.py - affinity: null -verbose: false diff --git a/functions/development/slack_notify/0.0.1/src/item.yaml b/functions/development/slack_notify/0.0.1/src/item.yaml deleted file mode 100644 index 4d9d075d..00000000 --- a/functions/development/slack_notify/0.0.1/src/item.yaml +++ /dev/null @@ -1,24 +0,0 @@ -apiVersion: v1 -categories: -- utils -description: Send Slack notification -doc: '' -example: slack_notify.ipynb -generationDate: 2021-05-19:23-13 -icon: '' -labels: - author: mdl -maintainers: [] -marketplaceType: '' -mlrunVersion: '' -name: slack-notify -platformVersion: '' -spec: - filename: slack_notify.py - handler: slack_notify - image: python:3.6-jessie - kind: job - requirements: - - requests -url: '' -version: 0.0.1 diff --git a/functions/development/slack_notify/0.0.1/src/slack_notify.ipynb b/functions/development/slack_notify/0.0.1/src/slack_notify.ipynb deleted file mode 100644 index 8119bb8c..00000000 --- a/functions/development/slack_notify/0.0.1/src/slack_notify.ipynb +++ /dev/null @@ -1,293 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 33, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: ignore\n", - "import nuclio" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "%nuclio: setting kind to 'job'\n", - "%nuclio: setting spec.image to 'python:3.6-jessie'\n" - ] - } - ], - "source": [ - "%nuclio config kind = \"job\"\n", - "%nuclio config spec.image = \"python:3.6-jessie\"" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "metadata": {}, - "outputs": [], - "source": [ - "%%nuclio cmd -c \n", - "pip install requests" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "metadata": {}, - "outputs": [], - "source": [ - "import warnings\n", - "warnings.simplefilter(action='ignore', category=FutureWarning)" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "import json\n", - "import requests\n", - "from mlrun.execution import MLClientCtx\n", - "from typing import List" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "metadata": {}, - "outputs": [], - "source": [ - "def slack_notify(\n", - " context: MLClientCtx,\n", - " webhook_url: str = \"URL\",\n", - " slack_blocks: List[str] = [],\n", - " notification_text: str = \"Notification\"\n", - ") -> None:\n", - " \"\"\"Summarize a table\n", - " :param context: the function context\n", - " :param webhook_url: Slack incoming webhook URL. Please read: https://api.slack.com/messaging/webhooks\n", - " :param notification_text: Notification text\n", - " :param slack_blocks: Message blocks list. NOT IMPLEMENTED YET\n", - " \"\"\"\n", - " \n", - " data = {\n", - " 'text': notification_text\n", - " }\n", - " print(\"====\",webhook_url)\n", - " response = requests.post(webhook_url, data=json.dumps(\n", - " data), headers={'Content-Type': 'application/json'})\n", - "\n", - " print('Response: ' + str(response.text))\n", - " print('Response code: ' + str(response.status_code))" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: end-code" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### mlconfig" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import mlconf\n", - "import os\n", - "\n", - "mlconf.dbpath = 'http://mlrun-api:8080'\n", - "mlconf.artifact_path = mlconf.artifact_path or f'{os.environ[\"HOME\"]}/artifacts'" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### save" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import code_to_function\n", - "\n", - "# create job function object from notebook code\n", - "fn = code_to_function(\"slack_notify\")\n", - "# add metadata (for templates and reuse)\n", - "fn.spec.default_handler = \"slack_notify\"\n", - "fn.spec.description = \"Send Slack notification\"\n", - "fn.metadata.categories = [\"ops\"]\n", - "fn.metadata.labels = {\"author\": \"mdl\"}\n", - "fn.export(\"function.yaml\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## tests" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import import_function\n", - "func = import_function(\"hub://slack_notify\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import NewTask, run_local\n", - "\n", - "\n", - "#Slack incoming webhook URL. Please read: https://api.slack.com/messaging/webhooks\n", - "task_params = {\n", - " \"webhook_url\" : \"https://hooks.slack.com/services/xxxxxxxx/xxxxxxxxx/xxxxxxxxxxxxxx\",\n", - " \"notification_text\" : \"Test Notification\"\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "task = NewTask(\n", - " name=\"tasks slack notify\", \n", - " params = task_params,\n", - " handler=slack_notify)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### run local where artifact path is fixed " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run = run_local(task, artifact_path=mlconf.artifact_path)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### run remote where artifact path includes the run id" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "func.deploy()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "func.run(task, params=task_params, workdir=mlconf.artifact_path)" - ] - }, - { - "cell_type": "code", - "execution_count": 42, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "function: slack-notify\n", - "Send Slack notification\n", - "default handler: slack_notify\n", - "entry points:\n", - " slack_notify: Summarize a table\n", - " context(MLClientCtx) - the function context\n", - " webhook_url(str) - Slack incoming webhook URL. Please read: https://api.slack.com/messaging/webhooks, default=URL\n", - " slack_blocks(List[str]) - Message blocks list. NOT IMPLEMENTED YET\n", - " notification_text(str) - Notification text, default=Notification\n" - ] - } - ], - "source": [ - "func.doc()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.8" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/functions/development/slack_notify/0.0.1/src/slack_notify.py b/functions/development/slack_notify/0.0.1/src/slack_notify.py deleted file mode 100644 index 91624be5..00000000 --- a/functions/development/slack_notify/0.0.1/src/slack_notify.py +++ /dev/null @@ -1,34 +0,0 @@ -# Generated by nuclio.export.NuclioExporter - -import warnings - -warnings.simplefilter(action="ignore", category=FutureWarning) - -import os -import json -import requests -from mlrun.execution import MLClientCtx -from typing import List - - -def slack_notify( - context: MLClientCtx, - webhook_url: str = "URL", - slack_blocks: List[str] = [], - notification_text: str = "Notification", -) -> None: - """Summarize a table - :param context: the function context - :param webhook_url: Slack incoming webhook URL. Please read: https://api.slack.com/messaging/webhooks - :param notification_text: Notification text - :param slack_blocks: Message blocks list. NOT IMPLEMENTED YET - """ - - data = {"text": notification_text} - print("====", webhook_url) - response = requests.post( - webhook_url, data=json.dumps(data), headers={"Content-Type": "application/json"} - ) - - print("Response: " + str(response.text)) - print("Response code: " + str(response.status_code)) diff --git a/functions/development/slack_notify/0.0.1/static/documentation.html b/functions/development/slack_notify/0.0.1/static/documentation.html deleted file mode 100644 index a5047e6f..00000000 --- a/functions/development/slack_notify/0.0.1/static/documentation.html +++ /dev/null @@ -1,134 +0,0 @@ - - - - - - - -slack_notify package - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- - -
-
-
-
-
-
-

slack_notify package

-
-

Submodules

-
-
-

slack_notify.slack_notify module

-
-
-slack_notify.slack_notify.slack_notify(context: mlrun.execution.MLClientCtx, webhook_url: str = 'URL', slack_blocks: List[str] = [], notification_text: str = 'Notification')None[source]
-

Summarize a table -:param context: the function context -:param webhook_url: Slack incoming webhook URL. Please read: https://api.slack.com/messaging/webhooks -:param notification_text: Notification text -:param slack_blocks: Message blocks list. NOT IMPLEMENTED YET

-
-
-
-

Module contents

-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/slack_notify/0.0.1/static/example.html b/functions/development/slack_notify/0.0.1/static/example.html deleted file mode 100644 index 8844f454..00000000 --- a/functions/development/slack_notify/0.0.1/static/example.html +++ /dev/null @@ -1,324 +0,0 @@ - - - - - - - -mlconfig - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- -
-
-
-
-
-
-
# nuclio: ignore
-import nuclio
-
-
-
-
-
-
-
%nuclio config kind = "job"
-%nuclio config spec.image = "python:3.6-jessie"
-
-
-
-
-
%nuclio: setting kind to 'job'
-%nuclio: setting spec.image to 'python:3.6-jessie'
-
-
-
-
-
-
-
%%nuclio cmd -c 
-pip install requests
-
-
-
-
-
-
-
import warnings
-warnings.simplefilter(action='ignore', category=FutureWarning)
-
-
-
-
-
-
-
import os
-import json
-import requests
-from mlrun.execution import MLClientCtx
-from typing import List
-
-
-
-
-
-
-
def slack_notify(
-    context: MLClientCtx,
-    webhook_url: str = "URL",
-    slack_blocks:  List[str] = [],
-    notification_text: str = "Notification"
-) -> None:
-    """Summarize a table
-    :param context:         the function context
-    :param webhook_url:     Slack incoming webhook URL. Please read: https://api.slack.com/messaging/webhooks
-    :param notification_text:            Notification text
-    :param slack_blocks:          Message blocks list. NOT IMPLEMENTED YET
-    """
-    
-    data = {
-        'text': notification_text
-    }
-    print("====",webhook_url)
-    response = requests.post(webhook_url, data=json.dumps(
-        data), headers={'Content-Type': 'application/json'})
-
-    print('Response: ' + str(response.text))
-    print('Response code: ' + str(response.status_code))
-
-
-
-
-
-
-
# nuclio: end-code
-
-
-
-
-
-

mlconfig

-
-
-
from mlrun import mlconf
-import os
-
-mlconf.dbpath = 'http://mlrun-api:8080'
-mlconf.artifact_path = mlconf.artifact_path or f'{os.environ["HOME"]}/artifacts'
-
-
-
-
-
-
-

save

-
-
-
from mlrun import code_to_function
-
-# create job function object from notebook code
-fn = code_to_function("slack_notify")
-# add metadata (for templates and reuse)
-fn.spec.default_handler = "slack_notify"
-fn.spec.description = "Send Slack notification"
-fn.metadata.categories = ["ops"]
-fn.metadata.labels = {"author": "mdl"}
-fn.export("function.yaml")
-
-
-
-
-
-
-

tests

-
-
-
from mlrun import import_function
-func = import_function("hub://slack_notify")
-
-
-
-
-
-
-
from mlrun import NewTask, run_local
-
-
-#Slack incoming webhook URL. Please read: https://api.slack.com/messaging/webhooks
-task_params = {
-    "webhook_url" : "https://hooks.slack.com/services/xxxxxxxx/xxxxxxxxx/xxxxxxxxxxxxxx",
-    "notification_text" : "Test Notification"
-}
-
-
-
-
-
-
-
task = NewTask(
-    name="tasks slack notify", 
-    params = task_params,
-    handler=slack_notify)
-
-
-
-
-
-

run local where artifact path is fixed

-
-
-
run = run_local(task, artifact_path=mlconf.artifact_path)
-
-
-
-
-
-
-

run remote where artifact path includes the run id

-
-
-
func.deploy()
-
-
-
-
-
-
-
func.run(task, params=task_params,  workdir=mlconf.artifact_path)
-
-
-
-
-
-
-
func.doc()
-
-
-
-
-
function: slack-notify
-Send Slack notification
-default handler: slack_notify
-entry points:
-  slack_notify: Summarize a table
-    context(MLClientCtx)  - the function context
-    webhook_url(str)  - Slack incoming webhook URL. Please read: https://api.slack.com/messaging/webhooks, default=URL
-    slack_blocks(List[str])  - Message blocks list. NOT IMPLEMENTED YET
-    notification_text(str)  - Notification text, default=Notification
-
-
-
-
-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/slack_notify/0.0.1/static/function.html b/functions/development/slack_notify/0.0.1/static/function.html deleted file mode 100644 index 84e586ae..00000000 --- a/functions/development/slack_notify/0.0.1/static/function.html +++ /dev/null @@ -1,70 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: job
-metadata:
-  name: slack-notify
-  tag: ''
-  hash: 3de7e78ed9b7928af192badf988055086431fb58
-  project: default
-  labels:
-    author: mdl
-  categories:
-  - utils
-spec:
-  command: ''
-  args: []
-  image: python:3.6-jessie
-  env: []
-  default_handler: slack_notify
-  entry_points:
-    slack_notify:
-      name: slack_notify
-      doc: Summarize a table
-      parameters:
-      - name: context
-        type: MLClientCtx
-        doc: the function context
-        default: ''
-      - name: webhook_url
-        type: str
-        doc: 'Slack incoming webhook URL. Please read: https://api.slack.com/messaging/webhooks'
-        default: URL
-      - name: slack_blocks
-        type: List[str]
-        doc: Message blocks list. NOT IMPLEMENTED YET
-        default: []
-      - name: notification_text
-        type: str
-        doc: Notification text
-        default: Notification
-      outputs:
-      - default: ''
-      lineno: 14
-  description: Send Slack notification
-  build:
-    functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHdhcm5pbmdzCgp3YXJuaW5ncy5zaW1wbGVmaWx0ZXIoYWN0aW9uPSJpZ25vcmUiLCBjYXRlZ29yeT1GdXR1cmVXYXJuaW5nKQoKaW1wb3J0IG9zCmltcG9ydCBqc29uCmltcG9ydCByZXF1ZXN0cwpmcm9tIG1scnVuLmV4ZWN1dGlvbiBpbXBvcnQgTUxDbGllbnRDdHgKZnJvbSB0eXBpbmcgaW1wb3J0IExpc3QKCgpkZWYgc2xhY2tfbm90aWZ5KAogICAgY29udGV4dDogTUxDbGllbnRDdHgsCiAgICB3ZWJob29rX3VybDogc3RyID0gIlVSTCIsCiAgICBzbGFja19ibG9ja3M6IExpc3Rbc3RyXSA9IFtdLAogICAgbm90aWZpY2F0aW9uX3RleHQ6IHN0ciA9ICJOb3RpZmljYXRpb24iLAopIC0+IE5vbmU6CiAgICAiIiJTdW1tYXJpemUgYSB0YWJsZQogICAgOnBhcmFtIGNvbnRleHQ6ICAgICAgICAgdGhlIGZ1bmN0aW9uIGNvbnRleHQKICAgIDpwYXJhbSB3ZWJob29rX3VybDogICAgIFNsYWNrIGluY29taW5nIHdlYmhvb2sgVVJMLiBQbGVhc2UgcmVhZDogaHR0cHM6Ly9hcGkuc2xhY2suY29tL21lc3NhZ2luZy93ZWJob29rcwogICAgOnBhcmFtIG5vdGlmaWNhdGlvbl90ZXh0OiAgICAgICAgICAgIE5vdGlmaWNhdGlvbiB0ZXh0CiAgICA6cGFyYW0gc2xhY2tfYmxvY2tzOiAgICAgICAgICBNZXNzYWdlIGJsb2NrcyBsaXN0LiBOT1QgSU1QTEVNRU5URUQgWUVUCiAgICAiIiIKCiAgICBkYXRhID0geyJ0ZXh0Ijogbm90aWZpY2F0aW9uX3RleHR9CiAgICBwcmludCgiPT09PSIsIHdlYmhvb2tfdXJsKQogICAgcmVzcG9uc2UgPSByZXF1ZXN0cy5wb3N0KAogICAgICAgIHdlYmhvb2tfdXJsLCBkYXRhPWpzb24uZHVtcHMoZGF0YSksIGhlYWRlcnM9eyJDb250ZW50LVR5cGUiOiAiYXBwbGljYXRpb24vanNvbiJ9CiAgICApCgogICAgcHJpbnQoIlJlc3BvbnNlOiAiICsgc3RyKHJlc3BvbnNlLnRleHQpKQogICAgcHJpbnQoIlJlc3BvbnNlIGNvZGU6ICIgKyBzdHIocmVzcG9uc2Uuc3RhdHVzX2NvZGUpKQo=
-    commands:
-    - python -m pip install requests
-    code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/slack_notify/slack_notify.py
-  affinity: null
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/slack_notify/0.0.1/static/item.html b/functions/development/slack_notify/0.0.1/static/item.html deleted file mode 100644 index 00a0735d..00000000 --- a/functions/development/slack_notify/0.0.1/static/item.html +++ /dev/null @@ -1,46 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- utils
-description: Send Slack notification
-doc: ''
-example: slack_notify.ipynb
-generationDate: 2021-05-19:23-13
-icon: ''
-labels:
-  author: mdl
-maintainers: []
-marketplaceType: ''
-mlrunVersion: ''
-name: slack-notify
-platformVersion: ''
-spec:
-  filename: slack_notify.py
-  handler: slack_notify
-  image: python:3.6-jessie
-  kind: job
-  requirements:
-  - requests
-url: ''
-version: 0.0.1
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/slack_notify/0.0.1/static/source.html b/functions/development/slack_notify/0.0.1/static/source.html deleted file mode 100644 index 5daa188e..00000000 --- a/functions/development/slack_notify/0.0.1/static/source.html +++ /dev/null @@ -1,56 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-# Generated by nuclio.export.NuclioExporter
-
-import warnings
-
-warnings.simplefilter(action="ignore", category=FutureWarning)
-
-import os
-import json
-import requests
-from mlrun.execution import MLClientCtx
-from typing import List
-
-
-def slack_notify(
-    context: MLClientCtx,
-    webhook_url: str = "URL",
-    slack_blocks: List[str] = [],
-    notification_text: str = "Notification",
-) -> None:
-    """Summarize a table
-    :param context:         the function context
-    :param webhook_url:     Slack incoming webhook URL. Please read: https://api.slack.com/messaging/webhooks
-    :param notification_text:            Notification text
-    :param slack_blocks:          Message blocks list. NOT IMPLEMENTED YET
-    """
-
-    data = {"text": notification_text}
-    print("====", webhook_url)
-    response = requests.post(
-        webhook_url, data=json.dumps(data), headers={"Content-Type": "application/json"}
-    )
-
-    print("Response: " + str(response.text))
-    print("Response code: " + str(response.status_code))
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/slack_notify/0.8.0/src/README.md b/functions/development/slack_notify/0.8.0/src/README.md deleted file mode 100644 index 9bde3299..00000000 --- a/functions/development/slack_notify/0.8.0/src/README.md +++ /dev/null @@ -1 +0,0 @@ -# Send Notification to Slack \ No newline at end of file diff --git a/functions/development/slack_notify/0.8.0/src/function.yaml b/functions/development/slack_notify/0.8.0/src/function.yaml deleted file mode 100644 index 1ede8b03..00000000 --- a/functions/development/slack_notify/0.8.0/src/function.yaml +++ /dev/null @@ -1,48 +0,0 @@ -kind: job -metadata: - name: slack-notify - tag: '' - hash: 3de7e78ed9b7928af192badf988055086431fb58 - project: default - labels: - author: mdl - categories: - - utils -spec: - command: '' - args: [] - image: python:3.6-jessie - env: [] - default_handler: slack_notify - entry_points: - slack_notify: - name: slack_notify - doc: Summarize a table - parameters: - - name: context - type: MLClientCtx - doc: the function context - default: '' - - name: webhook_url - type: str - doc: 'Slack incoming webhook URL. Please read: https://api.slack.com/messaging/webhooks' - default: URL - - name: slack_blocks - type: List[str] - doc: Message blocks list. NOT IMPLEMENTED YET - default: [] - - name: notification_text - type: str - doc: Notification text - default: Notification - outputs: - - default: '' - lineno: 14 - description: Send Slack notification - build: - functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHdhcm5pbmdzCgp3YXJuaW5ncy5zaW1wbGVmaWx0ZXIoYWN0aW9uPSJpZ25vcmUiLCBjYXRlZ29yeT1GdXR1cmVXYXJuaW5nKQoKaW1wb3J0IG9zCmltcG9ydCBqc29uCmltcG9ydCByZXF1ZXN0cwpmcm9tIG1scnVuLmV4ZWN1dGlvbiBpbXBvcnQgTUxDbGllbnRDdHgKZnJvbSB0eXBpbmcgaW1wb3J0IExpc3QKCgpkZWYgc2xhY2tfbm90aWZ5KAogICAgY29udGV4dDogTUxDbGllbnRDdHgsCiAgICB3ZWJob29rX3VybDogc3RyID0gIlVSTCIsCiAgICBzbGFja19ibG9ja3M6IExpc3Rbc3RyXSA9IFtdLAogICAgbm90aWZpY2F0aW9uX3RleHQ6IHN0ciA9ICJOb3RpZmljYXRpb24iLAopIC0+IE5vbmU6CiAgICAiIiJTdW1tYXJpemUgYSB0YWJsZQogICAgOnBhcmFtIGNvbnRleHQ6ICAgICAgICAgdGhlIGZ1bmN0aW9uIGNvbnRleHQKICAgIDpwYXJhbSB3ZWJob29rX3VybDogICAgIFNsYWNrIGluY29taW5nIHdlYmhvb2sgVVJMLiBQbGVhc2UgcmVhZDogaHR0cHM6Ly9hcGkuc2xhY2suY29tL21lc3NhZ2luZy93ZWJob29rcwogICAgOnBhcmFtIG5vdGlmaWNhdGlvbl90ZXh0OiAgICAgICAgICAgIE5vdGlmaWNhdGlvbiB0ZXh0CiAgICA6cGFyYW0gc2xhY2tfYmxvY2tzOiAgICAgICAgICBNZXNzYWdlIGJsb2NrcyBsaXN0LiBOT1QgSU1QTEVNRU5URUQgWUVUCiAgICAiIiIKCiAgICBkYXRhID0geyJ0ZXh0Ijogbm90aWZpY2F0aW9uX3RleHR9CiAgICBwcmludCgiPT09PSIsIHdlYmhvb2tfdXJsKQogICAgcmVzcG9uc2UgPSByZXF1ZXN0cy5wb3N0KAogICAgICAgIHdlYmhvb2tfdXJsLCBkYXRhPWpzb24uZHVtcHMoZGF0YSksIGhlYWRlcnM9eyJDb250ZW50LVR5cGUiOiAiYXBwbGljYXRpb24vanNvbiJ9CiAgICApCgogICAgcHJpbnQoIlJlc3BvbnNlOiAiICsgc3RyKHJlc3BvbnNlLnRleHQpKQogICAgcHJpbnQoIlJlc3BvbnNlIGNvZGU6ICIgKyBzdHIocmVzcG9uc2Uuc3RhdHVzX2NvZGUpKQo= - commands: - - python -m pip install requests - code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/slack_notify/slack_notify.py - affinity: null -verbose: false diff --git a/functions/development/slack_notify/0.8.0/src/item.yaml b/functions/development/slack_notify/0.8.0/src/item.yaml deleted file mode 100644 index 6bbd8b7b..00000000 --- a/functions/development/slack_notify/0.8.0/src/item.yaml +++ /dev/null @@ -1,24 +0,0 @@ -apiVersion: v1 -categories: -- utils -description: Send Slack notification -doc: '' -example: slack_notify.ipynb -generationDate: 2021-05-19:23-13 -icon: '' -labels: - author: mdl -maintainers: [] -marketplaceType: '' -mlrunVersion: 0.8.0 -name: slack-notify -platformVersion: 3.2.0 -spec: - filename: slack_notify.py - handler: slack_notify - image: python:3.6-jessie - kind: job - requirements: - - requests -url: '' -version: 0.8.0 diff --git a/functions/development/slack_notify/0.8.0/src/slack_notify.ipynb b/functions/development/slack_notify/0.8.0/src/slack_notify.ipynb deleted file mode 100644 index 8119bb8c..00000000 --- a/functions/development/slack_notify/0.8.0/src/slack_notify.ipynb +++ /dev/null @@ -1,293 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 33, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: ignore\n", - "import nuclio" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "%nuclio: setting kind to 'job'\n", - "%nuclio: setting spec.image to 'python:3.6-jessie'\n" - ] - } - ], - "source": [ - "%nuclio config kind = \"job\"\n", - "%nuclio config spec.image = \"python:3.6-jessie\"" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "metadata": {}, - "outputs": [], - "source": [ - "%%nuclio cmd -c \n", - "pip install requests" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "metadata": {}, - "outputs": [], - "source": [ - "import warnings\n", - "warnings.simplefilter(action='ignore', category=FutureWarning)" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "import json\n", - "import requests\n", - "from mlrun.execution import MLClientCtx\n", - "from typing import List" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "metadata": {}, - "outputs": [], - "source": [ - "def slack_notify(\n", - " context: MLClientCtx,\n", - " webhook_url: str = \"URL\",\n", - " slack_blocks: List[str] = [],\n", - " notification_text: str = \"Notification\"\n", - ") -> None:\n", - " \"\"\"Summarize a table\n", - " :param context: the function context\n", - " :param webhook_url: Slack incoming webhook URL. Please read: https://api.slack.com/messaging/webhooks\n", - " :param notification_text: Notification text\n", - " :param slack_blocks: Message blocks list. NOT IMPLEMENTED YET\n", - " \"\"\"\n", - " \n", - " data = {\n", - " 'text': notification_text\n", - " }\n", - " print(\"====\",webhook_url)\n", - " response = requests.post(webhook_url, data=json.dumps(\n", - " data), headers={'Content-Type': 'application/json'})\n", - "\n", - " print('Response: ' + str(response.text))\n", - " print('Response code: ' + str(response.status_code))" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: end-code" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### mlconfig" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import mlconf\n", - "import os\n", - "\n", - "mlconf.dbpath = 'http://mlrun-api:8080'\n", - "mlconf.artifact_path = mlconf.artifact_path or f'{os.environ[\"HOME\"]}/artifacts'" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### save" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import code_to_function\n", - "\n", - "# create job function object from notebook code\n", - "fn = code_to_function(\"slack_notify\")\n", - "# add metadata (for templates and reuse)\n", - "fn.spec.default_handler = \"slack_notify\"\n", - "fn.spec.description = \"Send Slack notification\"\n", - "fn.metadata.categories = [\"ops\"]\n", - "fn.metadata.labels = {\"author\": \"mdl\"}\n", - "fn.export(\"function.yaml\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## tests" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import import_function\n", - "func = import_function(\"hub://slack_notify\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import NewTask, run_local\n", - "\n", - "\n", - "#Slack incoming webhook URL. Please read: https://api.slack.com/messaging/webhooks\n", - "task_params = {\n", - " \"webhook_url\" : \"https://hooks.slack.com/services/xxxxxxxx/xxxxxxxxx/xxxxxxxxxxxxxx\",\n", - " \"notification_text\" : \"Test Notification\"\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "task = NewTask(\n", - " name=\"tasks slack notify\", \n", - " params = task_params,\n", - " handler=slack_notify)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### run local where artifact path is fixed " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run = run_local(task, artifact_path=mlconf.artifact_path)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### run remote where artifact path includes the run id" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "func.deploy()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "func.run(task, params=task_params, workdir=mlconf.artifact_path)" - ] - }, - { - "cell_type": "code", - "execution_count": 42, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "function: slack-notify\n", - "Send Slack notification\n", - "default handler: slack_notify\n", - "entry points:\n", - " slack_notify: Summarize a table\n", - " context(MLClientCtx) - the function context\n", - " webhook_url(str) - Slack incoming webhook URL. Please read: https://api.slack.com/messaging/webhooks, default=URL\n", - " slack_blocks(List[str]) - Message blocks list. NOT IMPLEMENTED YET\n", - " notification_text(str) - Notification text, default=Notification\n" - ] - } - ], - "source": [ - "func.doc()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.8" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/functions/development/slack_notify/0.8.0/src/slack_notify.py b/functions/development/slack_notify/0.8.0/src/slack_notify.py deleted file mode 100644 index 91624be5..00000000 --- a/functions/development/slack_notify/0.8.0/src/slack_notify.py +++ /dev/null @@ -1,34 +0,0 @@ -# Generated by nuclio.export.NuclioExporter - -import warnings - -warnings.simplefilter(action="ignore", category=FutureWarning) - -import os -import json -import requests -from mlrun.execution import MLClientCtx -from typing import List - - -def slack_notify( - context: MLClientCtx, - webhook_url: str = "URL", - slack_blocks: List[str] = [], - notification_text: str = "Notification", -) -> None: - """Summarize a table - :param context: the function context - :param webhook_url: Slack incoming webhook URL. Please read: https://api.slack.com/messaging/webhooks - :param notification_text: Notification text - :param slack_blocks: Message blocks list. NOT IMPLEMENTED YET - """ - - data = {"text": notification_text} - print("====", webhook_url) - response = requests.post( - webhook_url, data=json.dumps(data), headers={"Content-Type": "application/json"} - ) - - print("Response: " + str(response.text)) - print("Response code: " + str(response.status_code)) diff --git a/functions/development/slack_notify/0.8.0/static/documentation.html b/functions/development/slack_notify/0.8.0/static/documentation.html deleted file mode 100644 index a5047e6f..00000000 --- a/functions/development/slack_notify/0.8.0/static/documentation.html +++ /dev/null @@ -1,134 +0,0 @@ - - - - - - - -slack_notify package - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- - -
-
-
-
-
-
-

slack_notify package

-
-

Submodules

-
-
-

slack_notify.slack_notify module

-
-
-slack_notify.slack_notify.slack_notify(context: mlrun.execution.MLClientCtx, webhook_url: str = 'URL', slack_blocks: List[str] = [], notification_text: str = 'Notification')None[source]
-

Summarize a table -:param context: the function context -:param webhook_url: Slack incoming webhook URL. Please read: https://api.slack.com/messaging/webhooks -:param notification_text: Notification text -:param slack_blocks: Message blocks list. NOT IMPLEMENTED YET

-
-
-
-

Module contents

-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/slack_notify/0.8.0/static/example.html b/functions/development/slack_notify/0.8.0/static/example.html deleted file mode 100644 index 8844f454..00000000 --- a/functions/development/slack_notify/0.8.0/static/example.html +++ /dev/null @@ -1,324 +0,0 @@ - - - - - - - -mlconfig - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- -
-
-
-
-
-
-
# nuclio: ignore
-import nuclio
-
-
-
-
-
-
-
%nuclio config kind = "job"
-%nuclio config spec.image = "python:3.6-jessie"
-
-
-
-
-
%nuclio: setting kind to 'job'
-%nuclio: setting spec.image to 'python:3.6-jessie'
-
-
-
-
-
-
-
%%nuclio cmd -c 
-pip install requests
-
-
-
-
-
-
-
import warnings
-warnings.simplefilter(action='ignore', category=FutureWarning)
-
-
-
-
-
-
-
import os
-import json
-import requests
-from mlrun.execution import MLClientCtx
-from typing import List
-
-
-
-
-
-
-
def slack_notify(
-    context: MLClientCtx,
-    webhook_url: str = "URL",
-    slack_blocks:  List[str] = [],
-    notification_text: str = "Notification"
-) -> None:
-    """Summarize a table
-    :param context:         the function context
-    :param webhook_url:     Slack incoming webhook URL. Please read: https://api.slack.com/messaging/webhooks
-    :param notification_text:            Notification text
-    :param slack_blocks:          Message blocks list. NOT IMPLEMENTED YET
-    """
-    
-    data = {
-        'text': notification_text
-    }
-    print("====",webhook_url)
-    response = requests.post(webhook_url, data=json.dumps(
-        data), headers={'Content-Type': 'application/json'})
-
-    print('Response: ' + str(response.text))
-    print('Response code: ' + str(response.status_code))
-
-
-
-
-
-
-
# nuclio: end-code
-
-
-
-
-
-

mlconfig

-
-
-
from mlrun import mlconf
-import os
-
-mlconf.dbpath = 'http://mlrun-api:8080'
-mlconf.artifact_path = mlconf.artifact_path or f'{os.environ["HOME"]}/artifacts'
-
-
-
-
-
-
-

save

-
-
-
from mlrun import code_to_function
-
-# create job function object from notebook code
-fn = code_to_function("slack_notify")
-# add metadata (for templates and reuse)
-fn.spec.default_handler = "slack_notify"
-fn.spec.description = "Send Slack notification"
-fn.metadata.categories = ["ops"]
-fn.metadata.labels = {"author": "mdl"}
-fn.export("function.yaml")
-
-
-
-
-
-
-

tests

-
-
-
from mlrun import import_function
-func = import_function("hub://slack_notify")
-
-
-
-
-
-
-
from mlrun import NewTask, run_local
-
-
-#Slack incoming webhook URL. Please read: https://api.slack.com/messaging/webhooks
-task_params = {
-    "webhook_url" : "https://hooks.slack.com/services/xxxxxxxx/xxxxxxxxx/xxxxxxxxxxxxxx",
-    "notification_text" : "Test Notification"
-}
-
-
-
-
-
-
-
task = NewTask(
-    name="tasks slack notify", 
-    params = task_params,
-    handler=slack_notify)
-
-
-
-
-
-

run local where artifact path is fixed

-
-
-
run = run_local(task, artifact_path=mlconf.artifact_path)
-
-
-
-
-
-
-

run remote where artifact path includes the run id

-
-
-
func.deploy()
-
-
-
-
-
-
-
func.run(task, params=task_params,  workdir=mlconf.artifact_path)
-
-
-
-
-
-
-
func.doc()
-
-
-
-
-
function: slack-notify
-Send Slack notification
-default handler: slack_notify
-entry points:
-  slack_notify: Summarize a table
-    context(MLClientCtx)  - the function context
-    webhook_url(str)  - Slack incoming webhook URL. Please read: https://api.slack.com/messaging/webhooks, default=URL
-    slack_blocks(List[str])  - Message blocks list. NOT IMPLEMENTED YET
-    notification_text(str)  - Notification text, default=Notification
-
-
-
-
-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/slack_notify/0.8.0/static/function.html b/functions/development/slack_notify/0.8.0/static/function.html deleted file mode 100644 index 84e586ae..00000000 --- a/functions/development/slack_notify/0.8.0/static/function.html +++ /dev/null @@ -1,70 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: job
-metadata:
-  name: slack-notify
-  tag: ''
-  hash: 3de7e78ed9b7928af192badf988055086431fb58
-  project: default
-  labels:
-    author: mdl
-  categories:
-  - utils
-spec:
-  command: ''
-  args: []
-  image: python:3.6-jessie
-  env: []
-  default_handler: slack_notify
-  entry_points:
-    slack_notify:
-      name: slack_notify
-      doc: Summarize a table
-      parameters:
-      - name: context
-        type: MLClientCtx
-        doc: the function context
-        default: ''
-      - name: webhook_url
-        type: str
-        doc: 'Slack incoming webhook URL. Please read: https://api.slack.com/messaging/webhooks'
-        default: URL
-      - name: slack_blocks
-        type: List[str]
-        doc: Message blocks list. NOT IMPLEMENTED YET
-        default: []
-      - name: notification_text
-        type: str
-        doc: Notification text
-        default: Notification
-      outputs:
-      - default: ''
-      lineno: 14
-  description: Send Slack notification
-  build:
-    functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHdhcm5pbmdzCgp3YXJuaW5ncy5zaW1wbGVmaWx0ZXIoYWN0aW9uPSJpZ25vcmUiLCBjYXRlZ29yeT1GdXR1cmVXYXJuaW5nKQoKaW1wb3J0IG9zCmltcG9ydCBqc29uCmltcG9ydCByZXF1ZXN0cwpmcm9tIG1scnVuLmV4ZWN1dGlvbiBpbXBvcnQgTUxDbGllbnRDdHgKZnJvbSB0eXBpbmcgaW1wb3J0IExpc3QKCgpkZWYgc2xhY2tfbm90aWZ5KAogICAgY29udGV4dDogTUxDbGllbnRDdHgsCiAgICB3ZWJob29rX3VybDogc3RyID0gIlVSTCIsCiAgICBzbGFja19ibG9ja3M6IExpc3Rbc3RyXSA9IFtdLAogICAgbm90aWZpY2F0aW9uX3RleHQ6IHN0ciA9ICJOb3RpZmljYXRpb24iLAopIC0+IE5vbmU6CiAgICAiIiJTdW1tYXJpemUgYSB0YWJsZQogICAgOnBhcmFtIGNvbnRleHQ6ICAgICAgICAgdGhlIGZ1bmN0aW9uIGNvbnRleHQKICAgIDpwYXJhbSB3ZWJob29rX3VybDogICAgIFNsYWNrIGluY29taW5nIHdlYmhvb2sgVVJMLiBQbGVhc2UgcmVhZDogaHR0cHM6Ly9hcGkuc2xhY2suY29tL21lc3NhZ2luZy93ZWJob29rcwogICAgOnBhcmFtIG5vdGlmaWNhdGlvbl90ZXh0OiAgICAgICAgICAgIE5vdGlmaWNhdGlvbiB0ZXh0CiAgICA6cGFyYW0gc2xhY2tfYmxvY2tzOiAgICAgICAgICBNZXNzYWdlIGJsb2NrcyBsaXN0LiBOT1QgSU1QTEVNRU5URUQgWUVUCiAgICAiIiIKCiAgICBkYXRhID0geyJ0ZXh0Ijogbm90aWZpY2F0aW9uX3RleHR9CiAgICBwcmludCgiPT09PSIsIHdlYmhvb2tfdXJsKQogICAgcmVzcG9uc2UgPSByZXF1ZXN0cy5wb3N0KAogICAgICAgIHdlYmhvb2tfdXJsLCBkYXRhPWpzb24uZHVtcHMoZGF0YSksIGhlYWRlcnM9eyJDb250ZW50LVR5cGUiOiAiYXBwbGljYXRpb24vanNvbiJ9CiAgICApCgogICAgcHJpbnQoIlJlc3BvbnNlOiAiICsgc3RyKHJlc3BvbnNlLnRleHQpKQogICAgcHJpbnQoIlJlc3BvbnNlIGNvZGU6ICIgKyBzdHIocmVzcG9uc2Uuc3RhdHVzX2NvZGUpKQo=
-    commands:
-    - python -m pip install requests
-    code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/slack_notify/slack_notify.py
-  affinity: null
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/slack_notify/0.8.0/static/item.html b/functions/development/slack_notify/0.8.0/static/item.html deleted file mode 100644 index ff3bf917..00000000 --- a/functions/development/slack_notify/0.8.0/static/item.html +++ /dev/null @@ -1,46 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- utils
-description: Send Slack notification
-doc: ''
-example: slack_notify.ipynb
-generationDate: 2021-05-19:23-13
-icon: ''
-labels:
-  author: mdl
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 0.8.0
-name: slack-notify
-platformVersion: 3.2.0
-spec:
-  filename: slack_notify.py
-  handler: slack_notify
-  image: python:3.6-jessie
-  kind: job
-  requirements:
-  - requests
-url: ''
-version: 0.8.0
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/slack_notify/0.8.0/static/source.html b/functions/development/slack_notify/0.8.0/static/source.html deleted file mode 100644 index 5daa188e..00000000 --- a/functions/development/slack_notify/0.8.0/static/source.html +++ /dev/null @@ -1,56 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-# Generated by nuclio.export.NuclioExporter
-
-import warnings
-
-warnings.simplefilter(action="ignore", category=FutureWarning)
-
-import os
-import json
-import requests
-from mlrun.execution import MLClientCtx
-from typing import List
-
-
-def slack_notify(
-    context: MLClientCtx,
-    webhook_url: str = "URL",
-    slack_blocks: List[str] = [],
-    notification_text: str = "Notification",
-) -> None:
-    """Summarize a table
-    :param context:         the function context
-    :param webhook_url:     Slack incoming webhook URL. Please read: https://api.slack.com/messaging/webhooks
-    :param notification_text:            Notification text
-    :param slack_blocks:          Message blocks list. NOT IMPLEMENTED YET
-    """
-
-    data = {"text": notification_text}
-    print("====", webhook_url)
-    response = requests.post(
-        webhook_url, data=json.dumps(data), headers={"Content-Type": "application/json"}
-    )
-
-    print("Response: " + str(response.text))
-    print("Response code: " + str(response.status_code))
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/slack_notify/0.9.0/src/README.md b/functions/development/slack_notify/0.9.0/src/README.md deleted file mode 100644 index 9bde3299..00000000 --- a/functions/development/slack_notify/0.9.0/src/README.md +++ /dev/null @@ -1 +0,0 @@ -# Send Notification to Slack \ No newline at end of file diff --git a/functions/development/slack_notify/0.9.0/src/function.yaml b/functions/development/slack_notify/0.9.0/src/function.yaml deleted file mode 100644 index 95af087c..00000000 --- a/functions/development/slack_notify/0.9.0/src/function.yaml +++ /dev/null @@ -1,48 +0,0 @@ -kind: job -metadata: - name: slack-notify - tag: '' - hash: 3de7e78ed9b7928af192badf988055086431fb58 - project: '' - labels: - author: mdl - categories: - - utils -spec: - command: '' - args: [] - image: python:3.6-jessie - env: [] - default_handler: slack_notify - entry_points: - slack_notify: - name: slack_notify - doc: Summarize a table - parameters: - - name: context - type: MLClientCtx - doc: the function context - default: '' - - name: webhook_url - type: str - doc: 'Slack incoming webhook URL. Please read: https://api.slack.com/messaging/webhooks' - default: URL - - name: slack_blocks - type: List[str] - doc: Message blocks list. NOT IMPLEMENTED YET - default: [] - - name: notification_text - type: str - doc: Notification text - default: Notification - outputs: - - default: '' - lineno: 14 - description: Send Slack notification - build: - functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHdhcm5pbmdzCgp3YXJuaW5ncy5zaW1wbGVmaWx0ZXIoYWN0aW9uPSJpZ25vcmUiLCBjYXRlZ29yeT1GdXR1cmVXYXJuaW5nKQoKaW1wb3J0IG9zCmltcG9ydCBqc29uCmltcG9ydCByZXF1ZXN0cwpmcm9tIG1scnVuLmV4ZWN1dGlvbiBpbXBvcnQgTUxDbGllbnRDdHgKZnJvbSB0eXBpbmcgaW1wb3J0IExpc3QKCgpkZWYgc2xhY2tfbm90aWZ5KAogICAgY29udGV4dDogTUxDbGllbnRDdHgsCiAgICB3ZWJob29rX3VybDogc3RyID0gIlVSTCIsCiAgICBzbGFja19ibG9ja3M6IExpc3Rbc3RyXSA9IFtdLAogICAgbm90aWZpY2F0aW9uX3RleHQ6IHN0ciA9ICJOb3RpZmljYXRpb24iLAopIC0+IE5vbmU6CiAgICAiIiJTdW1tYXJpemUgYSB0YWJsZQogICAgOnBhcmFtIGNvbnRleHQ6ICAgICAgICAgdGhlIGZ1bmN0aW9uIGNvbnRleHQKICAgIDpwYXJhbSB3ZWJob29rX3VybDogICAgIFNsYWNrIGluY29taW5nIHdlYmhvb2sgVVJMLiBQbGVhc2UgcmVhZDogaHR0cHM6Ly9hcGkuc2xhY2suY29tL21lc3NhZ2luZy93ZWJob29rcwogICAgOnBhcmFtIG5vdGlmaWNhdGlvbl90ZXh0OiAgICAgICAgICAgIE5vdGlmaWNhdGlvbiB0ZXh0CiAgICA6cGFyYW0gc2xhY2tfYmxvY2tzOiAgICAgICAgICBNZXNzYWdlIGJsb2NrcyBsaXN0LiBOT1QgSU1QTEVNRU5URUQgWUVUCiAgICAiIiIKCiAgICBkYXRhID0geyJ0ZXh0Ijogbm90aWZpY2F0aW9uX3RleHR9CiAgICBwcmludCgiPT09PSIsIHdlYmhvb2tfdXJsKQogICAgcmVzcG9uc2UgPSByZXF1ZXN0cy5wb3N0KAogICAgICAgIHdlYmhvb2tfdXJsLCBkYXRhPWpzb24uZHVtcHMoZGF0YSksIGhlYWRlcnM9eyJDb250ZW50LVR5cGUiOiAiYXBwbGljYXRpb24vanNvbiJ9CiAgICApCgogICAgcHJpbnQoIlJlc3BvbnNlOiAiICsgc3RyKHJlc3BvbnNlLnRleHQpKQogICAgcHJpbnQoIlJlc3BvbnNlIGNvZGU6ICIgKyBzdHIocmVzcG9uc2Uuc3RhdHVzX2NvZGUpKQo= - commands: - - python -m pip install requests - code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/slack_notify/slack_notify.py - affinity: null -verbose: false diff --git a/functions/development/slack_notify/0.9.0/src/item.yaml b/functions/development/slack_notify/0.9.0/src/item.yaml deleted file mode 100644 index 0cdb6a9a..00000000 --- a/functions/development/slack_notify/0.9.0/src/item.yaml +++ /dev/null @@ -1,24 +0,0 @@ -apiVersion: v1 -categories: -- utils -description: Send Slack notification -doc: '' -example: slack_notify.ipynb -generationDate: 2021-11-18:12-28 -icon: '' -labels: - author: mdl -maintainers: [] -marketplaceType: '' -mlrunVersion: 0.8.0 -name: slack-notify -platformVersion: 3.2.0 -spec: - filename: slack_notify.py - handler: slack_notify - image: python:3.6-jessie - kind: job - requirements: - - requests -url: '' -version: 0.9.0 diff --git a/functions/development/slack_notify/0.9.0/src/slack_notify.ipynb b/functions/development/slack_notify/0.9.0/src/slack_notify.ipynb deleted file mode 100644 index 8119bb8c..00000000 --- a/functions/development/slack_notify/0.9.0/src/slack_notify.ipynb +++ /dev/null @@ -1,293 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 33, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: ignore\n", - "import nuclio" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "%nuclio: setting kind to 'job'\n", - "%nuclio: setting spec.image to 'python:3.6-jessie'\n" - ] - } - ], - "source": [ - "%nuclio config kind = \"job\"\n", - "%nuclio config spec.image = \"python:3.6-jessie\"" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "metadata": {}, - "outputs": [], - "source": [ - "%%nuclio cmd -c \n", - "pip install requests" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "metadata": {}, - "outputs": [], - "source": [ - "import warnings\n", - "warnings.simplefilter(action='ignore', category=FutureWarning)" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "import json\n", - "import requests\n", - "from mlrun.execution import MLClientCtx\n", - "from typing import List" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "metadata": {}, - "outputs": [], - "source": [ - "def slack_notify(\n", - " context: MLClientCtx,\n", - " webhook_url: str = \"URL\",\n", - " slack_blocks: List[str] = [],\n", - " notification_text: str = \"Notification\"\n", - ") -> None:\n", - " \"\"\"Summarize a table\n", - " :param context: the function context\n", - " :param webhook_url: Slack incoming webhook URL. Please read: https://api.slack.com/messaging/webhooks\n", - " :param notification_text: Notification text\n", - " :param slack_blocks: Message blocks list. NOT IMPLEMENTED YET\n", - " \"\"\"\n", - " \n", - " data = {\n", - " 'text': notification_text\n", - " }\n", - " print(\"====\",webhook_url)\n", - " response = requests.post(webhook_url, data=json.dumps(\n", - " data), headers={'Content-Type': 'application/json'})\n", - "\n", - " print('Response: ' + str(response.text))\n", - " print('Response code: ' + str(response.status_code))" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: end-code" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### mlconfig" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import mlconf\n", - "import os\n", - "\n", - "mlconf.dbpath = 'http://mlrun-api:8080'\n", - "mlconf.artifact_path = mlconf.artifact_path or f'{os.environ[\"HOME\"]}/artifacts'" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### save" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import code_to_function\n", - "\n", - "# create job function object from notebook code\n", - "fn = code_to_function(\"slack_notify\")\n", - "# add metadata (for templates and reuse)\n", - "fn.spec.default_handler = \"slack_notify\"\n", - "fn.spec.description = \"Send Slack notification\"\n", - "fn.metadata.categories = [\"ops\"]\n", - "fn.metadata.labels = {\"author\": \"mdl\"}\n", - "fn.export(\"function.yaml\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## tests" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import import_function\n", - "func = import_function(\"hub://slack_notify\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import NewTask, run_local\n", - "\n", - "\n", - "#Slack incoming webhook URL. Please read: https://api.slack.com/messaging/webhooks\n", - "task_params = {\n", - " \"webhook_url\" : \"https://hooks.slack.com/services/xxxxxxxx/xxxxxxxxx/xxxxxxxxxxxxxx\",\n", - " \"notification_text\" : \"Test Notification\"\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "task = NewTask(\n", - " name=\"tasks slack notify\", \n", - " params = task_params,\n", - " handler=slack_notify)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### run local where artifact path is fixed " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run = run_local(task, artifact_path=mlconf.artifact_path)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### run remote where artifact path includes the run id" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "func.deploy()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "func.run(task, params=task_params, workdir=mlconf.artifact_path)" - ] - }, - { - "cell_type": "code", - "execution_count": 42, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "function: slack-notify\n", - "Send Slack notification\n", - "default handler: slack_notify\n", - "entry points:\n", - " slack_notify: Summarize a table\n", - " context(MLClientCtx) - the function context\n", - " webhook_url(str) - Slack incoming webhook URL. Please read: https://api.slack.com/messaging/webhooks, default=URL\n", - " slack_blocks(List[str]) - Message blocks list. NOT IMPLEMENTED YET\n", - " notification_text(str) - Notification text, default=Notification\n" - ] - } - ], - "source": [ - "func.doc()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.8" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/functions/development/slack_notify/0.9.0/src/slack_notify.py b/functions/development/slack_notify/0.9.0/src/slack_notify.py deleted file mode 100644 index 91624be5..00000000 --- a/functions/development/slack_notify/0.9.0/src/slack_notify.py +++ /dev/null @@ -1,34 +0,0 @@ -# Generated by nuclio.export.NuclioExporter - -import warnings - -warnings.simplefilter(action="ignore", category=FutureWarning) - -import os -import json -import requests -from mlrun.execution import MLClientCtx -from typing import List - - -def slack_notify( - context: MLClientCtx, - webhook_url: str = "URL", - slack_blocks: List[str] = [], - notification_text: str = "Notification", -) -> None: - """Summarize a table - :param context: the function context - :param webhook_url: Slack incoming webhook URL. Please read: https://api.slack.com/messaging/webhooks - :param notification_text: Notification text - :param slack_blocks: Message blocks list. NOT IMPLEMENTED YET - """ - - data = {"text": notification_text} - print("====", webhook_url) - response = requests.post( - webhook_url, data=json.dumps(data), headers={"Content-Type": "application/json"} - ) - - print("Response: " + str(response.text)) - print("Response code: " + str(response.status_code)) diff --git a/functions/development/slack_notify/0.9.0/static/documentation.html b/functions/development/slack_notify/0.9.0/static/documentation.html deleted file mode 100644 index a5047e6f..00000000 --- a/functions/development/slack_notify/0.9.0/static/documentation.html +++ /dev/null @@ -1,134 +0,0 @@ - - - - - - - -slack_notify package - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- - -
-
-
-
-
-
-

slack_notify package

-
-

Submodules

-
-
-

slack_notify.slack_notify module

-
-
-slack_notify.slack_notify.slack_notify(context: mlrun.execution.MLClientCtx, webhook_url: str = 'URL', slack_blocks: List[str] = [], notification_text: str = 'Notification')None[source]
-

Summarize a table -:param context: the function context -:param webhook_url: Slack incoming webhook URL. Please read: https://api.slack.com/messaging/webhooks -:param notification_text: Notification text -:param slack_blocks: Message blocks list. NOT IMPLEMENTED YET

-
-
-
-

Module contents

-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/slack_notify/0.9.0/static/example.html b/functions/development/slack_notify/0.9.0/static/example.html deleted file mode 100644 index 8844f454..00000000 --- a/functions/development/slack_notify/0.9.0/static/example.html +++ /dev/null @@ -1,324 +0,0 @@ - - - - - - - -mlconfig - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- -
-
-
-
-
-
-
# nuclio: ignore
-import nuclio
-
-
-
-
-
-
-
%nuclio config kind = "job"
-%nuclio config spec.image = "python:3.6-jessie"
-
-
-
-
-
%nuclio: setting kind to 'job'
-%nuclio: setting spec.image to 'python:3.6-jessie'
-
-
-
-
-
-
-
%%nuclio cmd -c 
-pip install requests
-
-
-
-
-
-
-
import warnings
-warnings.simplefilter(action='ignore', category=FutureWarning)
-
-
-
-
-
-
-
import os
-import json
-import requests
-from mlrun.execution import MLClientCtx
-from typing import List
-
-
-
-
-
-
-
def slack_notify(
-    context: MLClientCtx,
-    webhook_url: str = "URL",
-    slack_blocks:  List[str] = [],
-    notification_text: str = "Notification"
-) -> None:
-    """Summarize a table
-    :param context:         the function context
-    :param webhook_url:     Slack incoming webhook URL. Please read: https://api.slack.com/messaging/webhooks
-    :param notification_text:            Notification text
-    :param slack_blocks:          Message blocks list. NOT IMPLEMENTED YET
-    """
-    
-    data = {
-        'text': notification_text
-    }
-    print("====",webhook_url)
-    response = requests.post(webhook_url, data=json.dumps(
-        data), headers={'Content-Type': 'application/json'})
-
-    print('Response: ' + str(response.text))
-    print('Response code: ' + str(response.status_code))
-
-
-
-
-
-
-
# nuclio: end-code
-
-
-
-
-
-

mlconfig

-
-
-
from mlrun import mlconf
-import os
-
-mlconf.dbpath = 'http://mlrun-api:8080'
-mlconf.artifact_path = mlconf.artifact_path or f'{os.environ["HOME"]}/artifacts'
-
-
-
-
-
-
-

save

-
-
-
from mlrun import code_to_function
-
-# create job function object from notebook code
-fn = code_to_function("slack_notify")
-# add metadata (for templates and reuse)
-fn.spec.default_handler = "slack_notify"
-fn.spec.description = "Send Slack notification"
-fn.metadata.categories = ["ops"]
-fn.metadata.labels = {"author": "mdl"}
-fn.export("function.yaml")
-
-
-
-
-
-
-

tests

-
-
-
from mlrun import import_function
-func = import_function("hub://slack_notify")
-
-
-
-
-
-
-
from mlrun import NewTask, run_local
-
-
-#Slack incoming webhook URL. Please read: https://api.slack.com/messaging/webhooks
-task_params = {
-    "webhook_url" : "https://hooks.slack.com/services/xxxxxxxx/xxxxxxxxx/xxxxxxxxxxxxxx",
-    "notification_text" : "Test Notification"
-}
-
-
-
-
-
-
-
task = NewTask(
-    name="tasks slack notify", 
-    params = task_params,
-    handler=slack_notify)
-
-
-
-
-
-

run local where artifact path is fixed

-
-
-
run = run_local(task, artifact_path=mlconf.artifact_path)
-
-
-
-
-
-
-

run remote where artifact path includes the run id

-
-
-
func.deploy()
-
-
-
-
-
-
-
func.run(task, params=task_params,  workdir=mlconf.artifact_path)
-
-
-
-
-
-
-
func.doc()
-
-
-
-
-
function: slack-notify
-Send Slack notification
-default handler: slack_notify
-entry points:
-  slack_notify: Summarize a table
-    context(MLClientCtx)  - the function context
-    webhook_url(str)  - Slack incoming webhook URL. Please read: https://api.slack.com/messaging/webhooks, default=URL
-    slack_blocks(List[str])  - Message blocks list. NOT IMPLEMENTED YET
-    notification_text(str)  - Notification text, default=Notification
-
-
-
-
-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/slack_notify/0.9.0/static/function.html b/functions/development/slack_notify/0.9.0/static/function.html deleted file mode 100644 index 32f2424b..00000000 --- a/functions/development/slack_notify/0.9.0/static/function.html +++ /dev/null @@ -1,70 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: job
-metadata:
-  name: slack-notify
-  tag: ''
-  hash: 3de7e78ed9b7928af192badf988055086431fb58
-  project: ''
-  labels:
-    author: mdl
-  categories:
-  - utils
-spec:
-  command: ''
-  args: []
-  image: python:3.6-jessie
-  env: []
-  default_handler: slack_notify
-  entry_points:
-    slack_notify:
-      name: slack_notify
-      doc: Summarize a table
-      parameters:
-      - name: context
-        type: MLClientCtx
-        doc: the function context
-        default: ''
-      - name: webhook_url
-        type: str
-        doc: 'Slack incoming webhook URL. Please read: https://api.slack.com/messaging/webhooks'
-        default: URL
-      - name: slack_blocks
-        type: List[str]
-        doc: Message blocks list. NOT IMPLEMENTED YET
-        default: []
-      - name: notification_text
-        type: str
-        doc: Notification text
-        default: Notification
-      outputs:
-      - default: ''
-      lineno: 14
-  description: Send Slack notification
-  build:
-    functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHdhcm5pbmdzCgp3YXJuaW5ncy5zaW1wbGVmaWx0ZXIoYWN0aW9uPSJpZ25vcmUiLCBjYXRlZ29yeT1GdXR1cmVXYXJuaW5nKQoKaW1wb3J0IG9zCmltcG9ydCBqc29uCmltcG9ydCByZXF1ZXN0cwpmcm9tIG1scnVuLmV4ZWN1dGlvbiBpbXBvcnQgTUxDbGllbnRDdHgKZnJvbSB0eXBpbmcgaW1wb3J0IExpc3QKCgpkZWYgc2xhY2tfbm90aWZ5KAogICAgY29udGV4dDogTUxDbGllbnRDdHgsCiAgICB3ZWJob29rX3VybDogc3RyID0gIlVSTCIsCiAgICBzbGFja19ibG9ja3M6IExpc3Rbc3RyXSA9IFtdLAogICAgbm90aWZpY2F0aW9uX3RleHQ6IHN0ciA9ICJOb3RpZmljYXRpb24iLAopIC0+IE5vbmU6CiAgICAiIiJTdW1tYXJpemUgYSB0YWJsZQogICAgOnBhcmFtIGNvbnRleHQ6ICAgICAgICAgdGhlIGZ1bmN0aW9uIGNvbnRleHQKICAgIDpwYXJhbSB3ZWJob29rX3VybDogICAgIFNsYWNrIGluY29taW5nIHdlYmhvb2sgVVJMLiBQbGVhc2UgcmVhZDogaHR0cHM6Ly9hcGkuc2xhY2suY29tL21lc3NhZ2luZy93ZWJob29rcwogICAgOnBhcmFtIG5vdGlmaWNhdGlvbl90ZXh0OiAgICAgICAgICAgIE5vdGlmaWNhdGlvbiB0ZXh0CiAgICA6cGFyYW0gc2xhY2tfYmxvY2tzOiAgICAgICAgICBNZXNzYWdlIGJsb2NrcyBsaXN0LiBOT1QgSU1QTEVNRU5URUQgWUVUCiAgICAiIiIKCiAgICBkYXRhID0geyJ0ZXh0Ijogbm90aWZpY2F0aW9uX3RleHR9CiAgICBwcmludCgiPT09PSIsIHdlYmhvb2tfdXJsKQogICAgcmVzcG9uc2UgPSByZXF1ZXN0cy5wb3N0KAogICAgICAgIHdlYmhvb2tfdXJsLCBkYXRhPWpzb24uZHVtcHMoZGF0YSksIGhlYWRlcnM9eyJDb250ZW50LVR5cGUiOiAiYXBwbGljYXRpb24vanNvbiJ9CiAgICApCgogICAgcHJpbnQoIlJlc3BvbnNlOiAiICsgc3RyKHJlc3BvbnNlLnRleHQpKQogICAgcHJpbnQoIlJlc3BvbnNlIGNvZGU6ICIgKyBzdHIocmVzcG9uc2Uuc3RhdHVzX2NvZGUpKQo=
-    commands:
-    - python -m pip install requests
-    code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/slack_notify/slack_notify.py
-  affinity: null
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/slack_notify/0.9.0/static/item.html b/functions/development/slack_notify/0.9.0/static/item.html deleted file mode 100644 index f24d650a..00000000 --- a/functions/development/slack_notify/0.9.0/static/item.html +++ /dev/null @@ -1,46 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- utils
-description: Send Slack notification
-doc: ''
-example: slack_notify.ipynb
-generationDate: 2021-11-18:12-28
-icon: ''
-labels:
-  author: mdl
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 0.8.0
-name: slack-notify
-platformVersion: 3.2.0
-spec:
-  filename: slack_notify.py
-  handler: slack_notify
-  image: python:3.6-jessie
-  kind: job
-  requirements:
-  - requests
-url: ''
-version: 0.9.0
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/slack_notify/0.9.0/static/source.html b/functions/development/slack_notify/0.9.0/static/source.html deleted file mode 100644 index 5daa188e..00000000 --- a/functions/development/slack_notify/0.9.0/static/source.html +++ /dev/null @@ -1,56 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-# Generated by nuclio.export.NuclioExporter
-
-import warnings
-
-warnings.simplefilter(action="ignore", category=FutureWarning)
-
-import os
-import json
-import requests
-from mlrun.execution import MLClientCtx
-from typing import List
-
-
-def slack_notify(
-    context: MLClientCtx,
-    webhook_url: str = "URL",
-    slack_blocks: List[str] = [],
-    notification_text: str = "Notification",
-) -> None:
-    """Summarize a table
-    :param context:         the function context
-    :param webhook_url:     Slack incoming webhook URL. Please read: https://api.slack.com/messaging/webhooks
-    :param notification_text:            Notification text
-    :param slack_blocks:          Message blocks list. NOT IMPLEMENTED YET
-    """
-
-    data = {"text": notification_text}
-    print("====", webhook_url)
-    response = requests.post(
-        webhook_url, data=json.dumps(data), headers={"Content-Type": "application/json"}
-    )
-
-    print("Response: " + str(response.text))
-    print("Response code: " + str(response.status_code))
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/slack_notify/1.1.0/src/README.md b/functions/development/slack_notify/1.1.0/src/README.md deleted file mode 100644 index 9bde3299..00000000 --- a/functions/development/slack_notify/1.1.0/src/README.md +++ /dev/null @@ -1 +0,0 @@ -# Send Notification to Slack \ No newline at end of file diff --git a/functions/development/slack_notify/1.1.0/src/function.yaml b/functions/development/slack_notify/1.1.0/src/function.yaml deleted file mode 100644 index 95af087c..00000000 --- a/functions/development/slack_notify/1.1.0/src/function.yaml +++ /dev/null @@ -1,48 +0,0 @@ -kind: job -metadata: - name: slack-notify - tag: '' - hash: 3de7e78ed9b7928af192badf988055086431fb58 - project: '' - labels: - author: mdl - categories: - - utils -spec: - command: '' - args: [] - image: python:3.6-jessie - env: [] - default_handler: slack_notify - entry_points: - slack_notify: - name: slack_notify - doc: Summarize a table - parameters: - - name: context - type: MLClientCtx - doc: the function context - default: '' - - name: webhook_url - type: str - doc: 'Slack incoming webhook URL. Please read: https://api.slack.com/messaging/webhooks' - default: URL - - name: slack_blocks - type: List[str] - doc: Message blocks list. NOT IMPLEMENTED YET - default: [] - - name: notification_text - type: str - doc: Notification text - default: Notification - outputs: - - default: '' - lineno: 14 - description: Send Slack notification - build: - functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHdhcm5pbmdzCgp3YXJuaW5ncy5zaW1wbGVmaWx0ZXIoYWN0aW9uPSJpZ25vcmUiLCBjYXRlZ29yeT1GdXR1cmVXYXJuaW5nKQoKaW1wb3J0IG9zCmltcG9ydCBqc29uCmltcG9ydCByZXF1ZXN0cwpmcm9tIG1scnVuLmV4ZWN1dGlvbiBpbXBvcnQgTUxDbGllbnRDdHgKZnJvbSB0eXBpbmcgaW1wb3J0IExpc3QKCgpkZWYgc2xhY2tfbm90aWZ5KAogICAgY29udGV4dDogTUxDbGllbnRDdHgsCiAgICB3ZWJob29rX3VybDogc3RyID0gIlVSTCIsCiAgICBzbGFja19ibG9ja3M6IExpc3Rbc3RyXSA9IFtdLAogICAgbm90aWZpY2F0aW9uX3RleHQ6IHN0ciA9ICJOb3RpZmljYXRpb24iLAopIC0+IE5vbmU6CiAgICAiIiJTdW1tYXJpemUgYSB0YWJsZQogICAgOnBhcmFtIGNvbnRleHQ6ICAgICAgICAgdGhlIGZ1bmN0aW9uIGNvbnRleHQKICAgIDpwYXJhbSB3ZWJob29rX3VybDogICAgIFNsYWNrIGluY29taW5nIHdlYmhvb2sgVVJMLiBQbGVhc2UgcmVhZDogaHR0cHM6Ly9hcGkuc2xhY2suY29tL21lc3NhZ2luZy93ZWJob29rcwogICAgOnBhcmFtIG5vdGlmaWNhdGlvbl90ZXh0OiAgICAgICAgICAgIE5vdGlmaWNhdGlvbiB0ZXh0CiAgICA6cGFyYW0gc2xhY2tfYmxvY2tzOiAgICAgICAgICBNZXNzYWdlIGJsb2NrcyBsaXN0LiBOT1QgSU1QTEVNRU5URUQgWUVUCiAgICAiIiIKCiAgICBkYXRhID0geyJ0ZXh0Ijogbm90aWZpY2F0aW9uX3RleHR9CiAgICBwcmludCgiPT09PSIsIHdlYmhvb2tfdXJsKQogICAgcmVzcG9uc2UgPSByZXF1ZXN0cy5wb3N0KAogICAgICAgIHdlYmhvb2tfdXJsLCBkYXRhPWpzb24uZHVtcHMoZGF0YSksIGhlYWRlcnM9eyJDb250ZW50LVR5cGUiOiAiYXBwbGljYXRpb24vanNvbiJ9CiAgICApCgogICAgcHJpbnQoIlJlc3BvbnNlOiAiICsgc3RyKHJlc3BvbnNlLnRleHQpKQogICAgcHJpbnQoIlJlc3BvbnNlIGNvZGU6ICIgKyBzdHIocmVzcG9uc2Uuc3RhdHVzX2NvZGUpKQo= - commands: - - python -m pip install requests - code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/slack_notify/slack_notify.py - affinity: null -verbose: false diff --git a/functions/development/slack_notify/1.1.0/src/item.yaml b/functions/development/slack_notify/1.1.0/src/item.yaml deleted file mode 100644 index 6bdfd2c8..00000000 --- a/functions/development/slack_notify/1.1.0/src/item.yaml +++ /dev/null @@ -1,25 +0,0 @@ -apiVersion: v1 -categories: -- utils -description: Send Slack notification -doc: '' -example: slack_notify.ipynb -generationDate: 2022-08-28:17-25 -hidden: false -icon: '' -labels: - author: mdl -maintainers: [] -marketplaceType: '' -mlrunVersion: 1.1.0 -name: slack-notify -platformVersion: 3.5.0 -spec: - filename: slack_notify.py - handler: slack_notify - image: python:3.6-jessie - kind: job - requirements: - - requests -url: '' -version: 1.1.0 diff --git a/functions/development/slack_notify/1.1.0/src/slack_notify.ipynb b/functions/development/slack_notify/1.1.0/src/slack_notify.ipynb deleted file mode 100644 index 8119bb8c..00000000 --- a/functions/development/slack_notify/1.1.0/src/slack_notify.ipynb +++ /dev/null @@ -1,293 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 33, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: ignore\n", - "import nuclio" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "%nuclio: setting kind to 'job'\n", - "%nuclio: setting spec.image to 'python:3.6-jessie'\n" - ] - } - ], - "source": [ - "%nuclio config kind = \"job\"\n", - "%nuclio config spec.image = \"python:3.6-jessie\"" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "metadata": {}, - "outputs": [], - "source": [ - "%%nuclio cmd -c \n", - "pip install requests" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "metadata": {}, - "outputs": [], - "source": [ - "import warnings\n", - "warnings.simplefilter(action='ignore', category=FutureWarning)" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "import json\n", - "import requests\n", - "from mlrun.execution import MLClientCtx\n", - "from typing import List" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "metadata": {}, - "outputs": [], - "source": [ - "def slack_notify(\n", - " context: MLClientCtx,\n", - " webhook_url: str = \"URL\",\n", - " slack_blocks: List[str] = [],\n", - " notification_text: str = \"Notification\"\n", - ") -> None:\n", - " \"\"\"Summarize a table\n", - " :param context: the function context\n", - " :param webhook_url: Slack incoming webhook URL. Please read: https://api.slack.com/messaging/webhooks\n", - " :param notification_text: Notification text\n", - " :param slack_blocks: Message blocks list. NOT IMPLEMENTED YET\n", - " \"\"\"\n", - " \n", - " data = {\n", - " 'text': notification_text\n", - " }\n", - " print(\"====\",webhook_url)\n", - " response = requests.post(webhook_url, data=json.dumps(\n", - " data), headers={'Content-Type': 'application/json'})\n", - "\n", - " print('Response: ' + str(response.text))\n", - " print('Response code: ' + str(response.status_code))" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: end-code" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### mlconfig" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import mlconf\n", - "import os\n", - "\n", - "mlconf.dbpath = 'http://mlrun-api:8080'\n", - "mlconf.artifact_path = mlconf.artifact_path or f'{os.environ[\"HOME\"]}/artifacts'" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### save" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import code_to_function\n", - "\n", - "# create job function object from notebook code\n", - "fn = code_to_function(\"slack_notify\")\n", - "# add metadata (for templates and reuse)\n", - "fn.spec.default_handler = \"slack_notify\"\n", - "fn.spec.description = \"Send Slack notification\"\n", - "fn.metadata.categories = [\"ops\"]\n", - "fn.metadata.labels = {\"author\": \"mdl\"}\n", - "fn.export(\"function.yaml\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## tests" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import import_function\n", - "func = import_function(\"hub://slack_notify\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import NewTask, run_local\n", - "\n", - "\n", - "#Slack incoming webhook URL. Please read: https://api.slack.com/messaging/webhooks\n", - "task_params = {\n", - " \"webhook_url\" : \"https://hooks.slack.com/services/xxxxxxxx/xxxxxxxxx/xxxxxxxxxxxxxx\",\n", - " \"notification_text\" : \"Test Notification\"\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "task = NewTask(\n", - " name=\"tasks slack notify\", \n", - " params = task_params,\n", - " handler=slack_notify)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### run local where artifact path is fixed " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run = run_local(task, artifact_path=mlconf.artifact_path)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### run remote where artifact path includes the run id" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "func.deploy()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "func.run(task, params=task_params, workdir=mlconf.artifact_path)" - ] - }, - { - "cell_type": "code", - "execution_count": 42, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "function: slack-notify\n", - "Send Slack notification\n", - "default handler: slack_notify\n", - "entry points:\n", - " slack_notify: Summarize a table\n", - " context(MLClientCtx) - the function context\n", - " webhook_url(str) - Slack incoming webhook URL. Please read: https://api.slack.com/messaging/webhooks, default=URL\n", - " slack_blocks(List[str]) - Message blocks list. NOT IMPLEMENTED YET\n", - " notification_text(str) - Notification text, default=Notification\n" - ] - } - ], - "source": [ - "func.doc()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.8" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/functions/development/slack_notify/1.1.0/src/slack_notify.py b/functions/development/slack_notify/1.1.0/src/slack_notify.py deleted file mode 100644 index 3208ffee..00000000 --- a/functions/development/slack_notify/1.1.0/src/slack_notify.py +++ /dev/null @@ -1,48 +0,0 @@ -# Copyright 2019 Iguazio -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# Generated by nuclio.export.NuclioExporter - -import warnings - -warnings.simplefilter(action="ignore", category=FutureWarning) - -import os -import json -import requests -from mlrun.execution import MLClientCtx -from typing import List - - -def slack_notify( - context: MLClientCtx, - webhook_url: str = "URL", - slack_blocks: List[str] = [], - notification_text: str = "Notification", -) -> None: - """Summarize a table - :param context: the function context - :param webhook_url: Slack incoming webhook URL. Please read: https://api.slack.com/messaging/webhooks - :param notification_text: Notification text - :param slack_blocks: Message blocks list. NOT IMPLEMENTED YET - """ - - data = {"text": notification_text} - print("====", webhook_url) - response = requests.post( - webhook_url, data=json.dumps(data), headers={"Content-Type": "application/json"} - ) - - print("Response: " + str(response.text)) - print("Response code: " + str(response.status_code)) diff --git a/functions/development/slack_notify/1.1.0/static/documentation.html b/functions/development/slack_notify/1.1.0/static/documentation.html deleted file mode 100644 index 8e88b8f8..00000000 --- a/functions/development/slack_notify/1.1.0/static/documentation.html +++ /dev/null @@ -1,231 +0,0 @@ - - - - - - - -slack_notify package - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - - - -
-
- - -
-
-
- -
-

slack_notify package

- -
- -
-
-
-
-
-

slack_notify package#

-
-

Submodules#

-
-
-

slack_notify.slack_notify module#

-
-
-slack_notify.slack_notify.slack_notify(context: mlrun.execution.MLClientCtx, webhook_url: str = 'URL', slack_blocks: List[str] = [], notification_text: str = 'Notification')None[source]#
-

Summarize a table -:param context: the function context -:param webhook_url: Slack incoming webhook URL. Please read: https://api.slack.com/messaging/webhooks -:param notification_text: Notification text -:param slack_blocks: Message blocks list. NOT IMPLEMENTED YET

-
-
-
-

Module contents#

-
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/slack_notify/1.1.0/static/example.html b/functions/development/slack_notify/1.1.0/static/example.html deleted file mode 100644 index 12375287..00000000 --- a/functions/development/slack_notify/1.1.0/static/example.html +++ /dev/null @@ -1,443 +0,0 @@ - - - - - - - -mlconfig - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - - - -
-
- - -
-
-
- - -
-
-
-
-
# nuclio: ignore
-import nuclio
-
-
-
-
-
-
-
%nuclio config kind = "job"
-%nuclio config spec.image = "python:3.6-jessie"
-
-
-
-
-
%nuclio: setting kind to 'job'
-%nuclio: setting spec.image to 'python:3.6-jessie'
-
-
-
-
-
-
-
%%nuclio cmd -c 
-pip install requests
-
-
-
-
-
-
-
import warnings
-warnings.simplefilter(action='ignore', category=FutureWarning)
-
-
-
-
-
-
-
import os
-import json
-import requests
-from mlrun.execution import MLClientCtx
-from typing import List
-
-
-
-
-
-
-
def slack_notify(
-    context: MLClientCtx,
-    webhook_url: str = "URL",
-    slack_blocks:  List[str] = [],
-    notification_text: str = "Notification"
-) -> None:
-    """Summarize a table
-    :param context:         the function context
-    :param webhook_url:     Slack incoming webhook URL. Please read: https://api.slack.com/messaging/webhooks
-    :param notification_text:            Notification text
-    :param slack_blocks:          Message blocks list. NOT IMPLEMENTED YET
-    """
-    
-    data = {
-        'text': notification_text
-    }
-    print("====",webhook_url)
-    response = requests.post(webhook_url, data=json.dumps(
-        data), headers={'Content-Type': 'application/json'})
-
-    print('Response: ' + str(response.text))
-    print('Response code: ' + str(response.status_code))
-
-
-
-
-
-
-
# nuclio: end-code
-
-
-
-
-
-

mlconfig#

-
-
-
from mlrun import mlconf
-import os
-
-mlconf.dbpath = 'http://mlrun-api:8080'
-mlconf.artifact_path = mlconf.artifact_path or f'{os.environ["HOME"]}/artifacts'
-
-
-
-
-
-
-

save#

-
-
-
from mlrun import code_to_function
-
-# create job function object from notebook code
-fn = code_to_function("slack_notify")
-# add metadata (for templates and reuse)
-fn.spec.default_handler = "slack_notify"
-fn.spec.description = "Send Slack notification"
-fn.metadata.categories = ["ops"]
-fn.metadata.labels = {"author": "mdl"}
-fn.export("function.yaml")
-
-
-
-
-
-
-

tests#

-
-
-
from mlrun import import_function
-func = import_function("hub://slack_notify")
-
-
-
-
-
-
-
from mlrun import NewTask, run_local
-
-
-#Slack incoming webhook URL. Please read: https://api.slack.com/messaging/webhooks
-task_params = {
-    "webhook_url" : "https://hooks.slack.com/services/xxxxxxxx/xxxxxxxxx/xxxxxxxxxxxxxx",
-    "notification_text" : "Test Notification"
-}
-
-
-
-
-
-
-
task = NewTask(
-    name="tasks slack notify", 
-    params = task_params,
-    handler=slack_notify)
-
-
-
-
-
-

run local where artifact path is fixed#

-
-
-
run = run_local(task, artifact_path=mlconf.artifact_path)
-
-
-
-
-
-
-

run remote where artifact path includes the run id#

-
-
-
func.deploy()
-
-
-
-
-
-
-
func.run(task, params=task_params,  workdir=mlconf.artifact_path)
-
-
-
-
-
-
-
func.doc()
-
-
-
-
-
function: slack-notify
-Send Slack notification
-default handler: slack_notify
-entry points:
-  slack_notify: Summarize a table
-    context(MLClientCtx)  - the function context
-    webhook_url(str)  - Slack incoming webhook URL. Please read: https://api.slack.com/messaging/webhooks, default=URL
-    slack_blocks(List[str])  - Message blocks list. NOT IMPLEMENTED YET
-    notification_text(str)  - Notification text, default=Notification
-
-
-
-
-
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/slack_notify/1.1.0/static/function.html b/functions/development/slack_notify/1.1.0/static/function.html deleted file mode 100644 index 32f2424b..00000000 --- a/functions/development/slack_notify/1.1.0/static/function.html +++ /dev/null @@ -1,70 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: job
-metadata:
-  name: slack-notify
-  tag: ''
-  hash: 3de7e78ed9b7928af192badf988055086431fb58
-  project: ''
-  labels:
-    author: mdl
-  categories:
-  - utils
-spec:
-  command: ''
-  args: []
-  image: python:3.6-jessie
-  env: []
-  default_handler: slack_notify
-  entry_points:
-    slack_notify:
-      name: slack_notify
-      doc: Summarize a table
-      parameters:
-      - name: context
-        type: MLClientCtx
-        doc: the function context
-        default: ''
-      - name: webhook_url
-        type: str
-        doc: 'Slack incoming webhook URL. Please read: https://api.slack.com/messaging/webhooks'
-        default: URL
-      - name: slack_blocks
-        type: List[str]
-        doc: Message blocks list. NOT IMPLEMENTED YET
-        default: []
-      - name: notification_text
-        type: str
-        doc: Notification text
-        default: Notification
-      outputs:
-      - default: ''
-      lineno: 14
-  description: Send Slack notification
-  build:
-    functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHdhcm5pbmdzCgp3YXJuaW5ncy5zaW1wbGVmaWx0ZXIoYWN0aW9uPSJpZ25vcmUiLCBjYXRlZ29yeT1GdXR1cmVXYXJuaW5nKQoKaW1wb3J0IG9zCmltcG9ydCBqc29uCmltcG9ydCByZXF1ZXN0cwpmcm9tIG1scnVuLmV4ZWN1dGlvbiBpbXBvcnQgTUxDbGllbnRDdHgKZnJvbSB0eXBpbmcgaW1wb3J0IExpc3QKCgpkZWYgc2xhY2tfbm90aWZ5KAogICAgY29udGV4dDogTUxDbGllbnRDdHgsCiAgICB3ZWJob29rX3VybDogc3RyID0gIlVSTCIsCiAgICBzbGFja19ibG9ja3M6IExpc3Rbc3RyXSA9IFtdLAogICAgbm90aWZpY2F0aW9uX3RleHQ6IHN0ciA9ICJOb3RpZmljYXRpb24iLAopIC0+IE5vbmU6CiAgICAiIiJTdW1tYXJpemUgYSB0YWJsZQogICAgOnBhcmFtIGNvbnRleHQ6ICAgICAgICAgdGhlIGZ1bmN0aW9uIGNvbnRleHQKICAgIDpwYXJhbSB3ZWJob29rX3VybDogICAgIFNsYWNrIGluY29taW5nIHdlYmhvb2sgVVJMLiBQbGVhc2UgcmVhZDogaHR0cHM6Ly9hcGkuc2xhY2suY29tL21lc3NhZ2luZy93ZWJob29rcwogICAgOnBhcmFtIG5vdGlmaWNhdGlvbl90ZXh0OiAgICAgICAgICAgIE5vdGlmaWNhdGlvbiB0ZXh0CiAgICA6cGFyYW0gc2xhY2tfYmxvY2tzOiAgICAgICAgICBNZXNzYWdlIGJsb2NrcyBsaXN0LiBOT1QgSU1QTEVNRU5URUQgWUVUCiAgICAiIiIKCiAgICBkYXRhID0geyJ0ZXh0Ijogbm90aWZpY2F0aW9uX3RleHR9CiAgICBwcmludCgiPT09PSIsIHdlYmhvb2tfdXJsKQogICAgcmVzcG9uc2UgPSByZXF1ZXN0cy5wb3N0KAogICAgICAgIHdlYmhvb2tfdXJsLCBkYXRhPWpzb24uZHVtcHMoZGF0YSksIGhlYWRlcnM9eyJDb250ZW50LVR5cGUiOiAiYXBwbGljYXRpb24vanNvbiJ9CiAgICApCgogICAgcHJpbnQoIlJlc3BvbnNlOiAiICsgc3RyKHJlc3BvbnNlLnRleHQpKQogICAgcHJpbnQoIlJlc3BvbnNlIGNvZGU6ICIgKyBzdHIocmVzcG9uc2Uuc3RhdHVzX2NvZGUpKQo=
-    commands:
-    - python -m pip install requests
-    code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/slack_notify/slack_notify.py
-  affinity: null
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/slack_notify/1.1.0/static/item.html b/functions/development/slack_notify/1.1.0/static/item.html deleted file mode 100644 index 4e0ea018..00000000 --- a/functions/development/slack_notify/1.1.0/static/item.html +++ /dev/null @@ -1,47 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- utils
-description: Send Slack notification
-doc: ''
-example: slack_notify.ipynb
-generationDate: 2022-08-28:17-25
-hidden: false
-icon: ''
-labels:
-  author: mdl
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 1.1.0
-name: slack-notify
-platformVersion: 3.5.0
-spec:
-  filename: slack_notify.py
-  handler: slack_notify
-  image: python:3.6-jessie
-  kind: job
-  requirements:
-  - requests
-url: ''
-version: 1.1.0
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/slack_notify/1.1.0/static/slack_notify.html b/functions/development/slack_notify/1.1.0/static/slack_notify.html deleted file mode 100644 index c0d48201..00000000 --- a/functions/development/slack_notify/1.1.0/static/slack_notify.html +++ /dev/null @@ -1,188 +0,0 @@ - - - - - - - -slack_notify.slack_notify - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - -
-
- -
-
-
-
-
- -
-

- -
-
-
-
-
-
-
-

Source code for slack_notify.slack_notify

-# Copyright 2019 Iguazio
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# Generated by nuclio.export.NuclioExporter
-
-import warnings
-
-warnings.simplefilter(action="ignore", category=FutureWarning)
-
-import os
-import json
-import requests
-from mlrun.execution import MLClientCtx
-from typing import List
-
-
-
[docs]def slack_notify( - context: MLClientCtx, - webhook_url: str = "URL", - slack_blocks: List[str] = [], - notification_text: str = "Notification", -) -> None: - """Summarize a table - :param context: the function context - :param webhook_url: Slack incoming webhook URL. Please read: https://api.slack.com/messaging/webhooks - :param notification_text: Notification text - :param slack_blocks: Message blocks list. NOT IMPLEMENTED YET - """ - - data = {"text": notification_text} - print("====", webhook_url) - response = requests.post( - webhook_url, data=json.dumps(data), headers={"Content-Type": "application/json"} - ) - - print("Response: " + str(response.text)) - print("Response code: " + str(response.status_code))
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/slack_notify/1.1.0/static/source.html b/functions/development/slack_notify/1.1.0/static/source.html deleted file mode 100644 index 85d1b6a3..00000000 --- a/functions/development/slack_notify/1.1.0/static/source.html +++ /dev/null @@ -1,70 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-# Copyright 2019 Iguazio
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# Generated by nuclio.export.NuclioExporter
-
-import warnings
-
-warnings.simplefilter(action="ignore", category=FutureWarning)
-
-import os
-import json
-import requests
-from mlrun.execution import MLClientCtx
-from typing import List
-
-
-def slack_notify(
-    context: MLClientCtx,
-    webhook_url: str = "URL",
-    slack_blocks: List[str] = [],
-    notification_text: str = "Notification",
-) -> None:
-    """Summarize a table
-    :param context:         the function context
-    :param webhook_url:     Slack incoming webhook URL. Please read: https://api.slack.com/messaging/webhooks
-    :param notification_text:            Notification text
-    :param slack_blocks:          Message blocks list. NOT IMPLEMENTED YET
-    """
-
-    data = {"text": notification_text}
-    print("====", webhook_url)
-    response = requests.post(
-        webhook_url, data=json.dumps(data), headers={"Content-Type": "application/json"}
-    )
-
-    print("Response: " + str(response.text))
-    print("Response code: " + str(response.status_code))
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/slack_notify/latest/src/README.md b/functions/development/slack_notify/latest/src/README.md deleted file mode 100644 index 9bde3299..00000000 --- a/functions/development/slack_notify/latest/src/README.md +++ /dev/null @@ -1 +0,0 @@ -# Send Notification to Slack \ No newline at end of file diff --git a/functions/development/slack_notify/latest/src/function.yaml b/functions/development/slack_notify/latest/src/function.yaml deleted file mode 100644 index 95af087c..00000000 --- a/functions/development/slack_notify/latest/src/function.yaml +++ /dev/null @@ -1,48 +0,0 @@ -kind: job -metadata: - name: slack-notify - tag: '' - hash: 3de7e78ed9b7928af192badf988055086431fb58 - project: '' - labels: - author: mdl - categories: - - utils -spec: - command: '' - args: [] - image: python:3.6-jessie - env: [] - default_handler: slack_notify - entry_points: - slack_notify: - name: slack_notify - doc: Summarize a table - parameters: - - name: context - type: MLClientCtx - doc: the function context - default: '' - - name: webhook_url - type: str - doc: 'Slack incoming webhook URL. Please read: https://api.slack.com/messaging/webhooks' - default: URL - - name: slack_blocks - type: List[str] - doc: Message blocks list. NOT IMPLEMENTED YET - default: [] - - name: notification_text - type: str - doc: Notification text - default: Notification - outputs: - - default: '' - lineno: 14 - description: Send Slack notification - build: - functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHdhcm5pbmdzCgp3YXJuaW5ncy5zaW1wbGVmaWx0ZXIoYWN0aW9uPSJpZ25vcmUiLCBjYXRlZ29yeT1GdXR1cmVXYXJuaW5nKQoKaW1wb3J0IG9zCmltcG9ydCBqc29uCmltcG9ydCByZXF1ZXN0cwpmcm9tIG1scnVuLmV4ZWN1dGlvbiBpbXBvcnQgTUxDbGllbnRDdHgKZnJvbSB0eXBpbmcgaW1wb3J0IExpc3QKCgpkZWYgc2xhY2tfbm90aWZ5KAogICAgY29udGV4dDogTUxDbGllbnRDdHgsCiAgICB3ZWJob29rX3VybDogc3RyID0gIlVSTCIsCiAgICBzbGFja19ibG9ja3M6IExpc3Rbc3RyXSA9IFtdLAogICAgbm90aWZpY2F0aW9uX3RleHQ6IHN0ciA9ICJOb3RpZmljYXRpb24iLAopIC0+IE5vbmU6CiAgICAiIiJTdW1tYXJpemUgYSB0YWJsZQogICAgOnBhcmFtIGNvbnRleHQ6ICAgICAgICAgdGhlIGZ1bmN0aW9uIGNvbnRleHQKICAgIDpwYXJhbSB3ZWJob29rX3VybDogICAgIFNsYWNrIGluY29taW5nIHdlYmhvb2sgVVJMLiBQbGVhc2UgcmVhZDogaHR0cHM6Ly9hcGkuc2xhY2suY29tL21lc3NhZ2luZy93ZWJob29rcwogICAgOnBhcmFtIG5vdGlmaWNhdGlvbl90ZXh0OiAgICAgICAgICAgIE5vdGlmaWNhdGlvbiB0ZXh0CiAgICA6cGFyYW0gc2xhY2tfYmxvY2tzOiAgICAgICAgICBNZXNzYWdlIGJsb2NrcyBsaXN0LiBOT1QgSU1QTEVNRU5URUQgWUVUCiAgICAiIiIKCiAgICBkYXRhID0geyJ0ZXh0Ijogbm90aWZpY2F0aW9uX3RleHR9CiAgICBwcmludCgiPT09PSIsIHdlYmhvb2tfdXJsKQogICAgcmVzcG9uc2UgPSByZXF1ZXN0cy5wb3N0KAogICAgICAgIHdlYmhvb2tfdXJsLCBkYXRhPWpzb24uZHVtcHMoZGF0YSksIGhlYWRlcnM9eyJDb250ZW50LVR5cGUiOiAiYXBwbGljYXRpb24vanNvbiJ9CiAgICApCgogICAgcHJpbnQoIlJlc3BvbnNlOiAiICsgc3RyKHJlc3BvbnNlLnRleHQpKQogICAgcHJpbnQoIlJlc3BvbnNlIGNvZGU6ICIgKyBzdHIocmVzcG9uc2Uuc3RhdHVzX2NvZGUpKQo= - commands: - - python -m pip install requests - code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/slack_notify/slack_notify.py - affinity: null -verbose: false diff --git a/functions/development/slack_notify/latest/src/item.yaml b/functions/development/slack_notify/latest/src/item.yaml deleted file mode 100644 index 6bdfd2c8..00000000 --- a/functions/development/slack_notify/latest/src/item.yaml +++ /dev/null @@ -1,25 +0,0 @@ -apiVersion: v1 -categories: -- utils -description: Send Slack notification -doc: '' -example: slack_notify.ipynb -generationDate: 2022-08-28:17-25 -hidden: false -icon: '' -labels: - author: mdl -maintainers: [] -marketplaceType: '' -mlrunVersion: 1.1.0 -name: slack-notify -platformVersion: 3.5.0 -spec: - filename: slack_notify.py - handler: slack_notify - image: python:3.6-jessie - kind: job - requirements: - - requests -url: '' -version: 1.1.0 diff --git a/functions/development/slack_notify/latest/src/slack_notify.ipynb b/functions/development/slack_notify/latest/src/slack_notify.ipynb deleted file mode 100644 index 8119bb8c..00000000 --- a/functions/development/slack_notify/latest/src/slack_notify.ipynb +++ /dev/null @@ -1,293 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 33, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: ignore\n", - "import nuclio" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "%nuclio: setting kind to 'job'\n", - "%nuclio: setting spec.image to 'python:3.6-jessie'\n" - ] - } - ], - "source": [ - "%nuclio config kind = \"job\"\n", - "%nuclio config spec.image = \"python:3.6-jessie\"" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "metadata": {}, - "outputs": [], - "source": [ - "%%nuclio cmd -c \n", - "pip install requests" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "metadata": {}, - "outputs": [], - "source": [ - "import warnings\n", - "warnings.simplefilter(action='ignore', category=FutureWarning)" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "import json\n", - "import requests\n", - "from mlrun.execution import MLClientCtx\n", - "from typing import List" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "metadata": {}, - "outputs": [], - "source": [ - "def slack_notify(\n", - " context: MLClientCtx,\n", - " webhook_url: str = \"URL\",\n", - " slack_blocks: List[str] = [],\n", - " notification_text: str = \"Notification\"\n", - ") -> None:\n", - " \"\"\"Summarize a table\n", - " :param context: the function context\n", - " :param webhook_url: Slack incoming webhook URL. Please read: https://api.slack.com/messaging/webhooks\n", - " :param notification_text: Notification text\n", - " :param slack_blocks: Message blocks list. NOT IMPLEMENTED YET\n", - " \"\"\"\n", - " \n", - " data = {\n", - " 'text': notification_text\n", - " }\n", - " print(\"====\",webhook_url)\n", - " response = requests.post(webhook_url, data=json.dumps(\n", - " data), headers={'Content-Type': 'application/json'})\n", - "\n", - " print('Response: ' + str(response.text))\n", - " print('Response code: ' + str(response.status_code))" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: end-code" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### mlconfig" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import mlconf\n", - "import os\n", - "\n", - "mlconf.dbpath = 'http://mlrun-api:8080'\n", - "mlconf.artifact_path = mlconf.artifact_path or f'{os.environ[\"HOME\"]}/artifacts'" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### save" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import code_to_function\n", - "\n", - "# create job function object from notebook code\n", - "fn = code_to_function(\"slack_notify\")\n", - "# add metadata (for templates and reuse)\n", - "fn.spec.default_handler = \"slack_notify\"\n", - "fn.spec.description = \"Send Slack notification\"\n", - "fn.metadata.categories = [\"ops\"]\n", - "fn.metadata.labels = {\"author\": \"mdl\"}\n", - "fn.export(\"function.yaml\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## tests" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import import_function\n", - "func = import_function(\"hub://slack_notify\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import NewTask, run_local\n", - "\n", - "\n", - "#Slack incoming webhook URL. Please read: https://api.slack.com/messaging/webhooks\n", - "task_params = {\n", - " \"webhook_url\" : \"https://hooks.slack.com/services/xxxxxxxx/xxxxxxxxx/xxxxxxxxxxxxxx\",\n", - " \"notification_text\" : \"Test Notification\"\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "task = NewTask(\n", - " name=\"tasks slack notify\", \n", - " params = task_params,\n", - " handler=slack_notify)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### run local where artifact path is fixed " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run = run_local(task, artifact_path=mlconf.artifact_path)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### run remote where artifact path includes the run id" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "func.deploy()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "func.run(task, params=task_params, workdir=mlconf.artifact_path)" - ] - }, - { - "cell_type": "code", - "execution_count": 42, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "function: slack-notify\n", - "Send Slack notification\n", - "default handler: slack_notify\n", - "entry points:\n", - " slack_notify: Summarize a table\n", - " context(MLClientCtx) - the function context\n", - " webhook_url(str) - Slack incoming webhook URL. Please read: https://api.slack.com/messaging/webhooks, default=URL\n", - " slack_blocks(List[str]) - Message blocks list. NOT IMPLEMENTED YET\n", - " notification_text(str) - Notification text, default=Notification\n" - ] - } - ], - "source": [ - "func.doc()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.8" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/functions/development/slack_notify/latest/src/slack_notify.py b/functions/development/slack_notify/latest/src/slack_notify.py deleted file mode 100644 index 3208ffee..00000000 --- a/functions/development/slack_notify/latest/src/slack_notify.py +++ /dev/null @@ -1,48 +0,0 @@ -# Copyright 2019 Iguazio -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# Generated by nuclio.export.NuclioExporter - -import warnings - -warnings.simplefilter(action="ignore", category=FutureWarning) - -import os -import json -import requests -from mlrun.execution import MLClientCtx -from typing import List - - -def slack_notify( - context: MLClientCtx, - webhook_url: str = "URL", - slack_blocks: List[str] = [], - notification_text: str = "Notification", -) -> None: - """Summarize a table - :param context: the function context - :param webhook_url: Slack incoming webhook URL. Please read: https://api.slack.com/messaging/webhooks - :param notification_text: Notification text - :param slack_blocks: Message blocks list. NOT IMPLEMENTED YET - """ - - data = {"text": notification_text} - print("====", webhook_url) - response = requests.post( - webhook_url, data=json.dumps(data), headers={"Content-Type": "application/json"} - ) - - print("Response: " + str(response.text)) - print("Response code: " + str(response.status_code)) diff --git a/functions/development/slack_notify/latest/static/documentation.html b/functions/development/slack_notify/latest/static/documentation.html deleted file mode 100644 index 8e88b8f8..00000000 --- a/functions/development/slack_notify/latest/static/documentation.html +++ /dev/null @@ -1,231 +0,0 @@ - - - - - - - -slack_notify package - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - - - -
-
- - -
-
-
- -
-

slack_notify package

- -
- -
-
-
-
-
-

slack_notify package#

-
-

Submodules#

-
-
-

slack_notify.slack_notify module#

-
-
-slack_notify.slack_notify.slack_notify(context: mlrun.execution.MLClientCtx, webhook_url: str = 'URL', slack_blocks: List[str] = [], notification_text: str = 'Notification')None[source]#
-

Summarize a table -:param context: the function context -:param webhook_url: Slack incoming webhook URL. Please read: https://api.slack.com/messaging/webhooks -:param notification_text: Notification text -:param slack_blocks: Message blocks list. NOT IMPLEMENTED YET

-
-
-
-

Module contents#

-
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/slack_notify/latest/static/example.html b/functions/development/slack_notify/latest/static/example.html deleted file mode 100644 index 12375287..00000000 --- a/functions/development/slack_notify/latest/static/example.html +++ /dev/null @@ -1,443 +0,0 @@ - - - - - - - -mlconfig - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - - - -
-
- - -
-
-
- - -
-
-
-
-
# nuclio: ignore
-import nuclio
-
-
-
-
-
-
-
%nuclio config kind = "job"
-%nuclio config spec.image = "python:3.6-jessie"
-
-
-
-
-
%nuclio: setting kind to 'job'
-%nuclio: setting spec.image to 'python:3.6-jessie'
-
-
-
-
-
-
-
%%nuclio cmd -c 
-pip install requests
-
-
-
-
-
-
-
import warnings
-warnings.simplefilter(action='ignore', category=FutureWarning)
-
-
-
-
-
-
-
import os
-import json
-import requests
-from mlrun.execution import MLClientCtx
-from typing import List
-
-
-
-
-
-
-
def slack_notify(
-    context: MLClientCtx,
-    webhook_url: str = "URL",
-    slack_blocks:  List[str] = [],
-    notification_text: str = "Notification"
-) -> None:
-    """Summarize a table
-    :param context:         the function context
-    :param webhook_url:     Slack incoming webhook URL. Please read: https://api.slack.com/messaging/webhooks
-    :param notification_text:            Notification text
-    :param slack_blocks:          Message blocks list. NOT IMPLEMENTED YET
-    """
-    
-    data = {
-        'text': notification_text
-    }
-    print("====",webhook_url)
-    response = requests.post(webhook_url, data=json.dumps(
-        data), headers={'Content-Type': 'application/json'})
-
-    print('Response: ' + str(response.text))
-    print('Response code: ' + str(response.status_code))
-
-
-
-
-
-
-
# nuclio: end-code
-
-
-
-
-
-

mlconfig#

-
-
-
from mlrun import mlconf
-import os
-
-mlconf.dbpath = 'http://mlrun-api:8080'
-mlconf.artifact_path = mlconf.artifact_path or f'{os.environ["HOME"]}/artifacts'
-
-
-
-
-
-
-

save#

-
-
-
from mlrun import code_to_function
-
-# create job function object from notebook code
-fn = code_to_function("slack_notify")
-# add metadata (for templates and reuse)
-fn.spec.default_handler = "slack_notify"
-fn.spec.description = "Send Slack notification"
-fn.metadata.categories = ["ops"]
-fn.metadata.labels = {"author": "mdl"}
-fn.export("function.yaml")
-
-
-
-
-
-
-

tests#

-
-
-
from mlrun import import_function
-func = import_function("hub://slack_notify")
-
-
-
-
-
-
-
from mlrun import NewTask, run_local
-
-
-#Slack incoming webhook URL. Please read: https://api.slack.com/messaging/webhooks
-task_params = {
-    "webhook_url" : "https://hooks.slack.com/services/xxxxxxxx/xxxxxxxxx/xxxxxxxxxxxxxx",
-    "notification_text" : "Test Notification"
-}
-
-
-
-
-
-
-
task = NewTask(
-    name="tasks slack notify", 
-    params = task_params,
-    handler=slack_notify)
-
-
-
-
-
-

run local where artifact path is fixed#

-
-
-
run = run_local(task, artifact_path=mlconf.artifact_path)
-
-
-
-
-
-
-

run remote where artifact path includes the run id#

-
-
-
func.deploy()
-
-
-
-
-
-
-
func.run(task, params=task_params,  workdir=mlconf.artifact_path)
-
-
-
-
-
-
-
func.doc()
-
-
-
-
-
function: slack-notify
-Send Slack notification
-default handler: slack_notify
-entry points:
-  slack_notify: Summarize a table
-    context(MLClientCtx)  - the function context
-    webhook_url(str)  - Slack incoming webhook URL. Please read: https://api.slack.com/messaging/webhooks, default=URL
-    slack_blocks(List[str])  - Message blocks list. NOT IMPLEMENTED YET
-    notification_text(str)  - Notification text, default=Notification
-
-
-
-
-
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/slack_notify/latest/static/function.html b/functions/development/slack_notify/latest/static/function.html deleted file mode 100644 index 32f2424b..00000000 --- a/functions/development/slack_notify/latest/static/function.html +++ /dev/null @@ -1,70 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: job
-metadata:
-  name: slack-notify
-  tag: ''
-  hash: 3de7e78ed9b7928af192badf988055086431fb58
-  project: ''
-  labels:
-    author: mdl
-  categories:
-  - utils
-spec:
-  command: ''
-  args: []
-  image: python:3.6-jessie
-  env: []
-  default_handler: slack_notify
-  entry_points:
-    slack_notify:
-      name: slack_notify
-      doc: Summarize a table
-      parameters:
-      - name: context
-        type: MLClientCtx
-        doc: the function context
-        default: ''
-      - name: webhook_url
-        type: str
-        doc: 'Slack incoming webhook URL. Please read: https://api.slack.com/messaging/webhooks'
-        default: URL
-      - name: slack_blocks
-        type: List[str]
-        doc: Message blocks list. NOT IMPLEMENTED YET
-        default: []
-      - name: notification_text
-        type: str
-        doc: Notification text
-        default: Notification
-      outputs:
-      - default: ''
-      lineno: 14
-  description: Send Slack notification
-  build:
-    functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHdhcm5pbmdzCgp3YXJuaW5ncy5zaW1wbGVmaWx0ZXIoYWN0aW9uPSJpZ25vcmUiLCBjYXRlZ29yeT1GdXR1cmVXYXJuaW5nKQoKaW1wb3J0IG9zCmltcG9ydCBqc29uCmltcG9ydCByZXF1ZXN0cwpmcm9tIG1scnVuLmV4ZWN1dGlvbiBpbXBvcnQgTUxDbGllbnRDdHgKZnJvbSB0eXBpbmcgaW1wb3J0IExpc3QKCgpkZWYgc2xhY2tfbm90aWZ5KAogICAgY29udGV4dDogTUxDbGllbnRDdHgsCiAgICB3ZWJob29rX3VybDogc3RyID0gIlVSTCIsCiAgICBzbGFja19ibG9ja3M6IExpc3Rbc3RyXSA9IFtdLAogICAgbm90aWZpY2F0aW9uX3RleHQ6IHN0ciA9ICJOb3RpZmljYXRpb24iLAopIC0+IE5vbmU6CiAgICAiIiJTdW1tYXJpemUgYSB0YWJsZQogICAgOnBhcmFtIGNvbnRleHQ6ICAgICAgICAgdGhlIGZ1bmN0aW9uIGNvbnRleHQKICAgIDpwYXJhbSB3ZWJob29rX3VybDogICAgIFNsYWNrIGluY29taW5nIHdlYmhvb2sgVVJMLiBQbGVhc2UgcmVhZDogaHR0cHM6Ly9hcGkuc2xhY2suY29tL21lc3NhZ2luZy93ZWJob29rcwogICAgOnBhcmFtIG5vdGlmaWNhdGlvbl90ZXh0OiAgICAgICAgICAgIE5vdGlmaWNhdGlvbiB0ZXh0CiAgICA6cGFyYW0gc2xhY2tfYmxvY2tzOiAgICAgICAgICBNZXNzYWdlIGJsb2NrcyBsaXN0LiBOT1QgSU1QTEVNRU5URUQgWUVUCiAgICAiIiIKCiAgICBkYXRhID0geyJ0ZXh0Ijogbm90aWZpY2F0aW9uX3RleHR9CiAgICBwcmludCgiPT09PSIsIHdlYmhvb2tfdXJsKQogICAgcmVzcG9uc2UgPSByZXF1ZXN0cy5wb3N0KAogICAgICAgIHdlYmhvb2tfdXJsLCBkYXRhPWpzb24uZHVtcHMoZGF0YSksIGhlYWRlcnM9eyJDb250ZW50LVR5cGUiOiAiYXBwbGljYXRpb24vanNvbiJ9CiAgICApCgogICAgcHJpbnQoIlJlc3BvbnNlOiAiICsgc3RyKHJlc3BvbnNlLnRleHQpKQogICAgcHJpbnQoIlJlc3BvbnNlIGNvZGU6ICIgKyBzdHIocmVzcG9uc2Uuc3RhdHVzX2NvZGUpKQo=
-    commands:
-    - python -m pip install requests
-    code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/slack_notify/slack_notify.py
-  affinity: null
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/slack_notify/latest/static/item.html b/functions/development/slack_notify/latest/static/item.html deleted file mode 100644 index 4e0ea018..00000000 --- a/functions/development/slack_notify/latest/static/item.html +++ /dev/null @@ -1,47 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- utils
-description: Send Slack notification
-doc: ''
-example: slack_notify.ipynb
-generationDate: 2022-08-28:17-25
-hidden: false
-icon: ''
-labels:
-  author: mdl
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 1.1.0
-name: slack-notify
-platformVersion: 3.5.0
-spec:
-  filename: slack_notify.py
-  handler: slack_notify
-  image: python:3.6-jessie
-  kind: job
-  requirements:
-  - requests
-url: ''
-version: 1.1.0
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/slack_notify/latest/static/slack_notify.html b/functions/development/slack_notify/latest/static/slack_notify.html deleted file mode 100644 index c0d48201..00000000 --- a/functions/development/slack_notify/latest/static/slack_notify.html +++ /dev/null @@ -1,188 +0,0 @@ - - - - - - - -slack_notify.slack_notify - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - -
-
- -
-
-
-
-
- -
-

- -
-
-
-
-
-
-
-

Source code for slack_notify.slack_notify

-# Copyright 2019 Iguazio
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# Generated by nuclio.export.NuclioExporter
-
-import warnings
-
-warnings.simplefilter(action="ignore", category=FutureWarning)
-
-import os
-import json
-import requests
-from mlrun.execution import MLClientCtx
-from typing import List
-
-
-
[docs]def slack_notify( - context: MLClientCtx, - webhook_url: str = "URL", - slack_blocks: List[str] = [], - notification_text: str = "Notification", -) -> None: - """Summarize a table - :param context: the function context - :param webhook_url: Slack incoming webhook URL. Please read: https://api.slack.com/messaging/webhooks - :param notification_text: Notification text - :param slack_blocks: Message blocks list. NOT IMPLEMENTED YET - """ - - data = {"text": notification_text} - print("====", webhook_url) - response = requests.post( - webhook_url, data=json.dumps(data), headers={"Content-Type": "application/json"} - ) - - print("Response: " + str(response.text)) - print("Response code: " + str(response.status_code))
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/slack_notify/latest/static/source.html b/functions/development/slack_notify/latest/static/source.html deleted file mode 100644 index 85d1b6a3..00000000 --- a/functions/development/slack_notify/latest/static/source.html +++ /dev/null @@ -1,70 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-# Copyright 2019 Iguazio
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# Generated by nuclio.export.NuclioExporter
-
-import warnings
-
-warnings.simplefilter(action="ignore", category=FutureWarning)
-
-import os
-import json
-import requests
-from mlrun.execution import MLClientCtx
-from typing import List
-
-
-def slack_notify(
-    context: MLClientCtx,
-    webhook_url: str = "URL",
-    slack_blocks: List[str] = [],
-    notification_text: str = "Notification",
-) -> None:
-    """Summarize a table
-    :param context:         the function context
-    :param webhook_url:     Slack incoming webhook URL. Please read: https://api.slack.com/messaging/webhooks
-    :param notification_text:            Notification text
-    :param slack_blocks:          Message blocks list. NOT IMPLEMENTED YET
-    """
-
-    data = {"text": notification_text}
-    print("====", webhook_url)
-    response = requests.post(
-        webhook_url, data=json.dumps(data), headers={"Content-Type": "application/json"}
-    )
-
-    print("Response: " + str(response.text))
-    print("Response code: " + str(response.status_code))
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/snowflake_dask/0.9.0/src/README.md b/functions/development/snowflake_dask/0.9.0/src/README.md deleted file mode 100644 index b4df32ad..00000000 --- a/functions/development/snowflake_dask/0.9.0/src/README.md +++ /dev/null @@ -1,20 +0,0 @@ -# **Data Preperation Function** - -## `Snowflake_dask` - -![](img/snowflake-dask.png) - -This function query the data from a snowflake database and process the results -in parallel in a Dask cluster. -It will publish the dask dataframe in the cluster for other process to use. -It can also write the results dataframe to parquet files. - -```markdown - -:param context: the function context -:param dask_client: dask cluster function name -:param connection_info: Snowflake database connection info (this will be in a secret later) -:param query: query to for Snowflake -:param parquet_out_dir: directory path for the output parquet files (default None, not write out) -:param publish_name: name of the dask dataframe to publish to the dask cluster (default None, not publish) -``` \ No newline at end of file diff --git a/functions/development/snowflake_dask/0.9.0/src/config-template.yaml b/functions/development/snowflake_dask/0.9.0/src/config-template.yaml deleted file mode 100644 index fb46ac2e..00000000 --- a/functions/development/snowflake_dask/0.9.0/src/config-template.yaml +++ /dev/null @@ -1,5 +0,0 @@ -user: "..." -password: "..." -warehouse: "..." -account: "..." -application: "Iguazio" \ No newline at end of file diff --git a/functions/development/snowflake_dask/0.9.0/src/function.yaml b/functions/development/snowflake_dask/0.9.0/src/function.yaml deleted file mode 100644 index 8adffc07..00000000 --- a/functions/development/snowflake_dask/0.9.0/src/function.yaml +++ /dev/null @@ -1,77 +0,0 @@ -kind: job -metadata: - name: snowflake-dask - tag: '' - hash: bc61b7ecda9966b7b700ed39d6e0d7e653ecdf66 - project: snowflake-dask - labels: - author: xingsheng - categories: - - data-prep -spec: - command: '' - args: [] - image: .mlrun/func-snowflake-dask-snowflake-dask:latest - build: - functionSourceCode: IiIiU25vd2ZsYWtlIERhc2sgLSBJbmdlc3QgU25hb3dmbGFrZSBkYXRhIHdpdGggRGFzayIiIgppbXBvcnQgd2FybmluZ3MKaW1wb3J0IG1scnVuCmZyb20gbWxydW4uZXhlY3V0aW9uIGltcG9ydCBNTENsaWVudEN0eAppbXBvcnQgc25vd2ZsYWtlLmNvbm5lY3RvciBhcyBzbm93CmZyb20gZGFzay5kaXN0cmlidXRlZCBpbXBvcnQgQ2xpZW50CmZyb20gZGFzay5kYXRhZnJhbWUgaW1wb3J0IGZyb21fZGVsYXllZApmcm9tIGRhc2sgaW1wb3J0IGRlbGF5ZWQKZnJvbSBkYXNrIGltcG9ydCBkYXRhZnJhbWUgYXMgZGQKCndhcm5pbmdzLmZpbHRlcndhcm5pbmdzKCJpZ25vcmUiKQoKQGRlbGF5ZWQKZGVmIGxvYWQoYmF0Y2gpOgoKICAgICIiIkEgZGVsYXllZCBsb2FkIG9uZSBiYXRjaC4iIiIKCiAgICB0cnk6CiAgICAgICAgcHJpbnQoIkJBVENISU5HIikKICAgICAgICBkZl8gPSBiYXRjaC50b19wYW5kYXMoKQogICAgICAgIHJldHVybiBkZl8KICAgIGV4Y2VwdCBFeGNlcHRpb24gYXMgZToKICAgICAgICBwcmludChmIkZhaWxlZCBvbiB7YmF0Y2h9IGZvciB7ZX0iKQogICAgICAgIHJhaXNlCgpkZWYgbG9hZF9yZXN1bHRzKGNvbnRleHQ6IE1MQ2xpZW50Q3R4LAogICAgICAgICAgICAgICAgIGRhc2tfY2xpZW50OiBzdHIsCiAgICAgICAgICAgICAgICAgY29ubmVjdGlvbl9pbmZvOiBzdHIsCiAgICAgICAgICAgICAgICAgcXVlcnk6IHN0ciwKICAgICAgICAgICAgICAgICBwYXJxdWV0X291dF9kaXIgPSBOb25lLAogICAgICAgICAgICAgICAgIHB1Ymxpc2hfbmFtZSA9IE5vbmUKICAgICAgICAgICAgICAgICkgLT4gTm9uZToKCiAgICAiIiJTbm93Zmxha2UgRGFzayAtIEluZ2VzdCBTbmFvd2ZsYWtlIGRhdGEgd2l0aCBEYXNrCgogICAgOnBhcmFtIGNvbnRleHQ6ICAgICAgICAgICB0aGUgZnVuY3Rpb24gY29udGV4dAogICAgOnBhcmFtIGRhc2tfY2xpZW50OiAgICAgICBkYXNrIGNsdXN0ZXIgZnVuY3Rpb24gbmFtZQogICAgOnBhcmFtIGNvbm5lY3Rpb25faW5mbzogICBTbm93Zmxha2UgZGF0YWJhc2UgY29ubmVjdGlvbiBpbmZvICh0aGlzIHdpa2sgYmUgaW4gYSBzZWNyZXQgbGF0ZXIpCiAgICA6cGFyYW0gcXVlcnk6ICAgICAgICAgICAgIHF1ZXJ5IHRvIGZvciBTbm93Zmxha2UKICAgIDpwYXJhbSBwYXJxdWV0X291dF9kaXI6ICAgZGlyZWN0b3J5IHBhdGggZm9yIHRoZSBvdXRwdXQgcGFycXVldCBmaWxlcwogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAoZGVmYXVsdCBOb25lLCBub3Qgd3JpdGUgb3V0KQogICAgOnBhcmFtIHB1Ymxpc2hfbmFtZTogICAgICBuYW1lIG9mIHRoZSBkYXNrIGRhdGFmcmFtZSB0byBwdWJsaXNoIHRvIHRoZSBkYXNrIGNsdXN0ZXIKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgKGRlZmF1bHQgTm9uZSwgbm90IHB1Ymxpc2gpCgogICAgIiIiCiAgICBjb250ZXh0ID0gbWxydW4uZ2V0X29yX2NyZWF0ZV9jdHgoJ3NuYXdmbGFrZS1kYXNrLWNsdXN0ZXInKQoKICAgICMgc2V0dXAgZGFzayBjbGllbnQgZnJvbSB0aGUgTUxSdW4gZGFzayBjbHVzdGVyIGZ1bmN0aW9uCiAgICBpZiBkYXNrX2NsaWVudDoKICAgICAgICBjbGllbnQgPSBtbHJ1bi5pbXBvcnRfZnVuY3Rpb24oZGFza19jbGllbnQpLmNsaWVudAogICAgICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZidFeGlzdGluZyBkYXNrIGNsaWVudCA9PT0gPj4+IHtjbGllbnR9XG4nKQogICAgZWxzZToKICAgICAgICBjbGllbnQgPSBDbGllbnQoKQogICAgICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZidcbk5ld2x5IGNyZWF0ZWQgZGFzayBjbGllbnQgPT09ID4+PiB7Y2xpZW50fVxuJykKCiAgICBjb25uID0gc25vdy5jb25uZWN0KCoqY29ubmVjdGlvbl9pbmZvKQogICAgY3VyID0gY29ubi5jdXJzb3IoKQogICAgY3VyLmV4ZWN1dGUocXVlcnkpCiAgICBiYXRjaGVzID0gY3VyLmdldF9yZXN1bHRfYmF0Y2hlcygpCiAgICBjb250ZXh0LmxvZ2dlci5pbmZvKGYnYmF0Y2hlcyBsZW4gPT09IHtsZW4oYmF0Y2hlcyl9XG4nKQoKICAgIGRmcyA9IFtdCiAgICBmb3IgYmF0Y2ggaW4gYmF0Y2hlczoKICAgICAgICBpZiBiYXRjaC5yb3djb3VudCA+IDA6CiAgICAgICAgICAgIGRmID0gbG9hZChiYXRjaCkKICAgICAgICAgICAgZGZzLmFwcGVuZChkZikKICAgIGRkZiA9IGZyb21fZGVsYXllZChkZnMpCgogICAgIyBtYXRlcmlhbGl6ZSB0aGUgcXVlcnkgcmVzdWx0cyBzZXQgZm9yIHNvbWUgc2FtcGxlIGNvbXB1dGUKCiAgICBkZGZfZGVzY3JpYmUgPSBkZGYuZGVzY3JpYmUoKS5jb21wdXRlKCkKCiAgICBjb250ZXh0LmxvZ2dlci5pbmZvKGYncXVlcnkgID09PSA+Pj4ge3F1ZXJ5fVxuJykKICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZidkZGYgID09PSA+Pj4ge2RkZn1cbicpCiAgICBjb250ZXh0LmxvZ19yZXN1bHQoJ251bWJlciBvZiByb3dzJywgbGVuKGRkZi5pbmRleCkpCiAgICBjb250ZXh0LmxvZ19kYXRhc2V0KCJkZGZfZGVzY3JpYmUiLCBkZj1kZGZfZGVzY3JpYmUpCgogICAgaWYgcHVibGlzaF9uYW1lOgogICAgICAgIGNvbnRleHQubG9nX3Jlc3VsdCgnZGF0YV9zZXRfbmFtZScsIHB1Ymxpc2hfbmFtZSkKICAgICAgICBpZiBub3QgY2xpZW50Lmxpc3RfZGF0YXNldHMoKToKICAgICAgICAgICAgZGRmLnBlcnNpc3QobmFtZSA9IHB1Ymxpc2hfbmFtZSkKICAgICAgICAgICAgY2xpZW50LnB1Ymxpc2hfZGF0YXNldChwdWJsaXNoX25hbWU9ZGRmKQoKICAgIGlmIHBhcnF1ZXRfb3V0X2RpcjoKICAgICAgICBkZC50b19wYXJxdWV0KGRmPWRkZiwgcGF0aD1wYXJxdWV0X291dF9kaXIpCiAgICAgICAgY29udGV4dC5sb2dfcmVzdWx0KCdwYXJxdWV0IGRpcmVjdG9yeScsIHBhcnF1ZXRfb3V0X2RpcikK - base_image: mlrun/mlrun - commands: - - python -m pip install bokeh snowflake-connector-python[pandas] - code_origin: https://github.com/xsqian/functions.git#b8c0c4307783069f3696ebdbfee8df7d64fbd0dc:snowflake_dask.py - origin_filename: snowflake_dask.py - entry_points: - load: - name: load - doc: A delayed load one batch. - parameters: - - name: batch - default: '' - outputs: - - default: '' - lineno: 13 - load_results: - name: load_results - doc: Snowflake Dask - Ingest Snaowflake data with Dask - parameters: - - name: context - type: MLClientCtx - doc: the function context - default: '' - - name: dask_client - type: str - doc: dask cluster function name - default: '' - - name: connection_info - type: str - doc: Snowflake database connection info (this wikk be in a secret later) - default: '' - - name: query - type: str - doc: query to for Snowflake - default: '' - - name: parquet_out_dir - doc: directory path for the output parquet files (default None, not write - out) - default: null - - name: publish_name - doc: name of the dask dataframe to publish to the dask cluster (default None, - not publish) - default: null - outputs: - - default: '' - lineno: 26 - description: Snowflake Dask - Ingest snowflake data in parallel with Dask cluster - default_handler: load_results - disable_auto_mount: false - env: - - name: V3IO_API - value: '' - - name: V3IO_USERNAME - value: '' - - name: V3IO_ACCESS_KEY - value: '' - - name: V3IO_FRAMESD - value: '' - priority_class_name: igz-workload-medium - affinity: null -verbose: false diff --git a/functions/development/snowflake_dask/0.9.0/src/img/snowflake-dask.png b/functions/development/snowflake_dask/0.9.0/src/img/snowflake-dask.png deleted file mode 100644 index 30a25282..00000000 Binary files a/functions/development/snowflake_dask/0.9.0/src/img/snowflake-dask.png and /dev/null differ diff --git a/functions/development/snowflake_dask/0.9.0/src/item.yaml b/functions/development/snowflake_dask/0.9.0/src/item.yaml deleted file mode 100644 index 8f1cd788..00000000 --- a/functions/development/snowflake_dask/0.9.0/src/item.yaml +++ /dev/null @@ -1,24 +0,0 @@ -apiVersion: v1 -categories: -- data-preparation -description: Snowflake Dask - Ingest snowflake data in parallel with Dask cluster -doc: '' -example: snowflake-dask-mlrun.ipynb -generationDate: 2022-03-20:12-28 -icon: '' -labels: - author: xingsheng - framework: dask -maintainers: [] -marketplaceType: '' -mlrunVersion: 0.9.1 -name: snowflake_dask -platformVersion: 3.2.0 -spec: - filename: snowflake_dask.py - handler: load_results - image: mlrun/mlrun - kind: job - requirements: [] -url: '' -version: 0.9.0 diff --git a/functions/development/snowflake_dask/0.9.0/src/requirements.txt b/functions/development/snowflake_dask/0.9.0/src/requirements.txt deleted file mode 100644 index f390b5ba..00000000 --- a/functions/development/snowflake_dask/0.9.0/src/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -bokeh -snowflake-connector-python[pandas] -mlrun~=0.9.1 \ No newline at end of file diff --git a/functions/development/snowflake_dask/0.9.0/src/snowflake-dask-mlrun.ipynb b/functions/development/snowflake_dask/0.9.0/src/snowflake-dask-mlrun.ipynb deleted file mode 100644 index 2e61f764..00000000 --- a/functions/development/snowflake_dask/0.9.0/src/snowflake-dask-mlrun.ipynb +++ /dev/null @@ -1,477 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# This notebook is to create a function to ingest data from snowflake with a Dask cluster" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The dask frameworks enables users to parallelize their python code and run it as a distributed process on Iguazio cluster and dramatically accelerate their performance.
\n", - "In this notebook we'll create an mlrun function running as a dask client to ingest data from snowflake.
\n", - "It also demonstrates how to run parallelize query against snowflake using Dask Delayed option to query a large data set from snowflake.
\n", - "The function will be published on the function marketplace.
\n", - "For more information on dask over kubernetes: https://kubernetes.dask.org/en/latest/" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Set up the enviroment" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2022-03-17 17:11:56,500 [info] loaded project snowflake-dask from MLRun DB\n", - "artifact_path = ('snowflake-dask', '/v3io/projects/snowflake-dask')\n" - ] - } - ], - "source": [ - "import mlrun\n", - "import os\n", - "import warnings\n", - "import yaml\n", - "\n", - "project_name = \"snowflake-dask\"\n", - "dask_cluster_name=\"snowflake-dask-cluster\"\n", - "artifact_path = mlrun.set_environment(project=project_name,\n", - " artifact_path = os.path.join(os.path.abspath('/v3io/projects/'), project_name))\n", - "\n", - "warnings.filterwarnings(\"ignore\")\n", - "\n", - "print(f'artifact_path = {artifact_path}')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Load snowflake configuration from config file. \n", - "This is for demo purpose, in the real production code, you would need to put the snowflake connection info into secrets use the secrets in the running pod to connect to snowflake" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "nf77378.eu-west-2.aws\n" - ] - } - ], - "source": [ - "# Load connection info\n", - "with open(\".config.yaml\") as f:\n", - " connection_info = yaml.safe_load(f)\n", - "\n", - "# verify the config\n", - "print(connection_info['account'])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Create a python function" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This function querys data from snowflake using snowflake python connector for parallel processing of the query results.
\n", - "With snoeflake python connector, when you execute a query, the cursor will return the result batches.
\n", - "Using Dask Delayed it will return and process results set in parallel.
" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### write the function to a py file" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Overwriting snowflake_dask.py\n" - ] - } - ], - "source": [ - "%%writefile snowflake_dask.py\n", - "\"\"\"Snowflake Dask - Ingest Snaowflake data with Dask\"\"\"\n", - "import warnings\n", - "import mlrun\n", - "from mlrun.execution import MLClientCtx\n", - "import snowflake.connector as snow\n", - "from dask.distributed import Client\n", - "from dask.dataframe import from_delayed\n", - "from dask import delayed\n", - "from dask import dataframe as dd\n", - "\n", - "warnings.filterwarnings(\"ignore\")\n", - "\n", - "@delayed\n", - "def load(batch):\n", - "\n", - " \"\"\"A delayed load one batch.\"\"\"\n", - "\n", - " try:\n", - " print(\"BATCHING\")\n", - " df_ = batch.to_pandas()\n", - " return df_\n", - " except Exception as e:\n", - " print(f\"Failed on {batch} for {e}\")\n", - " raise\n", - "\n", - "def load_results(context: MLClientCtx,\n", - " dask_client: str,\n", - " connection_info: str,\n", - " query: str,\n", - " parquet_out_dir = None,\n", - " publish_name = None\n", - " ) -> None:\n", - "\n", - " \"\"\"Snowflake Dask - Ingest Snaowflake data with Dask\n", - "\n", - " :param context: the function context\n", - " :param dask_client: dask cluster function name\n", - " :param connection_info: Snowflake database connection info (this will be in a secret later)\n", - " :param query: query to for Snowflake\n", - " :param parquet_out_dir: directory path for the output parquet files\n", - " (default None, not write out)\n", - " :param publish_name: name of the dask dataframe to publish to the dask cluster\n", - " (default None, not publish)\n", - "\n", - " \"\"\"\n", - " context = mlrun.get_or_create_ctx('snawflake-dask-cluster')\n", - "\n", - " # setup dask client from the MLRun dask cluster function\n", - " if dask_client:\n", - " client = mlrun.import_function(dask_client).client\n", - " context.logger.info(f'Existing dask client === >>> {client}\\n')\n", - " else:\n", - " client = Client()\n", - " context.logger.info(f'\\nNewly created dask client === >>> {client}\\n')\n", - "\n", - " conn = snow.connect(**connection_info)\n", - " cur = conn.cursor()\n", - " cur.execute(query)\n", - " batches = cur.get_result_batches()\n", - " context.logger.info(f'batches len === {len(batches)}\\n')\n", - "\n", - " dfs = []\n", - " for batch in batches:\n", - " if batch.rowcount > 0:\n", - " df = load(batch)\n", - " dfs.append(df)\n", - " ddf = from_delayed(dfs)\n", - "\n", - " # materialize the query results set for some sample compute\n", - "\n", - " ddf_describe = ddf.describe().compute()\n", - "\n", - " context.logger.info(f'query === >>> {query}\\n')\n", - " context.logger.info(f'ddf === >>> {ddf}\\n')\n", - " context.log_result('number of rows', len(ddf.index))\n", - " context.log_dataset(\"ddf_describe\", df=ddf_describe)\n", - "\n", - " if publish_name:\n", - " context.log_result('data_set_name', publish_name)\n", - " if not client.list_datasets():\n", - " ddf.persist(name = publish_name)\n", - " client.publish_dataset(publish_name=ddf)\n", - "\n", - " if parquet_out_dir:\n", - " dd.to_parquet(df=ddf, path=parquet_out_dir)\n", - " context.log_result('parquet directory', parquet_out_dir)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Convert the code to MLRun function" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Use code_to_function to convert the code to MLRun
" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "fn = mlrun.code_to_function(name=\"snowflake-dask\", \n", - " kind='job', \n", - " filename='snowflake_dask.py',\n", - " image='mlrun/mlrun',\n", - " requirements='requirements.txt',\n", - " handler=\"load_results\", \n", - " description=\"Snowflake Dask - Ingest snowflake data in parallel with Dask cluster\",\n", - " categories=[\"data-prep\"],\n", - " labels={\"author\": \"xingsheng\"}\n", - " )\n", - "fn.apply(mlrun.platforms.auto_mount())\n", - "fn.deploy()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### export function to local `function.yaml` file for testing\n", - "in the real usage, we will import a function from hub" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2022-03-17 17:12:47,044 [info] function spec saved to path: function.yaml\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fn.export('function.yaml')\n", - "# print(fn.to_yaml())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### import a function from local `function.yaml' for testing (Need to change it to import from hub before PR)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "fn = mlrun.import_function(\"./function.yaml\")" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "# fn = mlrun.import_function(\"hub://snowflake_dask\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### create a dask cluster and specify the configuration for the dask process (e.g. replicas, memory etc)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'db://snowflake-dask/snowflake-dask-cluster'" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# function URI is db:///\n", - "dask_uri = f'db://{project_name}/{dask_cluster_name}'\n", - "dask_uri" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "dsf = mlrun.new_function(name=dask_cluster_name, \n", - " kind='dask', \n", - " image='mlrun/mlrun',\n", - " requirements=[\"bokeh\", \"snowflake-connector-python[pandas]\"]\n", - " )\n", - "dsf.apply(mlrun.mount_v3io())\n", - "dsf.spec.remote = True\n", - "dsf.spec.min_replicas = 1\n", - "dsf.spec.max_replicas = 10\n", - "dsf.spec.service_type = \"NodePort\"\n", - "dsf.with_requests(mem='4G', cpu='2')\n", - "# dsf.spec.node_port=30088\n", - "# dsf.spec.scheduler_timeout = \"5 days\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dsf.deploy()" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2022-03-17 17:13:51,354 [info] trying dask client at: tcp://mlrun-snowflake-dask-cluster-15ea793c-d.default-tenant:8786\n", - "> 2022-03-17 17:13:51,391 [info] using remote dask scheduler (mlrun-snowflake-dask-cluster-15ea793c-d) at: tcp://mlrun-snowflake-dask-cluster-15ea793c-d.default-tenant:8786\n" - ] - }, - { - "data": { - "text/html": [ - "dashboard link: default-tenant.app.us-sales-322.iguazio-cd1.com:30088" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "client = dsf.client" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Run the function" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "When running the function you would see a remote dashboard link as part of the result. click on this link takes you to the dask monitoring dashboard" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "p = 'my-local-test'\n", - "parquet_path = f\"/v3io/bigdata/pq_from_sf_dask/{p}\"\n", - "\n", - "fn.run(handler = 'load_results',\n", - " params={\"dask_client\": dask_uri, \n", - " \"connection_info\": connection_info, \n", - " \"query\": \"SELECT * FROM SNOWFLAKE_SAMPLE_DATA.TPCH_SF1.CUSTOMER\",\n", - " \"parquet_out_dir\": parquet_path,\n", - " \"publish_name\": \"customer\",\n", - " }\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "client.close()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Track the progress in the UI" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Users can view the progress and detailed information in the mlrun UI by clicking on the uid above.
\n", - "Also, to track the dask progress in the dask UI click on the \"dashboard link\" above the \"client\" section" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python [conda env:root] *", - "language": "python", - "name": "conda-root-py" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/functions/development/snowflake_dask/0.9.0/src/snowflake_dask.py b/functions/development/snowflake_dask/0.9.0/src/snowflake_dask.py deleted file mode 100644 index f4063191..00000000 --- a/functions/development/snowflake_dask/0.9.0/src/snowflake_dask.py +++ /dev/null @@ -1,86 +0,0 @@ -"""Snowflake Dask - Ingest Snaowflake data with Dask""" -import warnings -import mlrun -from mlrun.execution import MLClientCtx -import snowflake.connector as snow -from dask.distributed import Client -from dask.dataframe import from_delayed -from dask import delayed -from dask import dataframe as dd - -warnings.filterwarnings("ignore") - -@delayed -def load(batch): - - """A delayed load one batch.""" - - try: - print("BATCHING") - df_ = batch.to_pandas() - return df_ - except Exception as e: - print(f"Failed on {batch} for {e}") - raise - -def load_results(context: MLClientCtx, - dask_client: str, - connection_info: str, - query: str, - parquet_out_dir = None, - publish_name = None - ) -> None: - - """Snowflake Dask - Ingest Snaowflake data with Dask - - :param context: the function context - :param dask_client: dask cluster function name - :param connection_info: Snowflake database connection info (this will be in a secret later) - :param query: query to for Snowflake - :param parquet_out_dir: directory path for the output parquet files - (default None, not write out) - :param publish_name: name of the dask dataframe to publish to the dask cluster - (default None, not publish) - - """ - context = mlrun.get_or_create_ctx('snawflake-dask-cluster') - - # setup dask client from the MLRun dask cluster function - if dask_client: - client = mlrun.import_function(dask_client).client - context.logger.info(f'Existing dask client === >>> {client}\n') - else: - client = Client() - context.logger.info(f'\nNewly created dask client === >>> {client}\n') - - conn = snow.connect(**connection_info) - cur = conn.cursor() - cur.execute(query) - batches = cur.get_result_batches() - context.logger.info(f'batches len === {len(batches)}\n') - - dfs = [] - for batch in batches: - if batch.rowcount > 0: - df = load(batch) - dfs.append(df) - ddf = from_delayed(dfs) - - # materialize the query results set for some sample compute - - ddf_describe = ddf.describe().compute() - - context.logger.info(f'query === >>> {query}\n') - context.logger.info(f'ddf === >>> {ddf}\n') - context.log_result('number of rows', len(ddf.index)) - context.log_dataset("ddf_describe", df=ddf_describe) - - if publish_name: - context.log_result('data_set_name', publish_name) - if not client.list_datasets(): - ddf.persist(name = publish_name) - client.publish_dataset(publish_name=ddf) - - if parquet_out_dir: - dd.to_parquet(df=ddf, path=parquet_out_dir) - context.log_result('parquet directory', parquet_out_dir) diff --git a/functions/development/snowflake_dask/0.9.0/src/test_snowflake_dask.py b/functions/development/snowflake_dask/0.9.0/src/test_snowflake_dask.py deleted file mode 100644 index 71d29298..00000000 --- a/functions/development/snowflake_dask/0.9.0/src/test_snowflake_dask.py +++ /dev/null @@ -1,10 +0,0 @@ -"""Snowflake Dask unit test""" -from mlrun import import_function - -def test_snowflake_dask(): - """An unit test""" - fn_to_test = import_function("function.yaml") - - # a fake assert to pass the unit test - if fn_to_test.to_yaml().__contains__('job'): - assert True diff --git a/functions/development/snowflake_dask/0.9.0/static/documentation.html b/functions/development/snowflake_dask/0.9.0/static/documentation.html deleted file mode 100644 index 28f63c67..00000000 --- a/functions/development/snowflake_dask/0.9.0/static/documentation.html +++ /dev/null @@ -1,148 +0,0 @@ - - - - - - - -snowflake_dask package - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- - -
-
-
-
-
-
-

snowflake_dask package

-
-

Submodules

-
-
-

snowflake_dask.snowflake_dask module

-

Snowflake Dask - Ingest Snaowflake data with Dask

-
-
-snowflake_dask.snowflake_dask.load_results(context: mlrun.execution.MLClientCtx, dask_client: str, connection_info: str, query: str, parquet_out_dir=None, publish_name=None)None[source]
-

Snowflake Dask - Ingest Snaowflake data with Dask

-
-
Parameters
-
    -
  • context – the function context

  • -
  • dask_client – dask cluster function name

  • -
  • connection_info – Snowflake database connection info (this will be in a secret later)

  • -
  • query – query to for Snowflake

  • -
  • parquet_out_dir – directory path for the output parquet files -(default None, not write out)

  • -
  • publish_name – name of the dask dataframe to publish to the dask cluster -(default None, not publish)

  • -
-
-
-
-
-
-

Module contents

-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/snowflake_dask/0.9.0/static/example.html b/functions/development/snowflake_dask/0.9.0/static/example.html deleted file mode 100644 index fb540c8c..00000000 --- a/functions/development/snowflake_dask/0.9.0/static/example.html +++ /dev/null @@ -1,473 +0,0 @@ - - - - - - - -This notebook is to create a function to ingest data from snowflake with a Dask cluster - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
- -
-
-
-
-

This notebook is to create a function to ingest data from snowflake with a Dask cluster

-

The dask frameworks enables users to parallelize their python code and run it as a distributed process on Iguazio cluster and dramatically accelerate their performance.
-In this notebook we’ll create an mlrun function running as a dask client to ingest data from snowflake.
-It also demonstrates how to run parallelize query against snowflake using Dask Delayed option to query a large data set from snowflake.
-The function will be published on the function marketplace.
-For more information on dask over kubernetes: https://kubernetes.dask.org/en/latest/

-
-

Set up the enviroment

-
-
-
import mlrun
-import os
-import warnings
-import yaml
-
-project_name = "snowflake-dask"
-dask_cluster_name="snowflake-dask-cluster"
-artifact_path = mlrun.set_environment(project=project_name,
-                                      artifact_path = os.path.join(os.path.abspath('/v3io/projects/'), project_name))
-
-warnings.filterwarnings("ignore")
-
-print(f'artifact_path = {artifact_path}')
-
-
-
-
-
> 2022-03-17 17:11:56,500 [info] loaded project snowflake-dask from MLRun DB
-artifact_path = ('snowflake-dask', '/v3io/projects/snowflake-dask')
-
-
-
-
-
-
-

Load snowflake configuration from config file.

-

This is for demo purpose, in the real production code, you would need to put the snowflake connection info into secrets use the secrets in the running pod to connect to snowflake

-
-
-
# Load connection info
-with open(".config.yaml") as f:
-    connection_info = yaml.safe_load(f)
-
-# verify the config
-print(connection_info['account'])
-
-
-
-
-
nf77378.eu-west-2.aws
-
-
-
-
-
-
-

Create a python function

-

This function querys data from snowflake using snowflake python connector for parallel processing of the query results.
-With snoeflake python connector, when you execute a query, the cursor will return the result batches.
-Using Dask Delayed it will return and process results set in parallel.

-
-

write the function to a py file

-
-
-
%%writefile snowflake_dask.py
-"""Snowflake Dask - Ingest Snaowflake data with Dask"""
-import warnings
-import mlrun
-from mlrun.execution import MLClientCtx
-import snowflake.connector as snow
-from dask.distributed import Client
-from dask.dataframe import from_delayed
-from dask import delayed
-from dask import dataframe as dd
-
-warnings.filterwarnings("ignore")
-
-@delayed
-def load(batch):
-
-    """A delayed load one batch."""
-
-    try:
-        print("BATCHING")
-        df_ = batch.to_pandas()
-        return df_
-    except Exception as e:
-        print(f"Failed on {batch} for {e}")
-        raise
-
-def load_results(context: MLClientCtx,
-                 dask_client: str,
-                 connection_info: str,
-                 query: str,
-                 parquet_out_dir = None,
-                 publish_name = None
-                ) -> None:
-
-    """Snowflake Dask - Ingest Snaowflake data with Dask
-
-    :param context:           the function context
-    :param dask_client:       dask cluster function name
-    :param connection_info:   Snowflake database connection info (this will be in a secret later)
-    :param query:             query to for Snowflake
-    :param parquet_out_dir:   directory path for the output parquet files
-                              (default None, not write out)
-    :param publish_name:      name of the dask dataframe to publish to the dask cluster
-                              (default None, not publish)
-
-    """
-    context = mlrun.get_or_create_ctx('snawflake-dask-cluster')
-
-    # setup dask client from the MLRun dask cluster function
-    if dask_client:
-        client = mlrun.import_function(dask_client).client
-        context.logger.info(f'Existing dask client === >>> {client}\n')
-    else:
-        client = Client()
-        context.logger.info(f'\nNewly created dask client === >>> {client}\n')
-
-    conn = snow.connect(**connection_info)
-    cur = conn.cursor()
-    cur.execute(query)
-    batches = cur.get_result_batches()
-    context.logger.info(f'batches len === {len(batches)}\n')
-
-    dfs = []
-    for batch in batches:
-        if batch.rowcount > 0:
-            df = load(batch)
-            dfs.append(df)
-    ddf = from_delayed(dfs)
-
-    # materialize the query results set for some sample compute
-
-    ddf_describe = ddf.describe().compute()
-
-    context.logger.info(f'query  === >>> {query}\n')
-    context.logger.info(f'ddf  === >>> {ddf}\n')
-    context.log_result('number of rows', len(ddf.index))
-    context.log_dataset("ddf_describe", df=ddf_describe)
-
-    if publish_name:
-        context.log_result('data_set_name', publish_name)
-        if not client.list_datasets():
-            ddf.persist(name = publish_name)
-            client.publish_dataset(publish_name=ddf)
-
-    if parquet_out_dir:
-        dd.to_parquet(df=ddf, path=parquet_out_dir)
-        context.log_result('parquet directory', parquet_out_dir)
-
-
-
-
-
Overwriting snowflake_dask.py
-
-
-
-
-
-
-
-

Convert the code to MLRun function

-

Use code_to_function to convert the code to MLRun

-
-
-
fn = mlrun.code_to_function(name="snowflake-dask",  
-                            kind='job', 
-                            filename='snowflake_dask.py',
-                            image='mlrun/mlrun',
-                            requirements='requirements.txt',
-                            handler="load_results", 
-                            description="Snowflake Dask - Ingest snowflake data in parallel with Dask cluster",
-                            categories=["data-prep"],
-                            labels={"author": "xingsheng"}
-                           )
-fn.apply(mlrun.platforms.auto_mount())
-fn.deploy()
-
-
-
-
-
-

export function to local function.yaml file for testing

-

in the real usage, we will import a function from hub

-
-
-
fn.export('function.yaml')
-# print(fn.to_yaml())
-
-
-
-
-
> 2022-03-17 17:12:47,044 [info] function spec saved to path: function.yaml
-
-
-
<mlrun.runtimes.kubejob.KubejobRuntime at 0x7fae6fe3e690>
-
-
-
-
-
-
-

import a function from local `function.yaml’ for testing (Need to change it to import from hub before PR)

-
-
-
fn = mlrun.import_function("./function.yaml")
-
-
-
-
-
-
-
# fn = mlrun.import_function("hub://snowflake_dask")
-
-
-
-
-
-
-

create a dask cluster and specify the configuration for the dask process (e.g. replicas, memory etc)

-
-
-
# function URI is db://<project>/<name>
-dask_uri = f'db://{project_name}/{dask_cluster_name}'
-dask_uri
-
-
-
-
-
'db://snowflake-dask/snowflake-dask-cluster'
-
-
-
-
-
-
-
dsf = mlrun.new_function(name=dask_cluster_name, 
-                         kind='dask', 
-                         image='mlrun/mlrun',
-                         requirements=["bokeh", "snowflake-connector-python[pandas]"]
-                        )
-dsf.apply(mlrun.mount_v3io())
-dsf.spec.remote = True
-dsf.spec.min_replicas = 1
-dsf.spec.max_replicas = 10
-dsf.spec.service_type = "NodePort"
-dsf.with_requests(mem='4G', cpu='2')
-# dsf.spec.node_port=30088
-# dsf.spec.scheduler_timeout = "5 days"
-
-
-
-
-
-
-
dsf.deploy()
-
-
-
-
-
-
-
client = dsf.client
-
-
-
-
-
> 2022-03-17 17:13:51,354 [info] trying dask client at: tcp://mlrun-snowflake-dask-cluster-15ea793c-d.default-tenant:8786
-> 2022-03-17 17:13:51,391 [info] using remote dask scheduler (mlrun-snowflake-dask-cluster-15ea793c-d) at: tcp://mlrun-snowflake-dask-cluster-15ea793c-d.default-tenant:8786
-
-
-
-
-
-
-
-

Run the function

-

When running the function you would see a remote dashboard link as part of the result. click on this link takes you to the dask monitoring dashboard

-
-
-
p = 'my-local-test'
-parquet_path = f"/v3io/bigdata/pq_from_sf_dask/{p}"
-
-fn.run(handler = 'load_results',
-       params={"dask_client": dask_uri, 
-               "connection_info": connection_info, 
-               "query": "SELECT * FROM SNOWFLAKE_SAMPLE_DATA.TPCH_SF1.CUSTOMER",
-               "parquet_out_dir": parquet_path,
-               "publish_name": "customer",
-              }
-      )
-
-
-
-
-
-
-
client.close()
-
-
-
-
-
-
-

Track the progress in the UI

-

Users can view the progress and detailed information in the mlrun UI by clicking on the uid above.
-Also, to track the dask progress in the dask UI click on the “dashboard link” above the “client” section

-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/snowflake_dask/0.9.0/static/function.html b/functions/development/snowflake_dask/0.9.0/static/function.html deleted file mode 100644 index 6886f085..00000000 --- a/functions/development/snowflake_dask/0.9.0/static/function.html +++ /dev/null @@ -1,99 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: job
-metadata:
-  name: snowflake-dask
-  tag: ''
-  hash: bc61b7ecda9966b7b700ed39d6e0d7e653ecdf66
-  project: snowflake-dask
-  labels:
-    author: xingsheng
-  categories:
-  - data-prep
-spec:
-  command: ''
-  args: []
-  image: .mlrun/func-snowflake-dask-snowflake-dask:latest
-  build:
-    functionSourceCode: IiIiU25vd2ZsYWtlIERhc2sgLSBJbmdlc3QgU25hb3dmbGFrZSBkYXRhIHdpdGggRGFzayIiIgppbXBvcnQgd2FybmluZ3MKaW1wb3J0IG1scnVuCmZyb20gbWxydW4uZXhlY3V0aW9uIGltcG9ydCBNTENsaWVudEN0eAppbXBvcnQgc25vd2ZsYWtlLmNvbm5lY3RvciBhcyBzbm93CmZyb20gZGFzay5kaXN0cmlidXRlZCBpbXBvcnQgQ2xpZW50CmZyb20gZGFzay5kYXRhZnJhbWUgaW1wb3J0IGZyb21fZGVsYXllZApmcm9tIGRhc2sgaW1wb3J0IGRlbGF5ZWQKZnJvbSBkYXNrIGltcG9ydCBkYXRhZnJhbWUgYXMgZGQKCndhcm5pbmdzLmZpbHRlcndhcm5pbmdzKCJpZ25vcmUiKQoKQGRlbGF5ZWQKZGVmIGxvYWQoYmF0Y2gpOgoKICAgICIiIkEgZGVsYXllZCBsb2FkIG9uZSBiYXRjaC4iIiIKCiAgICB0cnk6CiAgICAgICAgcHJpbnQoIkJBVENISU5HIikKICAgICAgICBkZl8gPSBiYXRjaC50b19wYW5kYXMoKQogICAgICAgIHJldHVybiBkZl8KICAgIGV4Y2VwdCBFeGNlcHRpb24gYXMgZToKICAgICAgICBwcmludChmIkZhaWxlZCBvbiB7YmF0Y2h9IGZvciB7ZX0iKQogICAgICAgIHJhaXNlCgpkZWYgbG9hZF9yZXN1bHRzKGNvbnRleHQ6IE1MQ2xpZW50Q3R4LAogICAgICAgICAgICAgICAgIGRhc2tfY2xpZW50OiBzdHIsCiAgICAgICAgICAgICAgICAgY29ubmVjdGlvbl9pbmZvOiBzdHIsCiAgICAgICAgICAgICAgICAgcXVlcnk6IHN0ciwKICAgICAgICAgICAgICAgICBwYXJxdWV0X291dF9kaXIgPSBOb25lLAogICAgICAgICAgICAgICAgIHB1Ymxpc2hfbmFtZSA9IE5vbmUKICAgICAgICAgICAgICAgICkgLT4gTm9uZToKCiAgICAiIiJTbm93Zmxha2UgRGFzayAtIEluZ2VzdCBTbmFvd2ZsYWtlIGRhdGEgd2l0aCBEYXNrCgogICAgOnBhcmFtIGNvbnRleHQ6ICAgICAgICAgICB0aGUgZnVuY3Rpb24gY29udGV4dAogICAgOnBhcmFtIGRhc2tfY2xpZW50OiAgICAgICBkYXNrIGNsdXN0ZXIgZnVuY3Rpb24gbmFtZQogICAgOnBhcmFtIGNvbm5lY3Rpb25faW5mbzogICBTbm93Zmxha2UgZGF0YWJhc2UgY29ubmVjdGlvbiBpbmZvICh0aGlzIHdpa2sgYmUgaW4gYSBzZWNyZXQgbGF0ZXIpCiAgICA6cGFyYW0gcXVlcnk6ICAgICAgICAgICAgIHF1ZXJ5IHRvIGZvciBTbm93Zmxha2UKICAgIDpwYXJhbSBwYXJxdWV0X291dF9kaXI6ICAgZGlyZWN0b3J5IHBhdGggZm9yIHRoZSBvdXRwdXQgcGFycXVldCBmaWxlcwogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAoZGVmYXVsdCBOb25lLCBub3Qgd3JpdGUgb3V0KQogICAgOnBhcmFtIHB1Ymxpc2hfbmFtZTogICAgICBuYW1lIG9mIHRoZSBkYXNrIGRhdGFmcmFtZSB0byBwdWJsaXNoIHRvIHRoZSBkYXNrIGNsdXN0ZXIKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgKGRlZmF1bHQgTm9uZSwgbm90IHB1Ymxpc2gpCgogICAgIiIiCiAgICBjb250ZXh0ID0gbWxydW4uZ2V0X29yX2NyZWF0ZV9jdHgoJ3NuYXdmbGFrZS1kYXNrLWNsdXN0ZXInKQoKICAgICMgc2V0dXAgZGFzayBjbGllbnQgZnJvbSB0aGUgTUxSdW4gZGFzayBjbHVzdGVyIGZ1bmN0aW9uCiAgICBpZiBkYXNrX2NsaWVudDoKICAgICAgICBjbGllbnQgPSBtbHJ1bi5pbXBvcnRfZnVuY3Rpb24oZGFza19jbGllbnQpLmNsaWVudAogICAgICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZidFeGlzdGluZyBkYXNrIGNsaWVudCA9PT0gPj4+IHtjbGllbnR9XG4nKQogICAgZWxzZToKICAgICAgICBjbGllbnQgPSBDbGllbnQoKQogICAgICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZidcbk5ld2x5IGNyZWF0ZWQgZGFzayBjbGllbnQgPT09ID4+PiB7Y2xpZW50fVxuJykKCiAgICBjb25uID0gc25vdy5jb25uZWN0KCoqY29ubmVjdGlvbl9pbmZvKQogICAgY3VyID0gY29ubi5jdXJzb3IoKQogICAgY3VyLmV4ZWN1dGUocXVlcnkpCiAgICBiYXRjaGVzID0gY3VyLmdldF9yZXN1bHRfYmF0Y2hlcygpCiAgICBjb250ZXh0LmxvZ2dlci5pbmZvKGYnYmF0Y2hlcyBsZW4gPT09IHtsZW4oYmF0Y2hlcyl9XG4nKQoKICAgIGRmcyA9IFtdCiAgICBmb3IgYmF0Y2ggaW4gYmF0Y2hlczoKICAgICAgICBpZiBiYXRjaC5yb3djb3VudCA+IDA6CiAgICAgICAgICAgIGRmID0gbG9hZChiYXRjaCkKICAgICAgICAgICAgZGZzLmFwcGVuZChkZikKICAgIGRkZiA9IGZyb21fZGVsYXllZChkZnMpCgogICAgIyBtYXRlcmlhbGl6ZSB0aGUgcXVlcnkgcmVzdWx0cyBzZXQgZm9yIHNvbWUgc2FtcGxlIGNvbXB1dGUKCiAgICBkZGZfZGVzY3JpYmUgPSBkZGYuZGVzY3JpYmUoKS5jb21wdXRlKCkKCiAgICBjb250ZXh0LmxvZ2dlci5pbmZvKGYncXVlcnkgID09PSA+Pj4ge3F1ZXJ5fVxuJykKICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZidkZGYgID09PSA+Pj4ge2RkZn1cbicpCiAgICBjb250ZXh0LmxvZ19yZXN1bHQoJ251bWJlciBvZiByb3dzJywgbGVuKGRkZi5pbmRleCkpCiAgICBjb250ZXh0LmxvZ19kYXRhc2V0KCJkZGZfZGVzY3JpYmUiLCBkZj1kZGZfZGVzY3JpYmUpCgogICAgaWYgcHVibGlzaF9uYW1lOgogICAgICAgIGNvbnRleHQubG9nX3Jlc3VsdCgnZGF0YV9zZXRfbmFtZScsIHB1Ymxpc2hfbmFtZSkKICAgICAgICBpZiBub3QgY2xpZW50Lmxpc3RfZGF0YXNldHMoKToKICAgICAgICAgICAgZGRmLnBlcnNpc3QobmFtZSA9IHB1Ymxpc2hfbmFtZSkKICAgICAgICAgICAgY2xpZW50LnB1Ymxpc2hfZGF0YXNldChwdWJsaXNoX25hbWU9ZGRmKQoKICAgIGlmIHBhcnF1ZXRfb3V0X2RpcjoKICAgICAgICBkZC50b19wYXJxdWV0KGRmPWRkZiwgcGF0aD1wYXJxdWV0X291dF9kaXIpCiAgICAgICAgY29udGV4dC5sb2dfcmVzdWx0KCdwYXJxdWV0IGRpcmVjdG9yeScsIHBhcnF1ZXRfb3V0X2RpcikK
-    base_image: mlrun/mlrun
-    commands:
-    - python -m pip install bokeh snowflake-connector-python[pandas]
-    code_origin: https://github.com/xsqian/functions.git#b8c0c4307783069f3696ebdbfee8df7d64fbd0dc:snowflake_dask.py
-    origin_filename: snowflake_dask.py
-  entry_points:
-    load:
-      name: load
-      doc: A delayed load one batch.
-      parameters:
-      - name: batch
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 13
-    load_results:
-      name: load_results
-      doc: Snowflake Dask - Ingest Snaowflake data with Dask
-      parameters:
-      - name: context
-        type: MLClientCtx
-        doc: the function context
-        default: ''
-      - name: dask_client
-        type: str
-        doc: dask cluster function name
-        default: ''
-      - name: connection_info
-        type: str
-        doc: Snowflake database connection info (this wikk be in a secret later)
-        default: ''
-      - name: query
-        type: str
-        doc: query to for Snowflake
-        default: ''
-      - name: parquet_out_dir
-        doc: directory path for the output parquet files (default None, not write
-          out)
-        default: null
-      - name: publish_name
-        doc: name of the dask dataframe to publish to the dask cluster (default None,
-          not publish)
-        default: null
-      outputs:
-      - default: ''
-      lineno: 26
-  description: Snowflake Dask - Ingest snowflake data in parallel with Dask cluster
-  default_handler: load_results
-  disable_auto_mount: false
-  env:
-  - name: V3IO_API
-    value: ''
-  - name: V3IO_USERNAME
-    value: ''
-  - name: V3IO_ACCESS_KEY
-    value: ''
-  - name: V3IO_FRAMESD
-    value: ''
-  priority_class_name: igz-workload-medium
-  affinity: null
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/snowflake_dask/0.9.0/static/item.html b/functions/development/snowflake_dask/0.9.0/static/item.html deleted file mode 100644 index 6135f081..00000000 --- a/functions/development/snowflake_dask/0.9.0/static/item.html +++ /dev/null @@ -1,46 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- data-preparation
-description: Snowflake Dask - Ingest snowflake data in parallel with Dask cluster
-doc: ''
-example: snowflake-dask-mlrun.ipynb
-generationDate: 2022-03-20:12-28
-icon: ''
-labels:
-  author: xingsheng
-  framework: dask
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 0.9.1
-name: snowflake_dask
-platformVersion: 3.2.0
-spec:
-  filename: snowflake_dask.py
-  handler: load_results
-  image: mlrun/mlrun
-  kind: job
-  requirements: []
-url: ''
-version: 0.9.0
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/snowflake_dask/0.9.0/static/source.html b/functions/development/snowflake_dask/0.9.0/static/source.html deleted file mode 100644 index 0d597fb8..00000000 --- a/functions/development/snowflake_dask/0.9.0/static/source.html +++ /dev/null @@ -1,108 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-"""Snowflake Dask - Ingest Snaowflake data with Dask"""
-import warnings
-import mlrun
-from mlrun.execution import MLClientCtx
-import snowflake.connector as snow
-from dask.distributed import Client
-from dask.dataframe import from_delayed
-from dask import delayed
-from dask import dataframe as dd
-
-warnings.filterwarnings("ignore")
-
-@delayed
-def load(batch):
-
-    """A delayed load one batch."""
-
-    try:
-        print("BATCHING")
-        df_ = batch.to_pandas()
-        return df_
-    except Exception as e:
-        print(f"Failed on {batch} for {e}")
-        raise
-
-def load_results(context: MLClientCtx,
-                 dask_client: str,
-                 connection_info: str,
-                 query: str,
-                 parquet_out_dir = None,
-                 publish_name = None
-                ) -> None:
-
-    """Snowflake Dask - Ingest Snaowflake data with Dask
-
-    :param context:           the function context
-    :param dask_client:       dask cluster function name
-    :param connection_info:   Snowflake database connection info (this will be in a secret later)
-    :param query:             query to for Snowflake
-    :param parquet_out_dir:   directory path for the output parquet files
-                              (default None, not write out)
-    :param publish_name:      name of the dask dataframe to publish to the dask cluster
-                              (default None, not publish)
-
-    """
-    context = mlrun.get_or_create_ctx('snawflake-dask-cluster')
-
-    # setup dask client from the MLRun dask cluster function
-    if dask_client:
-        client = mlrun.import_function(dask_client).client
-        context.logger.info(f'Existing dask client === >>> {client}\n')
-    else:
-        client = Client()
-        context.logger.info(f'\nNewly created dask client === >>> {client}\n')
-
-    conn = snow.connect(**connection_info)
-    cur = conn.cursor()
-    cur.execute(query)
-    batches = cur.get_result_batches()
-    context.logger.info(f'batches len === {len(batches)}\n')
-
-    dfs = []
-    for batch in batches:
-        if batch.rowcount > 0:
-            df = load(batch)
-            dfs.append(df)
-    ddf = from_delayed(dfs)
-
-    # materialize the query results set for some sample compute
-
-    ddf_describe = ddf.describe().compute()
-
-    context.logger.info(f'query  === >>> {query}\n')
-    context.logger.info(f'ddf  === >>> {ddf}\n')
-    context.log_result('number of rows', len(ddf.index))
-    context.log_dataset("ddf_describe", df=ddf_describe)
-
-    if publish_name:
-        context.log_result('data_set_name', publish_name)
-        if not client.list_datasets():
-            ddf.persist(name = publish_name)
-            client.publish_dataset(publish_name=ddf)
-
-    if parquet_out_dir:
-        dd.to_parquet(df=ddf, path=parquet_out_dir)
-        context.log_result('parquet directory', parquet_out_dir)
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/snowflake_dask/1.1.0/src/README.md b/functions/development/snowflake_dask/1.1.0/src/README.md deleted file mode 100644 index 70fa3c92..00000000 --- a/functions/development/snowflake_dask/1.1.0/src/README.md +++ /dev/null @@ -1,38 +0,0 @@ -# **Data Preperation Function** - -## `Snowflake_dask` - -![](img/snowflake-dask.png) - -This function query the data from a snowflake database and process the results -in parallel in a Dask cluster. -It will publish the dask dataframe in the cluster for other process to use. -It can also write the results dataframe to parquet files. - -```markdown - -:param context: the function context -:param dask_client: dask cluster function name -:param connection_info: Snowflake database connection info (this will be in a secret later) -:param query: query to for Snowflake -:param parquet_out_dir: directory path for the output parquet files (default None, not write out) -:param publish_name: name of the dask dataframe to publish to the dask cluster (default None, not publish) -``` - -To use the function, you will need to either have the password or key pair authentication to Snowflake configured. - -To get the password, or generate key pair in Snowflake and configure Snowflake for key pair authentication, please follow Snowflake [documentation](https://docs.snowflake.com/en/user-guide/key-pair-auth.html) here. - -After obtained password or key pair, please set up the project secrets in your Iguazio cluster. - -If you are using password, you only need to add ```sfPassword``` secret to the project settings. - -If you are using the key pair authentication, you will need to add both ```pkPath``` and ```pkPassword``` to the project settings. - - where: - - ```pkPath``` is the file path to your private key file in the cluster, for example ```/User/rsa_key.p8``` - -```pkPassword``` is your private key encryption password. Please see the screenshot below for your reference. - -![Secrets Screenshot](img/iguazio-project-secrets.png) diff --git a/functions/development/snowflake_dask/1.1.0/src/config-template.yaml b/functions/development/snowflake_dask/1.1.0/src/config-template.yaml deleted file mode 100644 index fb46ac2e..00000000 --- a/functions/development/snowflake_dask/1.1.0/src/config-template.yaml +++ /dev/null @@ -1,5 +0,0 @@ -user: "..." -password: "..." -warehouse: "..." -account: "..." -application: "Iguazio" \ No newline at end of file diff --git a/functions/development/snowflake_dask/1.1.0/src/function.yaml b/functions/development/snowflake_dask/1.1.0/src/function.yaml deleted file mode 100644 index c9cc8d74..00000000 --- a/functions/development/snowflake_dask/1.1.0/src/function.yaml +++ /dev/null @@ -1,81 +0,0 @@ -kind: job -metadata: - name: snowflake-dask - tag: '' - hash: a002c7743b4a7471c7befe00f5497de050ebe902 - project: snowflake-dask - labels: - author: xingsheng - categories: - - data-prep - credentials: - access_key: ec09bfc8-1cb4-466d-9049-852081973ce3 -spec: - command: '' - args: [] - image: .mlrun/func-snowflake-dask-snowflake-dask:latest - build: - functionSourceCode: IiIiU25vd2ZsYWtlIERhc2sgLSBJbmdlc3QgU25hb3dmbGFrZSBkYXRhIHdpdGggRGFzayIiIgppbXBvcnQgd2FybmluZ3MKaW1wb3J0IG1scnVuCmZyb20gbWxydW4uZXhlY3V0aW9uIGltcG9ydCBNTENsaWVudEN0eAppbXBvcnQgc25vd2ZsYWtlLmNvbm5lY3RvciBhcyBzbm93CmZyb20gZGFzay5kaXN0cmlidXRlZCBpbXBvcnQgQ2xpZW50CmZyb20gZGFzay5kYXRhZnJhbWUgaW1wb3J0IGZyb21fZGVsYXllZApmcm9tIGRhc2sgaW1wb3J0IGRlbGF5ZWQKZnJvbSBkYXNrIGltcG9ydCBkYXRhZnJhbWUgYXMgZGQKZnJvbSBjcnlwdG9ncmFwaHkuaGF6bWF0LmJhY2tlbmRzIGltcG9ydCBkZWZhdWx0X2JhY2tlbmQKZnJvbSBjcnlwdG9ncmFwaHkuaGF6bWF0LnByaW1pdGl2ZXMgaW1wb3J0IHNlcmlhbGl6YXRpb24KCndhcm5pbmdzLmZpbHRlcndhcm5pbmdzKCJpZ25vcmUiKQoKQGRlbGF5ZWQKZGVmIGxvYWQoYmF0Y2gpOgoKICAgICIiIkEgZGVsYXllZCBsb2FkIG9uZSBiYXRjaC4iIiIKCiAgICB0cnk6CiAgICAgICAgcHJpbnQoIkJBVENISU5HIikKICAgICAgICBkZl8gPSBiYXRjaC50b19wYW5kYXMoKQogICAgICAgIHJldHVybiBkZl8KICAgIGV4Y2VwdCBFeGNlcHRpb24gYXMgZToKICAgICAgICBwcmludChmIkZhaWxlZCBvbiB7YmF0Y2h9IGZvciB7ZX0iKQogICAgICAgIHJhaXNlCgpkZWYgbG9hZF9yZXN1bHRzKGNvbnRleHQ6IE1MQ2xpZW50Q3R4LAogICAgICAgICAgICAgICAgIGRhc2tfY2xpZW50OiBzdHIsCiAgICAgICAgICAgICAgICAgY29ubmVjdGlvbl9pbmZvOiBzdHIsCiAgICAgICAgICAgICAgICAgcXVlcnk6IHN0ciwKICAgICAgICAgICAgICAgICBwYXJxdWV0X291dF9kaXIgPSBOb25lLAogICAgICAgICAgICAgICAgIHB1Ymxpc2hfbmFtZSA9IE5vbmUKICAgICAgICAgICAgICAgICkgLT4gTm9uZToKCiAgICAiIiJTbm93Zmxha2UgRGFzayAtIEluZ2VzdCBTbmFvd2ZsYWtlIGRhdGEgd2l0aCBEYXNrCgogICAgOnBhcmFtIGNvbnRleHQ6ICAgICAgICAgICB0aGUgZnVuY3Rpb24gY29udGV4dAogICAgOnBhcmFtIGRhc2tfY2xpZW50OiAgICAgICBkYXNrIGNsdXN0ZXIgZnVuY3Rpb24gbmFtZQogICAgOnBhcmFtIGNvbm5lY3Rpb25faW5mbzogICBTbm93Zmxha2UgZGF0YWJhc2UgY29ubmVjdGlvbiBpbmZvICh0aGlzIHdpbGwgYmUgaW4gYSBzZWNyZXQgbGF0ZXIpCiAgICA6cGFyYW0gcXVlcnk6ICAgICAgICAgICAgIHF1ZXJ5IHRvIGZvciBTbm93Zmxha2UKICAgIDpwYXJhbSBwYXJxdWV0X291dF9kaXI6ICAgZGlyZWN0b3J5IHBhdGggZm9yIHRoZSBvdXRwdXQgcGFycXVldCBmaWxlcwogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAoZGVmYXVsdCBOb25lLCBub3Qgd3JpdGUgb3V0KQogICAgOnBhcmFtIHB1Ymxpc2hfbmFtZTogICAgICBuYW1lIG9mIHRoZSBkYXNrIGRhdGFmcmFtZSB0byBwdWJsaXNoIHRvIHRoZSBkYXNrIGNsdXN0ZXIKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgKGRlZmF1bHQgTm9uZSwgbm90IHB1Ymxpc2gpCgogICAgIiIiCiAgICBjb250ZXh0ID0gbWxydW4uZ2V0X29yX2NyZWF0ZV9jdHgoJ3NuYXdmbGFrZS1kYXNrLWNsdXN0ZXInKQogICAgc2ZfcGFzc3dvcmQgPSBjb250ZXh0LmdldF9zZWNyZXQoJ3NmUGFzc3dvcmQnKQogICAgcGtfcGF0aCA9ICBjb250ZXh0LmdldF9zZWNyZXQoJ3BrUGF0aCcpCiAgICBwa19wYXNzd29yZCA9ICBjb250ZXh0LmdldF9zZWNyZXQoJ3BrUGFzc3dvcmQnKQoKICAgIGlmIHBrX3BhdGggYW5kIHBrX3Bhc3N3b3JkOgogICAgICAgIHdpdGggb3Blbihwa19wYXRoLCAicmIiKSBhcyBrZXk6CiAgICAgICAgICAgIHBfa2V5PSBzZXJpYWxpemF0aW9uLmxvYWRfcGVtX3ByaXZhdGVfa2V5KAogICAgICAgICAgICAgICAga2V5LnJlYWQoKSwKICAgICAgICAgICAgICAgIHBhc3N3b3JkPXN0cihwa19wYXNzd29yZCkuZW5jb2RlKCksCiAgICAgICAgICAgICAgICBiYWNrZW5kPWRlZmF1bHRfYmFja2VuZCgpCiAgICAgICAgICAgICkKICAgICAgICBwa2IgPSBwX2tleS5wcml2YXRlX2J5dGVzKAogICAgICAgICAgICBlbmNvZGluZz1zZXJpYWxpemF0aW9uLkVuY29kaW5nLkRFUiwKICAgICAgICAgICAgZm9ybWF0PXNlcmlhbGl6YXRpb24uUHJpdmF0ZUZvcm1hdC5QS0NTOAogICAgICAgICAgICAsZW5jcnlwdGlvbl9hbGdvcml0aG09c2VyaWFsaXphdGlvbi5Ob0VuY3J5cHRpb24oKQogICAgICAgICkKICAgICAgICBjb25uZWN0aW9uX2luZm8ucG9wKCdwYXNzd29yZCcsICdObyBwYXNzd29yZCBmb3VuZCcpCiAgICAgICAgY29ubmVjdGlvbl9pbmZvWydwcml2YXRlX2tleSddID0gcGtiCiAgICBlbGlmIHNmX3Bhc3N3b3JkOgogICAgICAgIGNvbm5lY3Rpb25faW5mb1sncGFzc3dvcmQnXSA9IHNmX3Bhc3N3b3JkCiAgICBlbHNlOgogICAgICAgIHJhaXNlIEV4Y2VwdGlvbigiXG5QbGVhc2Ugc2V0IHVwIHRoZSBzZWNyZXQgZm9yIFNub3dmbGFrZSBpbiB5b3VyIHByb2plY3QhXG4iKQoKICAgICMgc2V0dXAgZGFzayBjbGllbnQgZnJvbSB0aGUgTUxSdW4gZGFzayBjbHVzdGVyIGZ1bmN0aW9uCiAgICBpZiBkYXNrX2NsaWVudDoKICAgICAgICBjbGllbnQgPSBtbHJ1bi5pbXBvcnRfZnVuY3Rpb24oZGFza19jbGllbnQpLmNsaWVudAogICAgICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZidFeGlzdGluZyBkYXNrIGNsaWVudCA9PT0gPj4+IHtjbGllbnR9XG4nKQogICAgZWxzZToKICAgICAgICBjbGllbnQgPSBDbGllbnQoKQogICAgICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZidcbk5ld2x5IGNyZWF0ZWQgZGFzayBjbGllbnQgPT09ID4+PiB7Y2xpZW50fVxuJykKCiAgICBjb25uID0gc25vdy5jb25uZWN0KCoqY29ubmVjdGlvbl9pbmZvKQogICAgY3VyID0gY29ubi5jdXJzb3IoKQogICAgY3VyLmV4ZWN1dGUocXVlcnkpCiAgICBiYXRjaGVzID0gY3VyLmdldF9yZXN1bHRfYmF0Y2hlcygpCiAgICBjb250ZXh0LmxvZ2dlci5pbmZvKGYnYmF0Y2hlcyBsZW4gPT09IHtsZW4oYmF0Y2hlcyl9XG4nKQoKICAgIGRmcyA9IFtdCiAgICBmb3IgYmF0Y2ggaW4gYmF0Y2hlczoKICAgICAgICBpZiBiYXRjaC5yb3djb3VudCA+IDA6CiAgICAgICAgICAgIGRmID0gbG9hZChiYXRjaCkKICAgICAgICAgICAgZGZzLmFwcGVuZChkZikKICAgIGRkZiA9IGZyb21fZGVsYXllZChkZnMpCgogICAgIyBtYXRlcmlhbGl6ZSB0aGUgcXVlcnkgcmVzdWx0cyBzZXQgZm9yIHNvbWUgc2FtcGxlIGNvbXB1dGUKCiAgICBkZGZfZGVzY3JpYmUgPSBkZGYuZGVzY3JpYmUoKS5jb21wdXRlKCkKCiAgICBjb250ZXh0LmxvZ2dlci5pbmZvKGYncXVlcnkgID09PSA+Pj4ge3F1ZXJ5fVxuJykKICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZidkZGYgID09PSA+Pj4ge2RkZn1cbicpCiAgICBjb250ZXh0LmxvZ19yZXN1bHQoJ251bWJlciBvZiByb3dzJywgbGVuKGRkZi5pbmRleCkpCiAgICBjb250ZXh0LmxvZ19kYXRhc2V0KCJkZGZfZGVzY3JpYmUiLCBkZj1kZGZfZGVzY3JpYmUpCgogICAgaWYgcHVibGlzaF9uYW1lOgogICAgICAgIGNvbnRleHQubG9nX3Jlc3VsdCgnZGF0YV9zZXRfbmFtZScsIHB1Ymxpc2hfbmFtZSkKICAgICAgICBpZiBub3QgY2xpZW50Lmxpc3RfZGF0YXNldHMoKToKICAgICAgICAgICAgZGRmLnBlcnNpc3QobmFtZSA9IHB1Ymxpc2hfbmFtZSkKICAgICAgICAgICAgY2xpZW50LnB1Ymxpc2hfZGF0YXNldChwdWJsaXNoX25hbWU9ZGRmKQoKICAgIGlmIHBhcnF1ZXRfb3V0X2RpcjoKICAgICAgICBkZC50b19wYXJxdWV0KGRmPWRkZiwgcGF0aD1wYXJxdWV0X291dF9kaXIpCiAgICAgICAgY29udGV4dC5sb2dfcmVzdWx0KCdwYXJxdWV0IGRpcmVjdG9yeScsIHBhcnF1ZXRfb3V0X2RpcikK - base_image: mlrun/mlrun - commands: - - python -m pip install bokeh snowflake-connector-python[pandas] mlrun~=0.9.1 - code_origin: https://github.com/xsqian/functions.git#6b31040e2ad762602f335b0589823a1c61a09975:snowflake_dask.py - origin_filename: snowflake_dask.py - entry_points: - load: - name: load - doc: A delayed load one batch. - parameters: - - name: batch - default: '' - outputs: - - default: '' - lineno: 15 - load_results: - name: load_results - doc: Snowflake Dask - Ingest Snaowflake data with Dask - parameters: - - name: context - type: MLClientCtx - doc: the function context - default: '' - - name: dask_client - type: str - doc: dask cluster function name - default: '' - - name: connection_info - type: str - doc: Snowflake database connection info (this will be in a secret later) - default: '' - - name: query - type: str - doc: query to for Snowflake - default: '' - - name: parquet_out_dir - doc: directory path for the output parquet files (default None, not write - out) - default: null - - name: publish_name - doc: name of the dask dataframe to publish to the dask cluster (default None, - not publish) - default: null - outputs: - - default: '' - lineno: 28 - description: Snowflake Dask - Ingest snowflake data in parallel with Dask cluster - default_handler: load_results - disable_auto_mount: false - env: - - name: V3IO_API - value: '' - - name: V3IO_USERNAME - value: '' - - name: V3IO_ACCESS_KEY - value: '' - - name: V3IO_FRAMESD - value: '' - priority_class_name: igz-workload-medium - preemption_mode: prevent - affinity: null - tolerations: null -verbose: false diff --git a/functions/development/snowflake_dask/1.1.0/src/img/iguazio-project-secrets.png b/functions/development/snowflake_dask/1.1.0/src/img/iguazio-project-secrets.png deleted file mode 100644 index 29f48aa3..00000000 Binary files a/functions/development/snowflake_dask/1.1.0/src/img/iguazio-project-secrets.png and /dev/null differ diff --git a/functions/development/snowflake_dask/1.1.0/src/img/snowflake-dask.png b/functions/development/snowflake_dask/1.1.0/src/img/snowflake-dask.png deleted file mode 100644 index 30a25282..00000000 Binary files a/functions/development/snowflake_dask/1.1.0/src/img/snowflake-dask.png and /dev/null differ diff --git a/functions/development/snowflake_dask/1.1.0/src/item.yaml b/functions/development/snowflake_dask/1.1.0/src/item.yaml deleted file mode 100644 index c12d3aba..00000000 --- a/functions/development/snowflake_dask/1.1.0/src/item.yaml +++ /dev/null @@ -1,25 +0,0 @@ -apiVersion: v1 -categories: -- data-preparation -description: Snowflake Dask - Ingest snowflake data in parallel with Dask cluster -doc: '' -example: snowflake-dask-mlrun.ipynb -generationDate: 2022-08-28:17-25 -hidden: false -icon: '' -labels: - author: xingsheng - framework: dask -maintainers: [] -marketplaceType: '' -mlrunVersion: 1.4.1 -name: snowflake_dask -platformVersion: 3.5.0 -spec: - filename: snowflake_dask.py - handler: load_results - image: mlrun/mlrun - kind: job - requirements: [] -url: '' -version: 1.1.0 diff --git a/functions/development/snowflake_dask/1.1.0/src/requirements.txt b/functions/development/snowflake_dask/1.1.0/src/requirements.txt deleted file mode 100644 index 0bca2c92..00000000 --- a/functions/development/snowflake_dask/1.1.0/src/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -bokeh -snowflake-connector-python[pandas] diff --git a/functions/development/snowflake_dask/1.1.0/src/snowflake-dask-mlrun.ipynb b/functions/development/snowflake_dask/1.1.0/src/snowflake-dask-mlrun.ipynb deleted file mode 100644 index 03936f2a..00000000 --- a/functions/development/snowflake_dask/1.1.0/src/snowflake-dask-mlrun.ipynb +++ /dev/null @@ -1,437 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# This notebook is to create a function to ingest data from snowflake with a Dask cluster" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The dask frameworks enables users to parallelize their python code and run it as a distributed process on Iguazio cluster and dramatically accelerate their performance.
\n", - "In this notebook we'll create an mlrun function running as a dask client to ingest data from snowflake.
\n", - "It also demonstrates how to run parallelize query against snowflake using Dask Delayed option to query a large data set from snowflake.
\n", - "The function will be published on the function marketplace.
\n", - "For more information on dask over kubernetes: https://kubernetes.dask.org/en/latest/" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Set up the enviroment" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import mlrun\n", - "import os\n", - "import warnings\n", - "import yaml\n", - "\n", - "project_name = \"snowflake-dask\"\n", - "dask_cluster_name=\"snowflake-dask-cluster\"\n", - "artifact_path = mlrun.set_environment(project=project_name,\n", - " artifact_path = os.path.join(os.path.abspath('/v3io/projects/'), project_name))\n", - "\n", - "warnings.filterwarnings(\"ignore\")\n", - "\n", - "print(f'artifact_path = {artifact_path}')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Load snowflake configuration from config file. \n", - "This is for demo purpose, in the real production code, you would need to put the snowflake connection info into secrets use the secrets in the running pod to connect to snowflake" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Load connection info\n", - "with open(\".config.yaml\") as f:\n", - " connection_info = yaml.safe_load(f)\n", - "\n", - "# verify the config\n", - "print(connection_info['account'])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Create a python function" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This function querys data from snowflake using snowflake python connector for parallel processing of the query results.
\n", - "With snoeflake python connector, when you execute a query, the cursor will return the result batches.
\n", - "Using Dask Delayed it will return and process results set in parallel.
" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### write the function to a py file" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%writefile snowflake_dask.py\n", - "\"\"\"Snowflake Dask - Ingest Snowflake data with Dask\"\"\"\n", - "import warnings\n", - "import mlrun\n", - "from mlrun.execution import MLClientCtx\n", - "import snowflake.connector as snow\n", - "from dask.distributed import Client\n", - "from dask.dataframe import from_delayed\n", - "from dask import delayed\n", - "from dask import dataframe as dd\n", - "from cryptography.hazmat.backends import default_backend\n", - "from cryptography.hazmat.primitives import serialization\n", - "\n", - "warnings.filterwarnings(\"ignore\")\n", - "\n", - "@delayed\n", - "def load(batch):\n", - "\n", - " \"\"\"A delayed load one batch.\"\"\"\n", - "\n", - " try:\n", - " print(\"BATCHING\")\n", - " df_ = batch.to_pandas()\n", - " return df_\n", - " except Exception as e:\n", - " print(f\"Failed on {batch} for {e}\")\n", - " raise\n", - "\n", - "def load_results(context: MLClientCtx,\n", - " dask_client: str,\n", - " connection_info: str,\n", - " query: str,\n", - " parquet_out_dir = None,\n", - " publish_name = None\n", - " ) -> None:\n", - "\n", - " \"\"\"Snowflake Dask - Ingest Snowflake data with Dask\n", - "\n", - " :param context: the function context\n", - " :param dask_client: dask cluster function name\n", - " :param connection_info: Snowflake database connection info (this will be in a secret later)\n", - " :param query: query to for Snowflake\n", - " :param parquet_out_dir: directory path for the output parquet files\n", - " (default None, not write out)\n", - " :param publish_name: name of the dask dataframe to publish to the dask cluster\n", - " (default None, not publish)\n", - "\n", - " \"\"\"\n", - " context = mlrun.get_or_create_ctx('snawflake-dask-cluster')\n", - " sf_password = context.get_secret('sfPassword')\n", - " pk_path = context.get_secret('pkPath')\n", - " pk_password = context.get_secret('pkPassword')\n", - "\n", - " if pk_path and pk_password:\n", - " with open(pk_path, \"rb\") as key:\n", - " p_key= serialization.load_pem_private_key(\n", - " key.read(),\n", - " password=str(pk_password).encode(),\n", - " backend=default_backend()\n", - " )\n", - " pkb = p_key.private_bytes(\n", - " encoding=serialization.Encoding.DER,\n", - " format=serialization.PrivateFormat.PKCS8\n", - " ,encryption_algorithm=serialization.NoEncryption()\n", - " )\n", - " connection_info.pop('password', 'No password found')\n", - " connection_info['private_key'] = pkb\n", - " elif sf_password:\n", - " connection_info['password'] = sf_password\n", - " else:\n", - " raise Exception(\"\\nPlease set up the secret for Snowflake in your project!\\n\")\n", - "\n", - " # setup dask client from the MLRun dask cluster function\n", - " if dask_client:\n", - " client = mlrun.import_function(dask_client).client\n", - " context.logger.info(f'Existing dask client === >>> {client}\\n')\n", - " else:\n", - " client = Client()\n", - " context.logger.info(f'\\nNewly created dask client === >>> {client}\\n')\n", - "\n", - " conn = snow.connect(**connection_info)\n", - " cur = conn.cursor()\n", - " cur.execute(query)\n", - " batches = cur.get_result_batches()\n", - " context.logger.info(f'batches len === {len(batches)}\\n')\n", - "\n", - " dfs = []\n", - " for batch in batches:\n", - " if batch.rowcount > 0:\n", - " df = load(batch)\n", - " dfs.append(df)\n", - " ddf = from_delayed(dfs)\n", - "\n", - " # materialize the query results set for some sample compute\n", - "\n", - " ddf_describe = ddf.describe().compute()\n", - "\n", - " context.logger.info(f'query === >>> {query}\\n')\n", - " context.logger.info(f'ddf === >>> {ddf}\\n')\n", - " context.log_result('number of rows', len(ddf.index))\n", - " context.log_dataset(\"ddf_describe\", df=ddf_describe)\n", - "\n", - " if publish_name:\n", - " context.log_result('data_set_name', publish_name)\n", - " if not client.list_datasets():\n", - " ddf.persist(name = publish_name)\n", - " client.publish_dataset(publish_name=ddf)\n", - "\n", - " if parquet_out_dir:\n", - " dd.to_parquet(df=ddf, path=parquet_out_dir)\n", - " context.log_result('parquet directory', parquet_out_dir)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Convert the code to MLRun function" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Use code_to_function to convert the code to MLRun
" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "fn = mlrun.code_to_function(name=\"snowflake-dask\", \n", - " kind='job', \n", - " filename='snowflake_dask.py',\n", - " image='mlrun/mlrun',\n", - " requirements='requirements.txt',\n", - " handler=\"load_results\", \n", - " description=\"Snowflake Dask - Ingest snowflake data in parallel with Dask cluster\",\n", - " categories=[\"data-prep\"],\n", - " labels={\"author\": \"xingsheng\"}\n", - " )\n", - "fn.apply(mlrun.platforms.auto_mount())\n", - "fn.deploy()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### export function to local `function.yaml` file for testing\n", - "in the real usage, we will import a function from hub" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "fn.export('function.yaml')\n", - "# print(fn.to_yaml())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### import a function from local `function.yaml' for testing (Need to change it to import from hub before PR)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "fn = mlrun.import_function(\"./function.yaml\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# fn = mlrun.import_function(\"hub://snowflake_dask\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "fn.apply(mlrun.platforms.auto_mount()) # this is a very important line" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### create a dask cluster and specify the configuration for the dask process (e.g. replicas, memory etc)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# function URI is db:///\n", - "dask_uri = f'db://{project_name}/{dask_cluster_name}'\n", - "dask_uri" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dsf = mlrun.new_function(name=dask_cluster_name, \n", - " kind='dask', \n", - " image='mlrun/mlrun',\n", - " requirements=[\"bokeh\", \"snowflake-connector-python[pandas]\"]\n", - " )\n", - "dsf.apply(mlrun.mount_v3io())\n", - "dsf.spec.remote = True\n", - "dsf.spec.min_replicas = 1\n", - "dsf.spec.max_replicas = 10\n", - "dsf.spec.service_type = \"NodePort\"\n", - "dsf.with_requests(mem='4G', cpu='2')\n", - "# dsf.spec.node_port=30088\n", - "# dsf.spec.scheduler_timeout = \"5 days\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dsf.deploy()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "client = dsf.client" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Run the function" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "When running the function you would see a remote dashboard link as part of the result. click on this link takes you to the dask monitoring dashboard" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "p = 'my-local-test'\n", - "parquet_path = f\"/v3io/bigdata/pq_from_sf_dask/{p}\"\n", - "\n", - "fn.run(handler = 'load_results',\n", - " params={\"dask_client\": dask_uri, \n", - " \"connection_info\": connection_info, \n", - " \"query\": \"SELECT * FROM SNOWFLAKE_SAMPLE_DATA.TPCH_SF1.CUSTOMER\",\n", - " \"parquet_out_dir\": parquet_path,\n", - " \"publish_name\": \"customer\",\n", - " }\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "client.close()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Track the progress in the UI" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Users can view the progress and detailed information in the mlrun UI by clicking on the uid above.
\n", - "Also, to track the dask progress in the dask UI click on the \"dashboard link\" above the \"client\" section" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python [conda env:root] *", - "language": "python", - "name": "conda-root-py" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/functions/development/snowflake_dask/1.1.0/src/snowflake_dask.py b/functions/development/snowflake_dask/1.1.0/src/snowflake_dask.py deleted file mode 100644 index 8846e821..00000000 --- a/functions/development/snowflake_dask/1.1.0/src/snowflake_dask.py +++ /dev/null @@ -1,125 +0,0 @@ -# Copyright 2019 Iguazio -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -"""Snowflake Dask - Ingest Snaowflake data with Dask""" - -import warnings -import mlrun -from mlrun.execution import MLClientCtx -import snowflake.connector as snow -from dask.distributed import Client -from dask.dataframe import from_delayed -from dask import delayed -from dask import dataframe as dd -from cryptography.hazmat.backends import default_backend -from cryptography.hazmat.primitives import serialization - -warnings.filterwarnings("ignore") - -@delayed -def load(batch): - - """A delayed load one batch.""" - - try: - print("BATCHING") - df_ = batch.to_pandas() - return df_ - except Exception as e: - print(f"Failed on {batch} for {e}") - raise - -def load_results(context: MLClientCtx, - dask_client: str, - connection_info: str, - query: str, - parquet_out_dir = None, - publish_name = None - ) -> None: - - """Snowflake Dask - Ingest Snowflake data with Dask - - :param context: the function context - :param dask_client: dask cluster function name - :param connection_info: Snowflake database connection info (this will be in a secret later) - :param query: query to for Snowflake - :param parquet_out_dir: directory path for the output parquet files - (default None, not write out) - :param publish_name: name of the dask dataframe to publish to the dask cluster - (default None, not publish) - - """ - context = mlrun.get_or_create_ctx('snawflake-dask-cluster') - sf_password = context.get_secret('sfPassword') - pk_path = context.get_secret('pkPath') - pk_password = context.get_secret('pkPassword') - - if pk_path and pk_password: - with open(pk_path, "rb") as key: - p_key= serialization.load_pem_private_key( - key.read(), - password=str(pk_password).encode(), - backend=default_backend() - ) - pkb = p_key.private_bytes( - encoding=serialization.Encoding.DER, - format=serialization.PrivateFormat.PKCS8 - ,encryption_algorithm=serialization.NoEncryption() - ) - connection_info.pop('password', 'No password found') - connection_info['private_key'] = pkb - elif sf_password: - connection_info['password'] = sf_password - else: - raise Exception("\nPlease set up the secret for Snowflake in your project!\n") - - # setup dask client from the MLRun dask cluster function - if dask_client: - client = mlrun.import_function(dask_client).client - context.logger.info(f'Existing dask client === >>> {client}\n') - else: - client = Client() - context.logger.info(f'\nNewly created dask client === >>> {client}\n') - - conn = snow.connect(**connection_info) - cur = conn.cursor() - cur.execute(query) - batches = cur.get_result_batches() - context.logger.info(f'batches len === {len(batches)}\n') - - dfs = [] - for batch in batches: - if batch.rowcount > 0: - df = load(batch) - dfs.append(df) - ddf = from_delayed(dfs) - - # materialize the query results set for some sample compute - - ddf_describe = ddf.describe().compute() - - context.logger.info(f'query === >>> {query}\n') - context.logger.info(f'ddf === >>> {ddf}\n') - context.log_result('number of rows', len(ddf.index)) - context.log_dataset("ddf_describe", df=ddf_describe) - - if publish_name: - context.log_result('data_set_name', publish_name) - if not client.list_datasets(): - ddf.persist(name = publish_name) - client.publish_dataset(publish_name=ddf) - - if parquet_out_dir: - dd.to_parquet(df=ddf, path=parquet_out_dir) - context.log_result('parquet directory', parquet_out_dir) diff --git a/functions/development/snowflake_dask/1.1.0/src/test_snowflake_dask.py b/functions/development/snowflake_dask/1.1.0/src/test_snowflake_dask.py deleted file mode 100644 index fc2d4c93..00000000 --- a/functions/development/snowflake_dask/1.1.0/src/test_snowflake_dask.py +++ /dev/null @@ -1,24 +0,0 @@ -# Copyright 2019 Iguazio -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -"""Snowflake Dask unit test""" -from mlrun import import_function - -def test_snowflake_dask(): - """An unit test""" - fn_to_test = import_function("function.yaml") - - # a fake assert to pass the unit test - if fn_to_test.to_yaml().__contains__('job'): - assert True diff --git a/functions/development/snowflake_dask/1.1.0/static/documentation.html b/functions/development/snowflake_dask/1.1.0/static/documentation.html deleted file mode 100644 index 68ca0ecd..00000000 --- a/functions/development/snowflake_dask/1.1.0/static/documentation.html +++ /dev/null @@ -1,247 +0,0 @@ - - - - - - - -snowflake_dask package - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - - - -
-
- - -
-
-
- -
-

snowflake_dask package

- -
- -
-
-
-
-
-

snowflake_dask package#

-
-

Submodules#

-
-
-

snowflake_dask.snowflake_dask module#

-

Snowflake Dask - Ingest Snaowflake data with Dask

-
-
-snowflake_dask.snowflake_dask.load(batch)#
-

A delayed load one batch.

-
-
-
-snowflake_dask.snowflake_dask.load_results(context: mlrun.execution.MLClientCtx, dask_client: str, connection_info: str, query: str, parquet_out_dir=None, publish_name=None)None[source]#
-

Snowflake Dask - Ingest Snowflake data with Dask

-
-
Parameters
-
    -
  • context – the function context

  • -
  • dask_client – dask cluster function name

  • -
  • connection_info – Snowflake database connection info (this will be in a secret later)

  • -
  • query – query to for Snowflake

  • -
  • parquet_out_dir – directory path for the output parquet files -(default None, not write out)

  • -
  • publish_name – name of the dask dataframe to publish to the dask cluster -(default None, not publish)

  • -
-
-
-
-
-
-

Module contents#

-
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/snowflake_dask/1.1.0/static/example.html b/functions/development/snowflake_dask/1.1.0/static/example.html deleted file mode 100644 index 35fe27a2..00000000 --- a/functions/development/snowflake_dask/1.1.0/static/example.html +++ /dev/null @@ -1,618 +0,0 @@ - - - - - - - -This notebook is to create a function to ingest data from snowflake with a Dask cluster - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - - - -
-
- - -
-
-
- - -
-
-
-

This notebook is to create a function to ingest data from snowflake with a Dask cluster#

-

The dask frameworks enables users to parallelize their python code and run it as a distributed process on Iguazio cluster and dramatically accelerate their performance.
-In this notebook we’ll create an mlrun function running as a dask client to ingest data from snowflake.
-It also demonstrates how to run parallelize query against snowflake using Dask Delayed option to query a large data set from snowflake.
-The function will be published on the function marketplace.
-For more information on dask over kubernetes: https://kubernetes.dask.org/en/latest/

-
-

Set up the enviroment#

-
-
-
import mlrun
-import os
-import warnings
-import yaml
-
-project_name = "snowflake-dask"
-dask_cluster_name="snowflake-dask-cluster"
-artifact_path = mlrun.set_environment(project=project_name,
-                                      artifact_path = os.path.join(os.path.abspath('/v3io/projects/'), project_name))
-
-warnings.filterwarnings("ignore")
-
-print(f'artifact_path = {artifact_path}')
-
-
-
-
-
-
-

Load snowflake configuration from config file.#

-

This is for demo purpose, in the real production code, you would need to put the snowflake connection info into secrets use the secrets in the running pod to connect to snowflake

-
-
-
# Load connection info
-with open(".config.yaml") as f:
-    connection_info = yaml.safe_load(f)
-
-# verify the config
-print(connection_info['account'])
-
-
-
-
-
-
-

Create a python function#

-

This function querys data from snowflake using snowflake python connector for parallel processing of the query results.
-With snoeflake python connector, when you execute a query, the cursor will return the result batches.
-Using Dask Delayed it will return and process results set in parallel.

-
-

write the function to a py file#

-
-
-
%%writefile snowflake_dask.py
-"""Snowflake Dask - Ingest Snowflake data with Dask"""
-import warnings
-import mlrun
-from mlrun.execution import MLClientCtx
-import snowflake.connector as snow
-from dask.distributed import Client
-from dask.dataframe import from_delayed
-from dask import delayed
-from dask import dataframe as dd
-from cryptography.hazmat.backends import default_backend
-from cryptography.hazmat.primitives import serialization
-
-warnings.filterwarnings("ignore")
-
-@delayed
-def load(batch):
-
-    """A delayed load one batch."""
-
-    try:
-        print("BATCHING")
-        df_ = batch.to_pandas()
-        return df_
-    except Exception as e:
-        print(f"Failed on {batch} for {e}")
-        raise
-
-def load_results(context: MLClientCtx,
-                 dask_client: str,
-                 connection_info: str,
-                 query: str,
-                 parquet_out_dir = None,
-                 publish_name = None
-                ) -> None:
-
-    """Snowflake Dask - Ingest Snowflake data with Dask
-
-    :param context:           the function context
-    :param dask_client:       dask cluster function name
-    :param connection_info:   Snowflake database connection info (this will be in a secret later)
-    :param query:             query to for Snowflake
-    :param parquet_out_dir:   directory path for the output parquet files
-                              (default None, not write out)
-    :param publish_name:      name of the dask dataframe to publish to the dask cluster
-                              (default None, not publish)
-
-    """
-    context = mlrun.get_or_create_ctx('snawflake-dask-cluster')
-    sf_password = context.get_secret('sfPassword')
-    pk_path =  context.get_secret('pkPath')
-    pk_password =  context.get_secret('pkPassword')
-
-    if pk_path and pk_password:
-        with open(pk_path, "rb") as key:
-            p_key= serialization.load_pem_private_key(
-                key.read(),
-                password=str(pk_password).encode(),
-                backend=default_backend()
-            )
-        pkb = p_key.private_bytes(
-            encoding=serialization.Encoding.DER,
-            format=serialization.PrivateFormat.PKCS8
-            ,encryption_algorithm=serialization.NoEncryption()
-        )
-        connection_info.pop('password', 'No password found')
-        connection_info['private_key'] = pkb
-    elif sf_password:
-        connection_info['password'] = sf_password
-    else:
-        raise Exception("\nPlease set up the secret for Snowflake in your project!\n")
-
-    # setup dask client from the MLRun dask cluster function
-    if dask_client:
-        client = mlrun.import_function(dask_client).client
-        context.logger.info(f'Existing dask client === >>> {client}\n')
-    else:
-        client = Client()
-        context.logger.info(f'\nNewly created dask client === >>> {client}\n')
-
-    conn = snow.connect(**connection_info)
-    cur = conn.cursor()
-    cur.execute(query)
-    batches = cur.get_result_batches()
-    context.logger.info(f'batches len === {len(batches)}\n')
-
-    dfs = []
-    for batch in batches:
-        if batch.rowcount > 0:
-            df = load(batch)
-            dfs.append(df)
-    ddf = from_delayed(dfs)
-
-    # materialize the query results set for some sample compute
-
-    ddf_describe = ddf.describe().compute()
-
-    context.logger.info(f'query  === >>> {query}\n')
-    context.logger.info(f'ddf  === >>> {ddf}\n')
-    context.log_result('number of rows', len(ddf.index))
-    context.log_dataset("ddf_describe", df=ddf_describe)
-
-    if publish_name:
-        context.log_result('data_set_name', publish_name)
-        if not client.list_datasets():
-            ddf.persist(name = publish_name)
-            client.publish_dataset(publish_name=ddf)
-
-    if parquet_out_dir:
-        dd.to_parquet(df=ddf, path=parquet_out_dir)
-        context.log_result('parquet directory', parquet_out_dir)
-
-
-
-
-
-
-
-

Convert the code to MLRun function#

-

Use code_to_function to convert the code to MLRun

-
-
-
fn = mlrun.code_to_function(name="snowflake-dask",  
-                            kind='job', 
-                            filename='snowflake_dask.py',
-                            image='mlrun/mlrun',
-                            requirements='requirements.txt',
-                            handler="load_results", 
-                            description="Snowflake Dask - Ingest snowflake data in parallel with Dask cluster",
-                            categories=["data-prep"],
-                            labels={"author": "xingsheng"}
-                           )
-fn.apply(mlrun.platforms.auto_mount())
-fn.deploy()
-
-
-
-
-
-

export function to local function.yaml file for testing#

-

in the real usage, we will import a function from hub

-
-
-
fn.export('function.yaml')
-# print(fn.to_yaml())
-
-
-
-
-
-
-

import a function from local `function.yaml’ for testing (Need to change it to import from hub before PR)#

-
-
-
fn = mlrun.import_function("./function.yaml")
-
-
-
-
-
-
-
# fn = mlrun.import_function("hub://snowflake_dask")
-
-
-
-
-
-
-
fn.apply(mlrun.platforms.auto_mount()) # this is a very important line
-
-
-
-
-
-
-

create a dask cluster and specify the configuration for the dask process (e.g. replicas, memory etc)#

-
-
-
# function URI is db://<project>/<name>
-dask_uri = f'db://{project_name}/{dask_cluster_name}'
-dask_uri
-
-
-
-
-
-
-
dsf = mlrun.new_function(name=dask_cluster_name, 
-                         kind='dask', 
-                         image='mlrun/mlrun',
-                         requirements=["bokeh", "snowflake-connector-python[pandas]"]
-                        )
-dsf.apply(mlrun.mount_v3io())
-dsf.spec.remote = True
-dsf.spec.min_replicas = 1
-dsf.spec.max_replicas = 10
-dsf.spec.service_type = "NodePort"
-dsf.with_requests(mem='4G', cpu='2')
-# dsf.spec.node_port=30088
-# dsf.spec.scheduler_timeout = "5 days"
-
-
-
-
-
-
-
dsf.deploy()
-
-
-
-
-
-
-
client = dsf.client
-
-
-
-
-
-
-
-

Run the function#

-

When running the function you would see a remote dashboard link as part of the result. click on this link takes you to the dask monitoring dashboard

-
-
-
p = 'my-local-test'
-parquet_path = f"/v3io/bigdata/pq_from_sf_dask/{p}"
-
-fn.run(handler = 'load_results',
-       params={"dask_client": dask_uri, 
-               "connection_info": connection_info, 
-               "query": "SELECT * FROM SNOWFLAKE_SAMPLE_DATA.TPCH_SF1.CUSTOMER",
-               "parquet_out_dir": parquet_path,
-               "publish_name": "customer",
-              }
-      )
-
-
-
-
-
-
-
client.close()
-
-
-
-
-
-
-

Track the progress in the UI#

-

Users can view the progress and detailed information in the mlrun UI by clicking on the uid above.
-Also, to track the dask progress in the dask UI click on the “dashboard link” above the “client” section

-
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/snowflake_dask/1.1.0/static/function.html b/functions/development/snowflake_dask/1.1.0/static/function.html deleted file mode 100644 index eda48073..00000000 --- a/functions/development/snowflake_dask/1.1.0/static/function.html +++ /dev/null @@ -1,103 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: job
-metadata:
-  name: snowflake-dask
-  tag: ''
-  hash: a002c7743b4a7471c7befe00f5497de050ebe902
-  project: snowflake-dask
-  labels:
-    author: xingsheng
-  categories:
-  - data-prep
-  credentials:
-    access_key: ec09bfc8-1cb4-466d-9049-852081973ce3
-spec:
-  command: ''
-  args: []
-  image: .mlrun/func-snowflake-dask-snowflake-dask:latest
-  build:
-    functionSourceCode: IiIiU25vd2ZsYWtlIERhc2sgLSBJbmdlc3QgU25hb3dmbGFrZSBkYXRhIHdpdGggRGFzayIiIgppbXBvcnQgd2FybmluZ3MKaW1wb3J0IG1scnVuCmZyb20gbWxydW4uZXhlY3V0aW9uIGltcG9ydCBNTENsaWVudEN0eAppbXBvcnQgc25vd2ZsYWtlLmNvbm5lY3RvciBhcyBzbm93CmZyb20gZGFzay5kaXN0cmlidXRlZCBpbXBvcnQgQ2xpZW50CmZyb20gZGFzay5kYXRhZnJhbWUgaW1wb3J0IGZyb21fZGVsYXllZApmcm9tIGRhc2sgaW1wb3J0IGRlbGF5ZWQKZnJvbSBkYXNrIGltcG9ydCBkYXRhZnJhbWUgYXMgZGQKZnJvbSBjcnlwdG9ncmFwaHkuaGF6bWF0LmJhY2tlbmRzIGltcG9ydCBkZWZhdWx0X2JhY2tlbmQKZnJvbSBjcnlwdG9ncmFwaHkuaGF6bWF0LnByaW1pdGl2ZXMgaW1wb3J0IHNlcmlhbGl6YXRpb24KCndhcm5pbmdzLmZpbHRlcndhcm5pbmdzKCJpZ25vcmUiKQoKQGRlbGF5ZWQKZGVmIGxvYWQoYmF0Y2gpOgoKICAgICIiIkEgZGVsYXllZCBsb2FkIG9uZSBiYXRjaC4iIiIKCiAgICB0cnk6CiAgICAgICAgcHJpbnQoIkJBVENISU5HIikKICAgICAgICBkZl8gPSBiYXRjaC50b19wYW5kYXMoKQogICAgICAgIHJldHVybiBkZl8KICAgIGV4Y2VwdCBFeGNlcHRpb24gYXMgZToKICAgICAgICBwcmludChmIkZhaWxlZCBvbiB7YmF0Y2h9IGZvciB7ZX0iKQogICAgICAgIHJhaXNlCgpkZWYgbG9hZF9yZXN1bHRzKGNvbnRleHQ6IE1MQ2xpZW50Q3R4LAogICAgICAgICAgICAgICAgIGRhc2tfY2xpZW50OiBzdHIsCiAgICAgICAgICAgICAgICAgY29ubmVjdGlvbl9pbmZvOiBzdHIsCiAgICAgICAgICAgICAgICAgcXVlcnk6IHN0ciwKICAgICAgICAgICAgICAgICBwYXJxdWV0X291dF9kaXIgPSBOb25lLAogICAgICAgICAgICAgICAgIHB1Ymxpc2hfbmFtZSA9IE5vbmUKICAgICAgICAgICAgICAgICkgLT4gTm9uZToKCiAgICAiIiJTbm93Zmxha2UgRGFzayAtIEluZ2VzdCBTbmFvd2ZsYWtlIGRhdGEgd2l0aCBEYXNrCgogICAgOnBhcmFtIGNvbnRleHQ6ICAgICAgICAgICB0aGUgZnVuY3Rpb24gY29udGV4dAogICAgOnBhcmFtIGRhc2tfY2xpZW50OiAgICAgICBkYXNrIGNsdXN0ZXIgZnVuY3Rpb24gbmFtZQogICAgOnBhcmFtIGNvbm5lY3Rpb25faW5mbzogICBTbm93Zmxha2UgZGF0YWJhc2UgY29ubmVjdGlvbiBpbmZvICh0aGlzIHdpbGwgYmUgaW4gYSBzZWNyZXQgbGF0ZXIpCiAgICA6cGFyYW0gcXVlcnk6ICAgICAgICAgICAgIHF1ZXJ5IHRvIGZvciBTbm93Zmxha2UKICAgIDpwYXJhbSBwYXJxdWV0X291dF9kaXI6ICAgZGlyZWN0b3J5IHBhdGggZm9yIHRoZSBvdXRwdXQgcGFycXVldCBmaWxlcwogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAoZGVmYXVsdCBOb25lLCBub3Qgd3JpdGUgb3V0KQogICAgOnBhcmFtIHB1Ymxpc2hfbmFtZTogICAgICBuYW1lIG9mIHRoZSBkYXNrIGRhdGFmcmFtZSB0byBwdWJsaXNoIHRvIHRoZSBkYXNrIGNsdXN0ZXIKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgKGRlZmF1bHQgTm9uZSwgbm90IHB1Ymxpc2gpCgogICAgIiIiCiAgICBjb250ZXh0ID0gbWxydW4uZ2V0X29yX2NyZWF0ZV9jdHgoJ3NuYXdmbGFrZS1kYXNrLWNsdXN0ZXInKQogICAgc2ZfcGFzc3dvcmQgPSBjb250ZXh0LmdldF9zZWNyZXQoJ3NmUGFzc3dvcmQnKQogICAgcGtfcGF0aCA9ICBjb250ZXh0LmdldF9zZWNyZXQoJ3BrUGF0aCcpCiAgICBwa19wYXNzd29yZCA9ICBjb250ZXh0LmdldF9zZWNyZXQoJ3BrUGFzc3dvcmQnKQoKICAgIGlmIHBrX3BhdGggYW5kIHBrX3Bhc3N3b3JkOgogICAgICAgIHdpdGggb3Blbihwa19wYXRoLCAicmIiKSBhcyBrZXk6CiAgICAgICAgICAgIHBfa2V5PSBzZXJpYWxpemF0aW9uLmxvYWRfcGVtX3ByaXZhdGVfa2V5KAogICAgICAgICAgICAgICAga2V5LnJlYWQoKSwKICAgICAgICAgICAgICAgIHBhc3N3b3JkPXN0cihwa19wYXNzd29yZCkuZW5jb2RlKCksCiAgICAgICAgICAgICAgICBiYWNrZW5kPWRlZmF1bHRfYmFja2VuZCgpCiAgICAgICAgICAgICkKICAgICAgICBwa2IgPSBwX2tleS5wcml2YXRlX2J5dGVzKAogICAgICAgICAgICBlbmNvZGluZz1zZXJpYWxpemF0aW9uLkVuY29kaW5nLkRFUiwKICAgICAgICAgICAgZm9ybWF0PXNlcmlhbGl6YXRpb24uUHJpdmF0ZUZvcm1hdC5QS0NTOAogICAgICAgICAgICAsZW5jcnlwdGlvbl9hbGdvcml0aG09c2VyaWFsaXphdGlvbi5Ob0VuY3J5cHRpb24oKQogICAgICAgICkKICAgICAgICBjb25uZWN0aW9uX2luZm8ucG9wKCdwYXNzd29yZCcsICdObyBwYXNzd29yZCBmb3VuZCcpCiAgICAgICAgY29ubmVjdGlvbl9pbmZvWydwcml2YXRlX2tleSddID0gcGtiCiAgICBlbGlmIHNmX3Bhc3N3b3JkOgogICAgICAgIGNvbm5lY3Rpb25faW5mb1sncGFzc3dvcmQnXSA9IHNmX3Bhc3N3b3JkCiAgICBlbHNlOgogICAgICAgIHJhaXNlIEV4Y2VwdGlvbigiXG5QbGVhc2Ugc2V0IHVwIHRoZSBzZWNyZXQgZm9yIFNub3dmbGFrZSBpbiB5b3VyIHByb2plY3QhXG4iKQoKICAgICMgc2V0dXAgZGFzayBjbGllbnQgZnJvbSB0aGUgTUxSdW4gZGFzayBjbHVzdGVyIGZ1bmN0aW9uCiAgICBpZiBkYXNrX2NsaWVudDoKICAgICAgICBjbGllbnQgPSBtbHJ1bi5pbXBvcnRfZnVuY3Rpb24oZGFza19jbGllbnQpLmNsaWVudAogICAgICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZidFeGlzdGluZyBkYXNrIGNsaWVudCA9PT0gPj4+IHtjbGllbnR9XG4nKQogICAgZWxzZToKICAgICAgICBjbGllbnQgPSBDbGllbnQoKQogICAgICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZidcbk5ld2x5IGNyZWF0ZWQgZGFzayBjbGllbnQgPT09ID4+PiB7Y2xpZW50fVxuJykKCiAgICBjb25uID0gc25vdy5jb25uZWN0KCoqY29ubmVjdGlvbl9pbmZvKQogICAgY3VyID0gY29ubi5jdXJzb3IoKQogICAgY3VyLmV4ZWN1dGUocXVlcnkpCiAgICBiYXRjaGVzID0gY3VyLmdldF9yZXN1bHRfYmF0Y2hlcygpCiAgICBjb250ZXh0LmxvZ2dlci5pbmZvKGYnYmF0Y2hlcyBsZW4gPT09IHtsZW4oYmF0Y2hlcyl9XG4nKQoKICAgIGRmcyA9IFtdCiAgICBmb3IgYmF0Y2ggaW4gYmF0Y2hlczoKICAgICAgICBpZiBiYXRjaC5yb3djb3VudCA+IDA6CiAgICAgICAgICAgIGRmID0gbG9hZChiYXRjaCkKICAgICAgICAgICAgZGZzLmFwcGVuZChkZikKICAgIGRkZiA9IGZyb21fZGVsYXllZChkZnMpCgogICAgIyBtYXRlcmlhbGl6ZSB0aGUgcXVlcnkgcmVzdWx0cyBzZXQgZm9yIHNvbWUgc2FtcGxlIGNvbXB1dGUKCiAgICBkZGZfZGVzY3JpYmUgPSBkZGYuZGVzY3JpYmUoKS5jb21wdXRlKCkKCiAgICBjb250ZXh0LmxvZ2dlci5pbmZvKGYncXVlcnkgID09PSA+Pj4ge3F1ZXJ5fVxuJykKICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZidkZGYgID09PSA+Pj4ge2RkZn1cbicpCiAgICBjb250ZXh0LmxvZ19yZXN1bHQoJ251bWJlciBvZiByb3dzJywgbGVuKGRkZi5pbmRleCkpCiAgICBjb250ZXh0LmxvZ19kYXRhc2V0KCJkZGZfZGVzY3JpYmUiLCBkZj1kZGZfZGVzY3JpYmUpCgogICAgaWYgcHVibGlzaF9uYW1lOgogICAgICAgIGNvbnRleHQubG9nX3Jlc3VsdCgnZGF0YV9zZXRfbmFtZScsIHB1Ymxpc2hfbmFtZSkKICAgICAgICBpZiBub3QgY2xpZW50Lmxpc3RfZGF0YXNldHMoKToKICAgICAgICAgICAgZGRmLnBlcnNpc3QobmFtZSA9IHB1Ymxpc2hfbmFtZSkKICAgICAgICAgICAgY2xpZW50LnB1Ymxpc2hfZGF0YXNldChwdWJsaXNoX25hbWU9ZGRmKQoKICAgIGlmIHBhcnF1ZXRfb3V0X2RpcjoKICAgICAgICBkZC50b19wYXJxdWV0KGRmPWRkZiwgcGF0aD1wYXJxdWV0X291dF9kaXIpCiAgICAgICAgY29udGV4dC5sb2dfcmVzdWx0KCdwYXJxdWV0IGRpcmVjdG9yeScsIHBhcnF1ZXRfb3V0X2RpcikK
-    base_image: mlrun/mlrun
-    commands:
-    - python -m pip install bokeh snowflake-connector-python[pandas] mlrun~=0.9.1
-    code_origin: https://github.com/xsqian/functions.git#6b31040e2ad762602f335b0589823a1c61a09975:snowflake_dask.py
-    origin_filename: snowflake_dask.py
-  entry_points:
-    load:
-      name: load
-      doc: A delayed load one batch.
-      parameters:
-      - name: batch
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 15
-    load_results:
-      name: load_results
-      doc: Snowflake Dask - Ingest Snaowflake data with Dask
-      parameters:
-      - name: context
-        type: MLClientCtx
-        doc: the function context
-        default: ''
-      - name: dask_client
-        type: str
-        doc: dask cluster function name
-        default: ''
-      - name: connection_info
-        type: str
-        doc: Snowflake database connection info (this will be in a secret later)
-        default: ''
-      - name: query
-        type: str
-        doc: query to for Snowflake
-        default: ''
-      - name: parquet_out_dir
-        doc: directory path for the output parquet files (default None, not write
-          out)
-        default: null
-      - name: publish_name
-        doc: name of the dask dataframe to publish to the dask cluster (default None,
-          not publish)
-        default: null
-      outputs:
-      - default: ''
-      lineno: 28
-  description: Snowflake Dask - Ingest snowflake data in parallel with Dask cluster
-  default_handler: load_results
-  disable_auto_mount: false
-  env:
-  - name: V3IO_API
-    value: ''
-  - name: V3IO_USERNAME
-    value: ''
-  - name: V3IO_ACCESS_KEY
-    value: ''
-  - name: V3IO_FRAMESD
-    value: ''
-  priority_class_name: igz-workload-medium
-  preemption_mode: prevent
-  affinity: null
-  tolerations: null
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/snowflake_dask/1.1.0/static/item.html b/functions/development/snowflake_dask/1.1.0/static/item.html deleted file mode 100644 index c7553d0c..00000000 --- a/functions/development/snowflake_dask/1.1.0/static/item.html +++ /dev/null @@ -1,47 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- data-preparation
-description: Snowflake Dask - Ingest snowflake data in parallel with Dask cluster
-doc: ''
-example: snowflake-dask-mlrun.ipynb
-generationDate: 2022-08-28:17-25
-hidden: false
-icon: ''
-labels:
-  author: xingsheng
-  framework: dask
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 1.4.1
-name: snowflake_dask
-platformVersion: 3.5.0
-spec:
-  filename: snowflake_dask.py
-  handler: load_results
-  image: mlrun/mlrun
-  kind: job
-  requirements: []
-url: ''
-version: 1.1.0
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/snowflake_dask/1.1.0/static/snowflake_dask.html b/functions/development/snowflake_dask/1.1.0/static/snowflake_dask.html deleted file mode 100644 index 1acc2302..00000000 --- a/functions/development/snowflake_dask/1.1.0/static/snowflake_dask.html +++ /dev/null @@ -1,265 +0,0 @@ - - - - - - - -snowflake_dask.snowflake_dask - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - -
-
- -
-
-
-
-
- -
-

- -
-
-
-
-
-
-
-

Source code for snowflake_dask.snowflake_dask

-# Copyright 2019 Iguazio
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-"""Snowflake Dask - Ingest Snaowflake data with Dask"""
-
-import warnings
-import mlrun
-from mlrun.execution import MLClientCtx
-import snowflake.connector as snow
-from dask.distributed import Client
-from dask.dataframe import from_delayed
-from dask import delayed
-from dask import dataframe as dd
-from cryptography.hazmat.backends import default_backend
-from cryptography.hazmat.primitives import serialization
-
-warnings.filterwarnings("ignore")
-
-@delayed
-def load(batch):
-
-    """A delayed load one batch."""
-
-    try:
-        print("BATCHING")
-        df_ = batch.to_pandas()
-        return df_
-    except Exception as e:
-        print(f"Failed on {batch} for {e}")
-        raise
-
-
[docs]def load_results(context: MLClientCtx, - dask_client: str, - connection_info: str, - query: str, - parquet_out_dir = None, - publish_name = None - ) -> None: - - """Snowflake Dask - Ingest Snowflake data with Dask - - :param context: the function context - :param dask_client: dask cluster function name - :param connection_info: Snowflake database connection info (this will be in a secret later) - :param query: query to for Snowflake - :param parquet_out_dir: directory path for the output parquet files - (default None, not write out) - :param publish_name: name of the dask dataframe to publish to the dask cluster - (default None, not publish) - - """ - context = mlrun.get_or_create_ctx('snawflake-dask-cluster') - sf_password = context.get_secret('sfPassword') - pk_path = context.get_secret('pkPath') - pk_password = context.get_secret('pkPassword') - - if pk_path and pk_password: - with open(pk_path, "rb") as key: - p_key= serialization.load_pem_private_key( - key.read(), - password=str(pk_password).encode(), - backend=default_backend() - ) - pkb = p_key.private_bytes( - encoding=serialization.Encoding.DER, - format=serialization.PrivateFormat.PKCS8 - ,encryption_algorithm=serialization.NoEncryption() - ) - connection_info.pop('password', 'No password found') - connection_info['private_key'] = pkb - elif sf_password: - connection_info['password'] = sf_password - else: - raise Exception("\nPlease set up the secret for Snowflake in your project!\n") - - # setup dask client from the MLRun dask cluster function - if dask_client: - client = mlrun.import_function(dask_client).client - context.logger.info(f'Existing dask client === >>> {client}\n') - else: - client = Client() - context.logger.info(f'\nNewly created dask client === >>> {client}\n') - - conn = snow.connect(**connection_info) - cur = conn.cursor() - cur.execute(query) - batches = cur.get_result_batches() - context.logger.info(f'batches len === {len(batches)}\n') - - dfs = [] - for batch in batches: - if batch.rowcount > 0: - df = load(batch) - dfs.append(df) - ddf = from_delayed(dfs) - - # materialize the query results set for some sample compute - - ddf_describe = ddf.describe().compute() - - context.logger.info(f'query === >>> {query}\n') - context.logger.info(f'ddf === >>> {ddf}\n') - context.log_result('number of rows', len(ddf.index)) - context.log_dataset("ddf_describe", df=ddf_describe) - - if publish_name: - context.log_result('data_set_name', publish_name) - if not client.list_datasets(): - ddf.persist(name = publish_name) - client.publish_dataset(publish_name=ddf) - - if parquet_out_dir: - dd.to_parquet(df=ddf, path=parquet_out_dir) - context.log_result('parquet directory', parquet_out_dir)
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/snowflake_dask/1.1.0/static/source.html b/functions/development/snowflake_dask/1.1.0/static/source.html deleted file mode 100644 index 99a7e972..00000000 --- a/functions/development/snowflake_dask/1.1.0/static/source.html +++ /dev/null @@ -1,147 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-# Copyright 2019 Iguazio
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-"""Snowflake Dask - Ingest Snaowflake data with Dask"""
-
-import warnings
-import mlrun
-from mlrun.execution import MLClientCtx
-import snowflake.connector as snow
-from dask.distributed import Client
-from dask.dataframe import from_delayed
-from dask import delayed
-from dask import dataframe as dd
-from cryptography.hazmat.backends import default_backend
-from cryptography.hazmat.primitives import serialization
-
-warnings.filterwarnings("ignore")
-
-@delayed
-def load(batch):
-
-    """A delayed load one batch."""
-
-    try:
-        print("BATCHING")
-        df_ = batch.to_pandas()
-        return df_
-    except Exception as e:
-        print(f"Failed on {batch} for {e}")
-        raise
-
-def load_results(context: MLClientCtx,
-                 dask_client: str,
-                 connection_info: str,
-                 query: str,
-                 parquet_out_dir = None,
-                 publish_name = None
-                ) -> None:
-
-    """Snowflake Dask - Ingest Snowflake data with Dask
-
-    :param context:           the function context
-    :param dask_client:       dask cluster function name
-    :param connection_info:   Snowflake database connection info (this will be in a secret later)
-    :param query:             query to for Snowflake
-    :param parquet_out_dir:   directory path for the output parquet files
-                              (default None, not write out)
-    :param publish_name:      name of the dask dataframe to publish to the dask cluster
-                              (default None, not publish)
-
-    """
-    context = mlrun.get_or_create_ctx('snawflake-dask-cluster')
-    sf_password = context.get_secret('sfPassword')
-    pk_path =  context.get_secret('pkPath')
-    pk_password =  context.get_secret('pkPassword')
-
-    if pk_path and pk_password:
-        with open(pk_path, "rb") as key:
-            p_key= serialization.load_pem_private_key(
-                key.read(),
-                password=str(pk_password).encode(),
-                backend=default_backend()
-            )
-        pkb = p_key.private_bytes(
-            encoding=serialization.Encoding.DER,
-            format=serialization.PrivateFormat.PKCS8
-            ,encryption_algorithm=serialization.NoEncryption()
-        )
-        connection_info.pop('password', 'No password found')
-        connection_info['private_key'] = pkb
-    elif sf_password:
-        connection_info['password'] = sf_password
-    else:
-        raise Exception("\nPlease set up the secret for Snowflake in your project!\n")
-
-    # setup dask client from the MLRun dask cluster function
-    if dask_client:
-        client = mlrun.import_function(dask_client).client
-        context.logger.info(f'Existing dask client === >>> {client}\n')
-    else:
-        client = Client()
-        context.logger.info(f'\nNewly created dask client === >>> {client}\n')
-
-    conn = snow.connect(**connection_info)
-    cur = conn.cursor()
-    cur.execute(query)
-    batches = cur.get_result_batches()
-    context.logger.info(f'batches len === {len(batches)}\n')
-
-    dfs = []
-    for batch in batches:
-        if batch.rowcount > 0:
-            df = load(batch)
-            dfs.append(df)
-    ddf = from_delayed(dfs)
-
-    # materialize the query results set for some sample compute
-
-    ddf_describe = ddf.describe().compute()
-
-    context.logger.info(f'query  === >>> {query}\n')
-    context.logger.info(f'ddf  === >>> {ddf}\n')
-    context.log_result('number of rows', len(ddf.index))
-    context.log_dataset("ddf_describe", df=ddf_describe)
-
-    if publish_name:
-        context.log_result('data_set_name', publish_name)
-        if not client.list_datasets():
-            ddf.persist(name = publish_name)
-            client.publish_dataset(publish_name=ddf)
-
-    if parquet_out_dir:
-        dd.to_parquet(df=ddf, path=parquet_out_dir)
-        context.log_result('parquet directory', parquet_out_dir)
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/snowflake_dask/latest/src/README.md b/functions/development/snowflake_dask/latest/src/README.md deleted file mode 100644 index 70fa3c92..00000000 --- a/functions/development/snowflake_dask/latest/src/README.md +++ /dev/null @@ -1,38 +0,0 @@ -# **Data Preperation Function** - -## `Snowflake_dask` - -![](img/snowflake-dask.png) - -This function query the data from a snowflake database and process the results -in parallel in a Dask cluster. -It will publish the dask dataframe in the cluster for other process to use. -It can also write the results dataframe to parquet files. - -```markdown - -:param context: the function context -:param dask_client: dask cluster function name -:param connection_info: Snowflake database connection info (this will be in a secret later) -:param query: query to for Snowflake -:param parquet_out_dir: directory path for the output parquet files (default None, not write out) -:param publish_name: name of the dask dataframe to publish to the dask cluster (default None, not publish) -``` - -To use the function, you will need to either have the password or key pair authentication to Snowflake configured. - -To get the password, or generate key pair in Snowflake and configure Snowflake for key pair authentication, please follow Snowflake [documentation](https://docs.snowflake.com/en/user-guide/key-pair-auth.html) here. - -After obtained password or key pair, please set up the project secrets in your Iguazio cluster. - -If you are using password, you only need to add ```sfPassword``` secret to the project settings. - -If you are using the key pair authentication, you will need to add both ```pkPath``` and ```pkPassword``` to the project settings. - - where: - - ```pkPath``` is the file path to your private key file in the cluster, for example ```/User/rsa_key.p8``` - -```pkPassword``` is your private key encryption password. Please see the screenshot below for your reference. - -![Secrets Screenshot](img/iguazio-project-secrets.png) diff --git a/functions/development/snowflake_dask/latest/src/config-template.yaml b/functions/development/snowflake_dask/latest/src/config-template.yaml deleted file mode 100644 index fb46ac2e..00000000 --- a/functions/development/snowflake_dask/latest/src/config-template.yaml +++ /dev/null @@ -1,5 +0,0 @@ -user: "..." -password: "..." -warehouse: "..." -account: "..." -application: "Iguazio" \ No newline at end of file diff --git a/functions/development/snowflake_dask/latest/src/function.yaml b/functions/development/snowflake_dask/latest/src/function.yaml deleted file mode 100644 index c9cc8d74..00000000 --- a/functions/development/snowflake_dask/latest/src/function.yaml +++ /dev/null @@ -1,81 +0,0 @@ -kind: job -metadata: - name: snowflake-dask - tag: '' - hash: a002c7743b4a7471c7befe00f5497de050ebe902 - project: snowflake-dask - labels: - author: xingsheng - categories: - - data-prep - credentials: - access_key: ec09bfc8-1cb4-466d-9049-852081973ce3 -spec: - command: '' - args: [] - image: .mlrun/func-snowflake-dask-snowflake-dask:latest - build: - functionSourceCode: IiIiU25vd2ZsYWtlIERhc2sgLSBJbmdlc3QgU25hb3dmbGFrZSBkYXRhIHdpdGggRGFzayIiIgppbXBvcnQgd2FybmluZ3MKaW1wb3J0IG1scnVuCmZyb20gbWxydW4uZXhlY3V0aW9uIGltcG9ydCBNTENsaWVudEN0eAppbXBvcnQgc25vd2ZsYWtlLmNvbm5lY3RvciBhcyBzbm93CmZyb20gZGFzay5kaXN0cmlidXRlZCBpbXBvcnQgQ2xpZW50CmZyb20gZGFzay5kYXRhZnJhbWUgaW1wb3J0IGZyb21fZGVsYXllZApmcm9tIGRhc2sgaW1wb3J0IGRlbGF5ZWQKZnJvbSBkYXNrIGltcG9ydCBkYXRhZnJhbWUgYXMgZGQKZnJvbSBjcnlwdG9ncmFwaHkuaGF6bWF0LmJhY2tlbmRzIGltcG9ydCBkZWZhdWx0X2JhY2tlbmQKZnJvbSBjcnlwdG9ncmFwaHkuaGF6bWF0LnByaW1pdGl2ZXMgaW1wb3J0IHNlcmlhbGl6YXRpb24KCndhcm5pbmdzLmZpbHRlcndhcm5pbmdzKCJpZ25vcmUiKQoKQGRlbGF5ZWQKZGVmIGxvYWQoYmF0Y2gpOgoKICAgICIiIkEgZGVsYXllZCBsb2FkIG9uZSBiYXRjaC4iIiIKCiAgICB0cnk6CiAgICAgICAgcHJpbnQoIkJBVENISU5HIikKICAgICAgICBkZl8gPSBiYXRjaC50b19wYW5kYXMoKQogICAgICAgIHJldHVybiBkZl8KICAgIGV4Y2VwdCBFeGNlcHRpb24gYXMgZToKICAgICAgICBwcmludChmIkZhaWxlZCBvbiB7YmF0Y2h9IGZvciB7ZX0iKQogICAgICAgIHJhaXNlCgpkZWYgbG9hZF9yZXN1bHRzKGNvbnRleHQ6IE1MQ2xpZW50Q3R4LAogICAgICAgICAgICAgICAgIGRhc2tfY2xpZW50OiBzdHIsCiAgICAgICAgICAgICAgICAgY29ubmVjdGlvbl9pbmZvOiBzdHIsCiAgICAgICAgICAgICAgICAgcXVlcnk6IHN0ciwKICAgICAgICAgICAgICAgICBwYXJxdWV0X291dF9kaXIgPSBOb25lLAogICAgICAgICAgICAgICAgIHB1Ymxpc2hfbmFtZSA9IE5vbmUKICAgICAgICAgICAgICAgICkgLT4gTm9uZToKCiAgICAiIiJTbm93Zmxha2UgRGFzayAtIEluZ2VzdCBTbmFvd2ZsYWtlIGRhdGEgd2l0aCBEYXNrCgogICAgOnBhcmFtIGNvbnRleHQ6ICAgICAgICAgICB0aGUgZnVuY3Rpb24gY29udGV4dAogICAgOnBhcmFtIGRhc2tfY2xpZW50OiAgICAgICBkYXNrIGNsdXN0ZXIgZnVuY3Rpb24gbmFtZQogICAgOnBhcmFtIGNvbm5lY3Rpb25faW5mbzogICBTbm93Zmxha2UgZGF0YWJhc2UgY29ubmVjdGlvbiBpbmZvICh0aGlzIHdpbGwgYmUgaW4gYSBzZWNyZXQgbGF0ZXIpCiAgICA6cGFyYW0gcXVlcnk6ICAgICAgICAgICAgIHF1ZXJ5IHRvIGZvciBTbm93Zmxha2UKICAgIDpwYXJhbSBwYXJxdWV0X291dF9kaXI6ICAgZGlyZWN0b3J5IHBhdGggZm9yIHRoZSBvdXRwdXQgcGFycXVldCBmaWxlcwogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAoZGVmYXVsdCBOb25lLCBub3Qgd3JpdGUgb3V0KQogICAgOnBhcmFtIHB1Ymxpc2hfbmFtZTogICAgICBuYW1lIG9mIHRoZSBkYXNrIGRhdGFmcmFtZSB0byBwdWJsaXNoIHRvIHRoZSBkYXNrIGNsdXN0ZXIKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgKGRlZmF1bHQgTm9uZSwgbm90IHB1Ymxpc2gpCgogICAgIiIiCiAgICBjb250ZXh0ID0gbWxydW4uZ2V0X29yX2NyZWF0ZV9jdHgoJ3NuYXdmbGFrZS1kYXNrLWNsdXN0ZXInKQogICAgc2ZfcGFzc3dvcmQgPSBjb250ZXh0LmdldF9zZWNyZXQoJ3NmUGFzc3dvcmQnKQogICAgcGtfcGF0aCA9ICBjb250ZXh0LmdldF9zZWNyZXQoJ3BrUGF0aCcpCiAgICBwa19wYXNzd29yZCA9ICBjb250ZXh0LmdldF9zZWNyZXQoJ3BrUGFzc3dvcmQnKQoKICAgIGlmIHBrX3BhdGggYW5kIHBrX3Bhc3N3b3JkOgogICAgICAgIHdpdGggb3Blbihwa19wYXRoLCAicmIiKSBhcyBrZXk6CiAgICAgICAgICAgIHBfa2V5PSBzZXJpYWxpemF0aW9uLmxvYWRfcGVtX3ByaXZhdGVfa2V5KAogICAgICAgICAgICAgICAga2V5LnJlYWQoKSwKICAgICAgICAgICAgICAgIHBhc3N3b3JkPXN0cihwa19wYXNzd29yZCkuZW5jb2RlKCksCiAgICAgICAgICAgICAgICBiYWNrZW5kPWRlZmF1bHRfYmFja2VuZCgpCiAgICAgICAgICAgICkKICAgICAgICBwa2IgPSBwX2tleS5wcml2YXRlX2J5dGVzKAogICAgICAgICAgICBlbmNvZGluZz1zZXJpYWxpemF0aW9uLkVuY29kaW5nLkRFUiwKICAgICAgICAgICAgZm9ybWF0PXNlcmlhbGl6YXRpb24uUHJpdmF0ZUZvcm1hdC5QS0NTOAogICAgICAgICAgICAsZW5jcnlwdGlvbl9hbGdvcml0aG09c2VyaWFsaXphdGlvbi5Ob0VuY3J5cHRpb24oKQogICAgICAgICkKICAgICAgICBjb25uZWN0aW9uX2luZm8ucG9wKCdwYXNzd29yZCcsICdObyBwYXNzd29yZCBmb3VuZCcpCiAgICAgICAgY29ubmVjdGlvbl9pbmZvWydwcml2YXRlX2tleSddID0gcGtiCiAgICBlbGlmIHNmX3Bhc3N3b3JkOgogICAgICAgIGNvbm5lY3Rpb25faW5mb1sncGFzc3dvcmQnXSA9IHNmX3Bhc3N3b3JkCiAgICBlbHNlOgogICAgICAgIHJhaXNlIEV4Y2VwdGlvbigiXG5QbGVhc2Ugc2V0IHVwIHRoZSBzZWNyZXQgZm9yIFNub3dmbGFrZSBpbiB5b3VyIHByb2plY3QhXG4iKQoKICAgICMgc2V0dXAgZGFzayBjbGllbnQgZnJvbSB0aGUgTUxSdW4gZGFzayBjbHVzdGVyIGZ1bmN0aW9uCiAgICBpZiBkYXNrX2NsaWVudDoKICAgICAgICBjbGllbnQgPSBtbHJ1bi5pbXBvcnRfZnVuY3Rpb24oZGFza19jbGllbnQpLmNsaWVudAogICAgICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZidFeGlzdGluZyBkYXNrIGNsaWVudCA9PT0gPj4+IHtjbGllbnR9XG4nKQogICAgZWxzZToKICAgICAgICBjbGllbnQgPSBDbGllbnQoKQogICAgICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZidcbk5ld2x5IGNyZWF0ZWQgZGFzayBjbGllbnQgPT09ID4+PiB7Y2xpZW50fVxuJykKCiAgICBjb25uID0gc25vdy5jb25uZWN0KCoqY29ubmVjdGlvbl9pbmZvKQogICAgY3VyID0gY29ubi5jdXJzb3IoKQogICAgY3VyLmV4ZWN1dGUocXVlcnkpCiAgICBiYXRjaGVzID0gY3VyLmdldF9yZXN1bHRfYmF0Y2hlcygpCiAgICBjb250ZXh0LmxvZ2dlci5pbmZvKGYnYmF0Y2hlcyBsZW4gPT09IHtsZW4oYmF0Y2hlcyl9XG4nKQoKICAgIGRmcyA9IFtdCiAgICBmb3IgYmF0Y2ggaW4gYmF0Y2hlczoKICAgICAgICBpZiBiYXRjaC5yb3djb3VudCA+IDA6CiAgICAgICAgICAgIGRmID0gbG9hZChiYXRjaCkKICAgICAgICAgICAgZGZzLmFwcGVuZChkZikKICAgIGRkZiA9IGZyb21fZGVsYXllZChkZnMpCgogICAgIyBtYXRlcmlhbGl6ZSB0aGUgcXVlcnkgcmVzdWx0cyBzZXQgZm9yIHNvbWUgc2FtcGxlIGNvbXB1dGUKCiAgICBkZGZfZGVzY3JpYmUgPSBkZGYuZGVzY3JpYmUoKS5jb21wdXRlKCkKCiAgICBjb250ZXh0LmxvZ2dlci5pbmZvKGYncXVlcnkgID09PSA+Pj4ge3F1ZXJ5fVxuJykKICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZidkZGYgID09PSA+Pj4ge2RkZn1cbicpCiAgICBjb250ZXh0LmxvZ19yZXN1bHQoJ251bWJlciBvZiByb3dzJywgbGVuKGRkZi5pbmRleCkpCiAgICBjb250ZXh0LmxvZ19kYXRhc2V0KCJkZGZfZGVzY3JpYmUiLCBkZj1kZGZfZGVzY3JpYmUpCgogICAgaWYgcHVibGlzaF9uYW1lOgogICAgICAgIGNvbnRleHQubG9nX3Jlc3VsdCgnZGF0YV9zZXRfbmFtZScsIHB1Ymxpc2hfbmFtZSkKICAgICAgICBpZiBub3QgY2xpZW50Lmxpc3RfZGF0YXNldHMoKToKICAgICAgICAgICAgZGRmLnBlcnNpc3QobmFtZSA9IHB1Ymxpc2hfbmFtZSkKICAgICAgICAgICAgY2xpZW50LnB1Ymxpc2hfZGF0YXNldChwdWJsaXNoX25hbWU9ZGRmKQoKICAgIGlmIHBhcnF1ZXRfb3V0X2RpcjoKICAgICAgICBkZC50b19wYXJxdWV0KGRmPWRkZiwgcGF0aD1wYXJxdWV0X291dF9kaXIpCiAgICAgICAgY29udGV4dC5sb2dfcmVzdWx0KCdwYXJxdWV0IGRpcmVjdG9yeScsIHBhcnF1ZXRfb3V0X2RpcikK - base_image: mlrun/mlrun - commands: - - python -m pip install bokeh snowflake-connector-python[pandas] mlrun~=0.9.1 - code_origin: https://github.com/xsqian/functions.git#6b31040e2ad762602f335b0589823a1c61a09975:snowflake_dask.py - origin_filename: snowflake_dask.py - entry_points: - load: - name: load - doc: A delayed load one batch. - parameters: - - name: batch - default: '' - outputs: - - default: '' - lineno: 15 - load_results: - name: load_results - doc: Snowflake Dask - Ingest Snaowflake data with Dask - parameters: - - name: context - type: MLClientCtx - doc: the function context - default: '' - - name: dask_client - type: str - doc: dask cluster function name - default: '' - - name: connection_info - type: str - doc: Snowflake database connection info (this will be in a secret later) - default: '' - - name: query - type: str - doc: query to for Snowflake - default: '' - - name: parquet_out_dir - doc: directory path for the output parquet files (default None, not write - out) - default: null - - name: publish_name - doc: name of the dask dataframe to publish to the dask cluster (default None, - not publish) - default: null - outputs: - - default: '' - lineno: 28 - description: Snowflake Dask - Ingest snowflake data in parallel with Dask cluster - default_handler: load_results - disable_auto_mount: false - env: - - name: V3IO_API - value: '' - - name: V3IO_USERNAME - value: '' - - name: V3IO_ACCESS_KEY - value: '' - - name: V3IO_FRAMESD - value: '' - priority_class_name: igz-workload-medium - preemption_mode: prevent - affinity: null - tolerations: null -verbose: false diff --git a/functions/development/snowflake_dask/latest/src/img/iguazio-project-secrets.png b/functions/development/snowflake_dask/latest/src/img/iguazio-project-secrets.png deleted file mode 100644 index 29f48aa3..00000000 Binary files a/functions/development/snowflake_dask/latest/src/img/iguazio-project-secrets.png and /dev/null differ diff --git a/functions/development/snowflake_dask/latest/src/img/snowflake-dask.png b/functions/development/snowflake_dask/latest/src/img/snowflake-dask.png deleted file mode 100644 index 30a25282..00000000 Binary files a/functions/development/snowflake_dask/latest/src/img/snowflake-dask.png and /dev/null differ diff --git a/functions/development/snowflake_dask/latest/src/item.yaml b/functions/development/snowflake_dask/latest/src/item.yaml deleted file mode 100644 index c12d3aba..00000000 --- a/functions/development/snowflake_dask/latest/src/item.yaml +++ /dev/null @@ -1,25 +0,0 @@ -apiVersion: v1 -categories: -- data-preparation -description: Snowflake Dask - Ingest snowflake data in parallel with Dask cluster -doc: '' -example: snowflake-dask-mlrun.ipynb -generationDate: 2022-08-28:17-25 -hidden: false -icon: '' -labels: - author: xingsheng - framework: dask -maintainers: [] -marketplaceType: '' -mlrunVersion: 1.4.1 -name: snowflake_dask -platformVersion: 3.5.0 -spec: - filename: snowflake_dask.py - handler: load_results - image: mlrun/mlrun - kind: job - requirements: [] -url: '' -version: 1.1.0 diff --git a/functions/development/snowflake_dask/latest/src/requirements.txt b/functions/development/snowflake_dask/latest/src/requirements.txt deleted file mode 100644 index 0bca2c92..00000000 --- a/functions/development/snowflake_dask/latest/src/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -bokeh -snowflake-connector-python[pandas] diff --git a/functions/development/snowflake_dask/latest/src/snowflake-dask-mlrun.ipynb b/functions/development/snowflake_dask/latest/src/snowflake-dask-mlrun.ipynb deleted file mode 100644 index 03936f2a..00000000 --- a/functions/development/snowflake_dask/latest/src/snowflake-dask-mlrun.ipynb +++ /dev/null @@ -1,437 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# This notebook is to create a function to ingest data from snowflake with a Dask cluster" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The dask frameworks enables users to parallelize their python code and run it as a distributed process on Iguazio cluster and dramatically accelerate their performance.
\n", - "In this notebook we'll create an mlrun function running as a dask client to ingest data from snowflake.
\n", - "It also demonstrates how to run parallelize query against snowflake using Dask Delayed option to query a large data set from snowflake.
\n", - "The function will be published on the function marketplace.
\n", - "For more information on dask over kubernetes: https://kubernetes.dask.org/en/latest/" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Set up the enviroment" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import mlrun\n", - "import os\n", - "import warnings\n", - "import yaml\n", - "\n", - "project_name = \"snowflake-dask\"\n", - "dask_cluster_name=\"snowflake-dask-cluster\"\n", - "artifact_path = mlrun.set_environment(project=project_name,\n", - " artifact_path = os.path.join(os.path.abspath('/v3io/projects/'), project_name))\n", - "\n", - "warnings.filterwarnings(\"ignore\")\n", - "\n", - "print(f'artifact_path = {artifact_path}')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Load snowflake configuration from config file. \n", - "This is for demo purpose, in the real production code, you would need to put the snowflake connection info into secrets use the secrets in the running pod to connect to snowflake" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Load connection info\n", - "with open(\".config.yaml\") as f:\n", - " connection_info = yaml.safe_load(f)\n", - "\n", - "# verify the config\n", - "print(connection_info['account'])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Create a python function" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This function querys data from snowflake using snowflake python connector for parallel processing of the query results.
\n", - "With snoeflake python connector, when you execute a query, the cursor will return the result batches.
\n", - "Using Dask Delayed it will return and process results set in parallel.
" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### write the function to a py file" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%writefile snowflake_dask.py\n", - "\"\"\"Snowflake Dask - Ingest Snowflake data with Dask\"\"\"\n", - "import warnings\n", - "import mlrun\n", - "from mlrun.execution import MLClientCtx\n", - "import snowflake.connector as snow\n", - "from dask.distributed import Client\n", - "from dask.dataframe import from_delayed\n", - "from dask import delayed\n", - "from dask import dataframe as dd\n", - "from cryptography.hazmat.backends import default_backend\n", - "from cryptography.hazmat.primitives import serialization\n", - "\n", - "warnings.filterwarnings(\"ignore\")\n", - "\n", - "@delayed\n", - "def load(batch):\n", - "\n", - " \"\"\"A delayed load one batch.\"\"\"\n", - "\n", - " try:\n", - " print(\"BATCHING\")\n", - " df_ = batch.to_pandas()\n", - " return df_\n", - " except Exception as e:\n", - " print(f\"Failed on {batch} for {e}\")\n", - " raise\n", - "\n", - "def load_results(context: MLClientCtx,\n", - " dask_client: str,\n", - " connection_info: str,\n", - " query: str,\n", - " parquet_out_dir = None,\n", - " publish_name = None\n", - " ) -> None:\n", - "\n", - " \"\"\"Snowflake Dask - Ingest Snowflake data with Dask\n", - "\n", - " :param context: the function context\n", - " :param dask_client: dask cluster function name\n", - " :param connection_info: Snowflake database connection info (this will be in a secret later)\n", - " :param query: query to for Snowflake\n", - " :param parquet_out_dir: directory path for the output parquet files\n", - " (default None, not write out)\n", - " :param publish_name: name of the dask dataframe to publish to the dask cluster\n", - " (default None, not publish)\n", - "\n", - " \"\"\"\n", - " context = mlrun.get_or_create_ctx('snawflake-dask-cluster')\n", - " sf_password = context.get_secret('sfPassword')\n", - " pk_path = context.get_secret('pkPath')\n", - " pk_password = context.get_secret('pkPassword')\n", - "\n", - " if pk_path and pk_password:\n", - " with open(pk_path, \"rb\") as key:\n", - " p_key= serialization.load_pem_private_key(\n", - " key.read(),\n", - " password=str(pk_password).encode(),\n", - " backend=default_backend()\n", - " )\n", - " pkb = p_key.private_bytes(\n", - " encoding=serialization.Encoding.DER,\n", - " format=serialization.PrivateFormat.PKCS8\n", - " ,encryption_algorithm=serialization.NoEncryption()\n", - " )\n", - " connection_info.pop('password', 'No password found')\n", - " connection_info['private_key'] = pkb\n", - " elif sf_password:\n", - " connection_info['password'] = sf_password\n", - " else:\n", - " raise Exception(\"\\nPlease set up the secret for Snowflake in your project!\\n\")\n", - "\n", - " # setup dask client from the MLRun dask cluster function\n", - " if dask_client:\n", - " client = mlrun.import_function(dask_client).client\n", - " context.logger.info(f'Existing dask client === >>> {client}\\n')\n", - " else:\n", - " client = Client()\n", - " context.logger.info(f'\\nNewly created dask client === >>> {client}\\n')\n", - "\n", - " conn = snow.connect(**connection_info)\n", - " cur = conn.cursor()\n", - " cur.execute(query)\n", - " batches = cur.get_result_batches()\n", - " context.logger.info(f'batches len === {len(batches)}\\n')\n", - "\n", - " dfs = []\n", - " for batch in batches:\n", - " if batch.rowcount > 0:\n", - " df = load(batch)\n", - " dfs.append(df)\n", - " ddf = from_delayed(dfs)\n", - "\n", - " # materialize the query results set for some sample compute\n", - "\n", - " ddf_describe = ddf.describe().compute()\n", - "\n", - " context.logger.info(f'query === >>> {query}\\n')\n", - " context.logger.info(f'ddf === >>> {ddf}\\n')\n", - " context.log_result('number of rows', len(ddf.index))\n", - " context.log_dataset(\"ddf_describe\", df=ddf_describe)\n", - "\n", - " if publish_name:\n", - " context.log_result('data_set_name', publish_name)\n", - " if not client.list_datasets():\n", - " ddf.persist(name = publish_name)\n", - " client.publish_dataset(publish_name=ddf)\n", - "\n", - " if parquet_out_dir:\n", - " dd.to_parquet(df=ddf, path=parquet_out_dir)\n", - " context.log_result('parquet directory', parquet_out_dir)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Convert the code to MLRun function" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Use code_to_function to convert the code to MLRun
" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "fn = mlrun.code_to_function(name=\"snowflake-dask\", \n", - " kind='job', \n", - " filename='snowflake_dask.py',\n", - " image='mlrun/mlrun',\n", - " requirements='requirements.txt',\n", - " handler=\"load_results\", \n", - " description=\"Snowflake Dask - Ingest snowflake data in parallel with Dask cluster\",\n", - " categories=[\"data-prep\"],\n", - " labels={\"author\": \"xingsheng\"}\n", - " )\n", - "fn.apply(mlrun.platforms.auto_mount())\n", - "fn.deploy()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### export function to local `function.yaml` file for testing\n", - "in the real usage, we will import a function from hub" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "fn.export('function.yaml')\n", - "# print(fn.to_yaml())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### import a function from local `function.yaml' for testing (Need to change it to import from hub before PR)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "fn = mlrun.import_function(\"./function.yaml\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# fn = mlrun.import_function(\"hub://snowflake_dask\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "fn.apply(mlrun.platforms.auto_mount()) # this is a very important line" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### create a dask cluster and specify the configuration for the dask process (e.g. replicas, memory etc)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# function URI is db:///\n", - "dask_uri = f'db://{project_name}/{dask_cluster_name}'\n", - "dask_uri" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dsf = mlrun.new_function(name=dask_cluster_name, \n", - " kind='dask', \n", - " image='mlrun/mlrun',\n", - " requirements=[\"bokeh\", \"snowflake-connector-python[pandas]\"]\n", - " )\n", - "dsf.apply(mlrun.mount_v3io())\n", - "dsf.spec.remote = True\n", - "dsf.spec.min_replicas = 1\n", - "dsf.spec.max_replicas = 10\n", - "dsf.spec.service_type = \"NodePort\"\n", - "dsf.with_requests(mem='4G', cpu='2')\n", - "# dsf.spec.node_port=30088\n", - "# dsf.spec.scheduler_timeout = \"5 days\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dsf.deploy()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "client = dsf.client" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Run the function" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "When running the function you would see a remote dashboard link as part of the result. click on this link takes you to the dask monitoring dashboard" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "p = 'my-local-test'\n", - "parquet_path = f\"/v3io/bigdata/pq_from_sf_dask/{p}\"\n", - "\n", - "fn.run(handler = 'load_results',\n", - " params={\"dask_client\": dask_uri, \n", - " \"connection_info\": connection_info, \n", - " \"query\": \"SELECT * FROM SNOWFLAKE_SAMPLE_DATA.TPCH_SF1.CUSTOMER\",\n", - " \"parquet_out_dir\": parquet_path,\n", - " \"publish_name\": \"customer\",\n", - " }\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "client.close()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Track the progress in the UI" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Users can view the progress and detailed information in the mlrun UI by clicking on the uid above.
\n", - "Also, to track the dask progress in the dask UI click on the \"dashboard link\" above the \"client\" section" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python [conda env:root] *", - "language": "python", - "name": "conda-root-py" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/functions/development/snowflake_dask/latest/src/snowflake_dask.py b/functions/development/snowflake_dask/latest/src/snowflake_dask.py deleted file mode 100644 index 8846e821..00000000 --- a/functions/development/snowflake_dask/latest/src/snowflake_dask.py +++ /dev/null @@ -1,125 +0,0 @@ -# Copyright 2019 Iguazio -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -"""Snowflake Dask - Ingest Snaowflake data with Dask""" - -import warnings -import mlrun -from mlrun.execution import MLClientCtx -import snowflake.connector as snow -from dask.distributed import Client -from dask.dataframe import from_delayed -from dask import delayed -from dask import dataframe as dd -from cryptography.hazmat.backends import default_backend -from cryptography.hazmat.primitives import serialization - -warnings.filterwarnings("ignore") - -@delayed -def load(batch): - - """A delayed load one batch.""" - - try: - print("BATCHING") - df_ = batch.to_pandas() - return df_ - except Exception as e: - print(f"Failed on {batch} for {e}") - raise - -def load_results(context: MLClientCtx, - dask_client: str, - connection_info: str, - query: str, - parquet_out_dir = None, - publish_name = None - ) -> None: - - """Snowflake Dask - Ingest Snowflake data with Dask - - :param context: the function context - :param dask_client: dask cluster function name - :param connection_info: Snowflake database connection info (this will be in a secret later) - :param query: query to for Snowflake - :param parquet_out_dir: directory path for the output parquet files - (default None, not write out) - :param publish_name: name of the dask dataframe to publish to the dask cluster - (default None, not publish) - - """ - context = mlrun.get_or_create_ctx('snawflake-dask-cluster') - sf_password = context.get_secret('sfPassword') - pk_path = context.get_secret('pkPath') - pk_password = context.get_secret('pkPassword') - - if pk_path and pk_password: - with open(pk_path, "rb") as key: - p_key= serialization.load_pem_private_key( - key.read(), - password=str(pk_password).encode(), - backend=default_backend() - ) - pkb = p_key.private_bytes( - encoding=serialization.Encoding.DER, - format=serialization.PrivateFormat.PKCS8 - ,encryption_algorithm=serialization.NoEncryption() - ) - connection_info.pop('password', 'No password found') - connection_info['private_key'] = pkb - elif sf_password: - connection_info['password'] = sf_password - else: - raise Exception("\nPlease set up the secret for Snowflake in your project!\n") - - # setup dask client from the MLRun dask cluster function - if dask_client: - client = mlrun.import_function(dask_client).client - context.logger.info(f'Existing dask client === >>> {client}\n') - else: - client = Client() - context.logger.info(f'\nNewly created dask client === >>> {client}\n') - - conn = snow.connect(**connection_info) - cur = conn.cursor() - cur.execute(query) - batches = cur.get_result_batches() - context.logger.info(f'batches len === {len(batches)}\n') - - dfs = [] - for batch in batches: - if batch.rowcount > 0: - df = load(batch) - dfs.append(df) - ddf = from_delayed(dfs) - - # materialize the query results set for some sample compute - - ddf_describe = ddf.describe().compute() - - context.logger.info(f'query === >>> {query}\n') - context.logger.info(f'ddf === >>> {ddf}\n') - context.log_result('number of rows', len(ddf.index)) - context.log_dataset("ddf_describe", df=ddf_describe) - - if publish_name: - context.log_result('data_set_name', publish_name) - if not client.list_datasets(): - ddf.persist(name = publish_name) - client.publish_dataset(publish_name=ddf) - - if parquet_out_dir: - dd.to_parquet(df=ddf, path=parquet_out_dir) - context.log_result('parquet directory', parquet_out_dir) diff --git a/functions/development/snowflake_dask/latest/src/test_snowflake_dask.py b/functions/development/snowflake_dask/latest/src/test_snowflake_dask.py deleted file mode 100644 index fc2d4c93..00000000 --- a/functions/development/snowflake_dask/latest/src/test_snowflake_dask.py +++ /dev/null @@ -1,24 +0,0 @@ -# Copyright 2019 Iguazio -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -"""Snowflake Dask unit test""" -from mlrun import import_function - -def test_snowflake_dask(): - """An unit test""" - fn_to_test = import_function("function.yaml") - - # a fake assert to pass the unit test - if fn_to_test.to_yaml().__contains__('job'): - assert True diff --git a/functions/development/snowflake_dask/latest/static/documentation.html b/functions/development/snowflake_dask/latest/static/documentation.html deleted file mode 100644 index 68ca0ecd..00000000 --- a/functions/development/snowflake_dask/latest/static/documentation.html +++ /dev/null @@ -1,247 +0,0 @@ - - - - - - - -snowflake_dask package - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - - - -
-
- - -
-
-
- -
-

snowflake_dask package

- -
- -
-
-
-
-
-

snowflake_dask package#

-
-

Submodules#

-
-
-

snowflake_dask.snowflake_dask module#

-

Snowflake Dask - Ingest Snaowflake data with Dask

-
-
-snowflake_dask.snowflake_dask.load(batch)#
-

A delayed load one batch.

-
-
-
-snowflake_dask.snowflake_dask.load_results(context: mlrun.execution.MLClientCtx, dask_client: str, connection_info: str, query: str, parquet_out_dir=None, publish_name=None)None[source]#
-

Snowflake Dask - Ingest Snowflake data with Dask

-
-
Parameters
-
    -
  • context – the function context

  • -
  • dask_client – dask cluster function name

  • -
  • connection_info – Snowflake database connection info (this will be in a secret later)

  • -
  • query – query to for Snowflake

  • -
  • parquet_out_dir – directory path for the output parquet files -(default None, not write out)

  • -
  • publish_name – name of the dask dataframe to publish to the dask cluster -(default None, not publish)

  • -
-
-
-
-
-
-

Module contents#

-
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/snowflake_dask/latest/static/example.html b/functions/development/snowflake_dask/latest/static/example.html deleted file mode 100644 index 35fe27a2..00000000 --- a/functions/development/snowflake_dask/latest/static/example.html +++ /dev/null @@ -1,618 +0,0 @@ - - - - - - - -This notebook is to create a function to ingest data from snowflake with a Dask cluster - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - - - -
-
- - -
-
-
- - -
-
-
-

This notebook is to create a function to ingest data from snowflake with a Dask cluster#

-

The dask frameworks enables users to parallelize their python code and run it as a distributed process on Iguazio cluster and dramatically accelerate their performance.
-In this notebook we’ll create an mlrun function running as a dask client to ingest data from snowflake.
-It also demonstrates how to run parallelize query against snowflake using Dask Delayed option to query a large data set from snowflake.
-The function will be published on the function marketplace.
-For more information on dask over kubernetes: https://kubernetes.dask.org/en/latest/

-
-

Set up the enviroment#

-
-
-
import mlrun
-import os
-import warnings
-import yaml
-
-project_name = "snowflake-dask"
-dask_cluster_name="snowflake-dask-cluster"
-artifact_path = mlrun.set_environment(project=project_name,
-                                      artifact_path = os.path.join(os.path.abspath('/v3io/projects/'), project_name))
-
-warnings.filterwarnings("ignore")
-
-print(f'artifact_path = {artifact_path}')
-
-
-
-
-
-
-

Load snowflake configuration from config file.#

-

This is for demo purpose, in the real production code, you would need to put the snowflake connection info into secrets use the secrets in the running pod to connect to snowflake

-
-
-
# Load connection info
-with open(".config.yaml") as f:
-    connection_info = yaml.safe_load(f)
-
-# verify the config
-print(connection_info['account'])
-
-
-
-
-
-
-

Create a python function#

-

This function querys data from snowflake using snowflake python connector for parallel processing of the query results.
-With snoeflake python connector, when you execute a query, the cursor will return the result batches.
-Using Dask Delayed it will return and process results set in parallel.

-
-

write the function to a py file#

-
-
-
%%writefile snowflake_dask.py
-"""Snowflake Dask - Ingest Snowflake data with Dask"""
-import warnings
-import mlrun
-from mlrun.execution import MLClientCtx
-import snowflake.connector as snow
-from dask.distributed import Client
-from dask.dataframe import from_delayed
-from dask import delayed
-from dask import dataframe as dd
-from cryptography.hazmat.backends import default_backend
-from cryptography.hazmat.primitives import serialization
-
-warnings.filterwarnings("ignore")
-
-@delayed
-def load(batch):
-
-    """A delayed load one batch."""
-
-    try:
-        print("BATCHING")
-        df_ = batch.to_pandas()
-        return df_
-    except Exception as e:
-        print(f"Failed on {batch} for {e}")
-        raise
-
-def load_results(context: MLClientCtx,
-                 dask_client: str,
-                 connection_info: str,
-                 query: str,
-                 parquet_out_dir = None,
-                 publish_name = None
-                ) -> None:
-
-    """Snowflake Dask - Ingest Snowflake data with Dask
-
-    :param context:           the function context
-    :param dask_client:       dask cluster function name
-    :param connection_info:   Snowflake database connection info (this will be in a secret later)
-    :param query:             query to for Snowflake
-    :param parquet_out_dir:   directory path for the output parquet files
-                              (default None, not write out)
-    :param publish_name:      name of the dask dataframe to publish to the dask cluster
-                              (default None, not publish)
-
-    """
-    context = mlrun.get_or_create_ctx('snawflake-dask-cluster')
-    sf_password = context.get_secret('sfPassword')
-    pk_path =  context.get_secret('pkPath')
-    pk_password =  context.get_secret('pkPassword')
-
-    if pk_path and pk_password:
-        with open(pk_path, "rb") as key:
-            p_key= serialization.load_pem_private_key(
-                key.read(),
-                password=str(pk_password).encode(),
-                backend=default_backend()
-            )
-        pkb = p_key.private_bytes(
-            encoding=serialization.Encoding.DER,
-            format=serialization.PrivateFormat.PKCS8
-            ,encryption_algorithm=serialization.NoEncryption()
-        )
-        connection_info.pop('password', 'No password found')
-        connection_info['private_key'] = pkb
-    elif sf_password:
-        connection_info['password'] = sf_password
-    else:
-        raise Exception("\nPlease set up the secret for Snowflake in your project!\n")
-
-    # setup dask client from the MLRun dask cluster function
-    if dask_client:
-        client = mlrun.import_function(dask_client).client
-        context.logger.info(f'Existing dask client === >>> {client}\n')
-    else:
-        client = Client()
-        context.logger.info(f'\nNewly created dask client === >>> {client}\n')
-
-    conn = snow.connect(**connection_info)
-    cur = conn.cursor()
-    cur.execute(query)
-    batches = cur.get_result_batches()
-    context.logger.info(f'batches len === {len(batches)}\n')
-
-    dfs = []
-    for batch in batches:
-        if batch.rowcount > 0:
-            df = load(batch)
-            dfs.append(df)
-    ddf = from_delayed(dfs)
-
-    # materialize the query results set for some sample compute
-
-    ddf_describe = ddf.describe().compute()
-
-    context.logger.info(f'query  === >>> {query}\n')
-    context.logger.info(f'ddf  === >>> {ddf}\n')
-    context.log_result('number of rows', len(ddf.index))
-    context.log_dataset("ddf_describe", df=ddf_describe)
-
-    if publish_name:
-        context.log_result('data_set_name', publish_name)
-        if not client.list_datasets():
-            ddf.persist(name = publish_name)
-            client.publish_dataset(publish_name=ddf)
-
-    if parquet_out_dir:
-        dd.to_parquet(df=ddf, path=parquet_out_dir)
-        context.log_result('parquet directory', parquet_out_dir)
-
-
-
-
-
-
-
-

Convert the code to MLRun function#

-

Use code_to_function to convert the code to MLRun

-
-
-
fn = mlrun.code_to_function(name="snowflake-dask",  
-                            kind='job', 
-                            filename='snowflake_dask.py',
-                            image='mlrun/mlrun',
-                            requirements='requirements.txt',
-                            handler="load_results", 
-                            description="Snowflake Dask - Ingest snowflake data in parallel with Dask cluster",
-                            categories=["data-prep"],
-                            labels={"author": "xingsheng"}
-                           )
-fn.apply(mlrun.platforms.auto_mount())
-fn.deploy()
-
-
-
-
-
-

export function to local function.yaml file for testing#

-

in the real usage, we will import a function from hub

-
-
-
fn.export('function.yaml')
-# print(fn.to_yaml())
-
-
-
-
-
-
-

import a function from local `function.yaml’ for testing (Need to change it to import from hub before PR)#

-
-
-
fn = mlrun.import_function("./function.yaml")
-
-
-
-
-
-
-
# fn = mlrun.import_function("hub://snowflake_dask")
-
-
-
-
-
-
-
fn.apply(mlrun.platforms.auto_mount()) # this is a very important line
-
-
-
-
-
-
-

create a dask cluster and specify the configuration for the dask process (e.g. replicas, memory etc)#

-
-
-
# function URI is db://<project>/<name>
-dask_uri = f'db://{project_name}/{dask_cluster_name}'
-dask_uri
-
-
-
-
-
-
-
dsf = mlrun.new_function(name=dask_cluster_name, 
-                         kind='dask', 
-                         image='mlrun/mlrun',
-                         requirements=["bokeh", "snowflake-connector-python[pandas]"]
-                        )
-dsf.apply(mlrun.mount_v3io())
-dsf.spec.remote = True
-dsf.spec.min_replicas = 1
-dsf.spec.max_replicas = 10
-dsf.spec.service_type = "NodePort"
-dsf.with_requests(mem='4G', cpu='2')
-# dsf.spec.node_port=30088
-# dsf.spec.scheduler_timeout = "5 days"
-
-
-
-
-
-
-
dsf.deploy()
-
-
-
-
-
-
-
client = dsf.client
-
-
-
-
-
-
-
-

Run the function#

-

When running the function you would see a remote dashboard link as part of the result. click on this link takes you to the dask monitoring dashboard

-
-
-
p = 'my-local-test'
-parquet_path = f"/v3io/bigdata/pq_from_sf_dask/{p}"
-
-fn.run(handler = 'load_results',
-       params={"dask_client": dask_uri, 
-               "connection_info": connection_info, 
-               "query": "SELECT * FROM SNOWFLAKE_SAMPLE_DATA.TPCH_SF1.CUSTOMER",
-               "parquet_out_dir": parquet_path,
-               "publish_name": "customer",
-              }
-      )
-
-
-
-
-
-
-
client.close()
-
-
-
-
-
-
-

Track the progress in the UI#

-

Users can view the progress and detailed information in the mlrun UI by clicking on the uid above.
-Also, to track the dask progress in the dask UI click on the “dashboard link” above the “client” section

-
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/snowflake_dask/latest/static/function.html b/functions/development/snowflake_dask/latest/static/function.html deleted file mode 100644 index eda48073..00000000 --- a/functions/development/snowflake_dask/latest/static/function.html +++ /dev/null @@ -1,103 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: job
-metadata:
-  name: snowflake-dask
-  tag: ''
-  hash: a002c7743b4a7471c7befe00f5497de050ebe902
-  project: snowflake-dask
-  labels:
-    author: xingsheng
-  categories:
-  - data-prep
-  credentials:
-    access_key: ec09bfc8-1cb4-466d-9049-852081973ce3
-spec:
-  command: ''
-  args: []
-  image: .mlrun/func-snowflake-dask-snowflake-dask:latest
-  build:
-    functionSourceCode: IiIiU25vd2ZsYWtlIERhc2sgLSBJbmdlc3QgU25hb3dmbGFrZSBkYXRhIHdpdGggRGFzayIiIgppbXBvcnQgd2FybmluZ3MKaW1wb3J0IG1scnVuCmZyb20gbWxydW4uZXhlY3V0aW9uIGltcG9ydCBNTENsaWVudEN0eAppbXBvcnQgc25vd2ZsYWtlLmNvbm5lY3RvciBhcyBzbm93CmZyb20gZGFzay5kaXN0cmlidXRlZCBpbXBvcnQgQ2xpZW50CmZyb20gZGFzay5kYXRhZnJhbWUgaW1wb3J0IGZyb21fZGVsYXllZApmcm9tIGRhc2sgaW1wb3J0IGRlbGF5ZWQKZnJvbSBkYXNrIGltcG9ydCBkYXRhZnJhbWUgYXMgZGQKZnJvbSBjcnlwdG9ncmFwaHkuaGF6bWF0LmJhY2tlbmRzIGltcG9ydCBkZWZhdWx0X2JhY2tlbmQKZnJvbSBjcnlwdG9ncmFwaHkuaGF6bWF0LnByaW1pdGl2ZXMgaW1wb3J0IHNlcmlhbGl6YXRpb24KCndhcm5pbmdzLmZpbHRlcndhcm5pbmdzKCJpZ25vcmUiKQoKQGRlbGF5ZWQKZGVmIGxvYWQoYmF0Y2gpOgoKICAgICIiIkEgZGVsYXllZCBsb2FkIG9uZSBiYXRjaC4iIiIKCiAgICB0cnk6CiAgICAgICAgcHJpbnQoIkJBVENISU5HIikKICAgICAgICBkZl8gPSBiYXRjaC50b19wYW5kYXMoKQogICAgICAgIHJldHVybiBkZl8KICAgIGV4Y2VwdCBFeGNlcHRpb24gYXMgZToKICAgICAgICBwcmludChmIkZhaWxlZCBvbiB7YmF0Y2h9IGZvciB7ZX0iKQogICAgICAgIHJhaXNlCgpkZWYgbG9hZF9yZXN1bHRzKGNvbnRleHQ6IE1MQ2xpZW50Q3R4LAogICAgICAgICAgICAgICAgIGRhc2tfY2xpZW50OiBzdHIsCiAgICAgICAgICAgICAgICAgY29ubmVjdGlvbl9pbmZvOiBzdHIsCiAgICAgICAgICAgICAgICAgcXVlcnk6IHN0ciwKICAgICAgICAgICAgICAgICBwYXJxdWV0X291dF9kaXIgPSBOb25lLAogICAgICAgICAgICAgICAgIHB1Ymxpc2hfbmFtZSA9IE5vbmUKICAgICAgICAgICAgICAgICkgLT4gTm9uZToKCiAgICAiIiJTbm93Zmxha2UgRGFzayAtIEluZ2VzdCBTbmFvd2ZsYWtlIGRhdGEgd2l0aCBEYXNrCgogICAgOnBhcmFtIGNvbnRleHQ6ICAgICAgICAgICB0aGUgZnVuY3Rpb24gY29udGV4dAogICAgOnBhcmFtIGRhc2tfY2xpZW50OiAgICAgICBkYXNrIGNsdXN0ZXIgZnVuY3Rpb24gbmFtZQogICAgOnBhcmFtIGNvbm5lY3Rpb25faW5mbzogICBTbm93Zmxha2UgZGF0YWJhc2UgY29ubmVjdGlvbiBpbmZvICh0aGlzIHdpbGwgYmUgaW4gYSBzZWNyZXQgbGF0ZXIpCiAgICA6cGFyYW0gcXVlcnk6ICAgICAgICAgICAgIHF1ZXJ5IHRvIGZvciBTbm93Zmxha2UKICAgIDpwYXJhbSBwYXJxdWV0X291dF9kaXI6ICAgZGlyZWN0b3J5IHBhdGggZm9yIHRoZSBvdXRwdXQgcGFycXVldCBmaWxlcwogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAoZGVmYXVsdCBOb25lLCBub3Qgd3JpdGUgb3V0KQogICAgOnBhcmFtIHB1Ymxpc2hfbmFtZTogICAgICBuYW1lIG9mIHRoZSBkYXNrIGRhdGFmcmFtZSB0byBwdWJsaXNoIHRvIHRoZSBkYXNrIGNsdXN0ZXIKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgKGRlZmF1bHQgTm9uZSwgbm90IHB1Ymxpc2gpCgogICAgIiIiCiAgICBjb250ZXh0ID0gbWxydW4uZ2V0X29yX2NyZWF0ZV9jdHgoJ3NuYXdmbGFrZS1kYXNrLWNsdXN0ZXInKQogICAgc2ZfcGFzc3dvcmQgPSBjb250ZXh0LmdldF9zZWNyZXQoJ3NmUGFzc3dvcmQnKQogICAgcGtfcGF0aCA9ICBjb250ZXh0LmdldF9zZWNyZXQoJ3BrUGF0aCcpCiAgICBwa19wYXNzd29yZCA9ICBjb250ZXh0LmdldF9zZWNyZXQoJ3BrUGFzc3dvcmQnKQoKICAgIGlmIHBrX3BhdGggYW5kIHBrX3Bhc3N3b3JkOgogICAgICAgIHdpdGggb3Blbihwa19wYXRoLCAicmIiKSBhcyBrZXk6CiAgICAgICAgICAgIHBfa2V5PSBzZXJpYWxpemF0aW9uLmxvYWRfcGVtX3ByaXZhdGVfa2V5KAogICAgICAgICAgICAgICAga2V5LnJlYWQoKSwKICAgICAgICAgICAgICAgIHBhc3N3b3JkPXN0cihwa19wYXNzd29yZCkuZW5jb2RlKCksCiAgICAgICAgICAgICAgICBiYWNrZW5kPWRlZmF1bHRfYmFja2VuZCgpCiAgICAgICAgICAgICkKICAgICAgICBwa2IgPSBwX2tleS5wcml2YXRlX2J5dGVzKAogICAgICAgICAgICBlbmNvZGluZz1zZXJpYWxpemF0aW9uLkVuY29kaW5nLkRFUiwKICAgICAgICAgICAgZm9ybWF0PXNlcmlhbGl6YXRpb24uUHJpdmF0ZUZvcm1hdC5QS0NTOAogICAgICAgICAgICAsZW5jcnlwdGlvbl9hbGdvcml0aG09c2VyaWFsaXphdGlvbi5Ob0VuY3J5cHRpb24oKQogICAgICAgICkKICAgICAgICBjb25uZWN0aW9uX2luZm8ucG9wKCdwYXNzd29yZCcsICdObyBwYXNzd29yZCBmb3VuZCcpCiAgICAgICAgY29ubmVjdGlvbl9pbmZvWydwcml2YXRlX2tleSddID0gcGtiCiAgICBlbGlmIHNmX3Bhc3N3b3JkOgogICAgICAgIGNvbm5lY3Rpb25faW5mb1sncGFzc3dvcmQnXSA9IHNmX3Bhc3N3b3JkCiAgICBlbHNlOgogICAgICAgIHJhaXNlIEV4Y2VwdGlvbigiXG5QbGVhc2Ugc2V0IHVwIHRoZSBzZWNyZXQgZm9yIFNub3dmbGFrZSBpbiB5b3VyIHByb2plY3QhXG4iKQoKICAgICMgc2V0dXAgZGFzayBjbGllbnQgZnJvbSB0aGUgTUxSdW4gZGFzayBjbHVzdGVyIGZ1bmN0aW9uCiAgICBpZiBkYXNrX2NsaWVudDoKICAgICAgICBjbGllbnQgPSBtbHJ1bi5pbXBvcnRfZnVuY3Rpb24oZGFza19jbGllbnQpLmNsaWVudAogICAgICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZidFeGlzdGluZyBkYXNrIGNsaWVudCA9PT0gPj4+IHtjbGllbnR9XG4nKQogICAgZWxzZToKICAgICAgICBjbGllbnQgPSBDbGllbnQoKQogICAgICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZidcbk5ld2x5IGNyZWF0ZWQgZGFzayBjbGllbnQgPT09ID4+PiB7Y2xpZW50fVxuJykKCiAgICBjb25uID0gc25vdy5jb25uZWN0KCoqY29ubmVjdGlvbl9pbmZvKQogICAgY3VyID0gY29ubi5jdXJzb3IoKQogICAgY3VyLmV4ZWN1dGUocXVlcnkpCiAgICBiYXRjaGVzID0gY3VyLmdldF9yZXN1bHRfYmF0Y2hlcygpCiAgICBjb250ZXh0LmxvZ2dlci5pbmZvKGYnYmF0Y2hlcyBsZW4gPT09IHtsZW4oYmF0Y2hlcyl9XG4nKQoKICAgIGRmcyA9IFtdCiAgICBmb3IgYmF0Y2ggaW4gYmF0Y2hlczoKICAgICAgICBpZiBiYXRjaC5yb3djb3VudCA+IDA6CiAgICAgICAgICAgIGRmID0gbG9hZChiYXRjaCkKICAgICAgICAgICAgZGZzLmFwcGVuZChkZikKICAgIGRkZiA9IGZyb21fZGVsYXllZChkZnMpCgogICAgIyBtYXRlcmlhbGl6ZSB0aGUgcXVlcnkgcmVzdWx0cyBzZXQgZm9yIHNvbWUgc2FtcGxlIGNvbXB1dGUKCiAgICBkZGZfZGVzY3JpYmUgPSBkZGYuZGVzY3JpYmUoKS5jb21wdXRlKCkKCiAgICBjb250ZXh0LmxvZ2dlci5pbmZvKGYncXVlcnkgID09PSA+Pj4ge3F1ZXJ5fVxuJykKICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZidkZGYgID09PSA+Pj4ge2RkZn1cbicpCiAgICBjb250ZXh0LmxvZ19yZXN1bHQoJ251bWJlciBvZiByb3dzJywgbGVuKGRkZi5pbmRleCkpCiAgICBjb250ZXh0LmxvZ19kYXRhc2V0KCJkZGZfZGVzY3JpYmUiLCBkZj1kZGZfZGVzY3JpYmUpCgogICAgaWYgcHVibGlzaF9uYW1lOgogICAgICAgIGNvbnRleHQubG9nX3Jlc3VsdCgnZGF0YV9zZXRfbmFtZScsIHB1Ymxpc2hfbmFtZSkKICAgICAgICBpZiBub3QgY2xpZW50Lmxpc3RfZGF0YXNldHMoKToKICAgICAgICAgICAgZGRmLnBlcnNpc3QobmFtZSA9IHB1Ymxpc2hfbmFtZSkKICAgICAgICAgICAgY2xpZW50LnB1Ymxpc2hfZGF0YXNldChwdWJsaXNoX25hbWU9ZGRmKQoKICAgIGlmIHBhcnF1ZXRfb3V0X2RpcjoKICAgICAgICBkZC50b19wYXJxdWV0KGRmPWRkZiwgcGF0aD1wYXJxdWV0X291dF9kaXIpCiAgICAgICAgY29udGV4dC5sb2dfcmVzdWx0KCdwYXJxdWV0IGRpcmVjdG9yeScsIHBhcnF1ZXRfb3V0X2RpcikK
-    base_image: mlrun/mlrun
-    commands:
-    - python -m pip install bokeh snowflake-connector-python[pandas] mlrun~=0.9.1
-    code_origin: https://github.com/xsqian/functions.git#6b31040e2ad762602f335b0589823a1c61a09975:snowflake_dask.py
-    origin_filename: snowflake_dask.py
-  entry_points:
-    load:
-      name: load
-      doc: A delayed load one batch.
-      parameters:
-      - name: batch
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 15
-    load_results:
-      name: load_results
-      doc: Snowflake Dask - Ingest Snaowflake data with Dask
-      parameters:
-      - name: context
-        type: MLClientCtx
-        doc: the function context
-        default: ''
-      - name: dask_client
-        type: str
-        doc: dask cluster function name
-        default: ''
-      - name: connection_info
-        type: str
-        doc: Snowflake database connection info (this will be in a secret later)
-        default: ''
-      - name: query
-        type: str
-        doc: query to for Snowflake
-        default: ''
-      - name: parquet_out_dir
-        doc: directory path for the output parquet files (default None, not write
-          out)
-        default: null
-      - name: publish_name
-        doc: name of the dask dataframe to publish to the dask cluster (default None,
-          not publish)
-        default: null
-      outputs:
-      - default: ''
-      lineno: 28
-  description: Snowflake Dask - Ingest snowflake data in parallel with Dask cluster
-  default_handler: load_results
-  disable_auto_mount: false
-  env:
-  - name: V3IO_API
-    value: ''
-  - name: V3IO_USERNAME
-    value: ''
-  - name: V3IO_ACCESS_KEY
-    value: ''
-  - name: V3IO_FRAMESD
-    value: ''
-  priority_class_name: igz-workload-medium
-  preemption_mode: prevent
-  affinity: null
-  tolerations: null
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/snowflake_dask/latest/static/item.html b/functions/development/snowflake_dask/latest/static/item.html deleted file mode 100644 index c7553d0c..00000000 --- a/functions/development/snowflake_dask/latest/static/item.html +++ /dev/null @@ -1,47 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- data-preparation
-description: Snowflake Dask - Ingest snowflake data in parallel with Dask cluster
-doc: ''
-example: snowflake-dask-mlrun.ipynb
-generationDate: 2022-08-28:17-25
-hidden: false
-icon: ''
-labels:
-  author: xingsheng
-  framework: dask
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 1.4.1
-name: snowflake_dask
-platformVersion: 3.5.0
-spec:
-  filename: snowflake_dask.py
-  handler: load_results
-  image: mlrun/mlrun
-  kind: job
-  requirements: []
-url: ''
-version: 1.1.0
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/snowflake_dask/latest/static/snowflake_dask.html b/functions/development/snowflake_dask/latest/static/snowflake_dask.html deleted file mode 100644 index 1acc2302..00000000 --- a/functions/development/snowflake_dask/latest/static/snowflake_dask.html +++ /dev/null @@ -1,265 +0,0 @@ - - - - - - - -snowflake_dask.snowflake_dask - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - -
-
- -
-
-
-
-
- -
-

- -
-
-
-
-
-
-
-

Source code for snowflake_dask.snowflake_dask

-# Copyright 2019 Iguazio
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-"""Snowflake Dask - Ingest Snaowflake data with Dask"""
-
-import warnings
-import mlrun
-from mlrun.execution import MLClientCtx
-import snowflake.connector as snow
-from dask.distributed import Client
-from dask.dataframe import from_delayed
-from dask import delayed
-from dask import dataframe as dd
-from cryptography.hazmat.backends import default_backend
-from cryptography.hazmat.primitives import serialization
-
-warnings.filterwarnings("ignore")
-
-@delayed
-def load(batch):
-
-    """A delayed load one batch."""
-
-    try:
-        print("BATCHING")
-        df_ = batch.to_pandas()
-        return df_
-    except Exception as e:
-        print(f"Failed on {batch} for {e}")
-        raise
-
-
[docs]def load_results(context: MLClientCtx, - dask_client: str, - connection_info: str, - query: str, - parquet_out_dir = None, - publish_name = None - ) -> None: - - """Snowflake Dask - Ingest Snowflake data with Dask - - :param context: the function context - :param dask_client: dask cluster function name - :param connection_info: Snowflake database connection info (this will be in a secret later) - :param query: query to for Snowflake - :param parquet_out_dir: directory path for the output parquet files - (default None, not write out) - :param publish_name: name of the dask dataframe to publish to the dask cluster - (default None, not publish) - - """ - context = mlrun.get_or_create_ctx('snawflake-dask-cluster') - sf_password = context.get_secret('sfPassword') - pk_path = context.get_secret('pkPath') - pk_password = context.get_secret('pkPassword') - - if pk_path and pk_password: - with open(pk_path, "rb") as key: - p_key= serialization.load_pem_private_key( - key.read(), - password=str(pk_password).encode(), - backend=default_backend() - ) - pkb = p_key.private_bytes( - encoding=serialization.Encoding.DER, - format=serialization.PrivateFormat.PKCS8 - ,encryption_algorithm=serialization.NoEncryption() - ) - connection_info.pop('password', 'No password found') - connection_info['private_key'] = pkb - elif sf_password: - connection_info['password'] = sf_password - else: - raise Exception("\nPlease set up the secret for Snowflake in your project!\n") - - # setup dask client from the MLRun dask cluster function - if dask_client: - client = mlrun.import_function(dask_client).client - context.logger.info(f'Existing dask client === >>> {client}\n') - else: - client = Client() - context.logger.info(f'\nNewly created dask client === >>> {client}\n') - - conn = snow.connect(**connection_info) - cur = conn.cursor() - cur.execute(query) - batches = cur.get_result_batches() - context.logger.info(f'batches len === {len(batches)}\n') - - dfs = [] - for batch in batches: - if batch.rowcount > 0: - df = load(batch) - dfs.append(df) - ddf = from_delayed(dfs) - - # materialize the query results set for some sample compute - - ddf_describe = ddf.describe().compute() - - context.logger.info(f'query === >>> {query}\n') - context.logger.info(f'ddf === >>> {ddf}\n') - context.log_result('number of rows', len(ddf.index)) - context.log_dataset("ddf_describe", df=ddf_describe) - - if publish_name: - context.log_result('data_set_name', publish_name) - if not client.list_datasets(): - ddf.persist(name = publish_name) - client.publish_dataset(publish_name=ddf) - - if parquet_out_dir: - dd.to_parquet(df=ddf, path=parquet_out_dir) - context.log_result('parquet directory', parquet_out_dir)
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/snowflake_dask/latest/static/source.html b/functions/development/snowflake_dask/latest/static/source.html deleted file mode 100644 index 99a7e972..00000000 --- a/functions/development/snowflake_dask/latest/static/source.html +++ /dev/null @@ -1,147 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-# Copyright 2019 Iguazio
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-"""Snowflake Dask - Ingest Snaowflake data with Dask"""
-
-import warnings
-import mlrun
-from mlrun.execution import MLClientCtx
-import snowflake.connector as snow
-from dask.distributed import Client
-from dask.dataframe import from_delayed
-from dask import delayed
-from dask import dataframe as dd
-from cryptography.hazmat.backends import default_backend
-from cryptography.hazmat.primitives import serialization
-
-warnings.filterwarnings("ignore")
-
-@delayed
-def load(batch):
-
-    """A delayed load one batch."""
-
-    try:
-        print("BATCHING")
-        df_ = batch.to_pandas()
-        return df_
-    except Exception as e:
-        print(f"Failed on {batch} for {e}")
-        raise
-
-def load_results(context: MLClientCtx,
-                 dask_client: str,
-                 connection_info: str,
-                 query: str,
-                 parquet_out_dir = None,
-                 publish_name = None
-                ) -> None:
-
-    """Snowflake Dask - Ingest Snowflake data with Dask
-
-    :param context:           the function context
-    :param dask_client:       dask cluster function name
-    :param connection_info:   Snowflake database connection info (this will be in a secret later)
-    :param query:             query to for Snowflake
-    :param parquet_out_dir:   directory path for the output parquet files
-                              (default None, not write out)
-    :param publish_name:      name of the dask dataframe to publish to the dask cluster
-                              (default None, not publish)
-
-    """
-    context = mlrun.get_or_create_ctx('snawflake-dask-cluster')
-    sf_password = context.get_secret('sfPassword')
-    pk_path =  context.get_secret('pkPath')
-    pk_password =  context.get_secret('pkPassword')
-
-    if pk_path and pk_password:
-        with open(pk_path, "rb") as key:
-            p_key= serialization.load_pem_private_key(
-                key.read(),
-                password=str(pk_password).encode(),
-                backend=default_backend()
-            )
-        pkb = p_key.private_bytes(
-            encoding=serialization.Encoding.DER,
-            format=serialization.PrivateFormat.PKCS8
-            ,encryption_algorithm=serialization.NoEncryption()
-        )
-        connection_info.pop('password', 'No password found')
-        connection_info['private_key'] = pkb
-    elif sf_password:
-        connection_info['password'] = sf_password
-    else:
-        raise Exception("\nPlease set up the secret for Snowflake in your project!\n")
-
-    # setup dask client from the MLRun dask cluster function
-    if dask_client:
-        client = mlrun.import_function(dask_client).client
-        context.logger.info(f'Existing dask client === >>> {client}\n')
-    else:
-        client = Client()
-        context.logger.info(f'\nNewly created dask client === >>> {client}\n')
-
-    conn = snow.connect(**connection_info)
-    cur = conn.cursor()
-    cur.execute(query)
-    batches = cur.get_result_batches()
-    context.logger.info(f'batches len === {len(batches)}\n')
-
-    dfs = []
-    for batch in batches:
-        if batch.rowcount > 0:
-            df = load(batch)
-            dfs.append(df)
-    ddf = from_delayed(dfs)
-
-    # materialize the query results set for some sample compute
-
-    ddf_describe = ddf.describe().compute()
-
-    context.logger.info(f'query  === >>> {query}\n')
-    context.logger.info(f'ddf  === >>> {ddf}\n')
-    context.log_result('number of rows', len(ddf.index))
-    context.log_dataset("ddf_describe", df=ddf_describe)
-
-    if publish_name:
-        context.log_result('data_set_name', publish_name)
-        if not client.list_datasets():
-            ddf.persist(name = publish_name)
-            client.publish_dataset(publish_name=ddf)
-
-    if parquet_out_dir:
-        dd.to_parquet(df=ddf, path=parquet_out_dir)
-        context.log_result('parquet directory', parquet_out_dir)
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/sql_to_file/0.0.1/src/function.yaml b/functions/development/sql_to_file/0.0.1/src/function.yaml deleted file mode 100644 index 06301a9a..00000000 --- a/functions/development/sql_to_file/0.0.1/src/function.yaml +++ /dev/null @@ -1,47 +0,0 @@ -kind: job -metadata: - name: sql-to-file - tag: '' - hash: 61f616fe697994e05cf018f2ee94c4ea25ed8863 - project: default - labels: - author: adih - categories: - - data-preparation -spec: - command: '' - args: [] - image: mlrun/mlrun - env: [] - default_handler: sql_to_file - entry_points: - sql_to_file: - name: sql_to_file - doc: SQL Ingest - Ingest data using SQL query - parameters: - - name: context - type: MLClientCtx - doc: the function context - default: '' - - name: sql_query - type: str - doc: the sql query used to retrieve the data - default: '' - - name: database_url - type: str - doc: database connection URL - default: '' - - name: file_ext - type: str - doc: ("parquet") format for result file - default: parquet - outputs: - - default: '' - lineno: 9 - description: SQL To File - Ingest data using SQL query - build: - functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHBhbmRhcyBhcyBwZAppbXBvcnQgcHloaXZlCmZyb20gc3FsYWxjaGVteS5lbmdpbmUgaW1wb3J0IGNyZWF0ZV9lbmdpbmUKZnJvbSBtbHJ1bi5leGVjdXRpb24gaW1wb3J0IE1MQ2xpZW50Q3R4CgoKZGVmIHNxbF90b19maWxlKAogICAgY29udGV4dDogTUxDbGllbnRDdHgsCiAgICBzcWxfcXVlcnk6IHN0ciwKICAgIGRhdGFiYXNlX3VybDogc3RyLAogICAgZmlsZV9leHQ6IHN0ciA9ICJwYXJxdWV0IiwKKSAtPiBOb25lOgogICAgIiIiU1FMIEluZ2VzdCAtIEluZ2VzdCBkYXRhIHVzaW5nIFNRTCBxdWVyeQoKICAgIDpwYXJhbSBjb250ZXh0OiAgICAgICAgICAgdGhlIGZ1bmN0aW9uIGNvbnRleHQKICAgIDpwYXJhbSBzcWxfcXVlcnk6ICAgICAgICAgdGhlIHNxbCBxdWVyeSB1c2VkIHRvIHJldHJpZXZlIHRoZSBkYXRhCiAgICA6cGFyYW0gZGF0YWJhc2VfdXJsOiAgICAgIGRhdGFiYXNlIGNvbm5lY3Rpb24gVVJMCiAgICA6cGFyYW0gZmlsZV9leHQ6ICAgICAgICAgICgicGFycXVldCIpIGZvcm1hdCBmb3IgcmVzdWx0IGZpbGUKICAgICIiIgoKICAgIGVuZ2luZSA9IGNyZWF0ZV9lbmdpbmUoZGF0YWJhc2VfdXJsKQogICAgZGYgPSBwZC5yZWFkX3NxbChzcWxfcXVlcnksIGVuZ2luZSkKCiAgICBjb250ZXh0LmxvZ19kYXRhc2V0KAogICAgICAgICJxdWVyeSByZXN1bHQiLAogICAgICAgIGRmPWRmLAogICAgICAgIGZvcm1hdD1maWxlX2V4dCwKICAgICAgICBhcnRpZmFjdF9wYXRoPWNvbnRleHQuYXJ0aWZhY3Rfc3VicGF0aCgiZGF0YSIpLAogICAgKQo= - commands: [] - code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/sql_to_file/sql_to_file.py - affinity: null -verbose: false diff --git a/functions/development/sql_to_file/0.0.1/src/item.yaml b/functions/development/sql_to_file/0.0.1/src/item.yaml deleted file mode 100644 index 099cbef1..00000000 --- a/functions/development/sql_to_file/0.0.1/src/item.yaml +++ /dev/null @@ -1,23 +0,0 @@ -apiVersion: v1 -categories: -- data-preparation -description: SQL To File - Ingest data using SQL query -doc: '' -example: sql_to_file.ipynb -generationDate: 2021-05-19:23-13 -icon: '' -labels: - author: adih -maintainers: [] -marketplaceType: '' -mlrunVersion: '' -name: sql-to-file -platformVersion: '' -spec: - filename: sql_to_file.py - handler: sql_to_file - image: mlrun/mlrun - kind: job - requirements: [] -url: '' -version: 0.0.1 diff --git a/functions/development/sql_to_file/0.0.1/src/requirements.txt b/functions/development/sql_to_file/0.0.1/src/requirements.txt deleted file mode 100644 index 21ef3f07..00000000 --- a/functions/development/sql_to_file/0.0.1/src/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -mlrun -PyHive -pymysql \ No newline at end of file diff --git a/functions/development/sql_to_file/0.0.1/src/sql_to_file.ipynb b/functions/development/sql_to_file/0.0.1/src/sql_to_file.ipynb deleted file mode 100644 index d4a084ad..00000000 --- a/functions/development/sql_to_file/0.0.1/src/sql_to_file.ipynb +++ /dev/null @@ -1,1567 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# SQL Ingest - Ingest data using SQL query " - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "ename": "ModuleNotFoundError", - "evalue": "No module named 'nuclio'", - "output_type": "error", - "traceback": [ - "\u001B[1;31m---------------------------------------------------------------------------\u001B[0m", - "\u001B[1;31mModuleNotFoundError\u001B[0m Traceback (most recent call last)", - "\u001B[1;32m\u001B[0m in \u001B[0;36m\u001B[1;34m\u001B[0m\n\u001B[0;32m 1\u001B[0m \u001B[1;31m# nuclio: ignore\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[1;32m----> 2\u001B[1;33m \u001B[1;32mimport\u001B[0m \u001B[0mnuclio\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0m\u001B[0;32m 3\u001B[0m \u001B[1;33m\u001B[0m\u001B[0m\n", - "\u001B[1;31mModuleNotFoundError\u001B[0m: No module named 'nuclio'" - ] - } - ], - "source": [ - "# nuclio: ignore\n", - "import nuclio" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "%nuclio: setting kind to 'job'\n", - "%nuclio: setting spec.image to 'mlrun/mlrun'\n" - ] - } - ], - "source": [ - "%nuclio config kind = \"job\"\n", - "%nuclio config spec.build.baseImage = \"mlrun/mlrun\"" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "%%nuclio cmd -c\n", - "pip install --no-cache-dir git+https://github.com/v3io/PyHive.git@v0.6.999 \n", - "pip install sqlalchemy==1.3.11\n", - "pip install PyMySQL==0.9.3" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import pyhive\n", - "from sqlalchemy.engine import create_engine\n", - "from mlrun.execution import MLClientCtx\n", - "\n", - "\n", - "def sql_to_file(\n", - " context: MLClientCtx,\n", - " sql_query: str,\n", - " database_url: str,\n", - " file_ext: str = \"parquet\",\n", - ") -> None:\n", - " \"\"\"SQL Ingest - Ingest data using SQL query\n", - "\n", - " :param context: the function context\n", - " :param sql_query: the sql query used to retrieve the data\n", - " :param database_url: database connection URL\n", - " :param file_ext: (\"parquet\") format for result file\n", - "\n", - "\"\"\"\n", - "\n", - " engine = create_engine(database_url)\n", - " df = pd.read_sql(sql_query, engine)\n", - "\n", - " context.log_dataset('query result',\n", - " df=df,\n", - " format=file_ext,\n", - " artifact_path=context.artifact_subpath('data'))\n" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: end-code" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### mlconfig" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "ename": "KeyError", - "evalue": "'HOME'", - "output_type": "error", - "traceback": [ - "\u001B[1;31m---------------------------------------------------------------------------\u001B[0m", - "\u001B[1;31mKeyError\u001B[0m Traceback (most recent call last)", - "\u001B[1;32m\u001B[0m in \u001B[0;36m\u001B[1;34m\u001B[0m\n\u001B[0;32m 2\u001B[0m \u001B[1;32mimport\u001B[0m \u001B[0mos\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 3\u001B[0m \u001B[0mmlconf\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mdbpath\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0mmlconf\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mdbpath\u001B[0m \u001B[1;32mor\u001B[0m \u001B[1;34m'http://mlrun-api:8080'\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[1;32m----> 4\u001B[1;33m \u001B[0mmlconf\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0martifact_path\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0mmlconf\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0martifact_path\u001B[0m \u001B[1;32mor\u001B[0m \u001B[1;34mf'{os.environ[\"HOME\"]}/artifacts'\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0m\u001B[0;32m 5\u001B[0m \u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 6\u001B[0m \u001B[1;33m\u001B[0m\u001B[0m\n", - "\u001B[1;32mC:\\Program Files\\Python37\\lib\\os.py\u001B[0m in \u001B[0;36m__getitem__\u001B[1;34m(self, key)\u001B[0m\n\u001B[0;32m 679\u001B[0m \u001B[1;32mexcept\u001B[0m \u001B[0mKeyError\u001B[0m\u001B[1;33m:\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 680\u001B[0m \u001B[1;31m# raise KeyError with the original key value\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[1;32m--> 681\u001B[1;33m \u001B[1;32mraise\u001B[0m \u001B[0mKeyError\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0mkey\u001B[0m\u001B[1;33m)\u001B[0m \u001B[1;32mfrom\u001B[0m \u001B[1;32mNone\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0m\u001B[0;32m 682\u001B[0m \u001B[1;32mreturn\u001B[0m \u001B[0mself\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mdecodevalue\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0mvalue\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 683\u001B[0m \u001B[1;33m\u001B[0m\u001B[0m\n", - "\u001B[1;31mKeyError\u001B[0m: 'HOME'" - ] - } - ], - "source": [ - "from mlrun import mlconf\n", - "import os\n", - "mlconf.dbpath = mlconf.dbpath or 'http://mlrun-api:8080'\n", - "mlconf.artifact_path = mlconf.artifact_path or f'{os.environ[\"HOME\"]}/artifacts'\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Save function" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "def mount_secret(\n", - " secret_name, volume_mount_path, volume_name='secret', items=None\n", - "):\n", - " def _mount_secret(task):\n", - " from kubernetes import client as k8s_client\n", - " vol = k8s_client.V1SecretVolumeSource(secret_name=secret_name, items=items)\n", - " return task.add_volume(\n", - " k8s_client.V1Volume(name=volume_name, secret=vol)\n", - " ).add_volume_mount(\n", - " k8s_client.V1VolumeMount(mount_path=volume_mount_path, name=volume_name)\n", - " )\n", - " return _mount_secret" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import code_to_function, NewTask\n", - "import os\n", - "\n", - "fn = code_to_function(name=\"sql_to_file\",\n", - " handler=\"sql_to_file\",\n", - " description=\"SQL To File - Ingest data using SQL query\",\n", - " categories=[\"data-prep\"],\n", - " labels={\"author\": \"adih\"})\n", - "\n", - "if \"V3IO_ACCESS_KEY\" in list(os.environ):\n", - " fn.apply(mount_secret(secret_name='presto-tls',\n", - " volume_mount_path= '/var/run/iguazio/secrets/'))\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Build the image" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-06-29 12:42:44,100 starting remote build, image: .mlrun/func-default-sql-ingest-latest\n", - "\u001B[36mINFO\u001B[0m[0000] Resolved base name mlrun/mlrun:0.4.10 to mlrun/mlrun:0.4.10 \n", - "\u001B[36mINFO\u001B[0m[0000] Resolved base name mlrun/mlrun:0.4.10 to mlrun/mlrun:0.4.10 \n", - "\u001B[36mINFO\u001B[0m[0000] Retrieving image manifest mlrun/mlrun:0.4.10 \n", - "\u001B[36mINFO\u001B[0m[0000] Retrieving image manifest mlrun/mlrun:0.4.10 \n", - "\u001B[36mINFO\u001B[0m[0000] Built cross stage deps: map[] \n", - "\u001B[36mINFO\u001B[0m[0000] Retrieving image manifest mlrun/mlrun:0.4.10 \n", - "\u001B[36mINFO\u001B[0m[0000] Retrieving image manifest mlrun/mlrun:0.4.10 \n", - "\u001B[36mINFO\u001B[0m[0001] Unpacking rootfs as cmd RUN pip install --no-cache-dir git+https://github.com/v3io/PyHive.git@v0.6.999 requires it. \n", - "\u001B[36mINFO\u001B[0m[0027] Taking snapshot of full filesystem... \n", - "\u001B[36mINFO\u001B[0m[0039] Resolving paths \n", - "\u001B[36mINFO\u001B[0m[0046] RUN pip install --no-cache-dir git+https://github.com/v3io/PyHive.git@v0.6.999 \n", - "\u001B[36mINFO\u001B[0m[0046] cmd: /bin/sh \n", - "\u001B[36mINFO\u001B[0m[0046] args: [-c pip install --no-cache-dir git+https://github.com/v3io/PyHive.git@v0.6.999] \n", - "Collecting git+https://github.com/v3io/PyHive.git@v0.6.999\n", - " Cloning https://github.com/v3io/PyHive.git (to revision v0.6.999) to /tmp/pip-req-build-ycqhuolw\n", - " Running command git clone -q https://github.com/v3io/PyHive.git /tmp/pip-req-build-ycqhuolw\n", - "Requirement already satisfied: future in /usr/local/lib/python3.7/site-packages (from PyHive==0.6.1.dev0) (0.18.2)\n", - "Requirement already satisfied: python-dateutil in /usr/local/lib/python3.7/site-packages (from PyHive==0.6.1.dev0) (2.8.1)\n", - "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/site-packages (from python-dateutil->PyHive==0.6.1.dev0) (1.15.0)\n", - "Building wheels for collected packages: PyHive\n", - " Building wheel for PyHive (setup.py): started\n", - " Building wheel for PyHive (setup.py): finished with status 'done'\n", - " Created wheel for PyHive: filename=PyHive-0.6.1.dev0-py3-none-any.whl size=46402 sha256=63dca405cbae83da4cfcabfd61fd00f1683bc008c8bfa2272eac7054ec283166\n", - " Stored in directory: /tmp/pip-ephem-wheel-cache-mwb52l_u/wheels/05/11/cd/4ac4df0fcee76e5ceb614c39c56fca1eead41c0ac32ff6285d\n", - "Successfully built PyHive\n", - "Installing collected packages: PyHive\n", - "Successfully installed PyHive-0.6.1.dev0\n", - "\u001B[36mINFO\u001B[0m[0048] Taking snapshot of full filesystem... \n", - "\u001B[36mINFO\u001B[0m[0048] Resolving paths \n", - "\u001B[36mINFO\u001B[0m[0053] RUN pip install sqlalchemy==1.3.11 \n", - "\u001B[36mINFO\u001B[0m[0053] cmd: /bin/sh \n", - "\u001B[36mINFO\u001B[0m[0053] args: [-c pip install sqlalchemy==1.3.11] \n", - "Collecting sqlalchemy==1.3.11\n", - " Downloading SQLAlchemy-1.3.11.tar.gz (6.0 MB)\n", - "Building wheels for collected packages: sqlalchemy\n", - " Building wheel for sqlalchemy (setup.py): started\n", - " Building wheel for sqlalchemy (setup.py): finished with status 'done'\n", - " Created wheel for sqlalchemy: filename=SQLAlchemy-1.3.11-cp37-cp37m-linux_x86_64.whl size=1216921 sha256=9dd22e89acfbb68df0c1d189d36907a16c9393e4174598eb4bf377ce57132f3c\n", - " Stored in directory: /root/.cache/pip/wheels/0a/60/60/f26cbd183a3bb0031ace108156036dd925ec0138ee1c496a16\n", - "Successfully built sqlalchemy\n", - "Installing collected packages: sqlalchemy\n", - " Attempting uninstall: sqlalchemy\n", - " Found existing installation: SQLAlchemy 1.3.17\n", - " Uninstalling SQLAlchemy-1.3.17:\n", - " Successfully uninstalled SQLAlchemy-1.3.17\n", - "Successfully installed sqlalchemy-1.3.11\n", - "\u001B[36mINFO\u001B[0m[0057] Taking snapshot of full filesystem... \n", - "\u001B[36mINFO\u001B[0m[0057] Resolving paths \n", - "\u001B[36mINFO\u001B[0m[0063] RUN pip install PyMySQL==0.9.3 \n", - "\u001B[36mINFO\u001B[0m[0063] cmd: /bin/sh \n", - "\u001B[36mINFO\u001B[0m[0063] args: [-c pip install PyMySQL==0.9.3] \n", - "Collecting PyMySQL==0.9.3\n", - " Downloading PyMySQL-0.9.3-py2.py3-none-any.whl (47 kB)\n", - "Installing collected packages: PyMySQL\n", - "Successfully installed PyMySQL-0.9.3\n", - "\u001B[36mINFO\u001B[0m[0064] Taking snapshot of full filesystem... \n", - "\u001B[36mINFO\u001B[0m[0064] Resolving paths \n" - ] - }, - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fn.deploy()" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-06-30 01:58:41,604 function spec saved to path: function.yaml\n" - ] - }, - { - "data": { - "text/plain": "" - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fn.export('function.yaml')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Test" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Reading from a public MySQL DB" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "mysql_url = 'mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam'\n", - "mysql_query = 'select rfam_acc,rfam_id,auto_wiki,description,author,seed_source FROM family'" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import NewTask, run_local\n", - "\n", - "sql_task = NewTask(name='sql',\n", - " handler=sql_to_file,\n", - " params={'sql_query': mysql_query,\n", - " 'database_url': mysql_url})\n" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-06-29 12:43:59,253 starting run sql uid=b0914edaa58e45ee97c132200c6b60be -> http://mlrun-api:8080\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 29 12:43:59completedsql
v3io_user=admin
kind=handler
owner=admin
host=jupyter-b9c7995f9-4fblj
sql_query=select rfam_acc,rfam_id,auto_wiki,description,author,seed_source FROM family
database_url=mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam
query result
\n", - "
\n", - "
\n", - "
\n", - " Title\n", - " ×\n", - "
\n", - " \n", - "
\n", - "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "to track results use .show() or .logs() or in CLI: \n", - "!mlrun get run b0914edaa58e45ee97c132200c6b60be --project default , !mlrun logs b0914edaa58e45ee97c132200c6b60be --project default\n", - "[mlrun] 2020-06-29 12:44:02,344 run executed, status=completed\n" - ] - } - ], - "source": [ - "sql_func = run_local(sql_task)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Run it on a cluster" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-06-29 12:44:02,350 starting run sql uid=46ff7ef67e314be49353982cdd8d073a -> http://mlrun-api:8080\n", - "[mlrun] 2020-06-29 12:44:02,622 Job is running in the background, pod: sql-mplpz\n", - "[mlrun] 2020-06-29 12:44:09,070 run executed, status=completed\n", - "final state: succeeded\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 29 12:44:06completedsql
v3io_user=admin
kind=job
owner=admin
host=sql-mplpz
sql_query=select rfam_acc,rfam_id,auto_wiki,description,author,seed_source FROM family
database_url=mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam
query result
\n", - "
\n", - "
\n", - "
\n", - " Title\n", - " ×\n", - "
\n", - " \n", - "
\n", - "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "to track results use .show() or .logs() or in CLI: \n", - "!mlrun get run 46ff7ef67e314be49353982cdd8d073a --project default , !mlrun logs 46ff7ef67e314be49353982cdd8d073a --project default\n", - "[mlrun] 2020-06-29 12:44:11,893 run executed, status=completed\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fn.run(sql_task)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### SQL query from Iguazio Key Value via Presto" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You need to create a table and set the sql_table path accordingly.
\n", - "you can find an example of creating such table in https://github.com/v3io/tutorials/blob/master/data-ingestion-and-preparation/basic-data-ingestion-and-preparation.ipynb" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: ignore\n", - "import os\n", - "sql_table = os.path.join('v3io.users.\"'+str(os.getenv('V3IO_USERNAME'))+'/examples/stocks_tab\"')\n", - "sql_query_string = 'select * from '+sql_table+\"\"" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Done.\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
securitydescsecuritytypetimeisinminpricedateendpricenumberoftradesmnemoniccurrencysecurityidmaxpricetradedvolumestartprice
UBS I.ETF-DL G.SEL.DIV.ADETF08:27IE00BMP3HG278.4182018-03-26 00:00:00.0008.4181UBUMEUR25054508.4184038.418
GILEAD SCIENCES DL-,001Common stock08:00US375558103659.72018-03-26 00:00:00.00059.843GISEUR250649559.8474559.7
3M CO. DL-,01Common stock08:00US88579Y1010176.512018-03-26 00:00:00.000176.511MMMEUR2506577176.5139176.51
DIEBOLD NIXDORF INH.O.N.Common stock08:06DE000A0CAYB266.32018-03-26 00:00:00.00066.31WINEUR250428666.36066.3
XTR.II EUR.INF.LINK.BD 1CETF08:13LU0290358224218.972018-03-26 00:00:00.000218.971DBXKEUR2505840218.97110218.97
UBS-ETF-MSCI EMU S.C.EOADETF08:33LU0671493277100.22018-03-26 00:00:00.000100.21UEFDEUR2506045100.2180100.2
ASMALLWORLD AG SF 1Common stock08:23CH040488012912.72018-03-26 00:00:00.00012.711Q7EUR308912212.740012.7
IS.DJ GLOB.TITAN.50 U.ETFETF08:42DE000628938231.252018-03-26 00:00:00.00031.251EXI2EUR250502931.255031.25
ISHS IV-AGEING POPUL.ETFETF08:17IE00BYZK46694.9262018-03-26 00:00:00.0004.92612B77EUR25055524.926254.926
PORSCHE AUTOM.HLDG VZOCommon stock08:00DE000PAH003864.682018-03-26 00:00:00.00064.768PAH3EUR250481664.7669864.7
" - ], - "text/plain": [ - "[('UBS I.ETF-DL G.SEL.DIV.AD', 'ETF', '08:27', 'IE00BMP3HG27', 8.418, '2018-03-26 00:00:00.000', 8.418, 1, 'UBUM', 'EUR', 2505450, 8.418, 403, 8.418),\n", - " ('GILEAD SCIENCES DL-,001', 'Common stock', '08:00', 'US3755581036', 59.7, '2018-03-26 00:00:00.000', 59.84, 3, 'GIS', 'EUR', 2506495, 59.84, 745, 59.7),\n", - " ('3M CO. DL-,01', 'Common stock', '08:00', 'US88579Y1010', 176.51, '2018-03-26 00:00:00.000', 176.51, 1, 'MMM', 'EUR', 2506577, 176.51, 39, 176.51),\n", - " ('DIEBOLD NIXDORF INH.O.N.', 'Common stock', '08:06', 'DE000A0CAYB2', 66.3, '2018-03-26 00:00:00.000', 66.3, 1, 'WIN', 'EUR', 2504286, 66.3, 60, 66.3),\n", - " ('XTR.II EUR.INF.LINK.BD 1C', 'ETF', '08:13', 'LU0290358224', 218.97, '2018-03-26 00:00:00.000', 218.97, 1, 'DBXK', 'EUR', 2505840, 218.97, 110, 218.97),\n", - " ('UBS-ETF-MSCI EMU S.C.EOAD', 'ETF', '08:33', 'LU0671493277', 100.2, '2018-03-26 00:00:00.000', 100.2, 1, 'UEFD', 'EUR', 2506045, 100.2, 180, 100.2),\n", - " ('ASMALLWORLD AG SF 1', 'Common stock', '08:23', 'CH0404880129', 12.7, '2018-03-26 00:00:00.000', 12.7, 1, '1Q7', 'EUR', 3089122, 12.7, 400, 12.7),\n", - " ('IS.DJ GLOB.TITAN.50 U.ETF', 'ETF', '08:42', 'DE0006289382', 31.25, '2018-03-26 00:00:00.000', 31.25, 1, 'EXI2', 'EUR', 2505029, 31.25, 50, 31.25),\n", - " ('ISHS IV-AGEING POPUL.ETF', 'ETF', '08:17', 'IE00BYZK4669', 4.926, '2018-03-26 00:00:00.000', 4.926, 1, '2B77', 'EUR', 2505552, 4.926, 25, 4.926),\n", - " ('PORSCHE AUTOM.HLDG VZO', 'Common stock', '08:00', 'DE000PAH0038', 64.68, '2018-03-26 00:00:00.000', 64.76, 8, 'PAH3', 'EUR', 2504816, 64.76, 698, 64.7)]" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "%sql select * from $sql_table limit 10" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [], - "source": [ - "sql_task = NewTask(name='sql', \n", - " handler=sql_to_file,\n", - " params={'sql_query': sql_query_string,\n", - " 'database_url': os.getenv('DATABASE_URL')}\n", - " )\n" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-06-29 12:44:14,406 starting run sql uid=d32a57bb990d4142bb1f63862e8906bf -> http://mlrun-api:8080\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 29 12:44:14completedsql
v3io_user=admin
kind=handler
owner=admin
host=jupyter-b9c7995f9-4fblj
sql_query=select * from v3io.users.\"admin/examples/stocks_tab\"
database_url=presto://admin:8278ee8e-0f31-4aea-a105-2eab202bec93@presto-api-presto.default-tenant.app.cs-mlrun-test.iguazio-c0.com:443/v3io?protocol=https&requests_kwargs=%7B%22verify%22%3A+%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.crt%22%2C+%22cert%22%3A+%5B%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.crt%22%2C+%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.key%22%5D%7D
query result
\n", - "
\n", - "
\n", - "
\n", - " Title\n", - " ×\n", - "
\n", - " \n", - "
\n", - "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "to track results use .show() or .logs() or in CLI: \n", - "!mlrun get run d32a57bb990d4142bb1f63862e8906bf --project default , !mlrun logs d32a57bb990d4142bb1f63862e8906bf --project default\n", - "[mlrun] 2020-06-29 12:44:18,102 run executed, status=completed\n" - ] - } - ], - "source": [ - "sql_func = run_local(sql_task)" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-06-29 12:44:18,112 starting run sql uid=db9507007f6d452e9ca020e4f483e33b -> http://mlrun-api:8080\n", - "[mlrun] 2020-06-29 12:44:18,387 Job is running in the background, pod: sql-g7p4f\n", - "[mlrun] 2020-06-29 12:44:25,033 run executed, status=completed\n", - "final state: succeeded\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 29 12:44:21completedsql
v3io_user=admin
kind=job
owner=admin
host=sql-g7p4f
sql_query=select * from v3io.users.\"admin/examples/stocks_tab\"
database_url=presto://admin:8278ee8e-0f31-4aea-a105-2eab202bec93@presto-api-presto.default-tenant.app.cs-mlrun-test.iguazio-c0.com:443/v3io?protocol=https&requests_kwargs=%7B%22verify%22%3A+%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.crt%22%2C+%22cert%22%3A+%5B%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.crt%22%2C+%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.key%22%5D%7D
query result
\n", - "
\n", - "
\n", - "
\n", - " Title\n", - " ×\n", - "
\n", - " \n", - "
\n", - "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "to track results use .show() or .logs() or in CLI: \n", - "!mlrun get run db9507007f6d452e9ca020e4f483e33b --project default , !mlrun logs db9507007f6d452e9ca020e4f483e33b --project default\n", - "[mlrun] 2020-06-29 12:44:27,645 run executed, status=completed\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fn.run(sql_task)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.8" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} \ No newline at end of file diff --git a/functions/development/sql_to_file/0.0.1/src/sql_to_file.py b/functions/development/sql_to_file/0.0.1/src/sql_to_file.py deleted file mode 100644 index 086cb066..00000000 --- a/functions/development/sql_to_file/0.0.1/src/sql_to_file.py +++ /dev/null @@ -1,31 +0,0 @@ -# Generated by nuclio.export.NuclioExporter - -import pandas as pd -import pyhive -from sqlalchemy.engine import create_engine -from mlrun.execution import MLClientCtx - - -def sql_to_file( - context: MLClientCtx, - sql_query: str, - database_url: str, - file_ext: str = "parquet", -) -> None: - """SQL Ingest - Ingest data using SQL query - - :param context: the function context - :param sql_query: the sql query used to retrieve the data - :param database_url: database connection URL - :param file_ext: ("parquet") format for result file - """ - - engine = create_engine(database_url) - df = pd.read_sql(sql_query, engine) - - context.log_dataset( - "query result", - df=df, - format=file_ext, - artifact_path=context.artifact_subpath("data"), - ) diff --git a/functions/development/sql_to_file/0.0.1/src/test_sql_to_file.py b/functions/development/sql_to_file/0.0.1/src/test_sql_to_file.py deleted file mode 100644 index cc345a27..00000000 --- a/functions/development/sql_to_file/0.0.1/src/test_sql_to_file.py +++ /dev/null @@ -1,18 +0,0 @@ -from mlrun import code_to_function - -mysql_url = 'mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam' -mysql_query = 'select rfam_acc,rfam_id,auto_wiki,description,author,seed_source FROM family' - - -def test_run_sql_to_file(): - fn = code_to_function(name='test_sql_to_file', - filename="sql_to_file.py", - handler="sql_to_file", - kind="job", - ) - fn.run(params={'sql_query': mysql_query, - 'database_url': mysql_url} - , local=True - - ) - diff --git a/functions/development/sql_to_file/0.0.1/static/documentation.html b/functions/development/sql_to_file/0.0.1/static/documentation.html deleted file mode 100644 index 6d477881..00000000 --- a/functions/development/sql_to_file/0.0.1/static/documentation.html +++ /dev/null @@ -1,125 +0,0 @@ - - - - - - - -sql_to_file package - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- - -
-
-
-
-
-
-

sql_to_file package

-
-

Submodules

-
-
-

sql_to_file.sql_to_file module

-
-
-

Module contents

-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/sql_to_file/0.0.1/static/example.html b/functions/development/sql_to_file/0.0.1/static/example.html deleted file mode 100644 index 17a02ee8..00000000 --- a/functions/development/sql_to_file/0.0.1/static/example.html +++ /dev/null @@ -1,1472 +0,0 @@ - - - - - - - -SQL Ingest - Ingest data using SQL query - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
- -
-
-
-
-

SQL Ingest - Ingest data using SQL query

-
-
-
# nuclio: ignore
-import nuclio
-
-
-
-
-
---------------------------------------------------------------------------
-ModuleNotFoundError                       Traceback (most recent call last)
-<ipython-input-1-0a63a0760af8> in <module>
-      1 # nuclio: ignore
-----> 2 import nuclio
-      3 
-
-ModuleNotFoundError: No module named 'nuclio'
-
-
-
-
-
-
-
%nuclio config kind = "job"
-%nuclio config spec.build.baseImage = "mlrun/mlrun"
-
-
-
-
-
%nuclio: setting kind to 'job'
-%nuclio: setting spec.image to 'mlrun/mlrun'
-
-
-
-
-
-
-
%%nuclio cmd -c
-pip install --no-cache-dir git+https://github.com/v3io/PyHive.git@v0.6.999 
-pip install sqlalchemy==1.3.11
-pip install PyMySQL==0.9.3
-
-
-
-
-
-
-
import pandas as pd
-import pyhive
-from sqlalchemy.engine import create_engine
-from mlrun.execution import MLClientCtx
-
-
-def sql_to_file(
-    context: MLClientCtx,
-    sql_query: str,
-    database_url: str,
-    file_ext: str = "parquet",
-) -> None:
-    """SQL Ingest - Ingest data using SQL query
-
-    :param context:           the function context
-    :param sql_query:         the sql query used to retrieve the data
-    :param database_url:      database connection URL
-    :param file_ext:          ("parquet") format for result file
-
-"""
-
-    engine = create_engine(database_url)
-    df = pd.read_sql(sql_query, engine)
-
-    context.log_dataset('query result',
-                        df=df,
-                        format=file_ext,
-                        artifact_path=context.artifact_subpath('data'))
-
-
-
-
-
-
-
# nuclio: end-code
-
-
-
-
-
-

mlconfig

-
-
-
from mlrun import mlconf
-import os
-mlconf.dbpath = mlconf.dbpath or 'http://mlrun-api:8080'
-mlconf.artifact_path = mlconf.artifact_path or f'{os.environ["HOME"]}/artifacts'
-
-
-
-
-
---------------------------------------------------------------------------
-KeyError                                  Traceback (most recent call last)
-<ipython-input-7-c3828dc245f1> in <module>
-      2 import os
-      3 mlconf.dbpath = mlconf.dbpath or 'http://mlrun-api:8080'
-----> 4 mlconf.artifact_path = mlconf.artifact_path or f'{os.environ["HOME"]}/artifacts'
-      5 
-      6 
-
-C:\Program Files\Python37\lib\os.py in __getitem__(self, key)
-    679         except KeyError:
-    680             # raise KeyError with the original key value
---> 681             raise KeyError(key) from None
-    682         return self.decodevalue(value)
-    683 
-
-KeyError: 'HOME'
-
-
-
-
-
-
-

Save function

-
-
-
def mount_secret(
-    secret_name, volume_mount_path, volume_name='secret', items=None
-):
-    def _mount_secret(task):
-        from kubernetes import client as k8s_client
-        vol = k8s_client.V1SecretVolumeSource(secret_name=secret_name, items=items)
-        return task.add_volume(
-            k8s_client.V1Volume(name=volume_name, secret=vol)
-        ).add_volume_mount(
-            k8s_client.V1VolumeMount(mount_path=volume_mount_path, name=volume_name)
-        )
-    return _mount_secret
-
-
-
-
-
-
-
from mlrun import code_to_function, NewTask
-import os
-
-fn = code_to_function(name="sql_to_file",
-                      handler="sql_to_file",
-                      description="SQL To File - Ingest data using SQL query",
-                      categories=["data-prep"],
-                      labels={"author": "adih"})
-
-if "V3IO_ACCESS_KEY" in list(os.environ):
-    fn.apply(mount_secret(secret_name='presto-tls',
-                        volume_mount_path= '/var/run/iguazio/secrets/'))
-
-
-
-
-
-
-

Build the image

-
-
-
fn.deploy()
-
-
-
-
-
[mlrun] 2020-06-29 12:42:44,100 starting remote build, image: .mlrun/func-default-sql-ingest-latest
-INFO[0000] Resolved base name mlrun/mlrun:0.4.10 to mlrun/mlrun:0.4.10 
-INFO[0000] Resolved base name mlrun/mlrun:0.4.10 to mlrun/mlrun:0.4.10 
-INFO[0000] Retrieving image manifest mlrun/mlrun:0.4.10 
-INFO[0000] Retrieving image manifest mlrun/mlrun:0.4.10 
-INFO[0000] Built cross stage deps: map[]                
-INFO[0000] Retrieving image manifest mlrun/mlrun:0.4.10 
-INFO[0000] Retrieving image manifest mlrun/mlrun:0.4.10 
-INFO[0001] Unpacking rootfs as cmd RUN pip install --no-cache-dir git+https://github.com/v3io/PyHive.git@v0.6.999 requires it. 
-INFO[0027] Taking snapshot of full filesystem...        
-INFO[0039] Resolving paths                              
-INFO[0046] RUN pip install --no-cache-dir git+https://github.com/v3io/PyHive.git@v0.6.999 
-INFO[0046] cmd: /bin/sh                                 
-INFO[0046] args: [-c pip install --no-cache-dir git+https://github.com/v3io/PyHive.git@v0.6.999] 
-Collecting git+https://github.com/v3io/PyHive.git@v0.6.999
-  Cloning https://github.com/v3io/PyHive.git (to revision v0.6.999) to /tmp/pip-req-build-ycqhuolw
-  Running command git clone -q https://github.com/v3io/PyHive.git /tmp/pip-req-build-ycqhuolw
-Requirement already satisfied: future in /usr/local/lib/python3.7/site-packages (from PyHive==0.6.1.dev0) (0.18.2)
-Requirement already satisfied: python-dateutil in /usr/local/lib/python3.7/site-packages (from PyHive==0.6.1.dev0) (2.8.1)
-Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/site-packages (from python-dateutil->PyHive==0.6.1.dev0) (1.15.0)
-Building wheels for collected packages: PyHive
-  Building wheel for PyHive (setup.py): started
-  Building wheel for PyHive (setup.py): finished with status 'done'
-  Created wheel for PyHive: filename=PyHive-0.6.1.dev0-py3-none-any.whl size=46402 sha256=63dca405cbae83da4cfcabfd61fd00f1683bc008c8bfa2272eac7054ec283166
-  Stored in directory: /tmp/pip-ephem-wheel-cache-mwb52l_u/wheels/05/11/cd/4ac4df0fcee76e5ceb614c39c56fca1eead41c0ac32ff6285d
-Successfully built PyHive
-Installing collected packages: PyHive
-Successfully installed PyHive-0.6.1.dev0
-INFO[0048] Taking snapshot of full filesystem...        
-INFO[0048] Resolving paths                              
-INFO[0053] RUN pip install sqlalchemy==1.3.11           
-INFO[0053] cmd: /bin/sh                                 
-INFO[0053] args: [-c pip install sqlalchemy==1.3.11]    
-Collecting sqlalchemy==1.3.11
-  Downloading SQLAlchemy-1.3.11.tar.gz (6.0 MB)
-Building wheels for collected packages: sqlalchemy
-  Building wheel for sqlalchemy (setup.py): started
-  Building wheel for sqlalchemy (setup.py): finished with status 'done'
-  Created wheel for sqlalchemy: filename=SQLAlchemy-1.3.11-cp37-cp37m-linux_x86_64.whl size=1216921 sha256=9dd22e89acfbb68df0c1d189d36907a16c9393e4174598eb4bf377ce57132f3c
-  Stored in directory: /root/.cache/pip/wheels/0a/60/60/f26cbd183a3bb0031ace108156036dd925ec0138ee1c496a16
-Successfully built sqlalchemy
-Installing collected packages: sqlalchemy
-  Attempting uninstall: sqlalchemy
-    Found existing installation: SQLAlchemy 1.3.17
-    Uninstalling SQLAlchemy-1.3.17:
-      Successfully uninstalled SQLAlchemy-1.3.17
-Successfully installed sqlalchemy-1.3.11
-INFO[0057] Taking snapshot of full filesystem...        
-INFO[0057] Resolving paths                              
-INFO[0063] RUN pip install PyMySQL==0.9.3               
-INFO[0063] cmd: /bin/sh                                 
-INFO[0063] args: [-c pip install PyMySQL==0.9.3]        
-Collecting PyMySQL==0.9.3
-  Downloading PyMySQL-0.9.3-py2.py3-none-any.whl (47 kB)
-Installing collected packages: PyMySQL
-Successfully installed PyMySQL-0.9.3
-INFO[0064] Taking snapshot of full filesystem...        
-INFO[0064] Resolving paths                              
-
-
-
True
-
-
-
-
-
-
-
fn.export('function.yaml')
-
-
-
-
-
[mlrun] 2020-06-30 01:58:41,604 function spec saved to path: function.yaml
-
-
-
<mlrun.runtimes.kubejob.KubejobRuntime at 0x2239dbf01c8>
-
-
-
-
-
-
-

Test

-
-

Reading from a public MySQL DB

-
-
-
mysql_url = 'mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam'
-mysql_query = 'select rfam_acc,rfam_id,auto_wiki,description,author,seed_source FROM family'
-
-
-
-
-
-
-
from mlrun import NewTask, run_local
-
-sql_task = NewTask(name='sql',
-                   handler=sql_to_file,
-                   params={'sql_query': mysql_query,
-                           'database_url': mysql_url})
-
-
-
-
-
-
-
sql_func = run_local(sql_task)
-
-
-
-
-
[mlrun] 2020-06-29 12:43:59,253 starting run sql uid=b0914edaa58e45ee97c132200c6b60be  -> http://mlrun-api:8080
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 29 12:43:59completedsql
v3io_user=admin
kind=handler
owner=admin
host=jupyter-b9c7995f9-4fblj
sql_query=select rfam_acc,rfam_id,auto_wiki,description,author,seed_source FROM family
database_url=mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam
query result
-
- -
-
to track results use .show() or .logs() or in CLI: 
-!mlrun get run b0914edaa58e45ee97c132200c6b60be --project default , !mlrun logs b0914edaa58e45ee97c132200c6b60be --project default
-[mlrun] 2020-06-29 12:44:02,344 run executed, status=completed
-
-
-
-
-
-

Run it on a cluster

-
-
-
fn.run(sql_task)
-
-
-
-
-
[mlrun] 2020-06-29 12:44:02,350 starting run sql uid=46ff7ef67e314be49353982cdd8d073a  -> http://mlrun-api:8080
-[mlrun] 2020-06-29 12:44:02,622 Job is running in the background, pod: sql-mplpz
-[mlrun] 2020-06-29 12:44:09,070 run executed, status=completed
-final state: succeeded
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 29 12:44:06completedsql
v3io_user=admin
kind=job
owner=admin
host=sql-mplpz
sql_query=select rfam_acc,rfam_id,auto_wiki,description,author,seed_source FROM family
database_url=mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam
query result
-
- -
-
to track results use .show() or .logs() or in CLI: 
-!mlrun get run 46ff7ef67e314be49353982cdd8d073a --project default , !mlrun logs 46ff7ef67e314be49353982cdd8d073a --project default
-[mlrun] 2020-06-29 12:44:11,893 run executed, status=completed
-
-
-
<mlrun.model.RunObject at 0x7f87fba74b00>
-
-
-
-
-
-
-
-

SQL query from Iguazio Key Value via Presto

-

You need to create a table and set the sql_table path accordingly.
-you can find an example of creating such table in https://github.com/v3io/tutorials/blob/master/data-ingestion-and-preparation/basic-data-ingestion-and-preparation.ipynb

-
-
-
# nuclio: ignore
-import os
-sql_table = os.path.join('v3io.users."'+str(os.getenv('V3IO_USERNAME'))+'/examples/stocks_tab"')
-sql_query_string = 'select * from '+sql_table+""
-
-
-
-
-
-
-
%sql select * from $sql_table limit 10
-
-
-
-
-
Done.
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
securitydescsecuritytypetimeisinminpricedateendpricenumberoftradesmnemoniccurrencysecurityidmaxpricetradedvolumestartprice
UBS I.ETF-DL G.SEL.DIV.ADETF08:27IE00BMP3HG278.4182018-03-26 00:00:00.0008.4181UBUMEUR25054508.4184038.418
GILEAD SCIENCES DL-,001Common stock08:00US375558103659.72018-03-26 00:00:00.00059.843GISEUR250649559.8474559.7
3M CO. DL-,01Common stock08:00US88579Y1010176.512018-03-26 00:00:00.000176.511MMMEUR2506577176.5139176.51
DIEBOLD NIXDORF INH.O.N.Common stock08:06DE000A0CAYB266.32018-03-26 00:00:00.00066.31WINEUR250428666.36066.3
XTR.II EUR.INF.LINK.BD 1CETF08:13LU0290358224218.972018-03-26 00:00:00.000218.971DBXKEUR2505840218.97110218.97
UBS-ETF-MSCI EMU S.C.EOADETF08:33LU0671493277100.22018-03-26 00:00:00.000100.21UEFDEUR2506045100.2180100.2
ASMALLWORLD AG SF 1Common stock08:23CH040488012912.72018-03-26 00:00:00.00012.711Q7EUR308912212.740012.7
IS.DJ GLOB.TITAN.50 U.ETFETF08:42DE000628938231.252018-03-26 00:00:00.00031.251EXI2EUR250502931.255031.25
ISHS IV-AGEING POPUL.ETFETF08:17IE00BYZK46694.9262018-03-26 00:00:00.0004.92612B77EUR25055524.926254.926
PORSCHE AUTOM.HLDG VZOCommon stock08:00DE000PAH003864.682018-03-26 00:00:00.00064.768PAH3EUR250481664.7669864.7
-
-
-
-
sql_task = NewTask(name='sql', 
-                   handler=sql_to_file,
-                   params={'sql_query': sql_query_string,
-                          'database_url': os.getenv('DATABASE_URL')}
-                          )
-
-
-
-
-
-
-
sql_func = run_local(sql_task)
-
-
-
-
-
[mlrun] 2020-06-29 12:44:14,406 starting run sql uid=d32a57bb990d4142bb1f63862e8906bf  -> http://mlrun-api:8080
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 29 12:44:14completedsql
v3io_user=admin
kind=handler
owner=admin
host=jupyter-b9c7995f9-4fblj
sql_query=select * from v3io.users."admin/examples/stocks_tab"
database_url=presto://admin:8278ee8e-0f31-4aea-a105-2eab202bec93@presto-api-presto.default-tenant.app.cs-mlrun-test.iguazio-c0.com:443/v3io?protocol=https&requests_kwargs=%7B%22verify%22%3A+%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.crt%22%2C+%22cert%22%3A+%5B%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.crt%22%2C+%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.key%22%5D%7D
query result
-
- -
-
to track results use .show() or .logs() or in CLI: 
-!mlrun get run d32a57bb990d4142bb1f63862e8906bf --project default , !mlrun logs d32a57bb990d4142bb1f63862e8906bf --project default
-[mlrun] 2020-06-29 12:44:18,102 run executed, status=completed
-
-
-
-
-
-
-
fn.run(sql_task)
-
-
-
-
-
[mlrun] 2020-06-29 12:44:18,112 starting run sql uid=db9507007f6d452e9ca020e4f483e33b  -> http://mlrun-api:8080
-[mlrun] 2020-06-29 12:44:18,387 Job is running in the background, pod: sql-g7p4f
-[mlrun] 2020-06-29 12:44:25,033 run executed, status=completed
-final state: succeeded
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 29 12:44:21completedsql
v3io_user=admin
kind=job
owner=admin
host=sql-g7p4f
sql_query=select * from v3io.users."admin/examples/stocks_tab"
database_url=presto://admin:8278ee8e-0f31-4aea-a105-2eab202bec93@presto-api-presto.default-tenant.app.cs-mlrun-test.iguazio-c0.com:443/v3io?protocol=https&requests_kwargs=%7B%22verify%22%3A+%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.crt%22%2C+%22cert%22%3A+%5B%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.crt%22%2C+%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.key%22%5D%7D
query result
-
- -
-
to track results use .show() or .logs() or in CLI: 
-!mlrun get run db9507007f6d452e9ca020e4f483e33b --project default , !mlrun logs db9507007f6d452e9ca020e4f483e33b --project default
-[mlrun] 2020-06-29 12:44:27,645 run executed, status=completed
-
-
-
<mlrun.model.RunObject at 0x7f87f8e26c18>
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/sql_to_file/0.0.1/static/function.html b/functions/development/sql_to_file/0.0.1/static/function.html deleted file mode 100644 index cff68646..00000000 --- a/functions/development/sql_to_file/0.0.1/static/function.html +++ /dev/null @@ -1,69 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: job
-metadata:
-  name: sql-to-file
-  tag: ''
-  hash: 61f616fe697994e05cf018f2ee94c4ea25ed8863
-  project: default
-  labels:
-    author: adih
-  categories:
-  - data-preparation
-spec:
-  command: ''
-  args: []
-  image: mlrun/mlrun
-  env: []
-  default_handler: sql_to_file
-  entry_points:
-    sql_to_file:
-      name: sql_to_file
-      doc: SQL Ingest - Ingest data using SQL query
-      parameters:
-      - name: context
-        type: MLClientCtx
-        doc: the function context
-        default: ''
-      - name: sql_query
-        type: str
-        doc: the sql query used to retrieve the data
-        default: ''
-      - name: database_url
-        type: str
-        doc: database connection URL
-        default: ''
-      - name: file_ext
-        type: str
-        doc: ("parquet") format for result file
-        default: parquet
-      outputs:
-      - default: ''
-      lineno: 9
-  description: SQL To File - Ingest data using SQL query
-  build:
-    functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHBhbmRhcyBhcyBwZAppbXBvcnQgcHloaXZlCmZyb20gc3FsYWxjaGVteS5lbmdpbmUgaW1wb3J0IGNyZWF0ZV9lbmdpbmUKZnJvbSBtbHJ1bi5leGVjdXRpb24gaW1wb3J0IE1MQ2xpZW50Q3R4CgoKZGVmIHNxbF90b19maWxlKAogICAgY29udGV4dDogTUxDbGllbnRDdHgsCiAgICBzcWxfcXVlcnk6IHN0ciwKICAgIGRhdGFiYXNlX3VybDogc3RyLAogICAgZmlsZV9leHQ6IHN0ciA9ICJwYXJxdWV0IiwKKSAtPiBOb25lOgogICAgIiIiU1FMIEluZ2VzdCAtIEluZ2VzdCBkYXRhIHVzaW5nIFNRTCBxdWVyeQoKICAgIDpwYXJhbSBjb250ZXh0OiAgICAgICAgICAgdGhlIGZ1bmN0aW9uIGNvbnRleHQKICAgIDpwYXJhbSBzcWxfcXVlcnk6ICAgICAgICAgdGhlIHNxbCBxdWVyeSB1c2VkIHRvIHJldHJpZXZlIHRoZSBkYXRhCiAgICA6cGFyYW0gZGF0YWJhc2VfdXJsOiAgICAgIGRhdGFiYXNlIGNvbm5lY3Rpb24gVVJMCiAgICA6cGFyYW0gZmlsZV9leHQ6ICAgICAgICAgICgicGFycXVldCIpIGZvcm1hdCBmb3IgcmVzdWx0IGZpbGUKICAgICIiIgoKICAgIGVuZ2luZSA9IGNyZWF0ZV9lbmdpbmUoZGF0YWJhc2VfdXJsKQogICAgZGYgPSBwZC5yZWFkX3NxbChzcWxfcXVlcnksIGVuZ2luZSkKCiAgICBjb250ZXh0LmxvZ19kYXRhc2V0KAogICAgICAgICJxdWVyeSByZXN1bHQiLAogICAgICAgIGRmPWRmLAogICAgICAgIGZvcm1hdD1maWxlX2V4dCwKICAgICAgICBhcnRpZmFjdF9wYXRoPWNvbnRleHQuYXJ0aWZhY3Rfc3VicGF0aCgiZGF0YSIpLAogICAgKQo=
-    commands: []
-    code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/sql_to_file/sql_to_file.py
-  affinity: null
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/sql_to_file/0.0.1/static/item.html b/functions/development/sql_to_file/0.0.1/static/item.html deleted file mode 100644 index 0a470f1d..00000000 --- a/functions/development/sql_to_file/0.0.1/static/item.html +++ /dev/null @@ -1,45 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- data-preparation
-description: SQL To File - Ingest data using SQL query
-doc: ''
-example: sql_to_file.ipynb
-generationDate: 2021-05-19:23-13
-icon: ''
-labels:
-  author: adih
-maintainers: []
-marketplaceType: ''
-mlrunVersion: ''
-name: sql-to-file
-platformVersion: ''
-spec:
-  filename: sql_to_file.py
-  handler: sql_to_file
-  image: mlrun/mlrun
-  kind: job
-  requirements: []
-url: ''
-version: 0.0.1
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/sql_to_file/0.0.1/static/source.html b/functions/development/sql_to_file/0.0.1/static/source.html deleted file mode 100644 index 7d22754f..00000000 --- a/functions/development/sql_to_file/0.0.1/static/source.html +++ /dev/null @@ -1,53 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-# Generated by nuclio.export.NuclioExporter
-
-import pandas as pd
-import pyhive
-from sqlalchemy.engine import create_engine
-from mlrun.execution import MLClientCtx
-
-
-def sql_to_file(
-    context: MLClientCtx,
-    sql_query: str,
-    database_url: str,
-    file_ext: str = "parquet",
-) -> None:
-    """SQL Ingest - Ingest data using SQL query
-
-    :param context:           the function context
-    :param sql_query:         the sql query used to retrieve the data
-    :param database_url:      database connection URL
-    :param file_ext:          ("parquet") format for result file
-    """
-
-    engine = create_engine(database_url)
-    df = pd.read_sql(sql_query, engine)
-
-    context.log_dataset(
-        "query result",
-        df=df,
-        format=file_ext,
-        artifact_path=context.artifact_subpath("data"),
-    )
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/sql_to_file/0.8.0/src/function.yaml b/functions/development/sql_to_file/0.8.0/src/function.yaml deleted file mode 100644 index 06301a9a..00000000 --- a/functions/development/sql_to_file/0.8.0/src/function.yaml +++ /dev/null @@ -1,47 +0,0 @@ -kind: job -metadata: - name: sql-to-file - tag: '' - hash: 61f616fe697994e05cf018f2ee94c4ea25ed8863 - project: default - labels: - author: adih - categories: - - data-preparation -spec: - command: '' - args: [] - image: mlrun/mlrun - env: [] - default_handler: sql_to_file - entry_points: - sql_to_file: - name: sql_to_file - doc: SQL Ingest - Ingest data using SQL query - parameters: - - name: context - type: MLClientCtx - doc: the function context - default: '' - - name: sql_query - type: str - doc: the sql query used to retrieve the data - default: '' - - name: database_url - type: str - doc: database connection URL - default: '' - - name: file_ext - type: str - doc: ("parquet") format for result file - default: parquet - outputs: - - default: '' - lineno: 9 - description: SQL To File - Ingest data using SQL query - build: - functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHBhbmRhcyBhcyBwZAppbXBvcnQgcHloaXZlCmZyb20gc3FsYWxjaGVteS5lbmdpbmUgaW1wb3J0IGNyZWF0ZV9lbmdpbmUKZnJvbSBtbHJ1bi5leGVjdXRpb24gaW1wb3J0IE1MQ2xpZW50Q3R4CgoKZGVmIHNxbF90b19maWxlKAogICAgY29udGV4dDogTUxDbGllbnRDdHgsCiAgICBzcWxfcXVlcnk6IHN0ciwKICAgIGRhdGFiYXNlX3VybDogc3RyLAogICAgZmlsZV9leHQ6IHN0ciA9ICJwYXJxdWV0IiwKKSAtPiBOb25lOgogICAgIiIiU1FMIEluZ2VzdCAtIEluZ2VzdCBkYXRhIHVzaW5nIFNRTCBxdWVyeQoKICAgIDpwYXJhbSBjb250ZXh0OiAgICAgICAgICAgdGhlIGZ1bmN0aW9uIGNvbnRleHQKICAgIDpwYXJhbSBzcWxfcXVlcnk6ICAgICAgICAgdGhlIHNxbCBxdWVyeSB1c2VkIHRvIHJldHJpZXZlIHRoZSBkYXRhCiAgICA6cGFyYW0gZGF0YWJhc2VfdXJsOiAgICAgIGRhdGFiYXNlIGNvbm5lY3Rpb24gVVJMCiAgICA6cGFyYW0gZmlsZV9leHQ6ICAgICAgICAgICgicGFycXVldCIpIGZvcm1hdCBmb3IgcmVzdWx0IGZpbGUKICAgICIiIgoKICAgIGVuZ2luZSA9IGNyZWF0ZV9lbmdpbmUoZGF0YWJhc2VfdXJsKQogICAgZGYgPSBwZC5yZWFkX3NxbChzcWxfcXVlcnksIGVuZ2luZSkKCiAgICBjb250ZXh0LmxvZ19kYXRhc2V0KAogICAgICAgICJxdWVyeSByZXN1bHQiLAogICAgICAgIGRmPWRmLAogICAgICAgIGZvcm1hdD1maWxlX2V4dCwKICAgICAgICBhcnRpZmFjdF9wYXRoPWNvbnRleHQuYXJ0aWZhY3Rfc3VicGF0aCgiZGF0YSIpLAogICAgKQo= - commands: [] - code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/sql_to_file/sql_to_file.py - affinity: null -verbose: false diff --git a/functions/development/sql_to_file/0.8.0/src/item.yaml b/functions/development/sql_to_file/0.8.0/src/item.yaml deleted file mode 100644 index cbc6aa05..00000000 --- a/functions/development/sql_to_file/0.8.0/src/item.yaml +++ /dev/null @@ -1,23 +0,0 @@ -apiVersion: v1 -categories: -- data-preparation -description: SQL To File - Ingest data using SQL query -doc: '' -example: sql_to_file.ipynb -generationDate: 2021-05-19:23-13 -icon: '' -labels: - author: adih -maintainers: [] -marketplaceType: '' -mlrunVersion: 0.8.0 -name: sql-to-file -platformVersion: 3.2.0 -spec: - filename: sql_to_file.py - handler: sql_to_file - image: mlrun/mlrun - kind: job - requirements: [] -url: '' -version: 0.8.0 diff --git a/functions/development/sql_to_file/0.8.0/src/requirements.txt b/functions/development/sql_to_file/0.8.0/src/requirements.txt deleted file mode 100644 index 21ef3f07..00000000 --- a/functions/development/sql_to_file/0.8.0/src/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -mlrun -PyHive -pymysql \ No newline at end of file diff --git a/functions/development/sql_to_file/0.8.0/src/sql_to_file.ipynb b/functions/development/sql_to_file/0.8.0/src/sql_to_file.ipynb deleted file mode 100644 index d4a084ad..00000000 --- a/functions/development/sql_to_file/0.8.0/src/sql_to_file.ipynb +++ /dev/null @@ -1,1567 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# SQL Ingest - Ingest data using SQL query " - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "ename": "ModuleNotFoundError", - "evalue": "No module named 'nuclio'", - "output_type": "error", - "traceback": [ - "\u001B[1;31m---------------------------------------------------------------------------\u001B[0m", - "\u001B[1;31mModuleNotFoundError\u001B[0m Traceback (most recent call last)", - "\u001B[1;32m\u001B[0m in \u001B[0;36m\u001B[1;34m\u001B[0m\n\u001B[0;32m 1\u001B[0m \u001B[1;31m# nuclio: ignore\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[1;32m----> 2\u001B[1;33m \u001B[1;32mimport\u001B[0m \u001B[0mnuclio\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0m\u001B[0;32m 3\u001B[0m \u001B[1;33m\u001B[0m\u001B[0m\n", - "\u001B[1;31mModuleNotFoundError\u001B[0m: No module named 'nuclio'" - ] - } - ], - "source": [ - "# nuclio: ignore\n", - "import nuclio" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "%nuclio: setting kind to 'job'\n", - "%nuclio: setting spec.image to 'mlrun/mlrun'\n" - ] - } - ], - "source": [ - "%nuclio config kind = \"job\"\n", - "%nuclio config spec.build.baseImage = \"mlrun/mlrun\"" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "%%nuclio cmd -c\n", - "pip install --no-cache-dir git+https://github.com/v3io/PyHive.git@v0.6.999 \n", - "pip install sqlalchemy==1.3.11\n", - "pip install PyMySQL==0.9.3" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import pyhive\n", - "from sqlalchemy.engine import create_engine\n", - "from mlrun.execution import MLClientCtx\n", - "\n", - "\n", - "def sql_to_file(\n", - " context: MLClientCtx,\n", - " sql_query: str,\n", - " database_url: str,\n", - " file_ext: str = \"parquet\",\n", - ") -> None:\n", - " \"\"\"SQL Ingest - Ingest data using SQL query\n", - "\n", - " :param context: the function context\n", - " :param sql_query: the sql query used to retrieve the data\n", - " :param database_url: database connection URL\n", - " :param file_ext: (\"parquet\") format for result file\n", - "\n", - "\"\"\"\n", - "\n", - " engine = create_engine(database_url)\n", - " df = pd.read_sql(sql_query, engine)\n", - "\n", - " context.log_dataset('query result',\n", - " df=df,\n", - " format=file_ext,\n", - " artifact_path=context.artifact_subpath('data'))\n" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: end-code" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### mlconfig" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "ename": "KeyError", - "evalue": "'HOME'", - "output_type": "error", - "traceback": [ - "\u001B[1;31m---------------------------------------------------------------------------\u001B[0m", - "\u001B[1;31mKeyError\u001B[0m Traceback (most recent call last)", - "\u001B[1;32m\u001B[0m in \u001B[0;36m\u001B[1;34m\u001B[0m\n\u001B[0;32m 2\u001B[0m \u001B[1;32mimport\u001B[0m \u001B[0mos\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 3\u001B[0m \u001B[0mmlconf\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mdbpath\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0mmlconf\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mdbpath\u001B[0m \u001B[1;32mor\u001B[0m \u001B[1;34m'http://mlrun-api:8080'\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[1;32m----> 4\u001B[1;33m \u001B[0mmlconf\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0martifact_path\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0mmlconf\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0martifact_path\u001B[0m \u001B[1;32mor\u001B[0m \u001B[1;34mf'{os.environ[\"HOME\"]}/artifacts'\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0m\u001B[0;32m 5\u001B[0m \u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 6\u001B[0m \u001B[1;33m\u001B[0m\u001B[0m\n", - "\u001B[1;32mC:\\Program Files\\Python37\\lib\\os.py\u001B[0m in \u001B[0;36m__getitem__\u001B[1;34m(self, key)\u001B[0m\n\u001B[0;32m 679\u001B[0m \u001B[1;32mexcept\u001B[0m \u001B[0mKeyError\u001B[0m\u001B[1;33m:\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 680\u001B[0m \u001B[1;31m# raise KeyError with the original key value\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[1;32m--> 681\u001B[1;33m \u001B[1;32mraise\u001B[0m \u001B[0mKeyError\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0mkey\u001B[0m\u001B[1;33m)\u001B[0m \u001B[1;32mfrom\u001B[0m \u001B[1;32mNone\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0m\u001B[0;32m 682\u001B[0m \u001B[1;32mreturn\u001B[0m \u001B[0mself\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mdecodevalue\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0mvalue\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 683\u001B[0m \u001B[1;33m\u001B[0m\u001B[0m\n", - "\u001B[1;31mKeyError\u001B[0m: 'HOME'" - ] - } - ], - "source": [ - "from mlrun import mlconf\n", - "import os\n", - "mlconf.dbpath = mlconf.dbpath or 'http://mlrun-api:8080'\n", - "mlconf.artifact_path = mlconf.artifact_path or f'{os.environ[\"HOME\"]}/artifacts'\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Save function" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "def mount_secret(\n", - " secret_name, volume_mount_path, volume_name='secret', items=None\n", - "):\n", - " def _mount_secret(task):\n", - " from kubernetes import client as k8s_client\n", - " vol = k8s_client.V1SecretVolumeSource(secret_name=secret_name, items=items)\n", - " return task.add_volume(\n", - " k8s_client.V1Volume(name=volume_name, secret=vol)\n", - " ).add_volume_mount(\n", - " k8s_client.V1VolumeMount(mount_path=volume_mount_path, name=volume_name)\n", - " )\n", - " return _mount_secret" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import code_to_function, NewTask\n", - "import os\n", - "\n", - "fn = code_to_function(name=\"sql_to_file\",\n", - " handler=\"sql_to_file\",\n", - " description=\"SQL To File - Ingest data using SQL query\",\n", - " categories=[\"data-prep\"],\n", - " labels={\"author\": \"adih\"})\n", - "\n", - "if \"V3IO_ACCESS_KEY\" in list(os.environ):\n", - " fn.apply(mount_secret(secret_name='presto-tls',\n", - " volume_mount_path= '/var/run/iguazio/secrets/'))\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Build the image" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-06-29 12:42:44,100 starting remote build, image: .mlrun/func-default-sql-ingest-latest\n", - "\u001B[36mINFO\u001B[0m[0000] Resolved base name mlrun/mlrun:0.4.10 to mlrun/mlrun:0.4.10 \n", - "\u001B[36mINFO\u001B[0m[0000] Resolved base name mlrun/mlrun:0.4.10 to mlrun/mlrun:0.4.10 \n", - "\u001B[36mINFO\u001B[0m[0000] Retrieving image manifest mlrun/mlrun:0.4.10 \n", - "\u001B[36mINFO\u001B[0m[0000] Retrieving image manifest mlrun/mlrun:0.4.10 \n", - "\u001B[36mINFO\u001B[0m[0000] Built cross stage deps: map[] \n", - "\u001B[36mINFO\u001B[0m[0000] Retrieving image manifest mlrun/mlrun:0.4.10 \n", - "\u001B[36mINFO\u001B[0m[0000] Retrieving image manifest mlrun/mlrun:0.4.10 \n", - "\u001B[36mINFO\u001B[0m[0001] Unpacking rootfs as cmd RUN pip install --no-cache-dir git+https://github.com/v3io/PyHive.git@v0.6.999 requires it. \n", - "\u001B[36mINFO\u001B[0m[0027] Taking snapshot of full filesystem... \n", - "\u001B[36mINFO\u001B[0m[0039] Resolving paths \n", - "\u001B[36mINFO\u001B[0m[0046] RUN pip install --no-cache-dir git+https://github.com/v3io/PyHive.git@v0.6.999 \n", - "\u001B[36mINFO\u001B[0m[0046] cmd: /bin/sh \n", - "\u001B[36mINFO\u001B[0m[0046] args: [-c pip install --no-cache-dir git+https://github.com/v3io/PyHive.git@v0.6.999] \n", - "Collecting git+https://github.com/v3io/PyHive.git@v0.6.999\n", - " Cloning https://github.com/v3io/PyHive.git (to revision v0.6.999) to /tmp/pip-req-build-ycqhuolw\n", - " Running command git clone -q https://github.com/v3io/PyHive.git /tmp/pip-req-build-ycqhuolw\n", - "Requirement already satisfied: future in /usr/local/lib/python3.7/site-packages (from PyHive==0.6.1.dev0) (0.18.2)\n", - "Requirement already satisfied: python-dateutil in /usr/local/lib/python3.7/site-packages (from PyHive==0.6.1.dev0) (2.8.1)\n", - "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/site-packages (from python-dateutil->PyHive==0.6.1.dev0) (1.15.0)\n", - "Building wheels for collected packages: PyHive\n", - " Building wheel for PyHive (setup.py): started\n", - " Building wheel for PyHive (setup.py): finished with status 'done'\n", - " Created wheel for PyHive: filename=PyHive-0.6.1.dev0-py3-none-any.whl size=46402 sha256=63dca405cbae83da4cfcabfd61fd00f1683bc008c8bfa2272eac7054ec283166\n", - " Stored in directory: /tmp/pip-ephem-wheel-cache-mwb52l_u/wheels/05/11/cd/4ac4df0fcee76e5ceb614c39c56fca1eead41c0ac32ff6285d\n", - "Successfully built PyHive\n", - "Installing collected packages: PyHive\n", - "Successfully installed PyHive-0.6.1.dev0\n", - "\u001B[36mINFO\u001B[0m[0048] Taking snapshot of full filesystem... \n", - "\u001B[36mINFO\u001B[0m[0048] Resolving paths \n", - "\u001B[36mINFO\u001B[0m[0053] RUN pip install sqlalchemy==1.3.11 \n", - "\u001B[36mINFO\u001B[0m[0053] cmd: /bin/sh \n", - "\u001B[36mINFO\u001B[0m[0053] args: [-c pip install sqlalchemy==1.3.11] \n", - "Collecting sqlalchemy==1.3.11\n", - " Downloading SQLAlchemy-1.3.11.tar.gz (6.0 MB)\n", - "Building wheels for collected packages: sqlalchemy\n", - " Building wheel for sqlalchemy (setup.py): started\n", - " Building wheel for sqlalchemy (setup.py): finished with status 'done'\n", - " Created wheel for sqlalchemy: filename=SQLAlchemy-1.3.11-cp37-cp37m-linux_x86_64.whl size=1216921 sha256=9dd22e89acfbb68df0c1d189d36907a16c9393e4174598eb4bf377ce57132f3c\n", - " Stored in directory: /root/.cache/pip/wheels/0a/60/60/f26cbd183a3bb0031ace108156036dd925ec0138ee1c496a16\n", - "Successfully built sqlalchemy\n", - "Installing collected packages: sqlalchemy\n", - " Attempting uninstall: sqlalchemy\n", - " Found existing installation: SQLAlchemy 1.3.17\n", - " Uninstalling SQLAlchemy-1.3.17:\n", - " Successfully uninstalled SQLAlchemy-1.3.17\n", - "Successfully installed sqlalchemy-1.3.11\n", - "\u001B[36mINFO\u001B[0m[0057] Taking snapshot of full filesystem... \n", - "\u001B[36mINFO\u001B[0m[0057] Resolving paths \n", - "\u001B[36mINFO\u001B[0m[0063] RUN pip install PyMySQL==0.9.3 \n", - "\u001B[36mINFO\u001B[0m[0063] cmd: /bin/sh \n", - "\u001B[36mINFO\u001B[0m[0063] args: [-c pip install PyMySQL==0.9.3] \n", - "Collecting PyMySQL==0.9.3\n", - " Downloading PyMySQL-0.9.3-py2.py3-none-any.whl (47 kB)\n", - "Installing collected packages: PyMySQL\n", - "Successfully installed PyMySQL-0.9.3\n", - "\u001B[36mINFO\u001B[0m[0064] Taking snapshot of full filesystem... \n", - "\u001B[36mINFO\u001B[0m[0064] Resolving paths \n" - ] - }, - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fn.deploy()" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-06-30 01:58:41,604 function spec saved to path: function.yaml\n" - ] - }, - { - "data": { - "text/plain": "" - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fn.export('function.yaml')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Test" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Reading from a public MySQL DB" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "mysql_url = 'mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam'\n", - "mysql_query = 'select rfam_acc,rfam_id,auto_wiki,description,author,seed_source FROM family'" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import NewTask, run_local\n", - "\n", - "sql_task = NewTask(name='sql',\n", - " handler=sql_to_file,\n", - " params={'sql_query': mysql_query,\n", - " 'database_url': mysql_url})\n" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-06-29 12:43:59,253 starting run sql uid=b0914edaa58e45ee97c132200c6b60be -> http://mlrun-api:8080\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 29 12:43:59completedsql
v3io_user=admin
kind=handler
owner=admin
host=jupyter-b9c7995f9-4fblj
sql_query=select rfam_acc,rfam_id,auto_wiki,description,author,seed_source FROM family
database_url=mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam
query result
\n", - "
\n", - "
\n", - "
\n", - " Title\n", - " ×\n", - "
\n", - " \n", - "
\n", - "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "to track results use .show() or .logs() or in CLI: \n", - "!mlrun get run b0914edaa58e45ee97c132200c6b60be --project default , !mlrun logs b0914edaa58e45ee97c132200c6b60be --project default\n", - "[mlrun] 2020-06-29 12:44:02,344 run executed, status=completed\n" - ] - } - ], - "source": [ - "sql_func = run_local(sql_task)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Run it on a cluster" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-06-29 12:44:02,350 starting run sql uid=46ff7ef67e314be49353982cdd8d073a -> http://mlrun-api:8080\n", - "[mlrun] 2020-06-29 12:44:02,622 Job is running in the background, pod: sql-mplpz\n", - "[mlrun] 2020-06-29 12:44:09,070 run executed, status=completed\n", - "final state: succeeded\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 29 12:44:06completedsql
v3io_user=admin
kind=job
owner=admin
host=sql-mplpz
sql_query=select rfam_acc,rfam_id,auto_wiki,description,author,seed_source FROM family
database_url=mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam
query result
\n", - "
\n", - "
\n", - "
\n", - " Title\n", - " ×\n", - "
\n", - " \n", - "
\n", - "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "to track results use .show() or .logs() or in CLI: \n", - "!mlrun get run 46ff7ef67e314be49353982cdd8d073a --project default , !mlrun logs 46ff7ef67e314be49353982cdd8d073a --project default\n", - "[mlrun] 2020-06-29 12:44:11,893 run executed, status=completed\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fn.run(sql_task)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### SQL query from Iguazio Key Value via Presto" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You need to create a table and set the sql_table path accordingly.
\n", - "you can find an example of creating such table in https://github.com/v3io/tutorials/blob/master/data-ingestion-and-preparation/basic-data-ingestion-and-preparation.ipynb" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: ignore\n", - "import os\n", - "sql_table = os.path.join('v3io.users.\"'+str(os.getenv('V3IO_USERNAME'))+'/examples/stocks_tab\"')\n", - "sql_query_string = 'select * from '+sql_table+\"\"" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Done.\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
securitydescsecuritytypetimeisinminpricedateendpricenumberoftradesmnemoniccurrencysecurityidmaxpricetradedvolumestartprice
UBS I.ETF-DL G.SEL.DIV.ADETF08:27IE00BMP3HG278.4182018-03-26 00:00:00.0008.4181UBUMEUR25054508.4184038.418
GILEAD SCIENCES DL-,001Common stock08:00US375558103659.72018-03-26 00:00:00.00059.843GISEUR250649559.8474559.7
3M CO. DL-,01Common stock08:00US88579Y1010176.512018-03-26 00:00:00.000176.511MMMEUR2506577176.5139176.51
DIEBOLD NIXDORF INH.O.N.Common stock08:06DE000A0CAYB266.32018-03-26 00:00:00.00066.31WINEUR250428666.36066.3
XTR.II EUR.INF.LINK.BD 1CETF08:13LU0290358224218.972018-03-26 00:00:00.000218.971DBXKEUR2505840218.97110218.97
UBS-ETF-MSCI EMU S.C.EOADETF08:33LU0671493277100.22018-03-26 00:00:00.000100.21UEFDEUR2506045100.2180100.2
ASMALLWORLD AG SF 1Common stock08:23CH040488012912.72018-03-26 00:00:00.00012.711Q7EUR308912212.740012.7
IS.DJ GLOB.TITAN.50 U.ETFETF08:42DE000628938231.252018-03-26 00:00:00.00031.251EXI2EUR250502931.255031.25
ISHS IV-AGEING POPUL.ETFETF08:17IE00BYZK46694.9262018-03-26 00:00:00.0004.92612B77EUR25055524.926254.926
PORSCHE AUTOM.HLDG VZOCommon stock08:00DE000PAH003864.682018-03-26 00:00:00.00064.768PAH3EUR250481664.7669864.7
" - ], - "text/plain": [ - "[('UBS I.ETF-DL G.SEL.DIV.AD', 'ETF', '08:27', 'IE00BMP3HG27', 8.418, '2018-03-26 00:00:00.000', 8.418, 1, 'UBUM', 'EUR', 2505450, 8.418, 403, 8.418),\n", - " ('GILEAD SCIENCES DL-,001', 'Common stock', '08:00', 'US3755581036', 59.7, '2018-03-26 00:00:00.000', 59.84, 3, 'GIS', 'EUR', 2506495, 59.84, 745, 59.7),\n", - " ('3M CO. DL-,01', 'Common stock', '08:00', 'US88579Y1010', 176.51, '2018-03-26 00:00:00.000', 176.51, 1, 'MMM', 'EUR', 2506577, 176.51, 39, 176.51),\n", - " ('DIEBOLD NIXDORF INH.O.N.', 'Common stock', '08:06', 'DE000A0CAYB2', 66.3, '2018-03-26 00:00:00.000', 66.3, 1, 'WIN', 'EUR', 2504286, 66.3, 60, 66.3),\n", - " ('XTR.II EUR.INF.LINK.BD 1C', 'ETF', '08:13', 'LU0290358224', 218.97, '2018-03-26 00:00:00.000', 218.97, 1, 'DBXK', 'EUR', 2505840, 218.97, 110, 218.97),\n", - " ('UBS-ETF-MSCI EMU S.C.EOAD', 'ETF', '08:33', 'LU0671493277', 100.2, '2018-03-26 00:00:00.000', 100.2, 1, 'UEFD', 'EUR', 2506045, 100.2, 180, 100.2),\n", - " ('ASMALLWORLD AG SF 1', 'Common stock', '08:23', 'CH0404880129', 12.7, '2018-03-26 00:00:00.000', 12.7, 1, '1Q7', 'EUR', 3089122, 12.7, 400, 12.7),\n", - " ('IS.DJ GLOB.TITAN.50 U.ETF', 'ETF', '08:42', 'DE0006289382', 31.25, '2018-03-26 00:00:00.000', 31.25, 1, 'EXI2', 'EUR', 2505029, 31.25, 50, 31.25),\n", - " ('ISHS IV-AGEING POPUL.ETF', 'ETF', '08:17', 'IE00BYZK4669', 4.926, '2018-03-26 00:00:00.000', 4.926, 1, '2B77', 'EUR', 2505552, 4.926, 25, 4.926),\n", - " ('PORSCHE AUTOM.HLDG VZO', 'Common stock', '08:00', 'DE000PAH0038', 64.68, '2018-03-26 00:00:00.000', 64.76, 8, 'PAH3', 'EUR', 2504816, 64.76, 698, 64.7)]" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "%sql select * from $sql_table limit 10" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [], - "source": [ - "sql_task = NewTask(name='sql', \n", - " handler=sql_to_file,\n", - " params={'sql_query': sql_query_string,\n", - " 'database_url': os.getenv('DATABASE_URL')}\n", - " )\n" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-06-29 12:44:14,406 starting run sql uid=d32a57bb990d4142bb1f63862e8906bf -> http://mlrun-api:8080\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 29 12:44:14completedsql
v3io_user=admin
kind=handler
owner=admin
host=jupyter-b9c7995f9-4fblj
sql_query=select * from v3io.users.\"admin/examples/stocks_tab\"
database_url=presto://admin:8278ee8e-0f31-4aea-a105-2eab202bec93@presto-api-presto.default-tenant.app.cs-mlrun-test.iguazio-c0.com:443/v3io?protocol=https&requests_kwargs=%7B%22verify%22%3A+%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.crt%22%2C+%22cert%22%3A+%5B%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.crt%22%2C+%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.key%22%5D%7D
query result
\n", - "
\n", - "
\n", - "
\n", - " Title\n", - " ×\n", - "
\n", - " \n", - "
\n", - "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "to track results use .show() or .logs() or in CLI: \n", - "!mlrun get run d32a57bb990d4142bb1f63862e8906bf --project default , !mlrun logs d32a57bb990d4142bb1f63862e8906bf --project default\n", - "[mlrun] 2020-06-29 12:44:18,102 run executed, status=completed\n" - ] - } - ], - "source": [ - "sql_func = run_local(sql_task)" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-06-29 12:44:18,112 starting run sql uid=db9507007f6d452e9ca020e4f483e33b -> http://mlrun-api:8080\n", - "[mlrun] 2020-06-29 12:44:18,387 Job is running in the background, pod: sql-g7p4f\n", - "[mlrun] 2020-06-29 12:44:25,033 run executed, status=completed\n", - "final state: succeeded\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 29 12:44:21completedsql
v3io_user=admin
kind=job
owner=admin
host=sql-g7p4f
sql_query=select * from v3io.users.\"admin/examples/stocks_tab\"
database_url=presto://admin:8278ee8e-0f31-4aea-a105-2eab202bec93@presto-api-presto.default-tenant.app.cs-mlrun-test.iguazio-c0.com:443/v3io?protocol=https&requests_kwargs=%7B%22verify%22%3A+%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.crt%22%2C+%22cert%22%3A+%5B%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.crt%22%2C+%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.key%22%5D%7D
query result
\n", - "
\n", - "
\n", - "
\n", - " Title\n", - " ×\n", - "
\n", - " \n", - "
\n", - "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "to track results use .show() or .logs() or in CLI: \n", - "!mlrun get run db9507007f6d452e9ca020e4f483e33b --project default , !mlrun logs db9507007f6d452e9ca020e4f483e33b --project default\n", - "[mlrun] 2020-06-29 12:44:27,645 run executed, status=completed\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fn.run(sql_task)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.8" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} \ No newline at end of file diff --git a/functions/development/sql_to_file/0.8.0/src/sql_to_file.py b/functions/development/sql_to_file/0.8.0/src/sql_to_file.py deleted file mode 100644 index 086cb066..00000000 --- a/functions/development/sql_to_file/0.8.0/src/sql_to_file.py +++ /dev/null @@ -1,31 +0,0 @@ -# Generated by nuclio.export.NuclioExporter - -import pandas as pd -import pyhive -from sqlalchemy.engine import create_engine -from mlrun.execution import MLClientCtx - - -def sql_to_file( - context: MLClientCtx, - sql_query: str, - database_url: str, - file_ext: str = "parquet", -) -> None: - """SQL Ingest - Ingest data using SQL query - - :param context: the function context - :param sql_query: the sql query used to retrieve the data - :param database_url: database connection URL - :param file_ext: ("parquet") format for result file - """ - - engine = create_engine(database_url) - df = pd.read_sql(sql_query, engine) - - context.log_dataset( - "query result", - df=df, - format=file_ext, - artifact_path=context.artifact_subpath("data"), - ) diff --git a/functions/development/sql_to_file/0.8.0/src/test_sql_to_file.py b/functions/development/sql_to_file/0.8.0/src/test_sql_to_file.py deleted file mode 100644 index cc345a27..00000000 --- a/functions/development/sql_to_file/0.8.0/src/test_sql_to_file.py +++ /dev/null @@ -1,18 +0,0 @@ -from mlrun import code_to_function - -mysql_url = 'mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam' -mysql_query = 'select rfam_acc,rfam_id,auto_wiki,description,author,seed_source FROM family' - - -def test_run_sql_to_file(): - fn = code_to_function(name='test_sql_to_file', - filename="sql_to_file.py", - handler="sql_to_file", - kind="job", - ) - fn.run(params={'sql_query': mysql_query, - 'database_url': mysql_url} - , local=True - - ) - diff --git a/functions/development/sql_to_file/0.8.0/static/documentation.html b/functions/development/sql_to_file/0.8.0/static/documentation.html deleted file mode 100644 index 6d477881..00000000 --- a/functions/development/sql_to_file/0.8.0/static/documentation.html +++ /dev/null @@ -1,125 +0,0 @@ - - - - - - - -sql_to_file package - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- - -
-
-
-
-
-
-

sql_to_file package

-
-

Submodules

-
-
-

sql_to_file.sql_to_file module

-
-
-

Module contents

-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/sql_to_file/0.8.0/static/example.html b/functions/development/sql_to_file/0.8.0/static/example.html deleted file mode 100644 index 09e89cb7..00000000 --- a/functions/development/sql_to_file/0.8.0/static/example.html +++ /dev/null @@ -1,1472 +0,0 @@ - - - - - - - -SQL Ingest - Ingest data using SQL query - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
- -
-
-
-
-

SQL Ingest - Ingest data using SQL query

-
-
-
# nuclio: ignore
-import nuclio
-
-
-
-
-
---------------------------------------------------------------------------
-ModuleNotFoundError                       Traceback (most recent call last)
-<ipython-input-1-0a63a0760af8> in <module>
-      1 # nuclio: ignore
-----> 2 import nuclio
-      3 
-
-ModuleNotFoundError: No module named 'nuclio'
-
-
-
-
-
-
-
%nuclio config kind = "job"
-%nuclio config spec.build.baseImage = "mlrun/mlrun"
-
-
-
-
-
%nuclio: setting kind to 'job'
-%nuclio: setting spec.image to 'mlrun/mlrun'
-
-
-
-
-
-
-
%%nuclio cmd -c
-pip install --no-cache-dir git+https://github.com/v3io/PyHive.git@v0.6.999 
-pip install sqlalchemy==1.3.11
-pip install PyMySQL==0.9.3
-
-
-
-
-
-
-
import pandas as pd
-import pyhive
-from sqlalchemy.engine import create_engine
-from mlrun.execution import MLClientCtx
-
-
-def sql_to_file(
-    context: MLClientCtx,
-    sql_query: str,
-    database_url: str,
-    file_ext: str = "parquet",
-) -> None:
-    """SQL Ingest - Ingest data using SQL query
-
-    :param context:           the function context
-    :param sql_query:         the sql query used to retrieve the data
-    :param database_url:      database connection URL
-    :param file_ext:          ("parquet") format for result file
-
-"""
-
-    engine = create_engine(database_url)
-    df = pd.read_sql(sql_query, engine)
-
-    context.log_dataset('query result',
-                        df=df,
-                        format=file_ext,
-                        artifact_path=context.artifact_subpath('data'))
-
-
-
-
-
-
-
# nuclio: end-code
-
-
-
-
-
-

mlconfig

-
-
-
from mlrun import mlconf
-import os
-mlconf.dbpath = mlconf.dbpath or 'http://mlrun-api:8080'
-mlconf.artifact_path = mlconf.artifact_path or f'{os.environ["HOME"]}/artifacts'
-
-
-
-
-
---------------------------------------------------------------------------
-KeyError                                  Traceback (most recent call last)
-<ipython-input-7-c3828dc245f1> in <module>
-      2 import os
-      3 mlconf.dbpath = mlconf.dbpath or 'http://mlrun-api:8080'
-----> 4 mlconf.artifact_path = mlconf.artifact_path or f'{os.environ["HOME"]}/artifacts'
-      5 
-      6 
-
-C:\Program Files\Python37\lib\os.py in __getitem__(self, key)
-    679         except KeyError:
-    680             # raise KeyError with the original key value
---> 681             raise KeyError(key) from None
-    682         return self.decodevalue(value)
-    683 
-
-KeyError: 'HOME'
-
-
-
-
-
-
-

Save function

-
-
-
def mount_secret(
-    secret_name, volume_mount_path, volume_name='secret', items=None
-):
-    def _mount_secret(task):
-        from kubernetes import client as k8s_client
-        vol = k8s_client.V1SecretVolumeSource(secret_name=secret_name, items=items)
-        return task.add_volume(
-            k8s_client.V1Volume(name=volume_name, secret=vol)
-        ).add_volume_mount(
-            k8s_client.V1VolumeMount(mount_path=volume_mount_path, name=volume_name)
-        )
-    return _mount_secret
-
-
-
-
-
-
-
from mlrun import code_to_function, NewTask
-import os
-
-fn = code_to_function(name="sql_to_file",
-                      handler="sql_to_file",
-                      description="SQL To File - Ingest data using SQL query",
-                      categories=["data-prep"],
-                      labels={"author": "adih"})
-
-if "V3IO_ACCESS_KEY" in list(os.environ):
-    fn.apply(mount_secret(secret_name='presto-tls',
-                        volume_mount_path= '/var/run/iguazio/secrets/'))
-
-
-
-
-
-
-

Build the image

-
-
-
fn.deploy()
-
-
-
-
-
[mlrun] 2020-06-29 12:42:44,100 starting remote build, image: .mlrun/func-default-sql-ingest-latest
-INFO[0000] Resolved base name mlrun/mlrun:0.4.10 to mlrun/mlrun:0.4.10 
-INFO[0000] Resolved base name mlrun/mlrun:0.4.10 to mlrun/mlrun:0.4.10 
-INFO[0000] Retrieving image manifest mlrun/mlrun:0.4.10 
-INFO[0000] Retrieving image manifest mlrun/mlrun:0.4.10 
-INFO[0000] Built cross stage deps: map[]                
-INFO[0000] Retrieving image manifest mlrun/mlrun:0.4.10 
-INFO[0000] Retrieving image manifest mlrun/mlrun:0.4.10 
-INFO[0001] Unpacking rootfs as cmd RUN pip install --no-cache-dir git+https://github.com/v3io/PyHive.git@v0.6.999 requires it. 
-INFO[0027] Taking snapshot of full filesystem...        
-INFO[0039] Resolving paths                              
-INFO[0046] RUN pip install --no-cache-dir git+https://github.com/v3io/PyHive.git@v0.6.999 
-INFO[0046] cmd: /bin/sh                                 
-INFO[0046] args: [-c pip install --no-cache-dir git+https://github.com/v3io/PyHive.git@v0.6.999] 
-Collecting git+https://github.com/v3io/PyHive.git@v0.6.999
-  Cloning https://github.com/v3io/PyHive.git (to revision v0.6.999) to /tmp/pip-req-build-ycqhuolw
-  Running command git clone -q https://github.com/v3io/PyHive.git /tmp/pip-req-build-ycqhuolw
-Requirement already satisfied: future in /usr/local/lib/python3.7/site-packages (from PyHive==0.6.1.dev0) (0.18.2)
-Requirement already satisfied: python-dateutil in /usr/local/lib/python3.7/site-packages (from PyHive==0.6.1.dev0) (2.8.1)
-Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/site-packages (from python-dateutil->PyHive==0.6.1.dev0) (1.15.0)
-Building wheels for collected packages: PyHive
-  Building wheel for PyHive (setup.py): started
-  Building wheel for PyHive (setup.py): finished with status 'done'
-  Created wheel for PyHive: filename=PyHive-0.6.1.dev0-py3-none-any.whl size=46402 sha256=63dca405cbae83da4cfcabfd61fd00f1683bc008c8bfa2272eac7054ec283166
-  Stored in directory: /tmp/pip-ephem-wheel-cache-mwb52l_u/wheels/05/11/cd/4ac4df0fcee76e5ceb614c39c56fca1eead41c0ac32ff6285d
-Successfully built PyHive
-Installing collected packages: PyHive
-Successfully installed PyHive-0.6.1.dev0
-INFO[0048] Taking snapshot of full filesystem...        
-INFO[0048] Resolving paths                              
-INFO[0053] RUN pip install sqlalchemy==1.3.11           
-INFO[0053] cmd: /bin/sh                                 
-INFO[0053] args: [-c pip install sqlalchemy==1.3.11]    
-Collecting sqlalchemy==1.3.11
-  Downloading SQLAlchemy-1.3.11.tar.gz (6.0 MB)
-Building wheels for collected packages: sqlalchemy
-  Building wheel for sqlalchemy (setup.py): started
-  Building wheel for sqlalchemy (setup.py): finished with status 'done'
-  Created wheel for sqlalchemy: filename=SQLAlchemy-1.3.11-cp37-cp37m-linux_x86_64.whl size=1216921 sha256=9dd22e89acfbb68df0c1d189d36907a16c9393e4174598eb4bf377ce57132f3c
-  Stored in directory: /root/.cache/pip/wheels/0a/60/60/f26cbd183a3bb0031ace108156036dd925ec0138ee1c496a16
-Successfully built sqlalchemy
-Installing collected packages: sqlalchemy
-  Attempting uninstall: sqlalchemy
-    Found existing installation: SQLAlchemy 1.3.17
-    Uninstalling SQLAlchemy-1.3.17:
-      Successfully uninstalled SQLAlchemy-1.3.17
-Successfully installed sqlalchemy-1.3.11
-INFO[0057] Taking snapshot of full filesystem...        
-INFO[0057] Resolving paths                              
-INFO[0063] RUN pip install PyMySQL==0.9.3               
-INFO[0063] cmd: /bin/sh                                 
-INFO[0063] args: [-c pip install PyMySQL==0.9.3]        
-Collecting PyMySQL==0.9.3
-  Downloading PyMySQL-0.9.3-py2.py3-none-any.whl (47 kB)
-Installing collected packages: PyMySQL
-Successfully installed PyMySQL-0.9.3
-INFO[0064] Taking snapshot of full filesystem...        
-INFO[0064] Resolving paths                              
-
-
-
True
-
-
-
-
-
-
-
fn.export('function.yaml')
-
-
-
-
-
[mlrun] 2020-06-30 01:58:41,604 function spec saved to path: function.yaml
-
-
-
<mlrun.runtimes.kubejob.KubejobRuntime at 0x2239dbf01c8>
-
-
-
-
-
-
-

Test

-
-

Reading from a public MySQL DB

-
-
-
mysql_url = 'mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam'
-mysql_query = 'select rfam_acc,rfam_id,auto_wiki,description,author,seed_source FROM family'
-
-
-
-
-
-
-
from mlrun import NewTask, run_local
-
-sql_task = NewTask(name='sql',
-                   handler=sql_to_file,
-                   params={'sql_query': mysql_query,
-                           'database_url': mysql_url})
-
-
-
-
-
-
-
sql_func = run_local(sql_task)
-
-
-
-
-
[mlrun] 2020-06-29 12:43:59,253 starting run sql uid=b0914edaa58e45ee97c132200c6b60be  -> http://mlrun-api:8080
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 29 12:43:59completedsql
v3io_user=admin
kind=handler
owner=admin
host=jupyter-b9c7995f9-4fblj
sql_query=select rfam_acc,rfam_id,auto_wiki,description,author,seed_source FROM family
database_url=mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam
query result
-
- -
-
to track results use .show() or .logs() or in CLI: 
-!mlrun get run b0914edaa58e45ee97c132200c6b60be --project default , !mlrun logs b0914edaa58e45ee97c132200c6b60be --project default
-[mlrun] 2020-06-29 12:44:02,344 run executed, status=completed
-
-
-
-
-
-

Run it on a cluster

-
-
-
fn.run(sql_task)
-
-
-
-
-
[mlrun] 2020-06-29 12:44:02,350 starting run sql uid=46ff7ef67e314be49353982cdd8d073a  -> http://mlrun-api:8080
-[mlrun] 2020-06-29 12:44:02,622 Job is running in the background, pod: sql-mplpz
-[mlrun] 2020-06-29 12:44:09,070 run executed, status=completed
-final state: succeeded
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 29 12:44:06completedsql
v3io_user=admin
kind=job
owner=admin
host=sql-mplpz
sql_query=select rfam_acc,rfam_id,auto_wiki,description,author,seed_source FROM family
database_url=mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam
query result
-
- -
-
to track results use .show() or .logs() or in CLI: 
-!mlrun get run 46ff7ef67e314be49353982cdd8d073a --project default , !mlrun logs 46ff7ef67e314be49353982cdd8d073a --project default
-[mlrun] 2020-06-29 12:44:11,893 run executed, status=completed
-
-
-
<mlrun.model.RunObject at 0x7f87fba74b00>
-
-
-
-
-
-
-
-

SQL query from Iguazio Key Value via Presto

-

You need to create a table and set the sql_table path accordingly.
-you can find an example of creating such table in https://github.com/v3io/tutorials/blob/master/data-ingestion-and-preparation/basic-data-ingestion-and-preparation.ipynb

-
-
-
# nuclio: ignore
-import os
-sql_table = os.path.join('v3io.users."'+str(os.getenv('V3IO_USERNAME'))+'/examples/stocks_tab"')
-sql_query_string = 'select * from '+sql_table+""
-
-
-
-
-
-
-
%sql select * from $sql_table limit 10
-
-
-
-
-
Done.
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
securitydescsecuritytypetimeisinminpricedateendpricenumberoftradesmnemoniccurrencysecurityidmaxpricetradedvolumestartprice
UBS I.ETF-DL G.SEL.DIV.ADETF08:27IE00BMP3HG278.4182018-03-26 00:00:00.0008.4181UBUMEUR25054508.4184038.418
GILEAD SCIENCES DL-,001Common stock08:00US375558103659.72018-03-26 00:00:00.00059.843GISEUR250649559.8474559.7
3M CO. DL-,01Common stock08:00US88579Y1010176.512018-03-26 00:00:00.000176.511MMMEUR2506577176.5139176.51
DIEBOLD NIXDORF INH.O.N.Common stock08:06DE000A0CAYB266.32018-03-26 00:00:00.00066.31WINEUR250428666.36066.3
XTR.II EUR.INF.LINK.BD 1CETF08:13LU0290358224218.972018-03-26 00:00:00.000218.971DBXKEUR2505840218.97110218.97
UBS-ETF-MSCI EMU S.C.EOADETF08:33LU0671493277100.22018-03-26 00:00:00.000100.21UEFDEUR2506045100.2180100.2
ASMALLWORLD AG SF 1Common stock08:23CH040488012912.72018-03-26 00:00:00.00012.711Q7EUR308912212.740012.7
IS.DJ GLOB.TITAN.50 U.ETFETF08:42DE000628938231.252018-03-26 00:00:00.00031.251EXI2EUR250502931.255031.25
ISHS IV-AGEING POPUL.ETFETF08:17IE00BYZK46694.9262018-03-26 00:00:00.0004.92612B77EUR25055524.926254.926
PORSCHE AUTOM.HLDG VZOCommon stock08:00DE000PAH003864.682018-03-26 00:00:00.00064.768PAH3EUR250481664.7669864.7
-
-
-
-
sql_task = NewTask(name='sql', 
-                   handler=sql_to_file,
-                   params={'sql_query': sql_query_string,
-                          'database_url': os.getenv('DATABASE_URL')}
-                          )
-
-
-
-
-
-
-
sql_func = run_local(sql_task)
-
-
-
-
-
[mlrun] 2020-06-29 12:44:14,406 starting run sql uid=d32a57bb990d4142bb1f63862e8906bf  -> http://mlrun-api:8080
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 29 12:44:14completedsql
v3io_user=admin
kind=handler
owner=admin
host=jupyter-b9c7995f9-4fblj
sql_query=select * from v3io.users."admin/examples/stocks_tab"
database_url=presto://admin:8278ee8e-0f31-4aea-a105-2eab202bec93@presto-api-presto.default-tenant.app.cs-mlrun-test.iguazio-c0.com:443/v3io?protocol=https&requests_kwargs=%7B%22verify%22%3A+%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.crt%22%2C+%22cert%22%3A+%5B%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.crt%22%2C+%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.key%22%5D%7D
query result
-
- -
-
to track results use .show() or .logs() or in CLI: 
-!mlrun get run d32a57bb990d4142bb1f63862e8906bf --project default , !mlrun logs d32a57bb990d4142bb1f63862e8906bf --project default
-[mlrun] 2020-06-29 12:44:18,102 run executed, status=completed
-
-
-
-
-
-
-
fn.run(sql_task)
-
-
-
-
-
[mlrun] 2020-06-29 12:44:18,112 starting run sql uid=db9507007f6d452e9ca020e4f483e33b  -> http://mlrun-api:8080
-[mlrun] 2020-06-29 12:44:18,387 Job is running in the background, pod: sql-g7p4f
-[mlrun] 2020-06-29 12:44:25,033 run executed, status=completed
-final state: succeeded
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 29 12:44:21completedsql
v3io_user=admin
kind=job
owner=admin
host=sql-g7p4f
sql_query=select * from v3io.users."admin/examples/stocks_tab"
database_url=presto://admin:8278ee8e-0f31-4aea-a105-2eab202bec93@presto-api-presto.default-tenant.app.cs-mlrun-test.iguazio-c0.com:443/v3io?protocol=https&requests_kwargs=%7B%22verify%22%3A+%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.crt%22%2C+%22cert%22%3A+%5B%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.crt%22%2C+%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.key%22%5D%7D
query result
-
- -
-
to track results use .show() or .logs() or in CLI: 
-!mlrun get run db9507007f6d452e9ca020e4f483e33b --project default , !mlrun logs db9507007f6d452e9ca020e4f483e33b --project default
-[mlrun] 2020-06-29 12:44:27,645 run executed, status=completed
-
-
-
<mlrun.model.RunObject at 0x7f87f8e26c18>
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/sql_to_file/0.8.0/static/function.html b/functions/development/sql_to_file/0.8.0/static/function.html deleted file mode 100644 index cff68646..00000000 --- a/functions/development/sql_to_file/0.8.0/static/function.html +++ /dev/null @@ -1,69 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: job
-metadata:
-  name: sql-to-file
-  tag: ''
-  hash: 61f616fe697994e05cf018f2ee94c4ea25ed8863
-  project: default
-  labels:
-    author: adih
-  categories:
-  - data-preparation
-spec:
-  command: ''
-  args: []
-  image: mlrun/mlrun
-  env: []
-  default_handler: sql_to_file
-  entry_points:
-    sql_to_file:
-      name: sql_to_file
-      doc: SQL Ingest - Ingest data using SQL query
-      parameters:
-      - name: context
-        type: MLClientCtx
-        doc: the function context
-        default: ''
-      - name: sql_query
-        type: str
-        doc: the sql query used to retrieve the data
-        default: ''
-      - name: database_url
-        type: str
-        doc: database connection URL
-        default: ''
-      - name: file_ext
-        type: str
-        doc: ("parquet") format for result file
-        default: parquet
-      outputs:
-      - default: ''
-      lineno: 9
-  description: SQL To File - Ingest data using SQL query
-  build:
-    functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHBhbmRhcyBhcyBwZAppbXBvcnQgcHloaXZlCmZyb20gc3FsYWxjaGVteS5lbmdpbmUgaW1wb3J0IGNyZWF0ZV9lbmdpbmUKZnJvbSBtbHJ1bi5leGVjdXRpb24gaW1wb3J0IE1MQ2xpZW50Q3R4CgoKZGVmIHNxbF90b19maWxlKAogICAgY29udGV4dDogTUxDbGllbnRDdHgsCiAgICBzcWxfcXVlcnk6IHN0ciwKICAgIGRhdGFiYXNlX3VybDogc3RyLAogICAgZmlsZV9leHQ6IHN0ciA9ICJwYXJxdWV0IiwKKSAtPiBOb25lOgogICAgIiIiU1FMIEluZ2VzdCAtIEluZ2VzdCBkYXRhIHVzaW5nIFNRTCBxdWVyeQoKICAgIDpwYXJhbSBjb250ZXh0OiAgICAgICAgICAgdGhlIGZ1bmN0aW9uIGNvbnRleHQKICAgIDpwYXJhbSBzcWxfcXVlcnk6ICAgICAgICAgdGhlIHNxbCBxdWVyeSB1c2VkIHRvIHJldHJpZXZlIHRoZSBkYXRhCiAgICA6cGFyYW0gZGF0YWJhc2VfdXJsOiAgICAgIGRhdGFiYXNlIGNvbm5lY3Rpb24gVVJMCiAgICA6cGFyYW0gZmlsZV9leHQ6ICAgICAgICAgICgicGFycXVldCIpIGZvcm1hdCBmb3IgcmVzdWx0IGZpbGUKICAgICIiIgoKICAgIGVuZ2luZSA9IGNyZWF0ZV9lbmdpbmUoZGF0YWJhc2VfdXJsKQogICAgZGYgPSBwZC5yZWFkX3NxbChzcWxfcXVlcnksIGVuZ2luZSkKCiAgICBjb250ZXh0LmxvZ19kYXRhc2V0KAogICAgICAgICJxdWVyeSByZXN1bHQiLAogICAgICAgIGRmPWRmLAogICAgICAgIGZvcm1hdD1maWxlX2V4dCwKICAgICAgICBhcnRpZmFjdF9wYXRoPWNvbnRleHQuYXJ0aWZhY3Rfc3VicGF0aCgiZGF0YSIpLAogICAgKQo=
-    commands: []
-    code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/sql_to_file/sql_to_file.py
-  affinity: null
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/sql_to_file/0.8.0/static/item.html b/functions/development/sql_to_file/0.8.0/static/item.html deleted file mode 100644 index 56af2500..00000000 --- a/functions/development/sql_to_file/0.8.0/static/item.html +++ /dev/null @@ -1,45 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- data-preparation
-description: SQL To File - Ingest data using SQL query
-doc: ''
-example: sql_to_file.ipynb
-generationDate: 2021-05-19:23-13
-icon: ''
-labels:
-  author: adih
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 0.8.0
-name: sql-to-file
-platformVersion: 3.2.0
-spec:
-  filename: sql_to_file.py
-  handler: sql_to_file
-  image: mlrun/mlrun
-  kind: job
-  requirements: []
-url: ''
-version: 0.8.0
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/sql_to_file/0.8.0/static/source.html b/functions/development/sql_to_file/0.8.0/static/source.html deleted file mode 100644 index 7d22754f..00000000 --- a/functions/development/sql_to_file/0.8.0/static/source.html +++ /dev/null @@ -1,53 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-# Generated by nuclio.export.NuclioExporter
-
-import pandas as pd
-import pyhive
-from sqlalchemy.engine import create_engine
-from mlrun.execution import MLClientCtx
-
-
-def sql_to_file(
-    context: MLClientCtx,
-    sql_query: str,
-    database_url: str,
-    file_ext: str = "parquet",
-) -> None:
-    """SQL Ingest - Ingest data using SQL query
-
-    :param context:           the function context
-    :param sql_query:         the sql query used to retrieve the data
-    :param database_url:      database connection URL
-    :param file_ext:          ("parquet") format for result file
-    """
-
-    engine = create_engine(database_url)
-    df = pd.read_sql(sql_query, engine)
-
-    context.log_dataset(
-        "query result",
-        df=df,
-        format=file_ext,
-        artifact_path=context.artifact_subpath("data"),
-    )
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/sql_to_file/0.9.0/src/function.yaml b/functions/development/sql_to_file/0.9.0/src/function.yaml deleted file mode 100644 index 10b332a5..00000000 --- a/functions/development/sql_to_file/0.9.0/src/function.yaml +++ /dev/null @@ -1,47 +0,0 @@ -kind: job -metadata: - name: sql-to-file - tag: '' - hash: 61f616fe697994e05cf018f2ee94c4ea25ed8863 - project: '' - labels: - author: adih - categories: - - data-preparation -spec: - command: '' - args: [] - image: mlrun/mlrun - env: [] - default_handler: sql_to_file - entry_points: - sql_to_file: - name: sql_to_file - doc: SQL Ingest - Ingest data using SQL query - parameters: - - name: context - type: MLClientCtx - doc: the function context - default: '' - - name: sql_query - type: str - doc: the sql query used to retrieve the data - default: '' - - name: database_url - type: str - doc: database connection URL - default: '' - - name: file_ext - type: str - doc: ("parquet") format for result file - default: parquet - outputs: - - default: '' - lineno: 9 - description: SQL To File - Ingest data using SQL query - build: - functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHBhbmRhcyBhcyBwZAppbXBvcnQgcHloaXZlCmZyb20gc3FsYWxjaGVteS5lbmdpbmUgaW1wb3J0IGNyZWF0ZV9lbmdpbmUKZnJvbSBtbHJ1bi5leGVjdXRpb24gaW1wb3J0IE1MQ2xpZW50Q3R4CgoKZGVmIHNxbF90b19maWxlKAogICAgY29udGV4dDogTUxDbGllbnRDdHgsCiAgICBzcWxfcXVlcnk6IHN0ciwKICAgIGRhdGFiYXNlX3VybDogc3RyLAogICAgZmlsZV9leHQ6IHN0ciA9ICJwYXJxdWV0IiwKKSAtPiBOb25lOgogICAgIiIiU1FMIEluZ2VzdCAtIEluZ2VzdCBkYXRhIHVzaW5nIFNRTCBxdWVyeQoKICAgIDpwYXJhbSBjb250ZXh0OiAgICAgICAgICAgdGhlIGZ1bmN0aW9uIGNvbnRleHQKICAgIDpwYXJhbSBzcWxfcXVlcnk6ICAgICAgICAgdGhlIHNxbCBxdWVyeSB1c2VkIHRvIHJldHJpZXZlIHRoZSBkYXRhCiAgICA6cGFyYW0gZGF0YWJhc2VfdXJsOiAgICAgIGRhdGFiYXNlIGNvbm5lY3Rpb24gVVJMCiAgICA6cGFyYW0gZmlsZV9leHQ6ICAgICAgICAgICgicGFycXVldCIpIGZvcm1hdCBmb3IgcmVzdWx0IGZpbGUKICAgICIiIgoKICAgIGVuZ2luZSA9IGNyZWF0ZV9lbmdpbmUoZGF0YWJhc2VfdXJsKQogICAgZGYgPSBwZC5yZWFkX3NxbChzcWxfcXVlcnksIGVuZ2luZSkKCiAgICBjb250ZXh0LmxvZ19kYXRhc2V0KAogICAgICAgICJxdWVyeSByZXN1bHQiLAogICAgICAgIGRmPWRmLAogICAgICAgIGZvcm1hdD1maWxlX2V4dCwKICAgICAgICBhcnRpZmFjdF9wYXRoPWNvbnRleHQuYXJ0aWZhY3Rfc3VicGF0aCgiZGF0YSIpLAogICAgKQo= - commands: [] - code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/sql_to_file/sql_to_file.py - affinity: null -verbose: false diff --git a/functions/development/sql_to_file/0.9.0/src/item.yaml b/functions/development/sql_to_file/0.9.0/src/item.yaml deleted file mode 100644 index df066abc..00000000 --- a/functions/development/sql_to_file/0.9.0/src/item.yaml +++ /dev/null @@ -1,23 +0,0 @@ -apiVersion: v1 -categories: -- data-preparation -description: SQL To File - Ingest data using SQL query -doc: '' -example: sql_to_file.ipynb -generationDate: 2021-11-18:12-28 -icon: '' -labels: - author: adih -maintainers: [] -marketplaceType: '' -mlrunVersion: 0.8.0 -name: sql-to-file -platformVersion: 3.2.0 -spec: - filename: sql_to_file.py - handler: sql_to_file - image: mlrun/mlrun - kind: job - requirements: [] -url: '' -version: 0.9.0 diff --git a/functions/development/sql_to_file/0.9.0/src/requirements.txt b/functions/development/sql_to_file/0.9.0/src/requirements.txt deleted file mode 100644 index 21ef3f07..00000000 --- a/functions/development/sql_to_file/0.9.0/src/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -mlrun -PyHive -pymysql \ No newline at end of file diff --git a/functions/development/sql_to_file/0.9.0/src/sql_to_file.ipynb b/functions/development/sql_to_file/0.9.0/src/sql_to_file.ipynb deleted file mode 100644 index d4a084ad..00000000 --- a/functions/development/sql_to_file/0.9.0/src/sql_to_file.ipynb +++ /dev/null @@ -1,1567 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# SQL Ingest - Ingest data using SQL query " - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "ename": "ModuleNotFoundError", - "evalue": "No module named 'nuclio'", - "output_type": "error", - "traceback": [ - "\u001B[1;31m---------------------------------------------------------------------------\u001B[0m", - "\u001B[1;31mModuleNotFoundError\u001B[0m Traceback (most recent call last)", - "\u001B[1;32m\u001B[0m in \u001B[0;36m\u001B[1;34m\u001B[0m\n\u001B[0;32m 1\u001B[0m \u001B[1;31m# nuclio: ignore\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[1;32m----> 2\u001B[1;33m \u001B[1;32mimport\u001B[0m \u001B[0mnuclio\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0m\u001B[0;32m 3\u001B[0m \u001B[1;33m\u001B[0m\u001B[0m\n", - "\u001B[1;31mModuleNotFoundError\u001B[0m: No module named 'nuclio'" - ] - } - ], - "source": [ - "# nuclio: ignore\n", - "import nuclio" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "%nuclio: setting kind to 'job'\n", - "%nuclio: setting spec.image to 'mlrun/mlrun'\n" - ] - } - ], - "source": [ - "%nuclio config kind = \"job\"\n", - "%nuclio config spec.build.baseImage = \"mlrun/mlrun\"" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "%%nuclio cmd -c\n", - "pip install --no-cache-dir git+https://github.com/v3io/PyHive.git@v0.6.999 \n", - "pip install sqlalchemy==1.3.11\n", - "pip install PyMySQL==0.9.3" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import pyhive\n", - "from sqlalchemy.engine import create_engine\n", - "from mlrun.execution import MLClientCtx\n", - "\n", - "\n", - "def sql_to_file(\n", - " context: MLClientCtx,\n", - " sql_query: str,\n", - " database_url: str,\n", - " file_ext: str = \"parquet\",\n", - ") -> None:\n", - " \"\"\"SQL Ingest - Ingest data using SQL query\n", - "\n", - " :param context: the function context\n", - " :param sql_query: the sql query used to retrieve the data\n", - " :param database_url: database connection URL\n", - " :param file_ext: (\"parquet\") format for result file\n", - "\n", - "\"\"\"\n", - "\n", - " engine = create_engine(database_url)\n", - " df = pd.read_sql(sql_query, engine)\n", - "\n", - " context.log_dataset('query result',\n", - " df=df,\n", - " format=file_ext,\n", - " artifact_path=context.artifact_subpath('data'))\n" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: end-code" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### mlconfig" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "ename": "KeyError", - "evalue": "'HOME'", - "output_type": "error", - "traceback": [ - "\u001B[1;31m---------------------------------------------------------------------------\u001B[0m", - "\u001B[1;31mKeyError\u001B[0m Traceback (most recent call last)", - "\u001B[1;32m\u001B[0m in \u001B[0;36m\u001B[1;34m\u001B[0m\n\u001B[0;32m 2\u001B[0m \u001B[1;32mimport\u001B[0m \u001B[0mos\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 3\u001B[0m \u001B[0mmlconf\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mdbpath\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0mmlconf\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mdbpath\u001B[0m \u001B[1;32mor\u001B[0m \u001B[1;34m'http://mlrun-api:8080'\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[1;32m----> 4\u001B[1;33m \u001B[0mmlconf\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0martifact_path\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0mmlconf\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0martifact_path\u001B[0m \u001B[1;32mor\u001B[0m \u001B[1;34mf'{os.environ[\"HOME\"]}/artifacts'\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0m\u001B[0;32m 5\u001B[0m \u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 6\u001B[0m \u001B[1;33m\u001B[0m\u001B[0m\n", - "\u001B[1;32mC:\\Program Files\\Python37\\lib\\os.py\u001B[0m in \u001B[0;36m__getitem__\u001B[1;34m(self, key)\u001B[0m\n\u001B[0;32m 679\u001B[0m \u001B[1;32mexcept\u001B[0m \u001B[0mKeyError\u001B[0m\u001B[1;33m:\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 680\u001B[0m \u001B[1;31m# raise KeyError with the original key value\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[1;32m--> 681\u001B[1;33m \u001B[1;32mraise\u001B[0m \u001B[0mKeyError\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0mkey\u001B[0m\u001B[1;33m)\u001B[0m \u001B[1;32mfrom\u001B[0m \u001B[1;32mNone\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0m\u001B[0;32m 682\u001B[0m \u001B[1;32mreturn\u001B[0m \u001B[0mself\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mdecodevalue\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0mvalue\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 683\u001B[0m \u001B[1;33m\u001B[0m\u001B[0m\n", - "\u001B[1;31mKeyError\u001B[0m: 'HOME'" - ] - } - ], - "source": [ - "from mlrun import mlconf\n", - "import os\n", - "mlconf.dbpath = mlconf.dbpath or 'http://mlrun-api:8080'\n", - "mlconf.artifact_path = mlconf.artifact_path or f'{os.environ[\"HOME\"]}/artifacts'\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Save function" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "def mount_secret(\n", - " secret_name, volume_mount_path, volume_name='secret', items=None\n", - "):\n", - " def _mount_secret(task):\n", - " from kubernetes import client as k8s_client\n", - " vol = k8s_client.V1SecretVolumeSource(secret_name=secret_name, items=items)\n", - " return task.add_volume(\n", - " k8s_client.V1Volume(name=volume_name, secret=vol)\n", - " ).add_volume_mount(\n", - " k8s_client.V1VolumeMount(mount_path=volume_mount_path, name=volume_name)\n", - " )\n", - " return _mount_secret" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import code_to_function, NewTask\n", - "import os\n", - "\n", - "fn = code_to_function(name=\"sql_to_file\",\n", - " handler=\"sql_to_file\",\n", - " description=\"SQL To File - Ingest data using SQL query\",\n", - " categories=[\"data-prep\"],\n", - " labels={\"author\": \"adih\"})\n", - "\n", - "if \"V3IO_ACCESS_KEY\" in list(os.environ):\n", - " fn.apply(mount_secret(secret_name='presto-tls',\n", - " volume_mount_path= '/var/run/iguazio/secrets/'))\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Build the image" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-06-29 12:42:44,100 starting remote build, image: .mlrun/func-default-sql-ingest-latest\n", - "\u001B[36mINFO\u001B[0m[0000] Resolved base name mlrun/mlrun:0.4.10 to mlrun/mlrun:0.4.10 \n", - "\u001B[36mINFO\u001B[0m[0000] Resolved base name mlrun/mlrun:0.4.10 to mlrun/mlrun:0.4.10 \n", - "\u001B[36mINFO\u001B[0m[0000] Retrieving image manifest mlrun/mlrun:0.4.10 \n", - "\u001B[36mINFO\u001B[0m[0000] Retrieving image manifest mlrun/mlrun:0.4.10 \n", - "\u001B[36mINFO\u001B[0m[0000] Built cross stage deps: map[] \n", - "\u001B[36mINFO\u001B[0m[0000] Retrieving image manifest mlrun/mlrun:0.4.10 \n", - "\u001B[36mINFO\u001B[0m[0000] Retrieving image manifest mlrun/mlrun:0.4.10 \n", - "\u001B[36mINFO\u001B[0m[0001] Unpacking rootfs as cmd RUN pip install --no-cache-dir git+https://github.com/v3io/PyHive.git@v0.6.999 requires it. \n", - "\u001B[36mINFO\u001B[0m[0027] Taking snapshot of full filesystem... \n", - "\u001B[36mINFO\u001B[0m[0039] Resolving paths \n", - "\u001B[36mINFO\u001B[0m[0046] RUN pip install --no-cache-dir git+https://github.com/v3io/PyHive.git@v0.6.999 \n", - "\u001B[36mINFO\u001B[0m[0046] cmd: /bin/sh \n", - "\u001B[36mINFO\u001B[0m[0046] args: [-c pip install --no-cache-dir git+https://github.com/v3io/PyHive.git@v0.6.999] \n", - "Collecting git+https://github.com/v3io/PyHive.git@v0.6.999\n", - " Cloning https://github.com/v3io/PyHive.git (to revision v0.6.999) to /tmp/pip-req-build-ycqhuolw\n", - " Running command git clone -q https://github.com/v3io/PyHive.git /tmp/pip-req-build-ycqhuolw\n", - "Requirement already satisfied: future in /usr/local/lib/python3.7/site-packages (from PyHive==0.6.1.dev0) (0.18.2)\n", - "Requirement already satisfied: python-dateutil in /usr/local/lib/python3.7/site-packages (from PyHive==0.6.1.dev0) (2.8.1)\n", - "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/site-packages (from python-dateutil->PyHive==0.6.1.dev0) (1.15.0)\n", - "Building wheels for collected packages: PyHive\n", - " Building wheel for PyHive (setup.py): started\n", - " Building wheel for PyHive (setup.py): finished with status 'done'\n", - " Created wheel for PyHive: filename=PyHive-0.6.1.dev0-py3-none-any.whl size=46402 sha256=63dca405cbae83da4cfcabfd61fd00f1683bc008c8bfa2272eac7054ec283166\n", - " Stored in directory: /tmp/pip-ephem-wheel-cache-mwb52l_u/wheels/05/11/cd/4ac4df0fcee76e5ceb614c39c56fca1eead41c0ac32ff6285d\n", - "Successfully built PyHive\n", - "Installing collected packages: PyHive\n", - "Successfully installed PyHive-0.6.1.dev0\n", - "\u001B[36mINFO\u001B[0m[0048] Taking snapshot of full filesystem... \n", - "\u001B[36mINFO\u001B[0m[0048] Resolving paths \n", - "\u001B[36mINFO\u001B[0m[0053] RUN pip install sqlalchemy==1.3.11 \n", - "\u001B[36mINFO\u001B[0m[0053] cmd: /bin/sh \n", - "\u001B[36mINFO\u001B[0m[0053] args: [-c pip install sqlalchemy==1.3.11] \n", - "Collecting sqlalchemy==1.3.11\n", - " Downloading SQLAlchemy-1.3.11.tar.gz (6.0 MB)\n", - "Building wheels for collected packages: sqlalchemy\n", - " Building wheel for sqlalchemy (setup.py): started\n", - " Building wheel for sqlalchemy (setup.py): finished with status 'done'\n", - " Created wheel for sqlalchemy: filename=SQLAlchemy-1.3.11-cp37-cp37m-linux_x86_64.whl size=1216921 sha256=9dd22e89acfbb68df0c1d189d36907a16c9393e4174598eb4bf377ce57132f3c\n", - " Stored in directory: /root/.cache/pip/wheels/0a/60/60/f26cbd183a3bb0031ace108156036dd925ec0138ee1c496a16\n", - "Successfully built sqlalchemy\n", - "Installing collected packages: sqlalchemy\n", - " Attempting uninstall: sqlalchemy\n", - " Found existing installation: SQLAlchemy 1.3.17\n", - " Uninstalling SQLAlchemy-1.3.17:\n", - " Successfully uninstalled SQLAlchemy-1.3.17\n", - "Successfully installed sqlalchemy-1.3.11\n", - "\u001B[36mINFO\u001B[0m[0057] Taking snapshot of full filesystem... \n", - "\u001B[36mINFO\u001B[0m[0057] Resolving paths \n", - "\u001B[36mINFO\u001B[0m[0063] RUN pip install PyMySQL==0.9.3 \n", - "\u001B[36mINFO\u001B[0m[0063] cmd: /bin/sh \n", - "\u001B[36mINFO\u001B[0m[0063] args: [-c pip install PyMySQL==0.9.3] \n", - "Collecting PyMySQL==0.9.3\n", - " Downloading PyMySQL-0.9.3-py2.py3-none-any.whl (47 kB)\n", - "Installing collected packages: PyMySQL\n", - "Successfully installed PyMySQL-0.9.3\n", - "\u001B[36mINFO\u001B[0m[0064] Taking snapshot of full filesystem... \n", - "\u001B[36mINFO\u001B[0m[0064] Resolving paths \n" - ] - }, - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fn.deploy()" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-06-30 01:58:41,604 function spec saved to path: function.yaml\n" - ] - }, - { - "data": { - "text/plain": "" - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fn.export('function.yaml')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Test" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Reading from a public MySQL DB" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "mysql_url = 'mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam'\n", - "mysql_query = 'select rfam_acc,rfam_id,auto_wiki,description,author,seed_source FROM family'" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import NewTask, run_local\n", - "\n", - "sql_task = NewTask(name='sql',\n", - " handler=sql_to_file,\n", - " params={'sql_query': mysql_query,\n", - " 'database_url': mysql_url})\n" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-06-29 12:43:59,253 starting run sql uid=b0914edaa58e45ee97c132200c6b60be -> http://mlrun-api:8080\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 29 12:43:59completedsql
v3io_user=admin
kind=handler
owner=admin
host=jupyter-b9c7995f9-4fblj
sql_query=select rfam_acc,rfam_id,auto_wiki,description,author,seed_source FROM family
database_url=mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam
query result
\n", - "
\n", - "
\n", - "
\n", - " Title\n", - " ×\n", - "
\n", - " \n", - "
\n", - "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "to track results use .show() or .logs() or in CLI: \n", - "!mlrun get run b0914edaa58e45ee97c132200c6b60be --project default , !mlrun logs b0914edaa58e45ee97c132200c6b60be --project default\n", - "[mlrun] 2020-06-29 12:44:02,344 run executed, status=completed\n" - ] - } - ], - "source": [ - "sql_func = run_local(sql_task)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Run it on a cluster" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-06-29 12:44:02,350 starting run sql uid=46ff7ef67e314be49353982cdd8d073a -> http://mlrun-api:8080\n", - "[mlrun] 2020-06-29 12:44:02,622 Job is running in the background, pod: sql-mplpz\n", - "[mlrun] 2020-06-29 12:44:09,070 run executed, status=completed\n", - "final state: succeeded\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 29 12:44:06completedsql
v3io_user=admin
kind=job
owner=admin
host=sql-mplpz
sql_query=select rfam_acc,rfam_id,auto_wiki,description,author,seed_source FROM family
database_url=mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam
query result
\n", - "
\n", - "
\n", - "
\n", - " Title\n", - " ×\n", - "
\n", - " \n", - "
\n", - "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "to track results use .show() or .logs() or in CLI: \n", - "!mlrun get run 46ff7ef67e314be49353982cdd8d073a --project default , !mlrun logs 46ff7ef67e314be49353982cdd8d073a --project default\n", - "[mlrun] 2020-06-29 12:44:11,893 run executed, status=completed\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fn.run(sql_task)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### SQL query from Iguazio Key Value via Presto" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You need to create a table and set the sql_table path accordingly.
\n", - "you can find an example of creating such table in https://github.com/v3io/tutorials/blob/master/data-ingestion-and-preparation/basic-data-ingestion-and-preparation.ipynb" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: ignore\n", - "import os\n", - "sql_table = os.path.join('v3io.users.\"'+str(os.getenv('V3IO_USERNAME'))+'/examples/stocks_tab\"')\n", - "sql_query_string = 'select * from '+sql_table+\"\"" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Done.\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
securitydescsecuritytypetimeisinminpricedateendpricenumberoftradesmnemoniccurrencysecurityidmaxpricetradedvolumestartprice
UBS I.ETF-DL G.SEL.DIV.ADETF08:27IE00BMP3HG278.4182018-03-26 00:00:00.0008.4181UBUMEUR25054508.4184038.418
GILEAD SCIENCES DL-,001Common stock08:00US375558103659.72018-03-26 00:00:00.00059.843GISEUR250649559.8474559.7
3M CO. DL-,01Common stock08:00US88579Y1010176.512018-03-26 00:00:00.000176.511MMMEUR2506577176.5139176.51
DIEBOLD NIXDORF INH.O.N.Common stock08:06DE000A0CAYB266.32018-03-26 00:00:00.00066.31WINEUR250428666.36066.3
XTR.II EUR.INF.LINK.BD 1CETF08:13LU0290358224218.972018-03-26 00:00:00.000218.971DBXKEUR2505840218.97110218.97
UBS-ETF-MSCI EMU S.C.EOADETF08:33LU0671493277100.22018-03-26 00:00:00.000100.21UEFDEUR2506045100.2180100.2
ASMALLWORLD AG SF 1Common stock08:23CH040488012912.72018-03-26 00:00:00.00012.711Q7EUR308912212.740012.7
IS.DJ GLOB.TITAN.50 U.ETFETF08:42DE000628938231.252018-03-26 00:00:00.00031.251EXI2EUR250502931.255031.25
ISHS IV-AGEING POPUL.ETFETF08:17IE00BYZK46694.9262018-03-26 00:00:00.0004.92612B77EUR25055524.926254.926
PORSCHE AUTOM.HLDG VZOCommon stock08:00DE000PAH003864.682018-03-26 00:00:00.00064.768PAH3EUR250481664.7669864.7
" - ], - "text/plain": [ - "[('UBS I.ETF-DL G.SEL.DIV.AD', 'ETF', '08:27', 'IE00BMP3HG27', 8.418, '2018-03-26 00:00:00.000', 8.418, 1, 'UBUM', 'EUR', 2505450, 8.418, 403, 8.418),\n", - " ('GILEAD SCIENCES DL-,001', 'Common stock', '08:00', 'US3755581036', 59.7, '2018-03-26 00:00:00.000', 59.84, 3, 'GIS', 'EUR', 2506495, 59.84, 745, 59.7),\n", - " ('3M CO. DL-,01', 'Common stock', '08:00', 'US88579Y1010', 176.51, '2018-03-26 00:00:00.000', 176.51, 1, 'MMM', 'EUR', 2506577, 176.51, 39, 176.51),\n", - " ('DIEBOLD NIXDORF INH.O.N.', 'Common stock', '08:06', 'DE000A0CAYB2', 66.3, '2018-03-26 00:00:00.000', 66.3, 1, 'WIN', 'EUR', 2504286, 66.3, 60, 66.3),\n", - " ('XTR.II EUR.INF.LINK.BD 1C', 'ETF', '08:13', 'LU0290358224', 218.97, '2018-03-26 00:00:00.000', 218.97, 1, 'DBXK', 'EUR', 2505840, 218.97, 110, 218.97),\n", - " ('UBS-ETF-MSCI EMU S.C.EOAD', 'ETF', '08:33', 'LU0671493277', 100.2, '2018-03-26 00:00:00.000', 100.2, 1, 'UEFD', 'EUR', 2506045, 100.2, 180, 100.2),\n", - " ('ASMALLWORLD AG SF 1', 'Common stock', '08:23', 'CH0404880129', 12.7, '2018-03-26 00:00:00.000', 12.7, 1, '1Q7', 'EUR', 3089122, 12.7, 400, 12.7),\n", - " ('IS.DJ GLOB.TITAN.50 U.ETF', 'ETF', '08:42', 'DE0006289382', 31.25, '2018-03-26 00:00:00.000', 31.25, 1, 'EXI2', 'EUR', 2505029, 31.25, 50, 31.25),\n", - " ('ISHS IV-AGEING POPUL.ETF', 'ETF', '08:17', 'IE00BYZK4669', 4.926, '2018-03-26 00:00:00.000', 4.926, 1, '2B77', 'EUR', 2505552, 4.926, 25, 4.926),\n", - " ('PORSCHE AUTOM.HLDG VZO', 'Common stock', '08:00', 'DE000PAH0038', 64.68, '2018-03-26 00:00:00.000', 64.76, 8, 'PAH3', 'EUR', 2504816, 64.76, 698, 64.7)]" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "%sql select * from $sql_table limit 10" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [], - "source": [ - "sql_task = NewTask(name='sql', \n", - " handler=sql_to_file,\n", - " params={'sql_query': sql_query_string,\n", - " 'database_url': os.getenv('DATABASE_URL')}\n", - " )\n" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-06-29 12:44:14,406 starting run sql uid=d32a57bb990d4142bb1f63862e8906bf -> http://mlrun-api:8080\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 29 12:44:14completedsql
v3io_user=admin
kind=handler
owner=admin
host=jupyter-b9c7995f9-4fblj
sql_query=select * from v3io.users.\"admin/examples/stocks_tab\"
database_url=presto://admin:8278ee8e-0f31-4aea-a105-2eab202bec93@presto-api-presto.default-tenant.app.cs-mlrun-test.iguazio-c0.com:443/v3io?protocol=https&requests_kwargs=%7B%22verify%22%3A+%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.crt%22%2C+%22cert%22%3A+%5B%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.crt%22%2C+%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.key%22%5D%7D
query result
\n", - "
\n", - "
\n", - "
\n", - " Title\n", - " ×\n", - "
\n", - " \n", - "
\n", - "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "to track results use .show() or .logs() or in CLI: \n", - "!mlrun get run d32a57bb990d4142bb1f63862e8906bf --project default , !mlrun logs d32a57bb990d4142bb1f63862e8906bf --project default\n", - "[mlrun] 2020-06-29 12:44:18,102 run executed, status=completed\n" - ] - } - ], - "source": [ - "sql_func = run_local(sql_task)" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-06-29 12:44:18,112 starting run sql uid=db9507007f6d452e9ca020e4f483e33b -> http://mlrun-api:8080\n", - "[mlrun] 2020-06-29 12:44:18,387 Job is running in the background, pod: sql-g7p4f\n", - "[mlrun] 2020-06-29 12:44:25,033 run executed, status=completed\n", - "final state: succeeded\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 29 12:44:21completedsql
v3io_user=admin
kind=job
owner=admin
host=sql-g7p4f
sql_query=select * from v3io.users.\"admin/examples/stocks_tab\"
database_url=presto://admin:8278ee8e-0f31-4aea-a105-2eab202bec93@presto-api-presto.default-tenant.app.cs-mlrun-test.iguazio-c0.com:443/v3io?protocol=https&requests_kwargs=%7B%22verify%22%3A+%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.crt%22%2C+%22cert%22%3A+%5B%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.crt%22%2C+%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.key%22%5D%7D
query result
\n", - "
\n", - "
\n", - "
\n", - " Title\n", - " ×\n", - "
\n", - " \n", - "
\n", - "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "to track results use .show() or .logs() or in CLI: \n", - "!mlrun get run db9507007f6d452e9ca020e4f483e33b --project default , !mlrun logs db9507007f6d452e9ca020e4f483e33b --project default\n", - "[mlrun] 2020-06-29 12:44:27,645 run executed, status=completed\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fn.run(sql_task)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.8" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} \ No newline at end of file diff --git a/functions/development/sql_to_file/0.9.0/src/sql_to_file.py b/functions/development/sql_to_file/0.9.0/src/sql_to_file.py deleted file mode 100644 index 086cb066..00000000 --- a/functions/development/sql_to_file/0.9.0/src/sql_to_file.py +++ /dev/null @@ -1,31 +0,0 @@ -# Generated by nuclio.export.NuclioExporter - -import pandas as pd -import pyhive -from sqlalchemy.engine import create_engine -from mlrun.execution import MLClientCtx - - -def sql_to_file( - context: MLClientCtx, - sql_query: str, - database_url: str, - file_ext: str = "parquet", -) -> None: - """SQL Ingest - Ingest data using SQL query - - :param context: the function context - :param sql_query: the sql query used to retrieve the data - :param database_url: database connection URL - :param file_ext: ("parquet") format for result file - """ - - engine = create_engine(database_url) - df = pd.read_sql(sql_query, engine) - - context.log_dataset( - "query result", - df=df, - format=file_ext, - artifact_path=context.artifact_subpath("data"), - ) diff --git a/functions/development/sql_to_file/0.9.0/src/test_sql_to_file.py b/functions/development/sql_to_file/0.9.0/src/test_sql_to_file.py deleted file mode 100644 index cc345a27..00000000 --- a/functions/development/sql_to_file/0.9.0/src/test_sql_to_file.py +++ /dev/null @@ -1,18 +0,0 @@ -from mlrun import code_to_function - -mysql_url = 'mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam' -mysql_query = 'select rfam_acc,rfam_id,auto_wiki,description,author,seed_source FROM family' - - -def test_run_sql_to_file(): - fn = code_to_function(name='test_sql_to_file', - filename="sql_to_file.py", - handler="sql_to_file", - kind="job", - ) - fn.run(params={'sql_query': mysql_query, - 'database_url': mysql_url} - , local=True - - ) - diff --git a/functions/development/sql_to_file/0.9.0/static/documentation.html b/functions/development/sql_to_file/0.9.0/static/documentation.html deleted file mode 100644 index 6d477881..00000000 --- a/functions/development/sql_to_file/0.9.0/static/documentation.html +++ /dev/null @@ -1,125 +0,0 @@ - - - - - - - -sql_to_file package - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- - -
-
-
-
-
-
-

sql_to_file package

-
-

Submodules

-
-
-

sql_to_file.sql_to_file module

-
-
-

Module contents

-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/sql_to_file/0.9.0/static/example.html b/functions/development/sql_to_file/0.9.0/static/example.html deleted file mode 100644 index 09e89cb7..00000000 --- a/functions/development/sql_to_file/0.9.0/static/example.html +++ /dev/null @@ -1,1472 +0,0 @@ - - - - - - - -SQL Ingest - Ingest data using SQL query - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
- -
-
-
-
-

SQL Ingest - Ingest data using SQL query

-
-
-
# nuclio: ignore
-import nuclio
-
-
-
-
-
---------------------------------------------------------------------------
-ModuleNotFoundError                       Traceback (most recent call last)
-<ipython-input-1-0a63a0760af8> in <module>
-      1 # nuclio: ignore
-----> 2 import nuclio
-      3 
-
-ModuleNotFoundError: No module named 'nuclio'
-
-
-
-
-
-
-
%nuclio config kind = "job"
-%nuclio config spec.build.baseImage = "mlrun/mlrun"
-
-
-
-
-
%nuclio: setting kind to 'job'
-%nuclio: setting spec.image to 'mlrun/mlrun'
-
-
-
-
-
-
-
%%nuclio cmd -c
-pip install --no-cache-dir git+https://github.com/v3io/PyHive.git@v0.6.999 
-pip install sqlalchemy==1.3.11
-pip install PyMySQL==0.9.3
-
-
-
-
-
-
-
import pandas as pd
-import pyhive
-from sqlalchemy.engine import create_engine
-from mlrun.execution import MLClientCtx
-
-
-def sql_to_file(
-    context: MLClientCtx,
-    sql_query: str,
-    database_url: str,
-    file_ext: str = "parquet",
-) -> None:
-    """SQL Ingest - Ingest data using SQL query
-
-    :param context:           the function context
-    :param sql_query:         the sql query used to retrieve the data
-    :param database_url:      database connection URL
-    :param file_ext:          ("parquet") format for result file
-
-"""
-
-    engine = create_engine(database_url)
-    df = pd.read_sql(sql_query, engine)
-
-    context.log_dataset('query result',
-                        df=df,
-                        format=file_ext,
-                        artifact_path=context.artifact_subpath('data'))
-
-
-
-
-
-
-
# nuclio: end-code
-
-
-
-
-
-

mlconfig

-
-
-
from mlrun import mlconf
-import os
-mlconf.dbpath = mlconf.dbpath or 'http://mlrun-api:8080'
-mlconf.artifact_path = mlconf.artifact_path or f'{os.environ["HOME"]}/artifacts'
-
-
-
-
-
---------------------------------------------------------------------------
-KeyError                                  Traceback (most recent call last)
-<ipython-input-7-c3828dc245f1> in <module>
-      2 import os
-      3 mlconf.dbpath = mlconf.dbpath or 'http://mlrun-api:8080'
-----> 4 mlconf.artifact_path = mlconf.artifact_path or f'{os.environ["HOME"]}/artifacts'
-      5 
-      6 
-
-C:\Program Files\Python37\lib\os.py in __getitem__(self, key)
-    679         except KeyError:
-    680             # raise KeyError with the original key value
---> 681             raise KeyError(key) from None
-    682         return self.decodevalue(value)
-    683 
-
-KeyError: 'HOME'
-
-
-
-
-
-
-

Save function

-
-
-
def mount_secret(
-    secret_name, volume_mount_path, volume_name='secret', items=None
-):
-    def _mount_secret(task):
-        from kubernetes import client as k8s_client
-        vol = k8s_client.V1SecretVolumeSource(secret_name=secret_name, items=items)
-        return task.add_volume(
-            k8s_client.V1Volume(name=volume_name, secret=vol)
-        ).add_volume_mount(
-            k8s_client.V1VolumeMount(mount_path=volume_mount_path, name=volume_name)
-        )
-    return _mount_secret
-
-
-
-
-
-
-
from mlrun import code_to_function, NewTask
-import os
-
-fn = code_to_function(name="sql_to_file",
-                      handler="sql_to_file",
-                      description="SQL To File - Ingest data using SQL query",
-                      categories=["data-prep"],
-                      labels={"author": "adih"})
-
-if "V3IO_ACCESS_KEY" in list(os.environ):
-    fn.apply(mount_secret(secret_name='presto-tls',
-                        volume_mount_path= '/var/run/iguazio/secrets/'))
-
-
-
-
-
-
-

Build the image

-
-
-
fn.deploy()
-
-
-
-
-
[mlrun] 2020-06-29 12:42:44,100 starting remote build, image: .mlrun/func-default-sql-ingest-latest
-INFO[0000] Resolved base name mlrun/mlrun:0.4.10 to mlrun/mlrun:0.4.10 
-INFO[0000] Resolved base name mlrun/mlrun:0.4.10 to mlrun/mlrun:0.4.10 
-INFO[0000] Retrieving image manifest mlrun/mlrun:0.4.10 
-INFO[0000] Retrieving image manifest mlrun/mlrun:0.4.10 
-INFO[0000] Built cross stage deps: map[]                
-INFO[0000] Retrieving image manifest mlrun/mlrun:0.4.10 
-INFO[0000] Retrieving image manifest mlrun/mlrun:0.4.10 
-INFO[0001] Unpacking rootfs as cmd RUN pip install --no-cache-dir git+https://github.com/v3io/PyHive.git@v0.6.999 requires it. 
-INFO[0027] Taking snapshot of full filesystem...        
-INFO[0039] Resolving paths                              
-INFO[0046] RUN pip install --no-cache-dir git+https://github.com/v3io/PyHive.git@v0.6.999 
-INFO[0046] cmd: /bin/sh                                 
-INFO[0046] args: [-c pip install --no-cache-dir git+https://github.com/v3io/PyHive.git@v0.6.999] 
-Collecting git+https://github.com/v3io/PyHive.git@v0.6.999
-  Cloning https://github.com/v3io/PyHive.git (to revision v0.6.999) to /tmp/pip-req-build-ycqhuolw
-  Running command git clone -q https://github.com/v3io/PyHive.git /tmp/pip-req-build-ycqhuolw
-Requirement already satisfied: future in /usr/local/lib/python3.7/site-packages (from PyHive==0.6.1.dev0) (0.18.2)
-Requirement already satisfied: python-dateutil in /usr/local/lib/python3.7/site-packages (from PyHive==0.6.1.dev0) (2.8.1)
-Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/site-packages (from python-dateutil->PyHive==0.6.1.dev0) (1.15.0)
-Building wheels for collected packages: PyHive
-  Building wheel for PyHive (setup.py): started
-  Building wheel for PyHive (setup.py): finished with status 'done'
-  Created wheel for PyHive: filename=PyHive-0.6.1.dev0-py3-none-any.whl size=46402 sha256=63dca405cbae83da4cfcabfd61fd00f1683bc008c8bfa2272eac7054ec283166
-  Stored in directory: /tmp/pip-ephem-wheel-cache-mwb52l_u/wheels/05/11/cd/4ac4df0fcee76e5ceb614c39c56fca1eead41c0ac32ff6285d
-Successfully built PyHive
-Installing collected packages: PyHive
-Successfully installed PyHive-0.6.1.dev0
-INFO[0048] Taking snapshot of full filesystem...        
-INFO[0048] Resolving paths                              
-INFO[0053] RUN pip install sqlalchemy==1.3.11           
-INFO[0053] cmd: /bin/sh                                 
-INFO[0053] args: [-c pip install sqlalchemy==1.3.11]    
-Collecting sqlalchemy==1.3.11
-  Downloading SQLAlchemy-1.3.11.tar.gz (6.0 MB)
-Building wheels for collected packages: sqlalchemy
-  Building wheel for sqlalchemy (setup.py): started
-  Building wheel for sqlalchemy (setup.py): finished with status 'done'
-  Created wheel for sqlalchemy: filename=SQLAlchemy-1.3.11-cp37-cp37m-linux_x86_64.whl size=1216921 sha256=9dd22e89acfbb68df0c1d189d36907a16c9393e4174598eb4bf377ce57132f3c
-  Stored in directory: /root/.cache/pip/wheels/0a/60/60/f26cbd183a3bb0031ace108156036dd925ec0138ee1c496a16
-Successfully built sqlalchemy
-Installing collected packages: sqlalchemy
-  Attempting uninstall: sqlalchemy
-    Found existing installation: SQLAlchemy 1.3.17
-    Uninstalling SQLAlchemy-1.3.17:
-      Successfully uninstalled SQLAlchemy-1.3.17
-Successfully installed sqlalchemy-1.3.11
-INFO[0057] Taking snapshot of full filesystem...        
-INFO[0057] Resolving paths                              
-INFO[0063] RUN pip install PyMySQL==0.9.3               
-INFO[0063] cmd: /bin/sh                                 
-INFO[0063] args: [-c pip install PyMySQL==0.9.3]        
-Collecting PyMySQL==0.9.3
-  Downloading PyMySQL-0.9.3-py2.py3-none-any.whl (47 kB)
-Installing collected packages: PyMySQL
-Successfully installed PyMySQL-0.9.3
-INFO[0064] Taking snapshot of full filesystem...        
-INFO[0064] Resolving paths                              
-
-
-
True
-
-
-
-
-
-
-
fn.export('function.yaml')
-
-
-
-
-
[mlrun] 2020-06-30 01:58:41,604 function spec saved to path: function.yaml
-
-
-
<mlrun.runtimes.kubejob.KubejobRuntime at 0x2239dbf01c8>
-
-
-
-
-
-
-

Test

-
-

Reading from a public MySQL DB

-
-
-
mysql_url = 'mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam'
-mysql_query = 'select rfam_acc,rfam_id,auto_wiki,description,author,seed_source FROM family'
-
-
-
-
-
-
-
from mlrun import NewTask, run_local
-
-sql_task = NewTask(name='sql',
-                   handler=sql_to_file,
-                   params={'sql_query': mysql_query,
-                           'database_url': mysql_url})
-
-
-
-
-
-
-
sql_func = run_local(sql_task)
-
-
-
-
-
[mlrun] 2020-06-29 12:43:59,253 starting run sql uid=b0914edaa58e45ee97c132200c6b60be  -> http://mlrun-api:8080
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 29 12:43:59completedsql
v3io_user=admin
kind=handler
owner=admin
host=jupyter-b9c7995f9-4fblj
sql_query=select rfam_acc,rfam_id,auto_wiki,description,author,seed_source FROM family
database_url=mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam
query result
-
- -
-
to track results use .show() or .logs() or in CLI: 
-!mlrun get run b0914edaa58e45ee97c132200c6b60be --project default , !mlrun logs b0914edaa58e45ee97c132200c6b60be --project default
-[mlrun] 2020-06-29 12:44:02,344 run executed, status=completed
-
-
-
-
-
-

Run it on a cluster

-
-
-
fn.run(sql_task)
-
-
-
-
-
[mlrun] 2020-06-29 12:44:02,350 starting run sql uid=46ff7ef67e314be49353982cdd8d073a  -> http://mlrun-api:8080
-[mlrun] 2020-06-29 12:44:02,622 Job is running in the background, pod: sql-mplpz
-[mlrun] 2020-06-29 12:44:09,070 run executed, status=completed
-final state: succeeded
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 29 12:44:06completedsql
v3io_user=admin
kind=job
owner=admin
host=sql-mplpz
sql_query=select rfam_acc,rfam_id,auto_wiki,description,author,seed_source FROM family
database_url=mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam
query result
-
- -
-
to track results use .show() or .logs() or in CLI: 
-!mlrun get run 46ff7ef67e314be49353982cdd8d073a --project default , !mlrun logs 46ff7ef67e314be49353982cdd8d073a --project default
-[mlrun] 2020-06-29 12:44:11,893 run executed, status=completed
-
-
-
<mlrun.model.RunObject at 0x7f87fba74b00>
-
-
-
-
-
-
-
-

SQL query from Iguazio Key Value via Presto

-

You need to create a table and set the sql_table path accordingly.
-you can find an example of creating such table in https://github.com/v3io/tutorials/blob/master/data-ingestion-and-preparation/basic-data-ingestion-and-preparation.ipynb

-
-
-
# nuclio: ignore
-import os
-sql_table = os.path.join('v3io.users."'+str(os.getenv('V3IO_USERNAME'))+'/examples/stocks_tab"')
-sql_query_string = 'select * from '+sql_table+""
-
-
-
-
-
-
-
%sql select * from $sql_table limit 10
-
-
-
-
-
Done.
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
securitydescsecuritytypetimeisinminpricedateendpricenumberoftradesmnemoniccurrencysecurityidmaxpricetradedvolumestartprice
UBS I.ETF-DL G.SEL.DIV.ADETF08:27IE00BMP3HG278.4182018-03-26 00:00:00.0008.4181UBUMEUR25054508.4184038.418
GILEAD SCIENCES DL-,001Common stock08:00US375558103659.72018-03-26 00:00:00.00059.843GISEUR250649559.8474559.7
3M CO. DL-,01Common stock08:00US88579Y1010176.512018-03-26 00:00:00.000176.511MMMEUR2506577176.5139176.51
DIEBOLD NIXDORF INH.O.N.Common stock08:06DE000A0CAYB266.32018-03-26 00:00:00.00066.31WINEUR250428666.36066.3
XTR.II EUR.INF.LINK.BD 1CETF08:13LU0290358224218.972018-03-26 00:00:00.000218.971DBXKEUR2505840218.97110218.97
UBS-ETF-MSCI EMU S.C.EOADETF08:33LU0671493277100.22018-03-26 00:00:00.000100.21UEFDEUR2506045100.2180100.2
ASMALLWORLD AG SF 1Common stock08:23CH040488012912.72018-03-26 00:00:00.00012.711Q7EUR308912212.740012.7
IS.DJ GLOB.TITAN.50 U.ETFETF08:42DE000628938231.252018-03-26 00:00:00.00031.251EXI2EUR250502931.255031.25
ISHS IV-AGEING POPUL.ETFETF08:17IE00BYZK46694.9262018-03-26 00:00:00.0004.92612B77EUR25055524.926254.926
PORSCHE AUTOM.HLDG VZOCommon stock08:00DE000PAH003864.682018-03-26 00:00:00.00064.768PAH3EUR250481664.7669864.7
-
-
-
-
sql_task = NewTask(name='sql', 
-                   handler=sql_to_file,
-                   params={'sql_query': sql_query_string,
-                          'database_url': os.getenv('DATABASE_URL')}
-                          )
-
-
-
-
-
-
-
sql_func = run_local(sql_task)
-
-
-
-
-
[mlrun] 2020-06-29 12:44:14,406 starting run sql uid=d32a57bb990d4142bb1f63862e8906bf  -> http://mlrun-api:8080
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 29 12:44:14completedsql
v3io_user=admin
kind=handler
owner=admin
host=jupyter-b9c7995f9-4fblj
sql_query=select * from v3io.users."admin/examples/stocks_tab"
database_url=presto://admin:8278ee8e-0f31-4aea-a105-2eab202bec93@presto-api-presto.default-tenant.app.cs-mlrun-test.iguazio-c0.com:443/v3io?protocol=https&requests_kwargs=%7B%22verify%22%3A+%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.crt%22%2C+%22cert%22%3A+%5B%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.crt%22%2C+%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.key%22%5D%7D
query result
-
- -
-
to track results use .show() or .logs() or in CLI: 
-!mlrun get run d32a57bb990d4142bb1f63862e8906bf --project default , !mlrun logs d32a57bb990d4142bb1f63862e8906bf --project default
-[mlrun] 2020-06-29 12:44:18,102 run executed, status=completed
-
-
-
-
-
-
-
fn.run(sql_task)
-
-
-
-
-
[mlrun] 2020-06-29 12:44:18,112 starting run sql uid=db9507007f6d452e9ca020e4f483e33b  -> http://mlrun-api:8080
-[mlrun] 2020-06-29 12:44:18,387 Job is running in the background, pod: sql-g7p4f
-[mlrun] 2020-06-29 12:44:25,033 run executed, status=completed
-final state: succeeded
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 29 12:44:21completedsql
v3io_user=admin
kind=job
owner=admin
host=sql-g7p4f
sql_query=select * from v3io.users."admin/examples/stocks_tab"
database_url=presto://admin:8278ee8e-0f31-4aea-a105-2eab202bec93@presto-api-presto.default-tenant.app.cs-mlrun-test.iguazio-c0.com:443/v3io?protocol=https&requests_kwargs=%7B%22verify%22%3A+%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.crt%22%2C+%22cert%22%3A+%5B%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.crt%22%2C+%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.key%22%5D%7D
query result
-
- -
-
to track results use .show() or .logs() or in CLI: 
-!mlrun get run db9507007f6d452e9ca020e4f483e33b --project default , !mlrun logs db9507007f6d452e9ca020e4f483e33b --project default
-[mlrun] 2020-06-29 12:44:27,645 run executed, status=completed
-
-
-
<mlrun.model.RunObject at 0x7f87f8e26c18>
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/sql_to_file/0.9.0/static/function.html b/functions/development/sql_to_file/0.9.0/static/function.html deleted file mode 100644 index 4de597e5..00000000 --- a/functions/development/sql_to_file/0.9.0/static/function.html +++ /dev/null @@ -1,69 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: job
-metadata:
-  name: sql-to-file
-  tag: ''
-  hash: 61f616fe697994e05cf018f2ee94c4ea25ed8863
-  project: ''
-  labels:
-    author: adih
-  categories:
-  - data-preparation
-spec:
-  command: ''
-  args: []
-  image: mlrun/mlrun
-  env: []
-  default_handler: sql_to_file
-  entry_points:
-    sql_to_file:
-      name: sql_to_file
-      doc: SQL Ingest - Ingest data using SQL query
-      parameters:
-      - name: context
-        type: MLClientCtx
-        doc: the function context
-        default: ''
-      - name: sql_query
-        type: str
-        doc: the sql query used to retrieve the data
-        default: ''
-      - name: database_url
-        type: str
-        doc: database connection URL
-        default: ''
-      - name: file_ext
-        type: str
-        doc: ("parquet") format for result file
-        default: parquet
-      outputs:
-      - default: ''
-      lineno: 9
-  description: SQL To File - Ingest data using SQL query
-  build:
-    functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHBhbmRhcyBhcyBwZAppbXBvcnQgcHloaXZlCmZyb20gc3FsYWxjaGVteS5lbmdpbmUgaW1wb3J0IGNyZWF0ZV9lbmdpbmUKZnJvbSBtbHJ1bi5leGVjdXRpb24gaW1wb3J0IE1MQ2xpZW50Q3R4CgoKZGVmIHNxbF90b19maWxlKAogICAgY29udGV4dDogTUxDbGllbnRDdHgsCiAgICBzcWxfcXVlcnk6IHN0ciwKICAgIGRhdGFiYXNlX3VybDogc3RyLAogICAgZmlsZV9leHQ6IHN0ciA9ICJwYXJxdWV0IiwKKSAtPiBOb25lOgogICAgIiIiU1FMIEluZ2VzdCAtIEluZ2VzdCBkYXRhIHVzaW5nIFNRTCBxdWVyeQoKICAgIDpwYXJhbSBjb250ZXh0OiAgICAgICAgICAgdGhlIGZ1bmN0aW9uIGNvbnRleHQKICAgIDpwYXJhbSBzcWxfcXVlcnk6ICAgICAgICAgdGhlIHNxbCBxdWVyeSB1c2VkIHRvIHJldHJpZXZlIHRoZSBkYXRhCiAgICA6cGFyYW0gZGF0YWJhc2VfdXJsOiAgICAgIGRhdGFiYXNlIGNvbm5lY3Rpb24gVVJMCiAgICA6cGFyYW0gZmlsZV9leHQ6ICAgICAgICAgICgicGFycXVldCIpIGZvcm1hdCBmb3IgcmVzdWx0IGZpbGUKICAgICIiIgoKICAgIGVuZ2luZSA9IGNyZWF0ZV9lbmdpbmUoZGF0YWJhc2VfdXJsKQogICAgZGYgPSBwZC5yZWFkX3NxbChzcWxfcXVlcnksIGVuZ2luZSkKCiAgICBjb250ZXh0LmxvZ19kYXRhc2V0KAogICAgICAgICJxdWVyeSByZXN1bHQiLAogICAgICAgIGRmPWRmLAogICAgICAgIGZvcm1hdD1maWxlX2V4dCwKICAgICAgICBhcnRpZmFjdF9wYXRoPWNvbnRleHQuYXJ0aWZhY3Rfc3VicGF0aCgiZGF0YSIpLAogICAgKQo=
-    commands: []
-    code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/sql_to_file/sql_to_file.py
-  affinity: null
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/sql_to_file/0.9.0/static/item.html b/functions/development/sql_to_file/0.9.0/static/item.html deleted file mode 100644 index 22c29059..00000000 --- a/functions/development/sql_to_file/0.9.0/static/item.html +++ /dev/null @@ -1,45 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- data-preparation
-description: SQL To File - Ingest data using SQL query
-doc: ''
-example: sql_to_file.ipynb
-generationDate: 2021-11-18:12-28
-icon: ''
-labels:
-  author: adih
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 0.8.0
-name: sql-to-file
-platformVersion: 3.2.0
-spec:
-  filename: sql_to_file.py
-  handler: sql_to_file
-  image: mlrun/mlrun
-  kind: job
-  requirements: []
-url: ''
-version: 0.9.0
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/sql_to_file/0.9.0/static/source.html b/functions/development/sql_to_file/0.9.0/static/source.html deleted file mode 100644 index 7d22754f..00000000 --- a/functions/development/sql_to_file/0.9.0/static/source.html +++ /dev/null @@ -1,53 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-# Generated by nuclio.export.NuclioExporter
-
-import pandas as pd
-import pyhive
-from sqlalchemy.engine import create_engine
-from mlrun.execution import MLClientCtx
-
-
-def sql_to_file(
-    context: MLClientCtx,
-    sql_query: str,
-    database_url: str,
-    file_ext: str = "parquet",
-) -> None:
-    """SQL Ingest - Ingest data using SQL query
-
-    :param context:           the function context
-    :param sql_query:         the sql query used to retrieve the data
-    :param database_url:      database connection URL
-    :param file_ext:          ("parquet") format for result file
-    """
-
-    engine = create_engine(database_url)
-    df = pd.read_sql(sql_query, engine)
-
-    context.log_dataset(
-        "query result",
-        df=df,
-        format=file_ext,
-        artifact_path=context.artifact_subpath("data"),
-    )
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/sql_to_file/0.9.1/src/function.yaml b/functions/development/sql_to_file/0.9.1/src/function.yaml deleted file mode 100644 index 10b332a5..00000000 --- a/functions/development/sql_to_file/0.9.1/src/function.yaml +++ /dev/null @@ -1,47 +0,0 @@ -kind: job -metadata: - name: sql-to-file - tag: '' - hash: 61f616fe697994e05cf018f2ee94c4ea25ed8863 - project: '' - labels: - author: adih - categories: - - data-preparation -spec: - command: '' - args: [] - image: mlrun/mlrun - env: [] - default_handler: sql_to_file - entry_points: - sql_to_file: - name: sql_to_file - doc: SQL Ingest - Ingest data using SQL query - parameters: - - name: context - type: MLClientCtx - doc: the function context - default: '' - - name: sql_query - type: str - doc: the sql query used to retrieve the data - default: '' - - name: database_url - type: str - doc: database connection URL - default: '' - - name: file_ext - type: str - doc: ("parquet") format for result file - default: parquet - outputs: - - default: '' - lineno: 9 - description: SQL To File - Ingest data using SQL query - build: - functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHBhbmRhcyBhcyBwZAppbXBvcnQgcHloaXZlCmZyb20gc3FsYWxjaGVteS5lbmdpbmUgaW1wb3J0IGNyZWF0ZV9lbmdpbmUKZnJvbSBtbHJ1bi5leGVjdXRpb24gaW1wb3J0IE1MQ2xpZW50Q3R4CgoKZGVmIHNxbF90b19maWxlKAogICAgY29udGV4dDogTUxDbGllbnRDdHgsCiAgICBzcWxfcXVlcnk6IHN0ciwKICAgIGRhdGFiYXNlX3VybDogc3RyLAogICAgZmlsZV9leHQ6IHN0ciA9ICJwYXJxdWV0IiwKKSAtPiBOb25lOgogICAgIiIiU1FMIEluZ2VzdCAtIEluZ2VzdCBkYXRhIHVzaW5nIFNRTCBxdWVyeQoKICAgIDpwYXJhbSBjb250ZXh0OiAgICAgICAgICAgdGhlIGZ1bmN0aW9uIGNvbnRleHQKICAgIDpwYXJhbSBzcWxfcXVlcnk6ICAgICAgICAgdGhlIHNxbCBxdWVyeSB1c2VkIHRvIHJldHJpZXZlIHRoZSBkYXRhCiAgICA6cGFyYW0gZGF0YWJhc2VfdXJsOiAgICAgIGRhdGFiYXNlIGNvbm5lY3Rpb24gVVJMCiAgICA6cGFyYW0gZmlsZV9leHQ6ICAgICAgICAgICgicGFycXVldCIpIGZvcm1hdCBmb3IgcmVzdWx0IGZpbGUKICAgICIiIgoKICAgIGVuZ2luZSA9IGNyZWF0ZV9lbmdpbmUoZGF0YWJhc2VfdXJsKQogICAgZGYgPSBwZC5yZWFkX3NxbChzcWxfcXVlcnksIGVuZ2luZSkKCiAgICBjb250ZXh0LmxvZ19kYXRhc2V0KAogICAgICAgICJxdWVyeSByZXN1bHQiLAogICAgICAgIGRmPWRmLAogICAgICAgIGZvcm1hdD1maWxlX2V4dCwKICAgICAgICBhcnRpZmFjdF9wYXRoPWNvbnRleHQuYXJ0aWZhY3Rfc3VicGF0aCgiZGF0YSIpLAogICAgKQo= - commands: [] - code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/sql_to_file/sql_to_file.py - affinity: null -verbose: false diff --git a/functions/development/sql_to_file/0.9.1/src/item.yaml b/functions/development/sql_to_file/0.9.1/src/item.yaml deleted file mode 100644 index b8f354c3..00000000 --- a/functions/development/sql_to_file/0.9.1/src/item.yaml +++ /dev/null @@ -1,23 +0,0 @@ -apiVersion: v1 -categories: -- data-preparation -description: SQL To File - Ingest data using SQL query -doc: '' -example: sql_to_file.ipynb -generationDate: 2021-11-18:12-28 -icon: '' -labels: - author: adih -maintainers: [] -marketplaceType: '' -mlrunVersion: 0.8.0 -name: sql-to-file -platformVersion: 3.2.0 -spec: - filename: sql_to_file.py - handler: sql_to_file - image: mlrun/mlrun - kind: job - requirements: [] -url: '' -version: 0.9.1 diff --git a/functions/development/sql_to_file/0.9.1/src/requirements.txt b/functions/development/sql_to_file/0.9.1/src/requirements.txt deleted file mode 100644 index c6dce205..00000000 --- a/functions/development/sql_to_file/0.9.1/src/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -mlrun -pyhive -pymysql \ No newline at end of file diff --git a/functions/development/sql_to_file/0.9.1/src/sql_to_file.ipynb b/functions/development/sql_to_file/0.9.1/src/sql_to_file.ipynb deleted file mode 100644 index d4a084ad..00000000 --- a/functions/development/sql_to_file/0.9.1/src/sql_to_file.ipynb +++ /dev/null @@ -1,1567 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# SQL Ingest - Ingest data using SQL query " - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "ename": "ModuleNotFoundError", - "evalue": "No module named 'nuclio'", - "output_type": "error", - "traceback": [ - "\u001B[1;31m---------------------------------------------------------------------------\u001B[0m", - "\u001B[1;31mModuleNotFoundError\u001B[0m Traceback (most recent call last)", - "\u001B[1;32m\u001B[0m in \u001B[0;36m\u001B[1;34m\u001B[0m\n\u001B[0;32m 1\u001B[0m \u001B[1;31m# nuclio: ignore\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[1;32m----> 2\u001B[1;33m \u001B[1;32mimport\u001B[0m \u001B[0mnuclio\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0m\u001B[0;32m 3\u001B[0m \u001B[1;33m\u001B[0m\u001B[0m\n", - "\u001B[1;31mModuleNotFoundError\u001B[0m: No module named 'nuclio'" - ] - } - ], - "source": [ - "# nuclio: ignore\n", - "import nuclio" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "%nuclio: setting kind to 'job'\n", - "%nuclio: setting spec.image to 'mlrun/mlrun'\n" - ] - } - ], - "source": [ - "%nuclio config kind = \"job\"\n", - "%nuclio config spec.build.baseImage = \"mlrun/mlrun\"" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "%%nuclio cmd -c\n", - "pip install --no-cache-dir git+https://github.com/v3io/PyHive.git@v0.6.999 \n", - "pip install sqlalchemy==1.3.11\n", - "pip install PyMySQL==0.9.3" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import pyhive\n", - "from sqlalchemy.engine import create_engine\n", - "from mlrun.execution import MLClientCtx\n", - "\n", - "\n", - "def sql_to_file(\n", - " context: MLClientCtx,\n", - " sql_query: str,\n", - " database_url: str,\n", - " file_ext: str = \"parquet\",\n", - ") -> None:\n", - " \"\"\"SQL Ingest - Ingest data using SQL query\n", - "\n", - " :param context: the function context\n", - " :param sql_query: the sql query used to retrieve the data\n", - " :param database_url: database connection URL\n", - " :param file_ext: (\"parquet\") format for result file\n", - "\n", - "\"\"\"\n", - "\n", - " engine = create_engine(database_url)\n", - " df = pd.read_sql(sql_query, engine)\n", - "\n", - " context.log_dataset('query result',\n", - " df=df,\n", - " format=file_ext,\n", - " artifact_path=context.artifact_subpath('data'))\n" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: end-code" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### mlconfig" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "ename": "KeyError", - "evalue": "'HOME'", - "output_type": "error", - "traceback": [ - "\u001B[1;31m---------------------------------------------------------------------------\u001B[0m", - "\u001B[1;31mKeyError\u001B[0m Traceback (most recent call last)", - "\u001B[1;32m\u001B[0m in \u001B[0;36m\u001B[1;34m\u001B[0m\n\u001B[0;32m 2\u001B[0m \u001B[1;32mimport\u001B[0m \u001B[0mos\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 3\u001B[0m \u001B[0mmlconf\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mdbpath\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0mmlconf\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mdbpath\u001B[0m \u001B[1;32mor\u001B[0m \u001B[1;34m'http://mlrun-api:8080'\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[1;32m----> 4\u001B[1;33m \u001B[0mmlconf\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0martifact_path\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0mmlconf\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0martifact_path\u001B[0m \u001B[1;32mor\u001B[0m \u001B[1;34mf'{os.environ[\"HOME\"]}/artifacts'\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0m\u001B[0;32m 5\u001B[0m \u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 6\u001B[0m \u001B[1;33m\u001B[0m\u001B[0m\n", - "\u001B[1;32mC:\\Program Files\\Python37\\lib\\os.py\u001B[0m in \u001B[0;36m__getitem__\u001B[1;34m(self, key)\u001B[0m\n\u001B[0;32m 679\u001B[0m \u001B[1;32mexcept\u001B[0m \u001B[0mKeyError\u001B[0m\u001B[1;33m:\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 680\u001B[0m \u001B[1;31m# raise KeyError with the original key value\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[1;32m--> 681\u001B[1;33m \u001B[1;32mraise\u001B[0m \u001B[0mKeyError\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0mkey\u001B[0m\u001B[1;33m)\u001B[0m \u001B[1;32mfrom\u001B[0m \u001B[1;32mNone\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0m\u001B[0;32m 682\u001B[0m \u001B[1;32mreturn\u001B[0m \u001B[0mself\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mdecodevalue\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0mvalue\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 683\u001B[0m \u001B[1;33m\u001B[0m\u001B[0m\n", - "\u001B[1;31mKeyError\u001B[0m: 'HOME'" - ] - } - ], - "source": [ - "from mlrun import mlconf\n", - "import os\n", - "mlconf.dbpath = mlconf.dbpath or 'http://mlrun-api:8080'\n", - "mlconf.artifact_path = mlconf.artifact_path or f'{os.environ[\"HOME\"]}/artifacts'\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Save function" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "def mount_secret(\n", - " secret_name, volume_mount_path, volume_name='secret', items=None\n", - "):\n", - " def _mount_secret(task):\n", - " from kubernetes import client as k8s_client\n", - " vol = k8s_client.V1SecretVolumeSource(secret_name=secret_name, items=items)\n", - " return task.add_volume(\n", - " k8s_client.V1Volume(name=volume_name, secret=vol)\n", - " ).add_volume_mount(\n", - " k8s_client.V1VolumeMount(mount_path=volume_mount_path, name=volume_name)\n", - " )\n", - " return _mount_secret" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import code_to_function, NewTask\n", - "import os\n", - "\n", - "fn = code_to_function(name=\"sql_to_file\",\n", - " handler=\"sql_to_file\",\n", - " description=\"SQL To File - Ingest data using SQL query\",\n", - " categories=[\"data-prep\"],\n", - " labels={\"author\": \"adih\"})\n", - "\n", - "if \"V3IO_ACCESS_KEY\" in list(os.environ):\n", - " fn.apply(mount_secret(secret_name='presto-tls',\n", - " volume_mount_path= '/var/run/iguazio/secrets/'))\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Build the image" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-06-29 12:42:44,100 starting remote build, image: .mlrun/func-default-sql-ingest-latest\n", - "\u001B[36mINFO\u001B[0m[0000] Resolved base name mlrun/mlrun:0.4.10 to mlrun/mlrun:0.4.10 \n", - "\u001B[36mINFO\u001B[0m[0000] Resolved base name mlrun/mlrun:0.4.10 to mlrun/mlrun:0.4.10 \n", - "\u001B[36mINFO\u001B[0m[0000] Retrieving image manifest mlrun/mlrun:0.4.10 \n", - "\u001B[36mINFO\u001B[0m[0000] Retrieving image manifest mlrun/mlrun:0.4.10 \n", - "\u001B[36mINFO\u001B[0m[0000] Built cross stage deps: map[] \n", - "\u001B[36mINFO\u001B[0m[0000] Retrieving image manifest mlrun/mlrun:0.4.10 \n", - "\u001B[36mINFO\u001B[0m[0000] Retrieving image manifest mlrun/mlrun:0.4.10 \n", - "\u001B[36mINFO\u001B[0m[0001] Unpacking rootfs as cmd RUN pip install --no-cache-dir git+https://github.com/v3io/PyHive.git@v0.6.999 requires it. \n", - "\u001B[36mINFO\u001B[0m[0027] Taking snapshot of full filesystem... \n", - "\u001B[36mINFO\u001B[0m[0039] Resolving paths \n", - "\u001B[36mINFO\u001B[0m[0046] RUN pip install --no-cache-dir git+https://github.com/v3io/PyHive.git@v0.6.999 \n", - "\u001B[36mINFO\u001B[0m[0046] cmd: /bin/sh \n", - "\u001B[36mINFO\u001B[0m[0046] args: [-c pip install --no-cache-dir git+https://github.com/v3io/PyHive.git@v0.6.999] \n", - "Collecting git+https://github.com/v3io/PyHive.git@v0.6.999\n", - " Cloning https://github.com/v3io/PyHive.git (to revision v0.6.999) to /tmp/pip-req-build-ycqhuolw\n", - " Running command git clone -q https://github.com/v3io/PyHive.git /tmp/pip-req-build-ycqhuolw\n", - "Requirement already satisfied: future in /usr/local/lib/python3.7/site-packages (from PyHive==0.6.1.dev0) (0.18.2)\n", - "Requirement already satisfied: python-dateutil in /usr/local/lib/python3.7/site-packages (from PyHive==0.6.1.dev0) (2.8.1)\n", - "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/site-packages (from python-dateutil->PyHive==0.6.1.dev0) (1.15.0)\n", - "Building wheels for collected packages: PyHive\n", - " Building wheel for PyHive (setup.py): started\n", - " Building wheel for PyHive (setup.py): finished with status 'done'\n", - " Created wheel for PyHive: filename=PyHive-0.6.1.dev0-py3-none-any.whl size=46402 sha256=63dca405cbae83da4cfcabfd61fd00f1683bc008c8bfa2272eac7054ec283166\n", - " Stored in directory: /tmp/pip-ephem-wheel-cache-mwb52l_u/wheels/05/11/cd/4ac4df0fcee76e5ceb614c39c56fca1eead41c0ac32ff6285d\n", - "Successfully built PyHive\n", - "Installing collected packages: PyHive\n", - "Successfully installed PyHive-0.6.1.dev0\n", - "\u001B[36mINFO\u001B[0m[0048] Taking snapshot of full filesystem... \n", - "\u001B[36mINFO\u001B[0m[0048] Resolving paths \n", - "\u001B[36mINFO\u001B[0m[0053] RUN pip install sqlalchemy==1.3.11 \n", - "\u001B[36mINFO\u001B[0m[0053] cmd: /bin/sh \n", - "\u001B[36mINFO\u001B[0m[0053] args: [-c pip install sqlalchemy==1.3.11] \n", - "Collecting sqlalchemy==1.3.11\n", - " Downloading SQLAlchemy-1.3.11.tar.gz (6.0 MB)\n", - "Building wheels for collected packages: sqlalchemy\n", - " Building wheel for sqlalchemy (setup.py): started\n", - " Building wheel for sqlalchemy (setup.py): finished with status 'done'\n", - " Created wheel for sqlalchemy: filename=SQLAlchemy-1.3.11-cp37-cp37m-linux_x86_64.whl size=1216921 sha256=9dd22e89acfbb68df0c1d189d36907a16c9393e4174598eb4bf377ce57132f3c\n", - " Stored in directory: /root/.cache/pip/wheels/0a/60/60/f26cbd183a3bb0031ace108156036dd925ec0138ee1c496a16\n", - "Successfully built sqlalchemy\n", - "Installing collected packages: sqlalchemy\n", - " Attempting uninstall: sqlalchemy\n", - " Found existing installation: SQLAlchemy 1.3.17\n", - " Uninstalling SQLAlchemy-1.3.17:\n", - " Successfully uninstalled SQLAlchemy-1.3.17\n", - "Successfully installed sqlalchemy-1.3.11\n", - "\u001B[36mINFO\u001B[0m[0057] Taking snapshot of full filesystem... \n", - "\u001B[36mINFO\u001B[0m[0057] Resolving paths \n", - "\u001B[36mINFO\u001B[0m[0063] RUN pip install PyMySQL==0.9.3 \n", - "\u001B[36mINFO\u001B[0m[0063] cmd: /bin/sh \n", - "\u001B[36mINFO\u001B[0m[0063] args: [-c pip install PyMySQL==0.9.3] \n", - "Collecting PyMySQL==0.9.3\n", - " Downloading PyMySQL-0.9.3-py2.py3-none-any.whl (47 kB)\n", - "Installing collected packages: PyMySQL\n", - "Successfully installed PyMySQL-0.9.3\n", - "\u001B[36mINFO\u001B[0m[0064] Taking snapshot of full filesystem... \n", - "\u001B[36mINFO\u001B[0m[0064] Resolving paths \n" - ] - }, - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fn.deploy()" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-06-30 01:58:41,604 function spec saved to path: function.yaml\n" - ] - }, - { - "data": { - "text/plain": "" - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fn.export('function.yaml')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Test" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Reading from a public MySQL DB" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "mysql_url = 'mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam'\n", - "mysql_query = 'select rfam_acc,rfam_id,auto_wiki,description,author,seed_source FROM family'" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import NewTask, run_local\n", - "\n", - "sql_task = NewTask(name='sql',\n", - " handler=sql_to_file,\n", - " params={'sql_query': mysql_query,\n", - " 'database_url': mysql_url})\n" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-06-29 12:43:59,253 starting run sql uid=b0914edaa58e45ee97c132200c6b60be -> http://mlrun-api:8080\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 29 12:43:59completedsql
v3io_user=admin
kind=handler
owner=admin
host=jupyter-b9c7995f9-4fblj
sql_query=select rfam_acc,rfam_id,auto_wiki,description,author,seed_source FROM family
database_url=mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam
query result
\n", - "
\n", - "
\n", - "
\n", - " Title\n", - " ×\n", - "
\n", - " \n", - "
\n", - "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "to track results use .show() or .logs() or in CLI: \n", - "!mlrun get run b0914edaa58e45ee97c132200c6b60be --project default , !mlrun logs b0914edaa58e45ee97c132200c6b60be --project default\n", - "[mlrun] 2020-06-29 12:44:02,344 run executed, status=completed\n" - ] - } - ], - "source": [ - "sql_func = run_local(sql_task)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Run it on a cluster" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-06-29 12:44:02,350 starting run sql uid=46ff7ef67e314be49353982cdd8d073a -> http://mlrun-api:8080\n", - "[mlrun] 2020-06-29 12:44:02,622 Job is running in the background, pod: sql-mplpz\n", - "[mlrun] 2020-06-29 12:44:09,070 run executed, status=completed\n", - "final state: succeeded\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 29 12:44:06completedsql
v3io_user=admin
kind=job
owner=admin
host=sql-mplpz
sql_query=select rfam_acc,rfam_id,auto_wiki,description,author,seed_source FROM family
database_url=mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam
query result
\n", - "
\n", - "
\n", - "
\n", - " Title\n", - " ×\n", - "
\n", - " \n", - "
\n", - "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "to track results use .show() or .logs() or in CLI: \n", - "!mlrun get run 46ff7ef67e314be49353982cdd8d073a --project default , !mlrun logs 46ff7ef67e314be49353982cdd8d073a --project default\n", - "[mlrun] 2020-06-29 12:44:11,893 run executed, status=completed\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fn.run(sql_task)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### SQL query from Iguazio Key Value via Presto" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You need to create a table and set the sql_table path accordingly.
\n", - "you can find an example of creating such table in https://github.com/v3io/tutorials/blob/master/data-ingestion-and-preparation/basic-data-ingestion-and-preparation.ipynb" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: ignore\n", - "import os\n", - "sql_table = os.path.join('v3io.users.\"'+str(os.getenv('V3IO_USERNAME'))+'/examples/stocks_tab\"')\n", - "sql_query_string = 'select * from '+sql_table+\"\"" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Done.\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
securitydescsecuritytypetimeisinminpricedateendpricenumberoftradesmnemoniccurrencysecurityidmaxpricetradedvolumestartprice
UBS I.ETF-DL G.SEL.DIV.ADETF08:27IE00BMP3HG278.4182018-03-26 00:00:00.0008.4181UBUMEUR25054508.4184038.418
GILEAD SCIENCES DL-,001Common stock08:00US375558103659.72018-03-26 00:00:00.00059.843GISEUR250649559.8474559.7
3M CO. DL-,01Common stock08:00US88579Y1010176.512018-03-26 00:00:00.000176.511MMMEUR2506577176.5139176.51
DIEBOLD NIXDORF INH.O.N.Common stock08:06DE000A0CAYB266.32018-03-26 00:00:00.00066.31WINEUR250428666.36066.3
XTR.II EUR.INF.LINK.BD 1CETF08:13LU0290358224218.972018-03-26 00:00:00.000218.971DBXKEUR2505840218.97110218.97
UBS-ETF-MSCI EMU S.C.EOADETF08:33LU0671493277100.22018-03-26 00:00:00.000100.21UEFDEUR2506045100.2180100.2
ASMALLWORLD AG SF 1Common stock08:23CH040488012912.72018-03-26 00:00:00.00012.711Q7EUR308912212.740012.7
IS.DJ GLOB.TITAN.50 U.ETFETF08:42DE000628938231.252018-03-26 00:00:00.00031.251EXI2EUR250502931.255031.25
ISHS IV-AGEING POPUL.ETFETF08:17IE00BYZK46694.9262018-03-26 00:00:00.0004.92612B77EUR25055524.926254.926
PORSCHE AUTOM.HLDG VZOCommon stock08:00DE000PAH003864.682018-03-26 00:00:00.00064.768PAH3EUR250481664.7669864.7
" - ], - "text/plain": [ - "[('UBS I.ETF-DL G.SEL.DIV.AD', 'ETF', '08:27', 'IE00BMP3HG27', 8.418, '2018-03-26 00:00:00.000', 8.418, 1, 'UBUM', 'EUR', 2505450, 8.418, 403, 8.418),\n", - " ('GILEAD SCIENCES DL-,001', 'Common stock', '08:00', 'US3755581036', 59.7, '2018-03-26 00:00:00.000', 59.84, 3, 'GIS', 'EUR', 2506495, 59.84, 745, 59.7),\n", - " ('3M CO. DL-,01', 'Common stock', '08:00', 'US88579Y1010', 176.51, '2018-03-26 00:00:00.000', 176.51, 1, 'MMM', 'EUR', 2506577, 176.51, 39, 176.51),\n", - " ('DIEBOLD NIXDORF INH.O.N.', 'Common stock', '08:06', 'DE000A0CAYB2', 66.3, '2018-03-26 00:00:00.000', 66.3, 1, 'WIN', 'EUR', 2504286, 66.3, 60, 66.3),\n", - " ('XTR.II EUR.INF.LINK.BD 1C', 'ETF', '08:13', 'LU0290358224', 218.97, '2018-03-26 00:00:00.000', 218.97, 1, 'DBXK', 'EUR', 2505840, 218.97, 110, 218.97),\n", - " ('UBS-ETF-MSCI EMU S.C.EOAD', 'ETF', '08:33', 'LU0671493277', 100.2, '2018-03-26 00:00:00.000', 100.2, 1, 'UEFD', 'EUR', 2506045, 100.2, 180, 100.2),\n", - " ('ASMALLWORLD AG SF 1', 'Common stock', '08:23', 'CH0404880129', 12.7, '2018-03-26 00:00:00.000', 12.7, 1, '1Q7', 'EUR', 3089122, 12.7, 400, 12.7),\n", - " ('IS.DJ GLOB.TITAN.50 U.ETF', 'ETF', '08:42', 'DE0006289382', 31.25, '2018-03-26 00:00:00.000', 31.25, 1, 'EXI2', 'EUR', 2505029, 31.25, 50, 31.25),\n", - " ('ISHS IV-AGEING POPUL.ETF', 'ETF', '08:17', 'IE00BYZK4669', 4.926, '2018-03-26 00:00:00.000', 4.926, 1, '2B77', 'EUR', 2505552, 4.926, 25, 4.926),\n", - " ('PORSCHE AUTOM.HLDG VZO', 'Common stock', '08:00', 'DE000PAH0038', 64.68, '2018-03-26 00:00:00.000', 64.76, 8, 'PAH3', 'EUR', 2504816, 64.76, 698, 64.7)]" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "%sql select * from $sql_table limit 10" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [], - "source": [ - "sql_task = NewTask(name='sql', \n", - " handler=sql_to_file,\n", - " params={'sql_query': sql_query_string,\n", - " 'database_url': os.getenv('DATABASE_URL')}\n", - " )\n" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-06-29 12:44:14,406 starting run sql uid=d32a57bb990d4142bb1f63862e8906bf -> http://mlrun-api:8080\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 29 12:44:14completedsql
v3io_user=admin
kind=handler
owner=admin
host=jupyter-b9c7995f9-4fblj
sql_query=select * from v3io.users.\"admin/examples/stocks_tab\"
database_url=presto://admin:8278ee8e-0f31-4aea-a105-2eab202bec93@presto-api-presto.default-tenant.app.cs-mlrun-test.iguazio-c0.com:443/v3io?protocol=https&requests_kwargs=%7B%22verify%22%3A+%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.crt%22%2C+%22cert%22%3A+%5B%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.crt%22%2C+%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.key%22%5D%7D
query result
\n", - "
\n", - "
\n", - "
\n", - " Title\n", - " ×\n", - "
\n", - " \n", - "
\n", - "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "to track results use .show() or .logs() or in CLI: \n", - "!mlrun get run d32a57bb990d4142bb1f63862e8906bf --project default , !mlrun logs d32a57bb990d4142bb1f63862e8906bf --project default\n", - "[mlrun] 2020-06-29 12:44:18,102 run executed, status=completed\n" - ] - } - ], - "source": [ - "sql_func = run_local(sql_task)" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-06-29 12:44:18,112 starting run sql uid=db9507007f6d452e9ca020e4f483e33b -> http://mlrun-api:8080\n", - "[mlrun] 2020-06-29 12:44:18,387 Job is running in the background, pod: sql-g7p4f\n", - "[mlrun] 2020-06-29 12:44:25,033 run executed, status=completed\n", - "final state: succeeded\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 29 12:44:21completedsql
v3io_user=admin
kind=job
owner=admin
host=sql-g7p4f
sql_query=select * from v3io.users.\"admin/examples/stocks_tab\"
database_url=presto://admin:8278ee8e-0f31-4aea-a105-2eab202bec93@presto-api-presto.default-tenant.app.cs-mlrun-test.iguazio-c0.com:443/v3io?protocol=https&requests_kwargs=%7B%22verify%22%3A+%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.crt%22%2C+%22cert%22%3A+%5B%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.crt%22%2C+%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.key%22%5D%7D
query result
\n", - "
\n", - "
\n", - "
\n", - " Title\n", - " ×\n", - "
\n", - " \n", - "
\n", - "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "to track results use .show() or .logs() or in CLI: \n", - "!mlrun get run db9507007f6d452e9ca020e4f483e33b --project default , !mlrun logs db9507007f6d452e9ca020e4f483e33b --project default\n", - "[mlrun] 2020-06-29 12:44:27,645 run executed, status=completed\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fn.run(sql_task)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.8" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} \ No newline at end of file diff --git a/functions/development/sql_to_file/0.9.1/src/sql_to_file.py b/functions/development/sql_to_file/0.9.1/src/sql_to_file.py deleted file mode 100644 index 086cb066..00000000 --- a/functions/development/sql_to_file/0.9.1/src/sql_to_file.py +++ /dev/null @@ -1,31 +0,0 @@ -# Generated by nuclio.export.NuclioExporter - -import pandas as pd -import pyhive -from sqlalchemy.engine import create_engine -from mlrun.execution import MLClientCtx - - -def sql_to_file( - context: MLClientCtx, - sql_query: str, - database_url: str, - file_ext: str = "parquet", -) -> None: - """SQL Ingest - Ingest data using SQL query - - :param context: the function context - :param sql_query: the sql query used to retrieve the data - :param database_url: database connection URL - :param file_ext: ("parquet") format for result file - """ - - engine = create_engine(database_url) - df = pd.read_sql(sql_query, engine) - - context.log_dataset( - "query result", - df=df, - format=file_ext, - artifact_path=context.artifact_subpath("data"), - ) diff --git a/functions/development/sql_to_file/0.9.1/src/test_sql_to_file.py b/functions/development/sql_to_file/0.9.1/src/test_sql_to_file.py deleted file mode 100644 index cc345a27..00000000 --- a/functions/development/sql_to_file/0.9.1/src/test_sql_to_file.py +++ /dev/null @@ -1,18 +0,0 @@ -from mlrun import code_to_function - -mysql_url = 'mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam' -mysql_query = 'select rfam_acc,rfam_id,auto_wiki,description,author,seed_source FROM family' - - -def test_run_sql_to_file(): - fn = code_to_function(name='test_sql_to_file', - filename="sql_to_file.py", - handler="sql_to_file", - kind="job", - ) - fn.run(params={'sql_query': mysql_query, - 'database_url': mysql_url} - , local=True - - ) - diff --git a/functions/development/sql_to_file/0.9.1/static/documentation.html b/functions/development/sql_to_file/0.9.1/static/documentation.html deleted file mode 100644 index 3a1c1e8c..00000000 --- a/functions/development/sql_to_file/0.9.1/static/documentation.html +++ /dev/null @@ -1,143 +0,0 @@ - - - - - - - -sql_to_file package - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- - -
-
-
-
-
-
-

sql_to_file package

-
-

Submodules

-
-
-

sql_to_file.sql_to_file module

-
-
-sql_to_file.sql_to_file.sql_to_file(context: mlrun.execution.MLClientCtx, sql_query: str, database_url: str, file_ext: str = 'parquet')None[source]
-

SQL Ingest - Ingest data using SQL query

-
-
Parameters
-
    -
  • context – the function context

  • -
  • sql_query – the sql query used to retrieve the data

  • -
  • database_url – database connection URL

  • -
  • file_ext – (“parquet”) format for result file

  • -
-
-
-
-
-
-

Module contents

-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/sql_to_file/0.9.1/static/example.html b/functions/development/sql_to_file/0.9.1/static/example.html deleted file mode 100644 index 5e2ea912..00000000 --- a/functions/development/sql_to_file/0.9.1/static/example.html +++ /dev/null @@ -1,1475 +0,0 @@ - - - - - - - -SQL Ingest - Ingest data using SQL query - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
- -
-
-
-
-

SQL Ingest - Ingest data using SQL query

-
-
-
# nuclio: ignore
-import nuclio
-
-
-
-
-
---------------------------------------------------------------------------
-ModuleNotFoundError                       Traceback (most recent call last)
-<ipython-input-1-0a63a0760af8> in <module>
-      1 # nuclio: ignore
-----> 2 import nuclio
-      3 
-
-ModuleNotFoundError: No module named 'nuclio'
-
-
-
-
-
-
-
%nuclio config kind = "job"
-%nuclio config spec.build.baseImage = "mlrun/mlrun"
-
-
-
-
-
%nuclio: setting kind to 'job'
-%nuclio: setting spec.image to 'mlrun/mlrun'
-
-
-
-
-
-
-
%%nuclio cmd -c
-pip install --no-cache-dir git+https://github.com/v3io/PyHive.git@v0.6.999 
-pip install sqlalchemy==1.3.11
-pip install PyMySQL==0.9.3
-
-
-
-
-
-
-
import pandas as pd
-import pyhive
-from sqlalchemy.engine import create_engine
-from mlrun.execution import MLClientCtx
-
-
-def sql_to_file(
-    context: MLClientCtx,
-    sql_query: str,
-    database_url: str,
-    file_ext: str = "parquet",
-) -> None:
-    """SQL Ingest - Ingest data using SQL query
-
-    :param context:           the function context
-    :param sql_query:         the sql query used to retrieve the data
-    :param database_url:      database connection URL
-    :param file_ext:          ("parquet") format for result file
-
-"""
-
-    engine = create_engine(database_url)
-    df = pd.read_sql(sql_query, engine)
-
-    context.log_dataset('query result',
-                        df=df,
-                        format=file_ext,
-                        artifact_path=context.artifact_subpath('data'))
-
-
-
-
-
-
-
# nuclio: end-code
-
-
-
-
-
-

mlconfig

-
-
-
from mlrun import mlconf
-import os
-mlconf.dbpath = mlconf.dbpath or 'http://mlrun-api:8080'
-mlconf.artifact_path = mlconf.artifact_path or f'{os.environ["HOME"]}/artifacts'
-
-
-
-
-
---------------------------------------------------------------------------
-KeyError                                  Traceback (most recent call last)
-<ipython-input-7-c3828dc245f1> in <module>
-      2 import os
-      3 mlconf.dbpath = mlconf.dbpath or 'http://mlrun-api:8080'
-----> 4 mlconf.artifact_path = mlconf.artifact_path or f'{os.environ["HOME"]}/artifacts'
-      5 
-      6 
-
-C:\Program Files\Python37\lib\os.py in __getitem__(self, key)
-    679         except KeyError:
-    680             # raise KeyError with the original key value
---> 681             raise KeyError(key) from None
-    682         return self.decodevalue(value)
-    683 
-
-KeyError: 'HOME'
-
-
-
-
-
-
-

Save function

-
-
-
def mount_secret(
-    secret_name, volume_mount_path, volume_name='secret', items=None
-):
-    def _mount_secret(task):
-        from kubernetes import client as k8s_client
-        vol = k8s_client.V1SecretVolumeSource(secret_name=secret_name, items=items)
-        return task.add_volume(
-            k8s_client.V1Volume(name=volume_name, secret=vol)
-        ).add_volume_mount(
-            k8s_client.V1VolumeMount(mount_path=volume_mount_path, name=volume_name)
-        )
-    return _mount_secret
-
-
-
-
-
-
-
from mlrun import code_to_function, NewTask
-import os
-
-fn = code_to_function(name="sql_to_file",
-                      handler="sql_to_file",
-                      description="SQL To File - Ingest data using SQL query",
-                      categories=["data-prep"],
-                      labels={"author": "adih"})
-
-if "V3IO_ACCESS_KEY" in list(os.environ):
-    fn.apply(mount_secret(secret_name='presto-tls',
-                        volume_mount_path= '/var/run/iguazio/secrets/'))
-
-
-
-
-
-
-

Build the image

-
-
-
fn.deploy()
-
-
-
-
-
[mlrun] 2020-06-29 12:42:44,100 starting remote build, image: .mlrun/func-default-sql-ingest-latest
-INFO[0000] Resolved base name mlrun/mlrun:0.4.10 to mlrun/mlrun:0.4.10 
-INFO[0000] Resolved base name mlrun/mlrun:0.4.10 to mlrun/mlrun:0.4.10 
-INFO[0000] Retrieving image manifest mlrun/mlrun:0.4.10 
-INFO[0000] Retrieving image manifest mlrun/mlrun:0.4.10 
-INFO[0000] Built cross stage deps: map[]                
-INFO[0000] Retrieving image manifest mlrun/mlrun:0.4.10 
-INFO[0000] Retrieving image manifest mlrun/mlrun:0.4.10 
-INFO[0001] Unpacking rootfs as cmd RUN pip install --no-cache-dir git+https://github.com/v3io/PyHive.git@v0.6.999 requires it. 
-INFO[0027] Taking snapshot of full filesystem...        
-INFO[0039] Resolving paths                              
-INFO[0046] RUN pip install --no-cache-dir git+https://github.com/v3io/PyHive.git@v0.6.999 
-INFO[0046] cmd: /bin/sh                                 
-INFO[0046] args: [-c pip install --no-cache-dir git+https://github.com/v3io/PyHive.git@v0.6.999] 
-Collecting git+https://github.com/v3io/PyHive.git@v0.6.999
-  Cloning https://github.com/v3io/PyHive.git (to revision v0.6.999) to /tmp/pip-req-build-ycqhuolw
-  Running command git clone -q https://github.com/v3io/PyHive.git /tmp/pip-req-build-ycqhuolw
-Requirement already satisfied: future in /usr/local/lib/python3.7/site-packages (from PyHive==0.6.1.dev0) (0.18.2)
-Requirement already satisfied: python-dateutil in /usr/local/lib/python3.7/site-packages (from PyHive==0.6.1.dev0) (2.8.1)
-Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/site-packages (from python-dateutil->PyHive==0.6.1.dev0) (1.15.0)
-Building wheels for collected packages: PyHive
-  Building wheel for PyHive (setup.py): started
-  Building wheel for PyHive (setup.py): finished with status 'done'
-  Created wheel for PyHive: filename=PyHive-0.6.1.dev0-py3-none-any.whl size=46402 sha256=63dca405cbae83da4cfcabfd61fd00f1683bc008c8bfa2272eac7054ec283166
-  Stored in directory: /tmp/pip-ephem-wheel-cache-mwb52l_u/wheels/05/11/cd/4ac4df0fcee76e5ceb614c39c56fca1eead41c0ac32ff6285d
-Successfully built PyHive
-Installing collected packages: PyHive
-Successfully installed PyHive-0.6.1.dev0
-INFO[0048] Taking snapshot of full filesystem...        
-INFO[0048] Resolving paths                              
-INFO[0053] RUN pip install sqlalchemy==1.3.11           
-INFO[0053] cmd: /bin/sh                                 
-INFO[0053] args: [-c pip install sqlalchemy==1.3.11]    
-Collecting sqlalchemy==1.3.11
-  Downloading SQLAlchemy-1.3.11.tar.gz (6.0 MB)
-Building wheels for collected packages: sqlalchemy
-  Building wheel for sqlalchemy (setup.py): started
-  Building wheel for sqlalchemy (setup.py): finished with status 'done'
-  Created wheel for sqlalchemy: filename=SQLAlchemy-1.3.11-cp37-cp37m-linux_x86_64.whl size=1216921 sha256=9dd22e89acfbb68df0c1d189d36907a16c9393e4174598eb4bf377ce57132f3c
-  Stored in directory: /root/.cache/pip/wheels/0a/60/60/f26cbd183a3bb0031ace108156036dd925ec0138ee1c496a16
-Successfully built sqlalchemy
-Installing collected packages: sqlalchemy
-  Attempting uninstall: sqlalchemy
-    Found existing installation: SQLAlchemy 1.3.17
-    Uninstalling SQLAlchemy-1.3.17:
-      Successfully uninstalled SQLAlchemy-1.3.17
-Successfully installed sqlalchemy-1.3.11
-INFO[0057] Taking snapshot of full filesystem...        
-INFO[0057] Resolving paths                              
-INFO[0063] RUN pip install PyMySQL==0.9.3               
-INFO[0063] cmd: /bin/sh                                 
-INFO[0063] args: [-c pip install PyMySQL==0.9.3]        
-Collecting PyMySQL==0.9.3
-  Downloading PyMySQL-0.9.3-py2.py3-none-any.whl (47 kB)
-Installing collected packages: PyMySQL
-Successfully installed PyMySQL-0.9.3
-INFO[0064] Taking snapshot of full filesystem...        
-INFO[0064] Resolving paths                              
-
-
-
True
-
-
-
-
-
-
-
fn.export('function.yaml')
-
-
-
-
-
[mlrun] 2020-06-30 01:58:41,604 function spec saved to path: function.yaml
-
-
-
<mlrun.runtimes.kubejob.KubejobRuntime at 0x2239dbf01c8>
-
-
-
-
-
-
-

Test

-
-

Reading from a public MySQL DB

-
-
-
mysql_url = 'mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam'
-mysql_query = 'select rfam_acc,rfam_id,auto_wiki,description,author,seed_source FROM family'
-
-
-
-
-
-
-
from mlrun import NewTask, run_local
-
-sql_task = NewTask(name='sql',
-                   handler=sql_to_file,
-                   params={'sql_query': mysql_query,
-                           'database_url': mysql_url})
-
-
-
-
-
-
-
sql_func = run_local(sql_task)
-
-
-
-
-
[mlrun] 2020-06-29 12:43:59,253 starting run sql uid=b0914edaa58e45ee97c132200c6b60be  -> http://mlrun-api:8080
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 29 12:43:59completedsql
v3io_user=admin
kind=handler
owner=admin
host=jupyter-b9c7995f9-4fblj
sql_query=select rfam_acc,rfam_id,auto_wiki,description,author,seed_source FROM family
database_url=mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam
query result
-
- -
-
to track results use .show() or .logs() or in CLI: 
-!mlrun get run b0914edaa58e45ee97c132200c6b60be --project default , !mlrun logs b0914edaa58e45ee97c132200c6b60be --project default
-[mlrun] 2020-06-29 12:44:02,344 run executed, status=completed
-
-
-
-
-
-

Run it on a cluster

-
-
-
fn.run(sql_task)
-
-
-
-
-
[mlrun] 2020-06-29 12:44:02,350 starting run sql uid=46ff7ef67e314be49353982cdd8d073a  -> http://mlrun-api:8080
-[mlrun] 2020-06-29 12:44:02,622 Job is running in the background, pod: sql-mplpz
-[mlrun] 2020-06-29 12:44:09,070 run executed, status=completed
-final state: succeeded
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 29 12:44:06completedsql
v3io_user=admin
kind=job
owner=admin
host=sql-mplpz
sql_query=select rfam_acc,rfam_id,auto_wiki,description,author,seed_source FROM family
database_url=mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam
query result
-
- -
-
to track results use .show() or .logs() or in CLI: 
-!mlrun get run 46ff7ef67e314be49353982cdd8d073a --project default , !mlrun logs 46ff7ef67e314be49353982cdd8d073a --project default
-[mlrun] 2020-06-29 12:44:11,893 run executed, status=completed
-
-
-
<mlrun.model.RunObject at 0x7f87fba74b00>
-
-
-
-
-
-
-
-

SQL query from Iguazio Key Value via Presto

-

You need to create a table and set the sql_table path accordingly.
-you can find an example of creating such table in https://github.com/v3io/tutorials/blob/master/data-ingestion-and-preparation/basic-data-ingestion-and-preparation.ipynb

-
-
-
# nuclio: ignore
-import os
-sql_table = os.path.join('v3io.users."'+str(os.getenv('V3IO_USERNAME'))+'/examples/stocks_tab"')
-sql_query_string = 'select * from '+sql_table+""
-
-
-
-
-
-
-
%sql select * from $sql_table limit 10
-
-
-
-
-
Done.
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
securitydescsecuritytypetimeisinminpricedateendpricenumberoftradesmnemoniccurrencysecurityidmaxpricetradedvolumestartprice
UBS I.ETF-DL G.SEL.DIV.ADETF08:27IE00BMP3HG278.4182018-03-26 00:00:00.0008.4181UBUMEUR25054508.4184038.418
GILEAD SCIENCES DL-,001Common stock08:00US375558103659.72018-03-26 00:00:00.00059.843GISEUR250649559.8474559.7
3M CO. DL-,01Common stock08:00US88579Y1010176.512018-03-26 00:00:00.000176.511MMMEUR2506577176.5139176.51
DIEBOLD NIXDORF INH.O.N.Common stock08:06DE000A0CAYB266.32018-03-26 00:00:00.00066.31WINEUR250428666.36066.3
XTR.II EUR.INF.LINK.BD 1CETF08:13LU0290358224218.972018-03-26 00:00:00.000218.971DBXKEUR2505840218.97110218.97
UBS-ETF-MSCI EMU S.C.EOADETF08:33LU0671493277100.22018-03-26 00:00:00.000100.21UEFDEUR2506045100.2180100.2
ASMALLWORLD AG SF 1Common stock08:23CH040488012912.72018-03-26 00:00:00.00012.711Q7EUR308912212.740012.7
IS.DJ GLOB.TITAN.50 U.ETFETF08:42DE000628938231.252018-03-26 00:00:00.00031.251EXI2EUR250502931.255031.25
ISHS IV-AGEING POPUL.ETFETF08:17IE00BYZK46694.9262018-03-26 00:00:00.0004.92612B77EUR25055524.926254.926
PORSCHE AUTOM.HLDG VZOCommon stock08:00DE000PAH003864.682018-03-26 00:00:00.00064.768PAH3EUR250481664.7669864.7
-
-
-
-
sql_task = NewTask(name='sql', 
-                   handler=sql_to_file,
-                   params={'sql_query': sql_query_string,
-                          'database_url': os.getenv('DATABASE_URL')}
-                          )
-
-
-
-
-
-
-
sql_func = run_local(sql_task)
-
-
-
-
-
[mlrun] 2020-06-29 12:44:14,406 starting run sql uid=d32a57bb990d4142bb1f63862e8906bf  -> http://mlrun-api:8080
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 29 12:44:14completedsql
v3io_user=admin
kind=handler
owner=admin
host=jupyter-b9c7995f9-4fblj
sql_query=select * from v3io.users."admin/examples/stocks_tab"
database_url=presto://admin:8278ee8e-0f31-4aea-a105-2eab202bec93@presto-api-presto.default-tenant.app.cs-mlrun-test.iguazio-c0.com:443/v3io?protocol=https&requests_kwargs=%7B%22verify%22%3A+%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.crt%22%2C+%22cert%22%3A+%5B%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.crt%22%2C+%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.key%22%5D%7D
query result
-
- -
-
to track results use .show() or .logs() or in CLI: 
-!mlrun get run d32a57bb990d4142bb1f63862e8906bf --project default , !mlrun logs d32a57bb990d4142bb1f63862e8906bf --project default
-[mlrun] 2020-06-29 12:44:18,102 run executed, status=completed
-
-
-
-
-
-
-
fn.run(sql_task)
-
-
-
-
-
[mlrun] 2020-06-29 12:44:18,112 starting run sql uid=db9507007f6d452e9ca020e4f483e33b  -> http://mlrun-api:8080
-[mlrun] 2020-06-29 12:44:18,387 Job is running in the background, pod: sql-g7p4f
-[mlrun] 2020-06-29 12:44:25,033 run executed, status=completed
-final state: succeeded
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 29 12:44:21completedsql
v3io_user=admin
kind=job
owner=admin
host=sql-g7p4f
sql_query=select * from v3io.users."admin/examples/stocks_tab"
database_url=presto://admin:8278ee8e-0f31-4aea-a105-2eab202bec93@presto-api-presto.default-tenant.app.cs-mlrun-test.iguazio-c0.com:443/v3io?protocol=https&requests_kwargs=%7B%22verify%22%3A+%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.crt%22%2C+%22cert%22%3A+%5B%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.crt%22%2C+%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.key%22%5D%7D
query result
-
- -
-
to track results use .show() or .logs() or in CLI: 
-!mlrun get run db9507007f6d452e9ca020e4f483e33b --project default , !mlrun logs db9507007f6d452e9ca020e4f483e33b --project default
-[mlrun] 2020-06-29 12:44:27,645 run executed, status=completed
-
-
-
<mlrun.model.RunObject at 0x7f87f8e26c18>
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/sql_to_file/0.9.1/static/function.html b/functions/development/sql_to_file/0.9.1/static/function.html deleted file mode 100644 index 4de597e5..00000000 --- a/functions/development/sql_to_file/0.9.1/static/function.html +++ /dev/null @@ -1,69 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: job
-metadata:
-  name: sql-to-file
-  tag: ''
-  hash: 61f616fe697994e05cf018f2ee94c4ea25ed8863
-  project: ''
-  labels:
-    author: adih
-  categories:
-  - data-preparation
-spec:
-  command: ''
-  args: []
-  image: mlrun/mlrun
-  env: []
-  default_handler: sql_to_file
-  entry_points:
-    sql_to_file:
-      name: sql_to_file
-      doc: SQL Ingest - Ingest data using SQL query
-      parameters:
-      - name: context
-        type: MLClientCtx
-        doc: the function context
-        default: ''
-      - name: sql_query
-        type: str
-        doc: the sql query used to retrieve the data
-        default: ''
-      - name: database_url
-        type: str
-        doc: database connection URL
-        default: ''
-      - name: file_ext
-        type: str
-        doc: ("parquet") format for result file
-        default: parquet
-      outputs:
-      - default: ''
-      lineno: 9
-  description: SQL To File - Ingest data using SQL query
-  build:
-    functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHBhbmRhcyBhcyBwZAppbXBvcnQgcHloaXZlCmZyb20gc3FsYWxjaGVteS5lbmdpbmUgaW1wb3J0IGNyZWF0ZV9lbmdpbmUKZnJvbSBtbHJ1bi5leGVjdXRpb24gaW1wb3J0IE1MQ2xpZW50Q3R4CgoKZGVmIHNxbF90b19maWxlKAogICAgY29udGV4dDogTUxDbGllbnRDdHgsCiAgICBzcWxfcXVlcnk6IHN0ciwKICAgIGRhdGFiYXNlX3VybDogc3RyLAogICAgZmlsZV9leHQ6IHN0ciA9ICJwYXJxdWV0IiwKKSAtPiBOb25lOgogICAgIiIiU1FMIEluZ2VzdCAtIEluZ2VzdCBkYXRhIHVzaW5nIFNRTCBxdWVyeQoKICAgIDpwYXJhbSBjb250ZXh0OiAgICAgICAgICAgdGhlIGZ1bmN0aW9uIGNvbnRleHQKICAgIDpwYXJhbSBzcWxfcXVlcnk6ICAgICAgICAgdGhlIHNxbCBxdWVyeSB1c2VkIHRvIHJldHJpZXZlIHRoZSBkYXRhCiAgICA6cGFyYW0gZGF0YWJhc2VfdXJsOiAgICAgIGRhdGFiYXNlIGNvbm5lY3Rpb24gVVJMCiAgICA6cGFyYW0gZmlsZV9leHQ6ICAgICAgICAgICgicGFycXVldCIpIGZvcm1hdCBmb3IgcmVzdWx0IGZpbGUKICAgICIiIgoKICAgIGVuZ2luZSA9IGNyZWF0ZV9lbmdpbmUoZGF0YWJhc2VfdXJsKQogICAgZGYgPSBwZC5yZWFkX3NxbChzcWxfcXVlcnksIGVuZ2luZSkKCiAgICBjb250ZXh0LmxvZ19kYXRhc2V0KAogICAgICAgICJxdWVyeSByZXN1bHQiLAogICAgICAgIGRmPWRmLAogICAgICAgIGZvcm1hdD1maWxlX2V4dCwKICAgICAgICBhcnRpZmFjdF9wYXRoPWNvbnRleHQuYXJ0aWZhY3Rfc3VicGF0aCgiZGF0YSIpLAogICAgKQo=
-    commands: []
-    code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/sql_to_file/sql_to_file.py
-  affinity: null
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/sql_to_file/0.9.1/static/item.html b/functions/development/sql_to_file/0.9.1/static/item.html deleted file mode 100644 index 88180aac..00000000 --- a/functions/development/sql_to_file/0.9.1/static/item.html +++ /dev/null @@ -1,45 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- data-preparation
-description: SQL To File - Ingest data using SQL query
-doc: ''
-example: sql_to_file.ipynb
-generationDate: 2021-11-18:12-28
-icon: ''
-labels:
-  author: adih
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 0.8.0
-name: sql-to-file
-platformVersion: 3.2.0
-spec:
-  filename: sql_to_file.py
-  handler: sql_to_file
-  image: mlrun/mlrun
-  kind: job
-  requirements: []
-url: ''
-version: 0.9.1
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/sql_to_file/0.9.1/static/source.html b/functions/development/sql_to_file/0.9.1/static/source.html deleted file mode 100644 index 7d22754f..00000000 --- a/functions/development/sql_to_file/0.9.1/static/source.html +++ /dev/null @@ -1,53 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-# Generated by nuclio.export.NuclioExporter
-
-import pandas as pd
-import pyhive
-from sqlalchemy.engine import create_engine
-from mlrun.execution import MLClientCtx
-
-
-def sql_to_file(
-    context: MLClientCtx,
-    sql_query: str,
-    database_url: str,
-    file_ext: str = "parquet",
-) -> None:
-    """SQL Ingest - Ingest data using SQL query
-
-    :param context:           the function context
-    :param sql_query:         the sql query used to retrieve the data
-    :param database_url:      database connection URL
-    :param file_ext:          ("parquet") format for result file
-    """
-
-    engine = create_engine(database_url)
-    df = pd.read_sql(sql_query, engine)
-
-    context.log_dataset(
-        "query result",
-        df=df,
-        format=file_ext,
-        artifact_path=context.artifact_subpath("data"),
-    )
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/sql_to_file/1.1.0/src/function.yaml b/functions/development/sql_to_file/1.1.0/src/function.yaml deleted file mode 100644 index 10b332a5..00000000 --- a/functions/development/sql_to_file/1.1.0/src/function.yaml +++ /dev/null @@ -1,47 +0,0 @@ -kind: job -metadata: - name: sql-to-file - tag: '' - hash: 61f616fe697994e05cf018f2ee94c4ea25ed8863 - project: '' - labels: - author: adih - categories: - - data-preparation -spec: - command: '' - args: [] - image: mlrun/mlrun - env: [] - default_handler: sql_to_file - entry_points: - sql_to_file: - name: sql_to_file - doc: SQL Ingest - Ingest data using SQL query - parameters: - - name: context - type: MLClientCtx - doc: the function context - default: '' - - name: sql_query - type: str - doc: the sql query used to retrieve the data - default: '' - - name: database_url - type: str - doc: database connection URL - default: '' - - name: file_ext - type: str - doc: ("parquet") format for result file - default: parquet - outputs: - - default: '' - lineno: 9 - description: SQL To File - Ingest data using SQL query - build: - functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHBhbmRhcyBhcyBwZAppbXBvcnQgcHloaXZlCmZyb20gc3FsYWxjaGVteS5lbmdpbmUgaW1wb3J0IGNyZWF0ZV9lbmdpbmUKZnJvbSBtbHJ1bi5leGVjdXRpb24gaW1wb3J0IE1MQ2xpZW50Q3R4CgoKZGVmIHNxbF90b19maWxlKAogICAgY29udGV4dDogTUxDbGllbnRDdHgsCiAgICBzcWxfcXVlcnk6IHN0ciwKICAgIGRhdGFiYXNlX3VybDogc3RyLAogICAgZmlsZV9leHQ6IHN0ciA9ICJwYXJxdWV0IiwKKSAtPiBOb25lOgogICAgIiIiU1FMIEluZ2VzdCAtIEluZ2VzdCBkYXRhIHVzaW5nIFNRTCBxdWVyeQoKICAgIDpwYXJhbSBjb250ZXh0OiAgICAgICAgICAgdGhlIGZ1bmN0aW9uIGNvbnRleHQKICAgIDpwYXJhbSBzcWxfcXVlcnk6ICAgICAgICAgdGhlIHNxbCBxdWVyeSB1c2VkIHRvIHJldHJpZXZlIHRoZSBkYXRhCiAgICA6cGFyYW0gZGF0YWJhc2VfdXJsOiAgICAgIGRhdGFiYXNlIGNvbm5lY3Rpb24gVVJMCiAgICA6cGFyYW0gZmlsZV9leHQ6ICAgICAgICAgICgicGFycXVldCIpIGZvcm1hdCBmb3IgcmVzdWx0IGZpbGUKICAgICIiIgoKICAgIGVuZ2luZSA9IGNyZWF0ZV9lbmdpbmUoZGF0YWJhc2VfdXJsKQogICAgZGYgPSBwZC5yZWFkX3NxbChzcWxfcXVlcnksIGVuZ2luZSkKCiAgICBjb250ZXh0LmxvZ19kYXRhc2V0KAogICAgICAgICJxdWVyeSByZXN1bHQiLAogICAgICAgIGRmPWRmLAogICAgICAgIGZvcm1hdD1maWxlX2V4dCwKICAgICAgICBhcnRpZmFjdF9wYXRoPWNvbnRleHQuYXJ0aWZhY3Rfc3VicGF0aCgiZGF0YSIpLAogICAgKQo= - commands: [] - code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/sql_to_file/sql_to_file.py - affinity: null -verbose: false diff --git a/functions/development/sql_to_file/1.1.0/src/item.yaml b/functions/development/sql_to_file/1.1.0/src/item.yaml deleted file mode 100644 index 2f6ae4c5..00000000 --- a/functions/development/sql_to_file/1.1.0/src/item.yaml +++ /dev/null @@ -1,24 +0,0 @@ -apiVersion: v1 -categories: -- data-preparation -description: SQL To File - Ingest data using SQL query -doc: '' -example: sql_to_file.ipynb -generationDate: 2022-08-28:17-25 -hidden: false -icon: '' -labels: - author: adih -maintainers: [] -marketplaceType: '' -mlrunVersion: 1.1.0 -name: sql-to-file -platformVersion: 3.5.0 -spec: - filename: sql_to_file.py - handler: sql_to_file - image: mlrun/mlrun - kind: job - requirements: [] -url: '' -version: 1.1.0 diff --git a/functions/development/sql_to_file/1.1.0/src/requirements.txt b/functions/development/sql_to_file/1.1.0/src/requirements.txt deleted file mode 100644 index 822eabb8..00000000 --- a/functions/development/sql_to_file/1.1.0/src/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -pyhive -pymysql \ No newline at end of file diff --git a/functions/development/sql_to_file/1.1.0/src/sql_to_file.ipynb b/functions/development/sql_to_file/1.1.0/src/sql_to_file.ipynb deleted file mode 100644 index d4a084ad..00000000 --- a/functions/development/sql_to_file/1.1.0/src/sql_to_file.ipynb +++ /dev/null @@ -1,1567 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# SQL Ingest - Ingest data using SQL query " - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "ename": "ModuleNotFoundError", - "evalue": "No module named 'nuclio'", - "output_type": "error", - "traceback": [ - "\u001B[1;31m---------------------------------------------------------------------------\u001B[0m", - "\u001B[1;31mModuleNotFoundError\u001B[0m Traceback (most recent call last)", - "\u001B[1;32m\u001B[0m in \u001B[0;36m\u001B[1;34m\u001B[0m\n\u001B[0;32m 1\u001B[0m \u001B[1;31m# nuclio: ignore\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[1;32m----> 2\u001B[1;33m \u001B[1;32mimport\u001B[0m \u001B[0mnuclio\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0m\u001B[0;32m 3\u001B[0m \u001B[1;33m\u001B[0m\u001B[0m\n", - "\u001B[1;31mModuleNotFoundError\u001B[0m: No module named 'nuclio'" - ] - } - ], - "source": [ - "# nuclio: ignore\n", - "import nuclio" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "%nuclio: setting kind to 'job'\n", - "%nuclio: setting spec.image to 'mlrun/mlrun'\n" - ] - } - ], - "source": [ - "%nuclio config kind = \"job\"\n", - "%nuclio config spec.build.baseImage = \"mlrun/mlrun\"" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "%%nuclio cmd -c\n", - "pip install --no-cache-dir git+https://github.com/v3io/PyHive.git@v0.6.999 \n", - "pip install sqlalchemy==1.3.11\n", - "pip install PyMySQL==0.9.3" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import pyhive\n", - "from sqlalchemy.engine import create_engine\n", - "from mlrun.execution import MLClientCtx\n", - "\n", - "\n", - "def sql_to_file(\n", - " context: MLClientCtx,\n", - " sql_query: str,\n", - " database_url: str,\n", - " file_ext: str = \"parquet\",\n", - ") -> None:\n", - " \"\"\"SQL Ingest - Ingest data using SQL query\n", - "\n", - " :param context: the function context\n", - " :param sql_query: the sql query used to retrieve the data\n", - " :param database_url: database connection URL\n", - " :param file_ext: (\"parquet\") format for result file\n", - "\n", - "\"\"\"\n", - "\n", - " engine = create_engine(database_url)\n", - " df = pd.read_sql(sql_query, engine)\n", - "\n", - " context.log_dataset('query result',\n", - " df=df,\n", - " format=file_ext,\n", - " artifact_path=context.artifact_subpath('data'))\n" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: end-code" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### mlconfig" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "ename": "KeyError", - "evalue": "'HOME'", - "output_type": "error", - "traceback": [ - "\u001B[1;31m---------------------------------------------------------------------------\u001B[0m", - "\u001B[1;31mKeyError\u001B[0m Traceback (most recent call last)", - "\u001B[1;32m\u001B[0m in \u001B[0;36m\u001B[1;34m\u001B[0m\n\u001B[0;32m 2\u001B[0m \u001B[1;32mimport\u001B[0m \u001B[0mos\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 3\u001B[0m \u001B[0mmlconf\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mdbpath\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0mmlconf\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mdbpath\u001B[0m \u001B[1;32mor\u001B[0m \u001B[1;34m'http://mlrun-api:8080'\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[1;32m----> 4\u001B[1;33m \u001B[0mmlconf\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0martifact_path\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0mmlconf\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0martifact_path\u001B[0m \u001B[1;32mor\u001B[0m \u001B[1;34mf'{os.environ[\"HOME\"]}/artifacts'\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0m\u001B[0;32m 5\u001B[0m \u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 6\u001B[0m \u001B[1;33m\u001B[0m\u001B[0m\n", - "\u001B[1;32mC:\\Program Files\\Python37\\lib\\os.py\u001B[0m in \u001B[0;36m__getitem__\u001B[1;34m(self, key)\u001B[0m\n\u001B[0;32m 679\u001B[0m \u001B[1;32mexcept\u001B[0m \u001B[0mKeyError\u001B[0m\u001B[1;33m:\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 680\u001B[0m \u001B[1;31m# raise KeyError with the original key value\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[1;32m--> 681\u001B[1;33m \u001B[1;32mraise\u001B[0m \u001B[0mKeyError\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0mkey\u001B[0m\u001B[1;33m)\u001B[0m \u001B[1;32mfrom\u001B[0m \u001B[1;32mNone\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0m\u001B[0;32m 682\u001B[0m \u001B[1;32mreturn\u001B[0m \u001B[0mself\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mdecodevalue\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0mvalue\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 683\u001B[0m \u001B[1;33m\u001B[0m\u001B[0m\n", - "\u001B[1;31mKeyError\u001B[0m: 'HOME'" - ] - } - ], - "source": [ - "from mlrun import mlconf\n", - "import os\n", - "mlconf.dbpath = mlconf.dbpath or 'http://mlrun-api:8080'\n", - "mlconf.artifact_path = mlconf.artifact_path or f'{os.environ[\"HOME\"]}/artifacts'\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Save function" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "def mount_secret(\n", - " secret_name, volume_mount_path, volume_name='secret', items=None\n", - "):\n", - " def _mount_secret(task):\n", - " from kubernetes import client as k8s_client\n", - " vol = k8s_client.V1SecretVolumeSource(secret_name=secret_name, items=items)\n", - " return task.add_volume(\n", - " k8s_client.V1Volume(name=volume_name, secret=vol)\n", - " ).add_volume_mount(\n", - " k8s_client.V1VolumeMount(mount_path=volume_mount_path, name=volume_name)\n", - " )\n", - " return _mount_secret" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import code_to_function, NewTask\n", - "import os\n", - "\n", - "fn = code_to_function(name=\"sql_to_file\",\n", - " handler=\"sql_to_file\",\n", - " description=\"SQL To File - Ingest data using SQL query\",\n", - " categories=[\"data-prep\"],\n", - " labels={\"author\": \"adih\"})\n", - "\n", - "if \"V3IO_ACCESS_KEY\" in list(os.environ):\n", - " fn.apply(mount_secret(secret_name='presto-tls',\n", - " volume_mount_path= '/var/run/iguazio/secrets/'))\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Build the image" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-06-29 12:42:44,100 starting remote build, image: .mlrun/func-default-sql-ingest-latest\n", - "\u001B[36mINFO\u001B[0m[0000] Resolved base name mlrun/mlrun:0.4.10 to mlrun/mlrun:0.4.10 \n", - "\u001B[36mINFO\u001B[0m[0000] Resolved base name mlrun/mlrun:0.4.10 to mlrun/mlrun:0.4.10 \n", - "\u001B[36mINFO\u001B[0m[0000] Retrieving image manifest mlrun/mlrun:0.4.10 \n", - "\u001B[36mINFO\u001B[0m[0000] Retrieving image manifest mlrun/mlrun:0.4.10 \n", - "\u001B[36mINFO\u001B[0m[0000] Built cross stage deps: map[] \n", - "\u001B[36mINFO\u001B[0m[0000] Retrieving image manifest mlrun/mlrun:0.4.10 \n", - "\u001B[36mINFO\u001B[0m[0000] Retrieving image manifest mlrun/mlrun:0.4.10 \n", - "\u001B[36mINFO\u001B[0m[0001] Unpacking rootfs as cmd RUN pip install --no-cache-dir git+https://github.com/v3io/PyHive.git@v0.6.999 requires it. \n", - "\u001B[36mINFO\u001B[0m[0027] Taking snapshot of full filesystem... \n", - "\u001B[36mINFO\u001B[0m[0039] Resolving paths \n", - "\u001B[36mINFO\u001B[0m[0046] RUN pip install --no-cache-dir git+https://github.com/v3io/PyHive.git@v0.6.999 \n", - "\u001B[36mINFO\u001B[0m[0046] cmd: /bin/sh \n", - "\u001B[36mINFO\u001B[0m[0046] args: [-c pip install --no-cache-dir git+https://github.com/v3io/PyHive.git@v0.6.999] \n", - "Collecting git+https://github.com/v3io/PyHive.git@v0.6.999\n", - " Cloning https://github.com/v3io/PyHive.git (to revision v0.6.999) to /tmp/pip-req-build-ycqhuolw\n", - " Running command git clone -q https://github.com/v3io/PyHive.git /tmp/pip-req-build-ycqhuolw\n", - "Requirement already satisfied: future in /usr/local/lib/python3.7/site-packages (from PyHive==0.6.1.dev0) (0.18.2)\n", - "Requirement already satisfied: python-dateutil in /usr/local/lib/python3.7/site-packages (from PyHive==0.6.1.dev0) (2.8.1)\n", - "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/site-packages (from python-dateutil->PyHive==0.6.1.dev0) (1.15.0)\n", - "Building wheels for collected packages: PyHive\n", - " Building wheel for PyHive (setup.py): started\n", - " Building wheel for PyHive (setup.py): finished with status 'done'\n", - " Created wheel for PyHive: filename=PyHive-0.6.1.dev0-py3-none-any.whl size=46402 sha256=63dca405cbae83da4cfcabfd61fd00f1683bc008c8bfa2272eac7054ec283166\n", - " Stored in directory: /tmp/pip-ephem-wheel-cache-mwb52l_u/wheels/05/11/cd/4ac4df0fcee76e5ceb614c39c56fca1eead41c0ac32ff6285d\n", - "Successfully built PyHive\n", - "Installing collected packages: PyHive\n", - "Successfully installed PyHive-0.6.1.dev0\n", - "\u001B[36mINFO\u001B[0m[0048] Taking snapshot of full filesystem... \n", - "\u001B[36mINFO\u001B[0m[0048] Resolving paths \n", - "\u001B[36mINFO\u001B[0m[0053] RUN pip install sqlalchemy==1.3.11 \n", - "\u001B[36mINFO\u001B[0m[0053] cmd: /bin/sh \n", - "\u001B[36mINFO\u001B[0m[0053] args: [-c pip install sqlalchemy==1.3.11] \n", - "Collecting sqlalchemy==1.3.11\n", - " Downloading SQLAlchemy-1.3.11.tar.gz (6.0 MB)\n", - "Building wheels for collected packages: sqlalchemy\n", - " Building wheel for sqlalchemy (setup.py): started\n", - " Building wheel for sqlalchemy (setup.py): finished with status 'done'\n", - " Created wheel for sqlalchemy: filename=SQLAlchemy-1.3.11-cp37-cp37m-linux_x86_64.whl size=1216921 sha256=9dd22e89acfbb68df0c1d189d36907a16c9393e4174598eb4bf377ce57132f3c\n", - " Stored in directory: /root/.cache/pip/wheels/0a/60/60/f26cbd183a3bb0031ace108156036dd925ec0138ee1c496a16\n", - "Successfully built sqlalchemy\n", - "Installing collected packages: sqlalchemy\n", - " Attempting uninstall: sqlalchemy\n", - " Found existing installation: SQLAlchemy 1.3.17\n", - " Uninstalling SQLAlchemy-1.3.17:\n", - " Successfully uninstalled SQLAlchemy-1.3.17\n", - "Successfully installed sqlalchemy-1.3.11\n", - "\u001B[36mINFO\u001B[0m[0057] Taking snapshot of full filesystem... \n", - "\u001B[36mINFO\u001B[0m[0057] Resolving paths \n", - "\u001B[36mINFO\u001B[0m[0063] RUN pip install PyMySQL==0.9.3 \n", - "\u001B[36mINFO\u001B[0m[0063] cmd: /bin/sh \n", - "\u001B[36mINFO\u001B[0m[0063] args: [-c pip install PyMySQL==0.9.3] \n", - "Collecting PyMySQL==0.9.3\n", - " Downloading PyMySQL-0.9.3-py2.py3-none-any.whl (47 kB)\n", - "Installing collected packages: PyMySQL\n", - "Successfully installed PyMySQL-0.9.3\n", - "\u001B[36mINFO\u001B[0m[0064] Taking snapshot of full filesystem... \n", - "\u001B[36mINFO\u001B[0m[0064] Resolving paths \n" - ] - }, - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fn.deploy()" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-06-30 01:58:41,604 function spec saved to path: function.yaml\n" - ] - }, - { - "data": { - "text/plain": "" - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fn.export('function.yaml')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Test" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Reading from a public MySQL DB" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "mysql_url = 'mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam'\n", - "mysql_query = 'select rfam_acc,rfam_id,auto_wiki,description,author,seed_source FROM family'" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import NewTask, run_local\n", - "\n", - "sql_task = NewTask(name='sql',\n", - " handler=sql_to_file,\n", - " params={'sql_query': mysql_query,\n", - " 'database_url': mysql_url})\n" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-06-29 12:43:59,253 starting run sql uid=b0914edaa58e45ee97c132200c6b60be -> http://mlrun-api:8080\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 29 12:43:59completedsql
v3io_user=admin
kind=handler
owner=admin
host=jupyter-b9c7995f9-4fblj
sql_query=select rfam_acc,rfam_id,auto_wiki,description,author,seed_source FROM family
database_url=mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam
query result
\n", - "
\n", - "
\n", - "
\n", - " Title\n", - " ×\n", - "
\n", - " \n", - "
\n", - "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "to track results use .show() or .logs() or in CLI: \n", - "!mlrun get run b0914edaa58e45ee97c132200c6b60be --project default , !mlrun logs b0914edaa58e45ee97c132200c6b60be --project default\n", - "[mlrun] 2020-06-29 12:44:02,344 run executed, status=completed\n" - ] - } - ], - "source": [ - "sql_func = run_local(sql_task)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Run it on a cluster" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-06-29 12:44:02,350 starting run sql uid=46ff7ef67e314be49353982cdd8d073a -> http://mlrun-api:8080\n", - "[mlrun] 2020-06-29 12:44:02,622 Job is running in the background, pod: sql-mplpz\n", - "[mlrun] 2020-06-29 12:44:09,070 run executed, status=completed\n", - "final state: succeeded\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 29 12:44:06completedsql
v3io_user=admin
kind=job
owner=admin
host=sql-mplpz
sql_query=select rfam_acc,rfam_id,auto_wiki,description,author,seed_source FROM family
database_url=mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam
query result
\n", - "
\n", - "
\n", - "
\n", - " Title\n", - " ×\n", - "
\n", - " \n", - "
\n", - "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "to track results use .show() or .logs() or in CLI: \n", - "!mlrun get run 46ff7ef67e314be49353982cdd8d073a --project default , !mlrun logs 46ff7ef67e314be49353982cdd8d073a --project default\n", - "[mlrun] 2020-06-29 12:44:11,893 run executed, status=completed\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fn.run(sql_task)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### SQL query from Iguazio Key Value via Presto" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You need to create a table and set the sql_table path accordingly.
\n", - "you can find an example of creating such table in https://github.com/v3io/tutorials/blob/master/data-ingestion-and-preparation/basic-data-ingestion-and-preparation.ipynb" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: ignore\n", - "import os\n", - "sql_table = os.path.join('v3io.users.\"'+str(os.getenv('V3IO_USERNAME'))+'/examples/stocks_tab\"')\n", - "sql_query_string = 'select * from '+sql_table+\"\"" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Done.\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
securitydescsecuritytypetimeisinminpricedateendpricenumberoftradesmnemoniccurrencysecurityidmaxpricetradedvolumestartprice
UBS I.ETF-DL G.SEL.DIV.ADETF08:27IE00BMP3HG278.4182018-03-26 00:00:00.0008.4181UBUMEUR25054508.4184038.418
GILEAD SCIENCES DL-,001Common stock08:00US375558103659.72018-03-26 00:00:00.00059.843GISEUR250649559.8474559.7
3M CO. DL-,01Common stock08:00US88579Y1010176.512018-03-26 00:00:00.000176.511MMMEUR2506577176.5139176.51
DIEBOLD NIXDORF INH.O.N.Common stock08:06DE000A0CAYB266.32018-03-26 00:00:00.00066.31WINEUR250428666.36066.3
XTR.II EUR.INF.LINK.BD 1CETF08:13LU0290358224218.972018-03-26 00:00:00.000218.971DBXKEUR2505840218.97110218.97
UBS-ETF-MSCI EMU S.C.EOADETF08:33LU0671493277100.22018-03-26 00:00:00.000100.21UEFDEUR2506045100.2180100.2
ASMALLWORLD AG SF 1Common stock08:23CH040488012912.72018-03-26 00:00:00.00012.711Q7EUR308912212.740012.7
IS.DJ GLOB.TITAN.50 U.ETFETF08:42DE000628938231.252018-03-26 00:00:00.00031.251EXI2EUR250502931.255031.25
ISHS IV-AGEING POPUL.ETFETF08:17IE00BYZK46694.9262018-03-26 00:00:00.0004.92612B77EUR25055524.926254.926
PORSCHE AUTOM.HLDG VZOCommon stock08:00DE000PAH003864.682018-03-26 00:00:00.00064.768PAH3EUR250481664.7669864.7
" - ], - "text/plain": [ - "[('UBS I.ETF-DL G.SEL.DIV.AD', 'ETF', '08:27', 'IE00BMP3HG27', 8.418, '2018-03-26 00:00:00.000', 8.418, 1, 'UBUM', 'EUR', 2505450, 8.418, 403, 8.418),\n", - " ('GILEAD SCIENCES DL-,001', 'Common stock', '08:00', 'US3755581036', 59.7, '2018-03-26 00:00:00.000', 59.84, 3, 'GIS', 'EUR', 2506495, 59.84, 745, 59.7),\n", - " ('3M CO. DL-,01', 'Common stock', '08:00', 'US88579Y1010', 176.51, '2018-03-26 00:00:00.000', 176.51, 1, 'MMM', 'EUR', 2506577, 176.51, 39, 176.51),\n", - " ('DIEBOLD NIXDORF INH.O.N.', 'Common stock', '08:06', 'DE000A0CAYB2', 66.3, '2018-03-26 00:00:00.000', 66.3, 1, 'WIN', 'EUR', 2504286, 66.3, 60, 66.3),\n", - " ('XTR.II EUR.INF.LINK.BD 1C', 'ETF', '08:13', 'LU0290358224', 218.97, '2018-03-26 00:00:00.000', 218.97, 1, 'DBXK', 'EUR', 2505840, 218.97, 110, 218.97),\n", - " ('UBS-ETF-MSCI EMU S.C.EOAD', 'ETF', '08:33', 'LU0671493277', 100.2, '2018-03-26 00:00:00.000', 100.2, 1, 'UEFD', 'EUR', 2506045, 100.2, 180, 100.2),\n", - " ('ASMALLWORLD AG SF 1', 'Common stock', '08:23', 'CH0404880129', 12.7, '2018-03-26 00:00:00.000', 12.7, 1, '1Q7', 'EUR', 3089122, 12.7, 400, 12.7),\n", - " ('IS.DJ GLOB.TITAN.50 U.ETF', 'ETF', '08:42', 'DE0006289382', 31.25, '2018-03-26 00:00:00.000', 31.25, 1, 'EXI2', 'EUR', 2505029, 31.25, 50, 31.25),\n", - " ('ISHS IV-AGEING POPUL.ETF', 'ETF', '08:17', 'IE00BYZK4669', 4.926, '2018-03-26 00:00:00.000', 4.926, 1, '2B77', 'EUR', 2505552, 4.926, 25, 4.926),\n", - " ('PORSCHE AUTOM.HLDG VZO', 'Common stock', '08:00', 'DE000PAH0038', 64.68, '2018-03-26 00:00:00.000', 64.76, 8, 'PAH3', 'EUR', 2504816, 64.76, 698, 64.7)]" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "%sql select * from $sql_table limit 10" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [], - "source": [ - "sql_task = NewTask(name='sql', \n", - " handler=sql_to_file,\n", - " params={'sql_query': sql_query_string,\n", - " 'database_url': os.getenv('DATABASE_URL')}\n", - " )\n" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-06-29 12:44:14,406 starting run sql uid=d32a57bb990d4142bb1f63862e8906bf -> http://mlrun-api:8080\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 29 12:44:14completedsql
v3io_user=admin
kind=handler
owner=admin
host=jupyter-b9c7995f9-4fblj
sql_query=select * from v3io.users.\"admin/examples/stocks_tab\"
database_url=presto://admin:8278ee8e-0f31-4aea-a105-2eab202bec93@presto-api-presto.default-tenant.app.cs-mlrun-test.iguazio-c0.com:443/v3io?protocol=https&requests_kwargs=%7B%22verify%22%3A+%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.crt%22%2C+%22cert%22%3A+%5B%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.crt%22%2C+%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.key%22%5D%7D
query result
\n", - "
\n", - "
\n", - "
\n", - " Title\n", - " ×\n", - "
\n", - " \n", - "
\n", - "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "to track results use .show() or .logs() or in CLI: \n", - "!mlrun get run d32a57bb990d4142bb1f63862e8906bf --project default , !mlrun logs d32a57bb990d4142bb1f63862e8906bf --project default\n", - "[mlrun] 2020-06-29 12:44:18,102 run executed, status=completed\n" - ] - } - ], - "source": [ - "sql_func = run_local(sql_task)" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-06-29 12:44:18,112 starting run sql uid=db9507007f6d452e9ca020e4f483e33b -> http://mlrun-api:8080\n", - "[mlrun] 2020-06-29 12:44:18,387 Job is running in the background, pod: sql-g7p4f\n", - "[mlrun] 2020-06-29 12:44:25,033 run executed, status=completed\n", - "final state: succeeded\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 29 12:44:21completedsql
v3io_user=admin
kind=job
owner=admin
host=sql-g7p4f
sql_query=select * from v3io.users.\"admin/examples/stocks_tab\"
database_url=presto://admin:8278ee8e-0f31-4aea-a105-2eab202bec93@presto-api-presto.default-tenant.app.cs-mlrun-test.iguazio-c0.com:443/v3io?protocol=https&requests_kwargs=%7B%22verify%22%3A+%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.crt%22%2C+%22cert%22%3A+%5B%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.crt%22%2C+%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.key%22%5D%7D
query result
\n", - "
\n", - "
\n", - "
\n", - " Title\n", - " ×\n", - "
\n", - " \n", - "
\n", - "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "to track results use .show() or .logs() or in CLI: \n", - "!mlrun get run db9507007f6d452e9ca020e4f483e33b --project default , !mlrun logs db9507007f6d452e9ca020e4f483e33b --project default\n", - "[mlrun] 2020-06-29 12:44:27,645 run executed, status=completed\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fn.run(sql_task)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.8" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} \ No newline at end of file diff --git a/functions/development/sql_to_file/1.1.0/src/sql_to_file.py b/functions/development/sql_to_file/1.1.0/src/sql_to_file.py deleted file mode 100644 index 6d5e152b..00000000 --- a/functions/development/sql_to_file/1.1.0/src/sql_to_file.py +++ /dev/null @@ -1,45 +0,0 @@ -# Copyright 2019 Iguazio -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# Generated by nuclio.export.NuclioExporter - -import pandas as pd -import pyhive -from sqlalchemy.engine import create_engine -from mlrun.execution import MLClientCtx - - -def sql_to_file( - context: MLClientCtx, - sql_query: str, - database_url: str, - file_ext: str = "parquet", -) -> None: - """SQL Ingest - Ingest data using SQL query - - :param context: the function context - :param sql_query: the sql query used to retrieve the data - :param database_url: database connection URL - :param file_ext: ("parquet") format for result file - """ - - engine = create_engine(database_url) - df = pd.read_sql(sql_query, engine) - - context.log_dataset( - "query result", - df=df, - format=file_ext, - artifact_path=context.artifact_subpath("data"), - ) diff --git a/functions/development/sql_to_file/1.1.0/src/test_sql_to_file.py b/functions/development/sql_to_file/1.1.0/src/test_sql_to_file.py deleted file mode 100644 index d636b86c..00000000 --- a/functions/development/sql_to_file/1.1.0/src/test_sql_to_file.py +++ /dev/null @@ -1,31 +0,0 @@ -# Copyright 2019 Iguazio -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -from mlrun import code_to_function - -mysql_url = 'mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam' -mysql_query = 'select rfam_acc,rfam_id,auto_wiki,description,author,seed_source FROM family' - - -def test_run_sql_to_file(): - fn = code_to_function(name='test_sql_to_file', - filename="sql_to_file.py", - handler="sql_to_file", - kind="job", - ) - run = fn.run(params={'sql_query': mysql_query, - 'database_url': mysql_url}, - local=True) - - assert(run.artifact("query result")) \ No newline at end of file diff --git a/functions/development/sql_to_file/1.1.0/static/documentation.html b/functions/development/sql_to_file/1.1.0/static/documentation.html deleted file mode 100644 index e157d47b..00000000 --- a/functions/development/sql_to_file/1.1.0/static/documentation.html +++ /dev/null @@ -1,237 +0,0 @@ - - - - - - - -sql_to_file package - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - - - -
-
- - -
-
-
- -
-

sql_to_file package

- -
- -
-
-
-
-
-

sql_to_file package#

-
-

Submodules#

-
-
-

sql_to_file.sql_to_file module#

-
-
-sql_to_file.sql_to_file.sql_to_file(context: mlrun.execution.MLClientCtx, sql_query: str, database_url: str, file_ext: str = 'parquet')None[source]#
-

SQL Ingest - Ingest data using SQL query

-
-
Parameters
-
    -
  • context – the function context

  • -
  • sql_query – the sql query used to retrieve the data

  • -
  • database_url – database connection URL

  • -
  • file_ext – (“parquet”) format for result file

  • -
-
-
-
-
-
-

Module contents#

-
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/sql_to_file/1.1.0/static/example.html b/functions/development/sql_to_file/1.1.0/static/example.html deleted file mode 100644 index 35853f3f..00000000 --- a/functions/development/sql_to_file/1.1.0/static/example.html +++ /dev/null @@ -1,1603 +0,0 @@ - - - - - - - -SQL Ingest - Ingest data using SQL query - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - - - -
-
- - -
-
-
- -
-

SQL Ingest - Ingest data using SQL query

- -
- -
-
-
-
-
-

SQL Ingest - Ingest data using SQL query#

-
-
-
# nuclio: ignore
-import nuclio
-
-
-
-
-
---------------------------------------------------------------------------
-ModuleNotFoundError                       Traceback (most recent call last)
-<ipython-input-1-0a63a0760af8> in <module>
-      1 # nuclio: ignore
-----> 2 import nuclio
-      3 
-
-ModuleNotFoundError: No module named 'nuclio'
-
-
-
-
-
-
-
%nuclio config kind = "job"
-%nuclio config spec.build.baseImage = "mlrun/mlrun"
-
-
-
-
-
%nuclio: setting kind to 'job'
-%nuclio: setting spec.image to 'mlrun/mlrun'
-
-
-
-
-
-
-
%%nuclio cmd -c
-pip install --no-cache-dir git+https://github.com/v3io/PyHive.git@v0.6.999 
-pip install sqlalchemy==1.3.11
-pip install PyMySQL==0.9.3
-
-
-
-
-
-
-
import pandas as pd
-import pyhive
-from sqlalchemy.engine import create_engine
-from mlrun.execution import MLClientCtx
-
-
-def sql_to_file(
-    context: MLClientCtx,
-    sql_query: str,
-    database_url: str,
-    file_ext: str = "parquet",
-) -> None:
-    """SQL Ingest - Ingest data using SQL query
-
-    :param context:           the function context
-    :param sql_query:         the sql query used to retrieve the data
-    :param database_url:      database connection URL
-    :param file_ext:          ("parquet") format for result file
-
-"""
-
-    engine = create_engine(database_url)
-    df = pd.read_sql(sql_query, engine)
-
-    context.log_dataset('query result',
-                        df=df,
-                        format=file_ext,
-                        artifact_path=context.artifact_subpath('data'))
-
-
-
-
-
-
-
# nuclio: end-code
-
-
-
-
-
-

mlconfig#

-
-
-
from mlrun import mlconf
-import os
-mlconf.dbpath = mlconf.dbpath or 'http://mlrun-api:8080'
-mlconf.artifact_path = mlconf.artifact_path or f'{os.environ["HOME"]}/artifacts'
-
-
-
-
-
---------------------------------------------------------------------------
-KeyError                                  Traceback (most recent call last)
-<ipython-input-7-c3828dc245f1> in <module>
-      2 import os
-      3 mlconf.dbpath = mlconf.dbpath or 'http://mlrun-api:8080'
-----> 4 mlconf.artifact_path = mlconf.artifact_path or f'{os.environ["HOME"]}/artifacts'
-      5 
-      6 
-
-C:\Program Files\Python37\lib\os.py in __getitem__(self, key)
-    679         except KeyError:
-    680             # raise KeyError with the original key value
---> 681             raise KeyError(key) from None
-    682         return self.decodevalue(value)
-    683 
-
-KeyError: 'HOME'
-
-
-
-
-
-
-

Save function#

-
-
-
def mount_secret(
-    secret_name, volume_mount_path, volume_name='secret', items=None
-):
-    def _mount_secret(task):
-        from kubernetes import client as k8s_client
-        vol = k8s_client.V1SecretVolumeSource(secret_name=secret_name, items=items)
-        return task.add_volume(
-            k8s_client.V1Volume(name=volume_name, secret=vol)
-        ).add_volume_mount(
-            k8s_client.V1VolumeMount(mount_path=volume_mount_path, name=volume_name)
-        )
-    return _mount_secret
-
-
-
-
-
-
-
from mlrun import code_to_function, NewTask
-import os
-
-fn = code_to_function(name="sql_to_file",
-                      handler="sql_to_file",
-                      description="SQL To File - Ingest data using SQL query",
-                      categories=["data-prep"],
-                      labels={"author": "adih"})
-
-if "V3IO_ACCESS_KEY" in list(os.environ):
-    fn.apply(mount_secret(secret_name='presto-tls',
-                        volume_mount_path= '/var/run/iguazio/secrets/'))
-
-
-
-
-
-
-

Build the image#

-
-
-
fn.deploy()
-
-
-
-
-
[mlrun] 2020-06-29 12:42:44,100 starting remote build, image: .mlrun/func-default-sql-ingest-latest
-INFO[0000] Resolved base name mlrun/mlrun:0.4.10 to mlrun/mlrun:0.4.10 
-INFO[0000] Resolved base name mlrun/mlrun:0.4.10 to mlrun/mlrun:0.4.10 
-INFO[0000] Retrieving image manifest mlrun/mlrun:0.4.10 
-INFO[0000] Retrieving image manifest mlrun/mlrun:0.4.10 
-INFO[0000] Built cross stage deps: map[]                
-INFO[0000] Retrieving image manifest mlrun/mlrun:0.4.10 
-INFO[0000] Retrieving image manifest mlrun/mlrun:0.4.10 
-INFO[0001] Unpacking rootfs as cmd RUN pip install --no-cache-dir git+https://github.com/v3io/PyHive.git@v0.6.999 requires it. 
-INFO[0027] Taking snapshot of full filesystem...        
-INFO[0039] Resolving paths                              
-INFO[0046] RUN pip install --no-cache-dir git+https://github.com/v3io/PyHive.git@v0.6.999 
-INFO[0046] cmd: /bin/sh                                 
-INFO[0046] args: [-c pip install --no-cache-dir git+https://github.com/v3io/PyHive.git@v0.6.999] 
-Collecting git+https://github.com/v3io/PyHive.git@v0.6.999
-  Cloning https://github.com/v3io/PyHive.git (to revision v0.6.999) to /tmp/pip-req-build-ycqhuolw
-  Running command git clone -q https://github.com/v3io/PyHive.git /tmp/pip-req-build-ycqhuolw
-Requirement already satisfied: future in /usr/local/lib/python3.7/site-packages (from PyHive==0.6.1.dev0) (0.18.2)
-Requirement already satisfied: python-dateutil in /usr/local/lib/python3.7/site-packages (from PyHive==0.6.1.dev0) (2.8.1)
-Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/site-packages (from python-dateutil->PyHive==0.6.1.dev0) (1.15.0)
-Building wheels for collected packages: PyHive
-  Building wheel for PyHive (setup.py): started
-  Building wheel for PyHive (setup.py): finished with status 'done'
-  Created wheel for PyHive: filename=PyHive-0.6.1.dev0-py3-none-any.whl size=46402 sha256=63dca405cbae83da4cfcabfd61fd00f1683bc008c8bfa2272eac7054ec283166
-  Stored in directory: /tmp/pip-ephem-wheel-cache-mwb52l_u/wheels/05/11/cd/4ac4df0fcee76e5ceb614c39c56fca1eead41c0ac32ff6285d
-Successfully built PyHive
-Installing collected packages: PyHive
-Successfully installed PyHive-0.6.1.dev0
-INFO[0048] Taking snapshot of full filesystem...        
-INFO[0048] Resolving paths                              
-INFO[0053] RUN pip install sqlalchemy==1.3.11           
-INFO[0053] cmd: /bin/sh                                 
-INFO[0053] args: [-c pip install sqlalchemy==1.3.11]    
-Collecting sqlalchemy==1.3.11
-  Downloading SQLAlchemy-1.3.11.tar.gz (6.0 MB)
-Building wheels for collected packages: sqlalchemy
-  Building wheel for sqlalchemy (setup.py): started
-  Building wheel for sqlalchemy (setup.py): finished with status 'done'
-  Created wheel for sqlalchemy: filename=SQLAlchemy-1.3.11-cp37-cp37m-linux_x86_64.whl size=1216921 sha256=9dd22e89acfbb68df0c1d189d36907a16c9393e4174598eb4bf377ce57132f3c
-  Stored in directory: /root/.cache/pip/wheels/0a/60/60/f26cbd183a3bb0031ace108156036dd925ec0138ee1c496a16
-Successfully built sqlalchemy
-Installing collected packages: sqlalchemy
-  Attempting uninstall: sqlalchemy
-    Found existing installation: SQLAlchemy 1.3.17
-    Uninstalling SQLAlchemy-1.3.17:
-      Successfully uninstalled SQLAlchemy-1.3.17
-Successfully installed sqlalchemy-1.3.11
-INFO[0057] Taking snapshot of full filesystem...        
-INFO[0057] Resolving paths                              
-INFO[0063] RUN pip install PyMySQL==0.9.3               
-INFO[0063] cmd: /bin/sh                                 
-INFO[0063] args: [-c pip install PyMySQL==0.9.3]        
-Collecting PyMySQL==0.9.3
-  Downloading PyMySQL-0.9.3-py2.py3-none-any.whl (47 kB)
-Installing collected packages: PyMySQL
-Successfully installed PyMySQL-0.9.3
-INFO[0064] Taking snapshot of full filesystem...        
-INFO[0064] Resolving paths                              
-
-
-
True
-
-
-
-
-
-
-
fn.export('function.yaml')
-
-
-
-
-
[mlrun] 2020-06-30 01:58:41,604 function spec saved to path: function.yaml
-
-
-
<mlrun.runtimes.kubejob.KubejobRuntime at 0x2239dbf01c8>
-
-
-
-
-
-
-

Test#

-
-

Reading from a public MySQL DB#

-
-
-
mysql_url = 'mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam'
-mysql_query = 'select rfam_acc,rfam_id,auto_wiki,description,author,seed_source FROM family'
-
-
-
-
-
-
-
from mlrun import NewTask, run_local
-
-sql_task = NewTask(name='sql',
-                   handler=sql_to_file,
-                   params={'sql_query': mysql_query,
-                           'database_url': mysql_url})
-
-
-
-
-
-
-
sql_func = run_local(sql_task)
-
-
-
-
-
[mlrun] 2020-06-29 12:43:59,253 starting run sql uid=b0914edaa58e45ee97c132200c6b60be  -> http://mlrun-api:8080
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 29 12:43:59completedsql
v3io_user=admin
kind=handler
owner=admin
host=jupyter-b9c7995f9-4fblj
sql_query=select rfam_acc,rfam_id,auto_wiki,description,author,seed_source FROM family
database_url=mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam
query result
-
- -
-
to track results use .show() or .logs() or in CLI: 
-!mlrun get run b0914edaa58e45ee97c132200c6b60be --project default , !mlrun logs b0914edaa58e45ee97c132200c6b60be --project default
-[mlrun] 2020-06-29 12:44:02,344 run executed, status=completed
-
-
-
-
-
-

Run it on a cluster#

-
-
-
fn.run(sql_task)
-
-
-
-
-
[mlrun] 2020-06-29 12:44:02,350 starting run sql uid=46ff7ef67e314be49353982cdd8d073a  -> http://mlrun-api:8080
-[mlrun] 2020-06-29 12:44:02,622 Job is running in the background, pod: sql-mplpz
-[mlrun] 2020-06-29 12:44:09,070 run executed, status=completed
-final state: succeeded
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 29 12:44:06completedsql
v3io_user=admin
kind=job
owner=admin
host=sql-mplpz
sql_query=select rfam_acc,rfam_id,auto_wiki,description,author,seed_source FROM family
database_url=mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam
query result
-
- -
-
to track results use .show() or .logs() or in CLI: 
-!mlrun get run 46ff7ef67e314be49353982cdd8d073a --project default , !mlrun logs 46ff7ef67e314be49353982cdd8d073a --project default
-[mlrun] 2020-06-29 12:44:11,893 run executed, status=completed
-
-
-
<mlrun.model.RunObject at 0x7f87fba74b00>
-
-
-
-
-
-
-
-

SQL query from Iguazio Key Value via Presto#

-

You need to create a table and set the sql_table path accordingly.
-you can find an example of creating such table in https://github.com/v3io/tutorials/blob/master/data-ingestion-and-preparation/basic-data-ingestion-and-preparation.ipynb

-
-
-
# nuclio: ignore
-import os
-sql_table = os.path.join('v3io.users."'+str(os.getenv('V3IO_USERNAME'))+'/examples/stocks_tab"')
-sql_query_string = 'select * from '+sql_table+""
-
-
-
-
-
-
-
%sql select * from $sql_table limit 10
-
-
-
-
-
Done.
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
securitydescsecuritytypetimeisinminpricedateendpricenumberoftradesmnemoniccurrencysecurityidmaxpricetradedvolumestartprice
UBS I.ETF-DL G.SEL.DIV.ADETF08:27IE00BMP3HG278.4182018-03-26 00:00:00.0008.4181UBUMEUR25054508.4184038.418
GILEAD SCIENCES DL-,001Common stock08:00US375558103659.72018-03-26 00:00:00.00059.843GISEUR250649559.8474559.7
3M CO. DL-,01Common stock08:00US88579Y1010176.512018-03-26 00:00:00.000176.511MMMEUR2506577176.5139176.51
DIEBOLD NIXDORF INH.O.N.Common stock08:06DE000A0CAYB266.32018-03-26 00:00:00.00066.31WINEUR250428666.36066.3
XTR.II EUR.INF.LINK.BD 1CETF08:13LU0290358224218.972018-03-26 00:00:00.000218.971DBXKEUR2505840218.97110218.97
UBS-ETF-MSCI EMU S.C.EOADETF08:33LU0671493277100.22018-03-26 00:00:00.000100.21UEFDEUR2506045100.2180100.2
ASMALLWORLD AG SF 1Common stock08:23CH040488012912.72018-03-26 00:00:00.00012.711Q7EUR308912212.740012.7
IS.DJ GLOB.TITAN.50 U.ETFETF08:42DE000628938231.252018-03-26 00:00:00.00031.251EXI2EUR250502931.255031.25
ISHS IV-AGEING POPUL.ETFETF08:17IE00BYZK46694.9262018-03-26 00:00:00.0004.92612B77EUR25055524.926254.926
PORSCHE AUTOM.HLDG VZOCommon stock08:00DE000PAH003864.682018-03-26 00:00:00.00064.768PAH3EUR250481664.7669864.7
-
-
-
-
sql_task = NewTask(name='sql', 
-                   handler=sql_to_file,
-                   params={'sql_query': sql_query_string,
-                          'database_url': os.getenv('DATABASE_URL')}
-                          )
-
-
-
-
-
-
-
sql_func = run_local(sql_task)
-
-
-
-
-
[mlrun] 2020-06-29 12:44:14,406 starting run sql uid=d32a57bb990d4142bb1f63862e8906bf  -> http://mlrun-api:8080
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 29 12:44:14completedsql
v3io_user=admin
kind=handler
owner=admin
host=jupyter-b9c7995f9-4fblj
sql_query=select * from v3io.users."admin/examples/stocks_tab"
database_url=presto://admin:8278ee8e-0f31-4aea-a105-2eab202bec93@presto-api-presto.default-tenant.app.cs-mlrun-test.iguazio-c0.com:443/v3io?protocol=https&requests_kwargs=%7B%22verify%22%3A+%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.crt%22%2C+%22cert%22%3A+%5B%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.crt%22%2C+%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.key%22%5D%7D
query result
-
- -
-
to track results use .show() or .logs() or in CLI: 
-!mlrun get run d32a57bb990d4142bb1f63862e8906bf --project default , !mlrun logs d32a57bb990d4142bb1f63862e8906bf --project default
-[mlrun] 2020-06-29 12:44:18,102 run executed, status=completed
-
-
-
-
-
-
-
fn.run(sql_task)
-
-
-
-
-
[mlrun] 2020-06-29 12:44:18,112 starting run sql uid=db9507007f6d452e9ca020e4f483e33b  -> http://mlrun-api:8080
-[mlrun] 2020-06-29 12:44:18,387 Job is running in the background, pod: sql-g7p4f
-[mlrun] 2020-06-29 12:44:25,033 run executed, status=completed
-final state: succeeded
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 29 12:44:21completedsql
v3io_user=admin
kind=job
owner=admin
host=sql-g7p4f
sql_query=select * from v3io.users."admin/examples/stocks_tab"
database_url=presto://admin:8278ee8e-0f31-4aea-a105-2eab202bec93@presto-api-presto.default-tenant.app.cs-mlrun-test.iguazio-c0.com:443/v3io?protocol=https&requests_kwargs=%7B%22verify%22%3A+%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.crt%22%2C+%22cert%22%3A+%5B%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.crt%22%2C+%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.key%22%5D%7D
query result
-
- -
-
to track results use .show() or .logs() or in CLI: 
-!mlrun get run db9507007f6d452e9ca020e4f483e33b --project default , !mlrun logs db9507007f6d452e9ca020e4f483e33b --project default
-[mlrun] 2020-06-29 12:44:27,645 run executed, status=completed
-
-
-
<mlrun.model.RunObject at 0x7f87f8e26c18>
-
-
-
-
-
-
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/sql_to_file/1.1.0/static/function.html b/functions/development/sql_to_file/1.1.0/static/function.html deleted file mode 100644 index 4de597e5..00000000 --- a/functions/development/sql_to_file/1.1.0/static/function.html +++ /dev/null @@ -1,69 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: job
-metadata:
-  name: sql-to-file
-  tag: ''
-  hash: 61f616fe697994e05cf018f2ee94c4ea25ed8863
-  project: ''
-  labels:
-    author: adih
-  categories:
-  - data-preparation
-spec:
-  command: ''
-  args: []
-  image: mlrun/mlrun
-  env: []
-  default_handler: sql_to_file
-  entry_points:
-    sql_to_file:
-      name: sql_to_file
-      doc: SQL Ingest - Ingest data using SQL query
-      parameters:
-      - name: context
-        type: MLClientCtx
-        doc: the function context
-        default: ''
-      - name: sql_query
-        type: str
-        doc: the sql query used to retrieve the data
-        default: ''
-      - name: database_url
-        type: str
-        doc: database connection URL
-        default: ''
-      - name: file_ext
-        type: str
-        doc: ("parquet") format for result file
-        default: parquet
-      outputs:
-      - default: ''
-      lineno: 9
-  description: SQL To File - Ingest data using SQL query
-  build:
-    functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHBhbmRhcyBhcyBwZAppbXBvcnQgcHloaXZlCmZyb20gc3FsYWxjaGVteS5lbmdpbmUgaW1wb3J0IGNyZWF0ZV9lbmdpbmUKZnJvbSBtbHJ1bi5leGVjdXRpb24gaW1wb3J0IE1MQ2xpZW50Q3R4CgoKZGVmIHNxbF90b19maWxlKAogICAgY29udGV4dDogTUxDbGllbnRDdHgsCiAgICBzcWxfcXVlcnk6IHN0ciwKICAgIGRhdGFiYXNlX3VybDogc3RyLAogICAgZmlsZV9leHQ6IHN0ciA9ICJwYXJxdWV0IiwKKSAtPiBOb25lOgogICAgIiIiU1FMIEluZ2VzdCAtIEluZ2VzdCBkYXRhIHVzaW5nIFNRTCBxdWVyeQoKICAgIDpwYXJhbSBjb250ZXh0OiAgICAgICAgICAgdGhlIGZ1bmN0aW9uIGNvbnRleHQKICAgIDpwYXJhbSBzcWxfcXVlcnk6ICAgICAgICAgdGhlIHNxbCBxdWVyeSB1c2VkIHRvIHJldHJpZXZlIHRoZSBkYXRhCiAgICA6cGFyYW0gZGF0YWJhc2VfdXJsOiAgICAgIGRhdGFiYXNlIGNvbm5lY3Rpb24gVVJMCiAgICA6cGFyYW0gZmlsZV9leHQ6ICAgICAgICAgICgicGFycXVldCIpIGZvcm1hdCBmb3IgcmVzdWx0IGZpbGUKICAgICIiIgoKICAgIGVuZ2luZSA9IGNyZWF0ZV9lbmdpbmUoZGF0YWJhc2VfdXJsKQogICAgZGYgPSBwZC5yZWFkX3NxbChzcWxfcXVlcnksIGVuZ2luZSkKCiAgICBjb250ZXh0LmxvZ19kYXRhc2V0KAogICAgICAgICJxdWVyeSByZXN1bHQiLAogICAgICAgIGRmPWRmLAogICAgICAgIGZvcm1hdD1maWxlX2V4dCwKICAgICAgICBhcnRpZmFjdF9wYXRoPWNvbnRleHQuYXJ0aWZhY3Rfc3VicGF0aCgiZGF0YSIpLAogICAgKQo=
-    commands: []
-    code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/sql_to_file/sql_to_file.py
-  affinity: null
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/sql_to_file/1.1.0/static/item.html b/functions/development/sql_to_file/1.1.0/static/item.html deleted file mode 100644 index a7e850ee..00000000 --- a/functions/development/sql_to_file/1.1.0/static/item.html +++ /dev/null @@ -1,46 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- data-preparation
-description: SQL To File - Ingest data using SQL query
-doc: ''
-example: sql_to_file.ipynb
-generationDate: 2022-08-28:17-25
-hidden: false
-icon: ''
-labels:
-  author: adih
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 1.1.0
-name: sql-to-file
-platformVersion: 3.5.0
-spec:
-  filename: sql_to_file.py
-  handler: sql_to_file
-  image: mlrun/mlrun
-  kind: job
-  requirements: []
-url: ''
-version: 1.1.0
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/sql_to_file/1.1.0/static/source.html b/functions/development/sql_to_file/1.1.0/static/source.html deleted file mode 100644 index 990d3b78..00000000 --- a/functions/development/sql_to_file/1.1.0/static/source.html +++ /dev/null @@ -1,67 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-# Copyright 2019 Iguazio
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# Generated by nuclio.export.NuclioExporter
-
-import pandas as pd
-import pyhive
-from sqlalchemy.engine import create_engine
-from mlrun.execution import MLClientCtx
-
-
-def sql_to_file(
-    context: MLClientCtx,
-    sql_query: str,
-    database_url: str,
-    file_ext: str = "parquet",
-) -> None:
-    """SQL Ingest - Ingest data using SQL query
-
-    :param context:           the function context
-    :param sql_query:         the sql query used to retrieve the data
-    :param database_url:      database connection URL
-    :param file_ext:          ("parquet") format for result file
-    """
-
-    engine = create_engine(database_url)
-    df = pd.read_sql(sql_query, engine)
-
-    context.log_dataset(
-        "query result",
-        df=df,
-        format=file_ext,
-        artifact_path=context.artifact_subpath("data"),
-    )
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/sql_to_file/1.1.0/static/sql_to_file.html b/functions/development/sql_to_file/1.1.0/static/sql_to_file.html deleted file mode 100644 index 8bfde314..00000000 --- a/functions/development/sql_to_file/1.1.0/static/sql_to_file.html +++ /dev/null @@ -1,185 +0,0 @@ - - - - - - - -sql_to_file.sql_to_file - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - -
-
- -
-
-
-
-
- -
-

- -
-
-
-
-
-
-
-

Source code for sql_to_file.sql_to_file

-# Copyright 2019 Iguazio
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# Generated by nuclio.export.NuclioExporter
-
-import pandas as pd
-import pyhive
-from sqlalchemy.engine import create_engine
-from mlrun.execution import MLClientCtx
-
-
-
[docs]def sql_to_file( - context: MLClientCtx, - sql_query: str, - database_url: str, - file_ext: str = "parquet", -) -> None: - """SQL Ingest - Ingest data using SQL query - - :param context: the function context - :param sql_query: the sql query used to retrieve the data - :param database_url: database connection URL - :param file_ext: ("parquet") format for result file - """ - - engine = create_engine(database_url) - df = pd.read_sql(sql_query, engine) - - context.log_dataset( - "query result", - df=df, - format=file_ext, - artifact_path=context.artifact_subpath("data"), - )
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/sql_to_file/latest/src/function.yaml b/functions/development/sql_to_file/latest/src/function.yaml deleted file mode 100644 index 10b332a5..00000000 --- a/functions/development/sql_to_file/latest/src/function.yaml +++ /dev/null @@ -1,47 +0,0 @@ -kind: job -metadata: - name: sql-to-file - tag: '' - hash: 61f616fe697994e05cf018f2ee94c4ea25ed8863 - project: '' - labels: - author: adih - categories: - - data-preparation -spec: - command: '' - args: [] - image: mlrun/mlrun - env: [] - default_handler: sql_to_file - entry_points: - sql_to_file: - name: sql_to_file - doc: SQL Ingest - Ingest data using SQL query - parameters: - - name: context - type: MLClientCtx - doc: the function context - default: '' - - name: sql_query - type: str - doc: the sql query used to retrieve the data - default: '' - - name: database_url - type: str - doc: database connection URL - default: '' - - name: file_ext - type: str - doc: ("parquet") format for result file - default: parquet - outputs: - - default: '' - lineno: 9 - description: SQL To File - Ingest data using SQL query - build: - functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHBhbmRhcyBhcyBwZAppbXBvcnQgcHloaXZlCmZyb20gc3FsYWxjaGVteS5lbmdpbmUgaW1wb3J0IGNyZWF0ZV9lbmdpbmUKZnJvbSBtbHJ1bi5leGVjdXRpb24gaW1wb3J0IE1MQ2xpZW50Q3R4CgoKZGVmIHNxbF90b19maWxlKAogICAgY29udGV4dDogTUxDbGllbnRDdHgsCiAgICBzcWxfcXVlcnk6IHN0ciwKICAgIGRhdGFiYXNlX3VybDogc3RyLAogICAgZmlsZV9leHQ6IHN0ciA9ICJwYXJxdWV0IiwKKSAtPiBOb25lOgogICAgIiIiU1FMIEluZ2VzdCAtIEluZ2VzdCBkYXRhIHVzaW5nIFNRTCBxdWVyeQoKICAgIDpwYXJhbSBjb250ZXh0OiAgICAgICAgICAgdGhlIGZ1bmN0aW9uIGNvbnRleHQKICAgIDpwYXJhbSBzcWxfcXVlcnk6ICAgICAgICAgdGhlIHNxbCBxdWVyeSB1c2VkIHRvIHJldHJpZXZlIHRoZSBkYXRhCiAgICA6cGFyYW0gZGF0YWJhc2VfdXJsOiAgICAgIGRhdGFiYXNlIGNvbm5lY3Rpb24gVVJMCiAgICA6cGFyYW0gZmlsZV9leHQ6ICAgICAgICAgICgicGFycXVldCIpIGZvcm1hdCBmb3IgcmVzdWx0IGZpbGUKICAgICIiIgoKICAgIGVuZ2luZSA9IGNyZWF0ZV9lbmdpbmUoZGF0YWJhc2VfdXJsKQogICAgZGYgPSBwZC5yZWFkX3NxbChzcWxfcXVlcnksIGVuZ2luZSkKCiAgICBjb250ZXh0LmxvZ19kYXRhc2V0KAogICAgICAgICJxdWVyeSByZXN1bHQiLAogICAgICAgIGRmPWRmLAogICAgICAgIGZvcm1hdD1maWxlX2V4dCwKICAgICAgICBhcnRpZmFjdF9wYXRoPWNvbnRleHQuYXJ0aWZhY3Rfc3VicGF0aCgiZGF0YSIpLAogICAgKQo= - commands: [] - code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/sql_to_file/sql_to_file.py - affinity: null -verbose: false diff --git a/functions/development/sql_to_file/latest/src/item.yaml b/functions/development/sql_to_file/latest/src/item.yaml deleted file mode 100644 index 2f6ae4c5..00000000 --- a/functions/development/sql_to_file/latest/src/item.yaml +++ /dev/null @@ -1,24 +0,0 @@ -apiVersion: v1 -categories: -- data-preparation -description: SQL To File - Ingest data using SQL query -doc: '' -example: sql_to_file.ipynb -generationDate: 2022-08-28:17-25 -hidden: false -icon: '' -labels: - author: adih -maintainers: [] -marketplaceType: '' -mlrunVersion: 1.1.0 -name: sql-to-file -platformVersion: 3.5.0 -spec: - filename: sql_to_file.py - handler: sql_to_file - image: mlrun/mlrun - kind: job - requirements: [] -url: '' -version: 1.1.0 diff --git a/functions/development/sql_to_file/latest/src/requirements.txt b/functions/development/sql_to_file/latest/src/requirements.txt deleted file mode 100644 index 822eabb8..00000000 --- a/functions/development/sql_to_file/latest/src/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -pyhive -pymysql \ No newline at end of file diff --git a/functions/development/sql_to_file/latest/src/sql_to_file.ipynb b/functions/development/sql_to_file/latest/src/sql_to_file.ipynb deleted file mode 100644 index d4a084ad..00000000 --- a/functions/development/sql_to_file/latest/src/sql_to_file.ipynb +++ /dev/null @@ -1,1567 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# SQL Ingest - Ingest data using SQL query " - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "ename": "ModuleNotFoundError", - "evalue": "No module named 'nuclio'", - "output_type": "error", - "traceback": [ - "\u001B[1;31m---------------------------------------------------------------------------\u001B[0m", - "\u001B[1;31mModuleNotFoundError\u001B[0m Traceback (most recent call last)", - "\u001B[1;32m\u001B[0m in \u001B[0;36m\u001B[1;34m\u001B[0m\n\u001B[0;32m 1\u001B[0m \u001B[1;31m# nuclio: ignore\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[1;32m----> 2\u001B[1;33m \u001B[1;32mimport\u001B[0m \u001B[0mnuclio\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0m\u001B[0;32m 3\u001B[0m \u001B[1;33m\u001B[0m\u001B[0m\n", - "\u001B[1;31mModuleNotFoundError\u001B[0m: No module named 'nuclio'" - ] - } - ], - "source": [ - "# nuclio: ignore\n", - "import nuclio" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "%nuclio: setting kind to 'job'\n", - "%nuclio: setting spec.image to 'mlrun/mlrun'\n" - ] - } - ], - "source": [ - "%nuclio config kind = \"job\"\n", - "%nuclio config spec.build.baseImage = \"mlrun/mlrun\"" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "%%nuclio cmd -c\n", - "pip install --no-cache-dir git+https://github.com/v3io/PyHive.git@v0.6.999 \n", - "pip install sqlalchemy==1.3.11\n", - "pip install PyMySQL==0.9.3" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import pyhive\n", - "from sqlalchemy.engine import create_engine\n", - "from mlrun.execution import MLClientCtx\n", - "\n", - "\n", - "def sql_to_file(\n", - " context: MLClientCtx,\n", - " sql_query: str,\n", - " database_url: str,\n", - " file_ext: str = \"parquet\",\n", - ") -> None:\n", - " \"\"\"SQL Ingest - Ingest data using SQL query\n", - "\n", - " :param context: the function context\n", - " :param sql_query: the sql query used to retrieve the data\n", - " :param database_url: database connection URL\n", - " :param file_ext: (\"parquet\") format for result file\n", - "\n", - "\"\"\"\n", - "\n", - " engine = create_engine(database_url)\n", - " df = pd.read_sql(sql_query, engine)\n", - "\n", - " context.log_dataset('query result',\n", - " df=df,\n", - " format=file_ext,\n", - " artifact_path=context.artifact_subpath('data'))\n" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: end-code" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### mlconfig" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "ename": "KeyError", - "evalue": "'HOME'", - "output_type": "error", - "traceback": [ - "\u001B[1;31m---------------------------------------------------------------------------\u001B[0m", - "\u001B[1;31mKeyError\u001B[0m Traceback (most recent call last)", - "\u001B[1;32m\u001B[0m in \u001B[0;36m\u001B[1;34m\u001B[0m\n\u001B[0;32m 2\u001B[0m \u001B[1;32mimport\u001B[0m \u001B[0mos\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 3\u001B[0m \u001B[0mmlconf\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mdbpath\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0mmlconf\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mdbpath\u001B[0m \u001B[1;32mor\u001B[0m \u001B[1;34m'http://mlrun-api:8080'\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[1;32m----> 4\u001B[1;33m \u001B[0mmlconf\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0martifact_path\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0mmlconf\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0martifact_path\u001B[0m \u001B[1;32mor\u001B[0m \u001B[1;34mf'{os.environ[\"HOME\"]}/artifacts'\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0m\u001B[0;32m 5\u001B[0m \u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 6\u001B[0m \u001B[1;33m\u001B[0m\u001B[0m\n", - "\u001B[1;32mC:\\Program Files\\Python37\\lib\\os.py\u001B[0m in \u001B[0;36m__getitem__\u001B[1;34m(self, key)\u001B[0m\n\u001B[0;32m 679\u001B[0m \u001B[1;32mexcept\u001B[0m \u001B[0mKeyError\u001B[0m\u001B[1;33m:\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 680\u001B[0m \u001B[1;31m# raise KeyError with the original key value\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[1;32m--> 681\u001B[1;33m \u001B[1;32mraise\u001B[0m \u001B[0mKeyError\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0mkey\u001B[0m\u001B[1;33m)\u001B[0m \u001B[1;32mfrom\u001B[0m \u001B[1;32mNone\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0m\u001B[0;32m 682\u001B[0m \u001B[1;32mreturn\u001B[0m \u001B[0mself\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mdecodevalue\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0mvalue\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 683\u001B[0m \u001B[1;33m\u001B[0m\u001B[0m\n", - "\u001B[1;31mKeyError\u001B[0m: 'HOME'" - ] - } - ], - "source": [ - "from mlrun import mlconf\n", - "import os\n", - "mlconf.dbpath = mlconf.dbpath or 'http://mlrun-api:8080'\n", - "mlconf.artifact_path = mlconf.artifact_path or f'{os.environ[\"HOME\"]}/artifacts'\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Save function" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "def mount_secret(\n", - " secret_name, volume_mount_path, volume_name='secret', items=None\n", - "):\n", - " def _mount_secret(task):\n", - " from kubernetes import client as k8s_client\n", - " vol = k8s_client.V1SecretVolumeSource(secret_name=secret_name, items=items)\n", - " return task.add_volume(\n", - " k8s_client.V1Volume(name=volume_name, secret=vol)\n", - " ).add_volume_mount(\n", - " k8s_client.V1VolumeMount(mount_path=volume_mount_path, name=volume_name)\n", - " )\n", - " return _mount_secret" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import code_to_function, NewTask\n", - "import os\n", - "\n", - "fn = code_to_function(name=\"sql_to_file\",\n", - " handler=\"sql_to_file\",\n", - " description=\"SQL To File - Ingest data using SQL query\",\n", - " categories=[\"data-prep\"],\n", - " labels={\"author\": \"adih\"})\n", - "\n", - "if \"V3IO_ACCESS_KEY\" in list(os.environ):\n", - " fn.apply(mount_secret(secret_name='presto-tls',\n", - " volume_mount_path= '/var/run/iguazio/secrets/'))\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Build the image" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-06-29 12:42:44,100 starting remote build, image: .mlrun/func-default-sql-ingest-latest\n", - "\u001B[36mINFO\u001B[0m[0000] Resolved base name mlrun/mlrun:0.4.10 to mlrun/mlrun:0.4.10 \n", - "\u001B[36mINFO\u001B[0m[0000] Resolved base name mlrun/mlrun:0.4.10 to mlrun/mlrun:0.4.10 \n", - "\u001B[36mINFO\u001B[0m[0000] Retrieving image manifest mlrun/mlrun:0.4.10 \n", - "\u001B[36mINFO\u001B[0m[0000] Retrieving image manifest mlrun/mlrun:0.4.10 \n", - "\u001B[36mINFO\u001B[0m[0000] Built cross stage deps: map[] \n", - "\u001B[36mINFO\u001B[0m[0000] Retrieving image manifest mlrun/mlrun:0.4.10 \n", - "\u001B[36mINFO\u001B[0m[0000] Retrieving image manifest mlrun/mlrun:0.4.10 \n", - "\u001B[36mINFO\u001B[0m[0001] Unpacking rootfs as cmd RUN pip install --no-cache-dir git+https://github.com/v3io/PyHive.git@v0.6.999 requires it. \n", - "\u001B[36mINFO\u001B[0m[0027] Taking snapshot of full filesystem... \n", - "\u001B[36mINFO\u001B[0m[0039] Resolving paths \n", - "\u001B[36mINFO\u001B[0m[0046] RUN pip install --no-cache-dir git+https://github.com/v3io/PyHive.git@v0.6.999 \n", - "\u001B[36mINFO\u001B[0m[0046] cmd: /bin/sh \n", - "\u001B[36mINFO\u001B[0m[0046] args: [-c pip install --no-cache-dir git+https://github.com/v3io/PyHive.git@v0.6.999] \n", - "Collecting git+https://github.com/v3io/PyHive.git@v0.6.999\n", - " Cloning https://github.com/v3io/PyHive.git (to revision v0.6.999) to /tmp/pip-req-build-ycqhuolw\n", - " Running command git clone -q https://github.com/v3io/PyHive.git /tmp/pip-req-build-ycqhuolw\n", - "Requirement already satisfied: future in /usr/local/lib/python3.7/site-packages (from PyHive==0.6.1.dev0) (0.18.2)\n", - "Requirement already satisfied: python-dateutil in /usr/local/lib/python3.7/site-packages (from PyHive==0.6.1.dev0) (2.8.1)\n", - "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/site-packages (from python-dateutil->PyHive==0.6.1.dev0) (1.15.0)\n", - "Building wheels for collected packages: PyHive\n", - " Building wheel for PyHive (setup.py): started\n", - " Building wheel for PyHive (setup.py): finished with status 'done'\n", - " Created wheel for PyHive: filename=PyHive-0.6.1.dev0-py3-none-any.whl size=46402 sha256=63dca405cbae83da4cfcabfd61fd00f1683bc008c8bfa2272eac7054ec283166\n", - " Stored in directory: /tmp/pip-ephem-wheel-cache-mwb52l_u/wheels/05/11/cd/4ac4df0fcee76e5ceb614c39c56fca1eead41c0ac32ff6285d\n", - "Successfully built PyHive\n", - "Installing collected packages: PyHive\n", - "Successfully installed PyHive-0.6.1.dev0\n", - "\u001B[36mINFO\u001B[0m[0048] Taking snapshot of full filesystem... \n", - "\u001B[36mINFO\u001B[0m[0048] Resolving paths \n", - "\u001B[36mINFO\u001B[0m[0053] RUN pip install sqlalchemy==1.3.11 \n", - "\u001B[36mINFO\u001B[0m[0053] cmd: /bin/sh \n", - "\u001B[36mINFO\u001B[0m[0053] args: [-c pip install sqlalchemy==1.3.11] \n", - "Collecting sqlalchemy==1.3.11\n", - " Downloading SQLAlchemy-1.3.11.tar.gz (6.0 MB)\n", - "Building wheels for collected packages: sqlalchemy\n", - " Building wheel for sqlalchemy (setup.py): started\n", - " Building wheel for sqlalchemy (setup.py): finished with status 'done'\n", - " Created wheel for sqlalchemy: filename=SQLAlchemy-1.3.11-cp37-cp37m-linux_x86_64.whl size=1216921 sha256=9dd22e89acfbb68df0c1d189d36907a16c9393e4174598eb4bf377ce57132f3c\n", - " Stored in directory: /root/.cache/pip/wheels/0a/60/60/f26cbd183a3bb0031ace108156036dd925ec0138ee1c496a16\n", - "Successfully built sqlalchemy\n", - "Installing collected packages: sqlalchemy\n", - " Attempting uninstall: sqlalchemy\n", - " Found existing installation: SQLAlchemy 1.3.17\n", - " Uninstalling SQLAlchemy-1.3.17:\n", - " Successfully uninstalled SQLAlchemy-1.3.17\n", - "Successfully installed sqlalchemy-1.3.11\n", - "\u001B[36mINFO\u001B[0m[0057] Taking snapshot of full filesystem... \n", - "\u001B[36mINFO\u001B[0m[0057] Resolving paths \n", - "\u001B[36mINFO\u001B[0m[0063] RUN pip install PyMySQL==0.9.3 \n", - "\u001B[36mINFO\u001B[0m[0063] cmd: /bin/sh \n", - "\u001B[36mINFO\u001B[0m[0063] args: [-c pip install PyMySQL==0.9.3] \n", - "Collecting PyMySQL==0.9.3\n", - " Downloading PyMySQL-0.9.3-py2.py3-none-any.whl (47 kB)\n", - "Installing collected packages: PyMySQL\n", - "Successfully installed PyMySQL-0.9.3\n", - "\u001B[36mINFO\u001B[0m[0064] Taking snapshot of full filesystem... \n", - "\u001B[36mINFO\u001B[0m[0064] Resolving paths \n" - ] - }, - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fn.deploy()" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-06-30 01:58:41,604 function spec saved to path: function.yaml\n" - ] - }, - { - "data": { - "text/plain": "" - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fn.export('function.yaml')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Test" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Reading from a public MySQL DB" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "mysql_url = 'mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam'\n", - "mysql_query = 'select rfam_acc,rfam_id,auto_wiki,description,author,seed_source FROM family'" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import NewTask, run_local\n", - "\n", - "sql_task = NewTask(name='sql',\n", - " handler=sql_to_file,\n", - " params={'sql_query': mysql_query,\n", - " 'database_url': mysql_url})\n" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-06-29 12:43:59,253 starting run sql uid=b0914edaa58e45ee97c132200c6b60be -> http://mlrun-api:8080\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 29 12:43:59completedsql
v3io_user=admin
kind=handler
owner=admin
host=jupyter-b9c7995f9-4fblj
sql_query=select rfam_acc,rfam_id,auto_wiki,description,author,seed_source FROM family
database_url=mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam
query result
\n", - "
\n", - "
\n", - "
\n", - " Title\n", - " ×\n", - "
\n", - " \n", - "
\n", - "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "to track results use .show() or .logs() or in CLI: \n", - "!mlrun get run b0914edaa58e45ee97c132200c6b60be --project default , !mlrun logs b0914edaa58e45ee97c132200c6b60be --project default\n", - "[mlrun] 2020-06-29 12:44:02,344 run executed, status=completed\n" - ] - } - ], - "source": [ - "sql_func = run_local(sql_task)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Run it on a cluster" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-06-29 12:44:02,350 starting run sql uid=46ff7ef67e314be49353982cdd8d073a -> http://mlrun-api:8080\n", - "[mlrun] 2020-06-29 12:44:02,622 Job is running in the background, pod: sql-mplpz\n", - "[mlrun] 2020-06-29 12:44:09,070 run executed, status=completed\n", - "final state: succeeded\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 29 12:44:06completedsql
v3io_user=admin
kind=job
owner=admin
host=sql-mplpz
sql_query=select rfam_acc,rfam_id,auto_wiki,description,author,seed_source FROM family
database_url=mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam
query result
\n", - "
\n", - "
\n", - "
\n", - " Title\n", - " ×\n", - "
\n", - " \n", - "
\n", - "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "to track results use .show() or .logs() or in CLI: \n", - "!mlrun get run 46ff7ef67e314be49353982cdd8d073a --project default , !mlrun logs 46ff7ef67e314be49353982cdd8d073a --project default\n", - "[mlrun] 2020-06-29 12:44:11,893 run executed, status=completed\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fn.run(sql_task)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### SQL query from Iguazio Key Value via Presto" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You need to create a table and set the sql_table path accordingly.
\n", - "you can find an example of creating such table in https://github.com/v3io/tutorials/blob/master/data-ingestion-and-preparation/basic-data-ingestion-and-preparation.ipynb" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: ignore\n", - "import os\n", - "sql_table = os.path.join('v3io.users.\"'+str(os.getenv('V3IO_USERNAME'))+'/examples/stocks_tab\"')\n", - "sql_query_string = 'select * from '+sql_table+\"\"" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Done.\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
securitydescsecuritytypetimeisinminpricedateendpricenumberoftradesmnemoniccurrencysecurityidmaxpricetradedvolumestartprice
UBS I.ETF-DL G.SEL.DIV.ADETF08:27IE00BMP3HG278.4182018-03-26 00:00:00.0008.4181UBUMEUR25054508.4184038.418
GILEAD SCIENCES DL-,001Common stock08:00US375558103659.72018-03-26 00:00:00.00059.843GISEUR250649559.8474559.7
3M CO. DL-,01Common stock08:00US88579Y1010176.512018-03-26 00:00:00.000176.511MMMEUR2506577176.5139176.51
DIEBOLD NIXDORF INH.O.N.Common stock08:06DE000A0CAYB266.32018-03-26 00:00:00.00066.31WINEUR250428666.36066.3
XTR.II EUR.INF.LINK.BD 1CETF08:13LU0290358224218.972018-03-26 00:00:00.000218.971DBXKEUR2505840218.97110218.97
UBS-ETF-MSCI EMU S.C.EOADETF08:33LU0671493277100.22018-03-26 00:00:00.000100.21UEFDEUR2506045100.2180100.2
ASMALLWORLD AG SF 1Common stock08:23CH040488012912.72018-03-26 00:00:00.00012.711Q7EUR308912212.740012.7
IS.DJ GLOB.TITAN.50 U.ETFETF08:42DE000628938231.252018-03-26 00:00:00.00031.251EXI2EUR250502931.255031.25
ISHS IV-AGEING POPUL.ETFETF08:17IE00BYZK46694.9262018-03-26 00:00:00.0004.92612B77EUR25055524.926254.926
PORSCHE AUTOM.HLDG VZOCommon stock08:00DE000PAH003864.682018-03-26 00:00:00.00064.768PAH3EUR250481664.7669864.7
" - ], - "text/plain": [ - "[('UBS I.ETF-DL G.SEL.DIV.AD', 'ETF', '08:27', 'IE00BMP3HG27', 8.418, '2018-03-26 00:00:00.000', 8.418, 1, 'UBUM', 'EUR', 2505450, 8.418, 403, 8.418),\n", - " ('GILEAD SCIENCES DL-,001', 'Common stock', '08:00', 'US3755581036', 59.7, '2018-03-26 00:00:00.000', 59.84, 3, 'GIS', 'EUR', 2506495, 59.84, 745, 59.7),\n", - " ('3M CO. DL-,01', 'Common stock', '08:00', 'US88579Y1010', 176.51, '2018-03-26 00:00:00.000', 176.51, 1, 'MMM', 'EUR', 2506577, 176.51, 39, 176.51),\n", - " ('DIEBOLD NIXDORF INH.O.N.', 'Common stock', '08:06', 'DE000A0CAYB2', 66.3, '2018-03-26 00:00:00.000', 66.3, 1, 'WIN', 'EUR', 2504286, 66.3, 60, 66.3),\n", - " ('XTR.II EUR.INF.LINK.BD 1C', 'ETF', '08:13', 'LU0290358224', 218.97, '2018-03-26 00:00:00.000', 218.97, 1, 'DBXK', 'EUR', 2505840, 218.97, 110, 218.97),\n", - " ('UBS-ETF-MSCI EMU S.C.EOAD', 'ETF', '08:33', 'LU0671493277', 100.2, '2018-03-26 00:00:00.000', 100.2, 1, 'UEFD', 'EUR', 2506045, 100.2, 180, 100.2),\n", - " ('ASMALLWORLD AG SF 1', 'Common stock', '08:23', 'CH0404880129', 12.7, '2018-03-26 00:00:00.000', 12.7, 1, '1Q7', 'EUR', 3089122, 12.7, 400, 12.7),\n", - " ('IS.DJ GLOB.TITAN.50 U.ETF', 'ETF', '08:42', 'DE0006289382', 31.25, '2018-03-26 00:00:00.000', 31.25, 1, 'EXI2', 'EUR', 2505029, 31.25, 50, 31.25),\n", - " ('ISHS IV-AGEING POPUL.ETF', 'ETF', '08:17', 'IE00BYZK4669', 4.926, '2018-03-26 00:00:00.000', 4.926, 1, '2B77', 'EUR', 2505552, 4.926, 25, 4.926),\n", - " ('PORSCHE AUTOM.HLDG VZO', 'Common stock', '08:00', 'DE000PAH0038', 64.68, '2018-03-26 00:00:00.000', 64.76, 8, 'PAH3', 'EUR', 2504816, 64.76, 698, 64.7)]" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "%sql select * from $sql_table limit 10" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [], - "source": [ - "sql_task = NewTask(name='sql', \n", - " handler=sql_to_file,\n", - " params={'sql_query': sql_query_string,\n", - " 'database_url': os.getenv('DATABASE_URL')}\n", - " )\n" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-06-29 12:44:14,406 starting run sql uid=d32a57bb990d4142bb1f63862e8906bf -> http://mlrun-api:8080\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 29 12:44:14completedsql
v3io_user=admin
kind=handler
owner=admin
host=jupyter-b9c7995f9-4fblj
sql_query=select * from v3io.users.\"admin/examples/stocks_tab\"
database_url=presto://admin:8278ee8e-0f31-4aea-a105-2eab202bec93@presto-api-presto.default-tenant.app.cs-mlrun-test.iguazio-c0.com:443/v3io?protocol=https&requests_kwargs=%7B%22verify%22%3A+%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.crt%22%2C+%22cert%22%3A+%5B%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.crt%22%2C+%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.key%22%5D%7D
query result
\n", - "
\n", - "
\n", - "
\n", - " Title\n", - " ×\n", - "
\n", - " \n", - "
\n", - "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "to track results use .show() or .logs() or in CLI: \n", - "!mlrun get run d32a57bb990d4142bb1f63862e8906bf --project default , !mlrun logs d32a57bb990d4142bb1f63862e8906bf --project default\n", - "[mlrun] 2020-06-29 12:44:18,102 run executed, status=completed\n" - ] - } - ], - "source": [ - "sql_func = run_local(sql_task)" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-06-29 12:44:18,112 starting run sql uid=db9507007f6d452e9ca020e4f483e33b -> http://mlrun-api:8080\n", - "[mlrun] 2020-06-29 12:44:18,387 Job is running in the background, pod: sql-g7p4f\n", - "[mlrun] 2020-06-29 12:44:25,033 run executed, status=completed\n", - "final state: succeeded\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 29 12:44:21completedsql
v3io_user=admin
kind=job
owner=admin
host=sql-g7p4f
sql_query=select * from v3io.users.\"admin/examples/stocks_tab\"
database_url=presto://admin:8278ee8e-0f31-4aea-a105-2eab202bec93@presto-api-presto.default-tenant.app.cs-mlrun-test.iguazio-c0.com:443/v3io?protocol=https&requests_kwargs=%7B%22verify%22%3A+%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.crt%22%2C+%22cert%22%3A+%5B%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.crt%22%2C+%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.key%22%5D%7D
query result
\n", - "
\n", - "
\n", - "
\n", - " Title\n", - " ×\n", - "
\n", - " \n", - "
\n", - "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "to track results use .show() or .logs() or in CLI: \n", - "!mlrun get run db9507007f6d452e9ca020e4f483e33b --project default , !mlrun logs db9507007f6d452e9ca020e4f483e33b --project default\n", - "[mlrun] 2020-06-29 12:44:27,645 run executed, status=completed\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fn.run(sql_task)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.8" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} \ No newline at end of file diff --git a/functions/development/sql_to_file/latest/src/sql_to_file.py b/functions/development/sql_to_file/latest/src/sql_to_file.py deleted file mode 100644 index 6d5e152b..00000000 --- a/functions/development/sql_to_file/latest/src/sql_to_file.py +++ /dev/null @@ -1,45 +0,0 @@ -# Copyright 2019 Iguazio -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# Generated by nuclio.export.NuclioExporter - -import pandas as pd -import pyhive -from sqlalchemy.engine import create_engine -from mlrun.execution import MLClientCtx - - -def sql_to_file( - context: MLClientCtx, - sql_query: str, - database_url: str, - file_ext: str = "parquet", -) -> None: - """SQL Ingest - Ingest data using SQL query - - :param context: the function context - :param sql_query: the sql query used to retrieve the data - :param database_url: database connection URL - :param file_ext: ("parquet") format for result file - """ - - engine = create_engine(database_url) - df = pd.read_sql(sql_query, engine) - - context.log_dataset( - "query result", - df=df, - format=file_ext, - artifact_path=context.artifact_subpath("data"), - ) diff --git a/functions/development/sql_to_file/latest/src/test_sql_to_file.py b/functions/development/sql_to_file/latest/src/test_sql_to_file.py deleted file mode 100644 index d636b86c..00000000 --- a/functions/development/sql_to_file/latest/src/test_sql_to_file.py +++ /dev/null @@ -1,31 +0,0 @@ -# Copyright 2019 Iguazio -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -from mlrun import code_to_function - -mysql_url = 'mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam' -mysql_query = 'select rfam_acc,rfam_id,auto_wiki,description,author,seed_source FROM family' - - -def test_run_sql_to_file(): - fn = code_to_function(name='test_sql_to_file', - filename="sql_to_file.py", - handler="sql_to_file", - kind="job", - ) - run = fn.run(params={'sql_query': mysql_query, - 'database_url': mysql_url}, - local=True) - - assert(run.artifact("query result")) \ No newline at end of file diff --git a/functions/development/sql_to_file/latest/static/documentation.html b/functions/development/sql_to_file/latest/static/documentation.html deleted file mode 100644 index e157d47b..00000000 --- a/functions/development/sql_to_file/latest/static/documentation.html +++ /dev/null @@ -1,237 +0,0 @@ - - - - - - - -sql_to_file package - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - - - -
-
- - -
-
-
- -
-

sql_to_file package

- -
- -
-
-
-
-
-

sql_to_file package#

-
-

Submodules#

-
-
-

sql_to_file.sql_to_file module#

-
-
-sql_to_file.sql_to_file.sql_to_file(context: mlrun.execution.MLClientCtx, sql_query: str, database_url: str, file_ext: str = 'parquet')None[source]#
-

SQL Ingest - Ingest data using SQL query

-
-
Parameters
-
    -
  • context – the function context

  • -
  • sql_query – the sql query used to retrieve the data

  • -
  • database_url – database connection URL

  • -
  • file_ext – (“parquet”) format for result file

  • -
-
-
-
-
-
-

Module contents#

-
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/sql_to_file/latest/static/example.html b/functions/development/sql_to_file/latest/static/example.html deleted file mode 100644 index 35853f3f..00000000 --- a/functions/development/sql_to_file/latest/static/example.html +++ /dev/null @@ -1,1603 +0,0 @@ - - - - - - - -SQL Ingest - Ingest data using SQL query - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - - - -
-
- - -
-
-
- -
-

SQL Ingest - Ingest data using SQL query

- -
- -
-
-
-
-
-

SQL Ingest - Ingest data using SQL query#

-
-
-
# nuclio: ignore
-import nuclio
-
-
-
-
-
---------------------------------------------------------------------------
-ModuleNotFoundError                       Traceback (most recent call last)
-<ipython-input-1-0a63a0760af8> in <module>
-      1 # nuclio: ignore
-----> 2 import nuclio
-      3 
-
-ModuleNotFoundError: No module named 'nuclio'
-
-
-
-
-
-
-
%nuclio config kind = "job"
-%nuclio config spec.build.baseImage = "mlrun/mlrun"
-
-
-
-
-
%nuclio: setting kind to 'job'
-%nuclio: setting spec.image to 'mlrun/mlrun'
-
-
-
-
-
-
-
%%nuclio cmd -c
-pip install --no-cache-dir git+https://github.com/v3io/PyHive.git@v0.6.999 
-pip install sqlalchemy==1.3.11
-pip install PyMySQL==0.9.3
-
-
-
-
-
-
-
import pandas as pd
-import pyhive
-from sqlalchemy.engine import create_engine
-from mlrun.execution import MLClientCtx
-
-
-def sql_to_file(
-    context: MLClientCtx,
-    sql_query: str,
-    database_url: str,
-    file_ext: str = "parquet",
-) -> None:
-    """SQL Ingest - Ingest data using SQL query
-
-    :param context:           the function context
-    :param sql_query:         the sql query used to retrieve the data
-    :param database_url:      database connection URL
-    :param file_ext:          ("parquet") format for result file
-
-"""
-
-    engine = create_engine(database_url)
-    df = pd.read_sql(sql_query, engine)
-
-    context.log_dataset('query result',
-                        df=df,
-                        format=file_ext,
-                        artifact_path=context.artifact_subpath('data'))
-
-
-
-
-
-
-
# nuclio: end-code
-
-
-
-
-
-

mlconfig#

-
-
-
from mlrun import mlconf
-import os
-mlconf.dbpath = mlconf.dbpath or 'http://mlrun-api:8080'
-mlconf.artifact_path = mlconf.artifact_path or f'{os.environ["HOME"]}/artifacts'
-
-
-
-
-
---------------------------------------------------------------------------
-KeyError                                  Traceback (most recent call last)
-<ipython-input-7-c3828dc245f1> in <module>
-      2 import os
-      3 mlconf.dbpath = mlconf.dbpath or 'http://mlrun-api:8080'
-----> 4 mlconf.artifact_path = mlconf.artifact_path or f'{os.environ["HOME"]}/artifacts'
-      5 
-      6 
-
-C:\Program Files\Python37\lib\os.py in __getitem__(self, key)
-    679         except KeyError:
-    680             # raise KeyError with the original key value
---> 681             raise KeyError(key) from None
-    682         return self.decodevalue(value)
-    683 
-
-KeyError: 'HOME'
-
-
-
-
-
-
-

Save function#

-
-
-
def mount_secret(
-    secret_name, volume_mount_path, volume_name='secret', items=None
-):
-    def _mount_secret(task):
-        from kubernetes import client as k8s_client
-        vol = k8s_client.V1SecretVolumeSource(secret_name=secret_name, items=items)
-        return task.add_volume(
-            k8s_client.V1Volume(name=volume_name, secret=vol)
-        ).add_volume_mount(
-            k8s_client.V1VolumeMount(mount_path=volume_mount_path, name=volume_name)
-        )
-    return _mount_secret
-
-
-
-
-
-
-
from mlrun import code_to_function, NewTask
-import os
-
-fn = code_to_function(name="sql_to_file",
-                      handler="sql_to_file",
-                      description="SQL To File - Ingest data using SQL query",
-                      categories=["data-prep"],
-                      labels={"author": "adih"})
-
-if "V3IO_ACCESS_KEY" in list(os.environ):
-    fn.apply(mount_secret(secret_name='presto-tls',
-                        volume_mount_path= '/var/run/iguazio/secrets/'))
-
-
-
-
-
-
-

Build the image#

-
-
-
fn.deploy()
-
-
-
-
-
[mlrun] 2020-06-29 12:42:44,100 starting remote build, image: .mlrun/func-default-sql-ingest-latest
-INFO[0000] Resolved base name mlrun/mlrun:0.4.10 to mlrun/mlrun:0.4.10 
-INFO[0000] Resolved base name mlrun/mlrun:0.4.10 to mlrun/mlrun:0.4.10 
-INFO[0000] Retrieving image manifest mlrun/mlrun:0.4.10 
-INFO[0000] Retrieving image manifest mlrun/mlrun:0.4.10 
-INFO[0000] Built cross stage deps: map[]                
-INFO[0000] Retrieving image manifest mlrun/mlrun:0.4.10 
-INFO[0000] Retrieving image manifest mlrun/mlrun:0.4.10 
-INFO[0001] Unpacking rootfs as cmd RUN pip install --no-cache-dir git+https://github.com/v3io/PyHive.git@v0.6.999 requires it. 
-INFO[0027] Taking snapshot of full filesystem...        
-INFO[0039] Resolving paths                              
-INFO[0046] RUN pip install --no-cache-dir git+https://github.com/v3io/PyHive.git@v0.6.999 
-INFO[0046] cmd: /bin/sh                                 
-INFO[0046] args: [-c pip install --no-cache-dir git+https://github.com/v3io/PyHive.git@v0.6.999] 
-Collecting git+https://github.com/v3io/PyHive.git@v0.6.999
-  Cloning https://github.com/v3io/PyHive.git (to revision v0.6.999) to /tmp/pip-req-build-ycqhuolw
-  Running command git clone -q https://github.com/v3io/PyHive.git /tmp/pip-req-build-ycqhuolw
-Requirement already satisfied: future in /usr/local/lib/python3.7/site-packages (from PyHive==0.6.1.dev0) (0.18.2)
-Requirement already satisfied: python-dateutil in /usr/local/lib/python3.7/site-packages (from PyHive==0.6.1.dev0) (2.8.1)
-Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/site-packages (from python-dateutil->PyHive==0.6.1.dev0) (1.15.0)
-Building wheels for collected packages: PyHive
-  Building wheel for PyHive (setup.py): started
-  Building wheel for PyHive (setup.py): finished with status 'done'
-  Created wheel for PyHive: filename=PyHive-0.6.1.dev0-py3-none-any.whl size=46402 sha256=63dca405cbae83da4cfcabfd61fd00f1683bc008c8bfa2272eac7054ec283166
-  Stored in directory: /tmp/pip-ephem-wheel-cache-mwb52l_u/wheels/05/11/cd/4ac4df0fcee76e5ceb614c39c56fca1eead41c0ac32ff6285d
-Successfully built PyHive
-Installing collected packages: PyHive
-Successfully installed PyHive-0.6.1.dev0
-INFO[0048] Taking snapshot of full filesystem...        
-INFO[0048] Resolving paths                              
-INFO[0053] RUN pip install sqlalchemy==1.3.11           
-INFO[0053] cmd: /bin/sh                                 
-INFO[0053] args: [-c pip install sqlalchemy==1.3.11]    
-Collecting sqlalchemy==1.3.11
-  Downloading SQLAlchemy-1.3.11.tar.gz (6.0 MB)
-Building wheels for collected packages: sqlalchemy
-  Building wheel for sqlalchemy (setup.py): started
-  Building wheel for sqlalchemy (setup.py): finished with status 'done'
-  Created wheel for sqlalchemy: filename=SQLAlchemy-1.3.11-cp37-cp37m-linux_x86_64.whl size=1216921 sha256=9dd22e89acfbb68df0c1d189d36907a16c9393e4174598eb4bf377ce57132f3c
-  Stored in directory: /root/.cache/pip/wheels/0a/60/60/f26cbd183a3bb0031ace108156036dd925ec0138ee1c496a16
-Successfully built sqlalchemy
-Installing collected packages: sqlalchemy
-  Attempting uninstall: sqlalchemy
-    Found existing installation: SQLAlchemy 1.3.17
-    Uninstalling SQLAlchemy-1.3.17:
-      Successfully uninstalled SQLAlchemy-1.3.17
-Successfully installed sqlalchemy-1.3.11
-INFO[0057] Taking snapshot of full filesystem...        
-INFO[0057] Resolving paths                              
-INFO[0063] RUN pip install PyMySQL==0.9.3               
-INFO[0063] cmd: /bin/sh                                 
-INFO[0063] args: [-c pip install PyMySQL==0.9.3]        
-Collecting PyMySQL==0.9.3
-  Downloading PyMySQL-0.9.3-py2.py3-none-any.whl (47 kB)
-Installing collected packages: PyMySQL
-Successfully installed PyMySQL-0.9.3
-INFO[0064] Taking snapshot of full filesystem...        
-INFO[0064] Resolving paths                              
-
-
-
True
-
-
-
-
-
-
-
fn.export('function.yaml')
-
-
-
-
-
[mlrun] 2020-06-30 01:58:41,604 function spec saved to path: function.yaml
-
-
-
<mlrun.runtimes.kubejob.KubejobRuntime at 0x2239dbf01c8>
-
-
-
-
-
-
-

Test#

-
-

Reading from a public MySQL DB#

-
-
-
mysql_url = 'mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam'
-mysql_query = 'select rfam_acc,rfam_id,auto_wiki,description,author,seed_source FROM family'
-
-
-
-
-
-
-
from mlrun import NewTask, run_local
-
-sql_task = NewTask(name='sql',
-                   handler=sql_to_file,
-                   params={'sql_query': mysql_query,
-                           'database_url': mysql_url})
-
-
-
-
-
-
-
sql_func = run_local(sql_task)
-
-
-
-
-
[mlrun] 2020-06-29 12:43:59,253 starting run sql uid=b0914edaa58e45ee97c132200c6b60be  -> http://mlrun-api:8080
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 29 12:43:59completedsql
v3io_user=admin
kind=handler
owner=admin
host=jupyter-b9c7995f9-4fblj
sql_query=select rfam_acc,rfam_id,auto_wiki,description,author,seed_source FROM family
database_url=mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam
query result
-
- -
-
to track results use .show() or .logs() or in CLI: 
-!mlrun get run b0914edaa58e45ee97c132200c6b60be --project default , !mlrun logs b0914edaa58e45ee97c132200c6b60be --project default
-[mlrun] 2020-06-29 12:44:02,344 run executed, status=completed
-
-
-
-
-
-

Run it on a cluster#

-
-
-
fn.run(sql_task)
-
-
-
-
-
[mlrun] 2020-06-29 12:44:02,350 starting run sql uid=46ff7ef67e314be49353982cdd8d073a  -> http://mlrun-api:8080
-[mlrun] 2020-06-29 12:44:02,622 Job is running in the background, pod: sql-mplpz
-[mlrun] 2020-06-29 12:44:09,070 run executed, status=completed
-final state: succeeded
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 29 12:44:06completedsql
v3io_user=admin
kind=job
owner=admin
host=sql-mplpz
sql_query=select rfam_acc,rfam_id,auto_wiki,description,author,seed_source FROM family
database_url=mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam
query result
-
- -
-
to track results use .show() or .logs() or in CLI: 
-!mlrun get run 46ff7ef67e314be49353982cdd8d073a --project default , !mlrun logs 46ff7ef67e314be49353982cdd8d073a --project default
-[mlrun] 2020-06-29 12:44:11,893 run executed, status=completed
-
-
-
<mlrun.model.RunObject at 0x7f87fba74b00>
-
-
-
-
-
-
-
-

SQL query from Iguazio Key Value via Presto#

-

You need to create a table and set the sql_table path accordingly.
-you can find an example of creating such table in https://github.com/v3io/tutorials/blob/master/data-ingestion-and-preparation/basic-data-ingestion-and-preparation.ipynb

-
-
-
# nuclio: ignore
-import os
-sql_table = os.path.join('v3io.users."'+str(os.getenv('V3IO_USERNAME'))+'/examples/stocks_tab"')
-sql_query_string = 'select * from '+sql_table+""
-
-
-
-
-
-
-
%sql select * from $sql_table limit 10
-
-
-
-
-
Done.
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
securitydescsecuritytypetimeisinminpricedateendpricenumberoftradesmnemoniccurrencysecurityidmaxpricetradedvolumestartprice
UBS I.ETF-DL G.SEL.DIV.ADETF08:27IE00BMP3HG278.4182018-03-26 00:00:00.0008.4181UBUMEUR25054508.4184038.418
GILEAD SCIENCES DL-,001Common stock08:00US375558103659.72018-03-26 00:00:00.00059.843GISEUR250649559.8474559.7
3M CO. DL-,01Common stock08:00US88579Y1010176.512018-03-26 00:00:00.000176.511MMMEUR2506577176.5139176.51
DIEBOLD NIXDORF INH.O.N.Common stock08:06DE000A0CAYB266.32018-03-26 00:00:00.00066.31WINEUR250428666.36066.3
XTR.II EUR.INF.LINK.BD 1CETF08:13LU0290358224218.972018-03-26 00:00:00.000218.971DBXKEUR2505840218.97110218.97
UBS-ETF-MSCI EMU S.C.EOADETF08:33LU0671493277100.22018-03-26 00:00:00.000100.21UEFDEUR2506045100.2180100.2
ASMALLWORLD AG SF 1Common stock08:23CH040488012912.72018-03-26 00:00:00.00012.711Q7EUR308912212.740012.7
IS.DJ GLOB.TITAN.50 U.ETFETF08:42DE000628938231.252018-03-26 00:00:00.00031.251EXI2EUR250502931.255031.25
ISHS IV-AGEING POPUL.ETFETF08:17IE00BYZK46694.9262018-03-26 00:00:00.0004.92612B77EUR25055524.926254.926
PORSCHE AUTOM.HLDG VZOCommon stock08:00DE000PAH003864.682018-03-26 00:00:00.00064.768PAH3EUR250481664.7669864.7
-
-
-
-
sql_task = NewTask(name='sql', 
-                   handler=sql_to_file,
-                   params={'sql_query': sql_query_string,
-                          'database_url': os.getenv('DATABASE_URL')}
-                          )
-
-
-
-
-
-
-
sql_func = run_local(sql_task)
-
-
-
-
-
[mlrun] 2020-06-29 12:44:14,406 starting run sql uid=d32a57bb990d4142bb1f63862e8906bf  -> http://mlrun-api:8080
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 29 12:44:14completedsql
v3io_user=admin
kind=handler
owner=admin
host=jupyter-b9c7995f9-4fblj
sql_query=select * from v3io.users."admin/examples/stocks_tab"
database_url=presto://admin:8278ee8e-0f31-4aea-a105-2eab202bec93@presto-api-presto.default-tenant.app.cs-mlrun-test.iguazio-c0.com:443/v3io?protocol=https&requests_kwargs=%7B%22verify%22%3A+%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.crt%22%2C+%22cert%22%3A+%5B%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.crt%22%2C+%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.key%22%5D%7D
query result
-
- -
-
to track results use .show() or .logs() or in CLI: 
-!mlrun get run d32a57bb990d4142bb1f63862e8906bf --project default , !mlrun logs d32a57bb990d4142bb1f63862e8906bf --project default
-[mlrun] 2020-06-29 12:44:18,102 run executed, status=completed
-
-
-
-
-
-
-
fn.run(sql_task)
-
-
-
-
-
[mlrun] 2020-06-29 12:44:18,112 starting run sql uid=db9507007f6d452e9ca020e4f483e33b  -> http://mlrun-api:8080
-[mlrun] 2020-06-29 12:44:18,387 Job is running in the background, pod: sql-g7p4f
-[mlrun] 2020-06-29 12:44:25,033 run executed, status=completed
-final state: succeeded
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 29 12:44:21completedsql
v3io_user=admin
kind=job
owner=admin
host=sql-g7p4f
sql_query=select * from v3io.users."admin/examples/stocks_tab"
database_url=presto://admin:8278ee8e-0f31-4aea-a105-2eab202bec93@presto-api-presto.default-tenant.app.cs-mlrun-test.iguazio-c0.com:443/v3io?protocol=https&requests_kwargs=%7B%22verify%22%3A+%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.crt%22%2C+%22cert%22%3A+%5B%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.crt%22%2C+%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.key%22%5D%7D
query result
-
- -
-
to track results use .show() or .logs() or in CLI: 
-!mlrun get run db9507007f6d452e9ca020e4f483e33b --project default , !mlrun logs db9507007f6d452e9ca020e4f483e33b --project default
-[mlrun] 2020-06-29 12:44:27,645 run executed, status=completed
-
-
-
<mlrun.model.RunObject at 0x7f87f8e26c18>
-
-
-
-
-
-
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/sql_to_file/latest/static/function.html b/functions/development/sql_to_file/latest/static/function.html deleted file mode 100644 index 4de597e5..00000000 --- a/functions/development/sql_to_file/latest/static/function.html +++ /dev/null @@ -1,69 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: job
-metadata:
-  name: sql-to-file
-  tag: ''
-  hash: 61f616fe697994e05cf018f2ee94c4ea25ed8863
-  project: ''
-  labels:
-    author: adih
-  categories:
-  - data-preparation
-spec:
-  command: ''
-  args: []
-  image: mlrun/mlrun
-  env: []
-  default_handler: sql_to_file
-  entry_points:
-    sql_to_file:
-      name: sql_to_file
-      doc: SQL Ingest - Ingest data using SQL query
-      parameters:
-      - name: context
-        type: MLClientCtx
-        doc: the function context
-        default: ''
-      - name: sql_query
-        type: str
-        doc: the sql query used to retrieve the data
-        default: ''
-      - name: database_url
-        type: str
-        doc: database connection URL
-        default: ''
-      - name: file_ext
-        type: str
-        doc: ("parquet") format for result file
-        default: parquet
-      outputs:
-      - default: ''
-      lineno: 9
-  description: SQL To File - Ingest data using SQL query
-  build:
-    functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHBhbmRhcyBhcyBwZAppbXBvcnQgcHloaXZlCmZyb20gc3FsYWxjaGVteS5lbmdpbmUgaW1wb3J0IGNyZWF0ZV9lbmdpbmUKZnJvbSBtbHJ1bi5leGVjdXRpb24gaW1wb3J0IE1MQ2xpZW50Q3R4CgoKZGVmIHNxbF90b19maWxlKAogICAgY29udGV4dDogTUxDbGllbnRDdHgsCiAgICBzcWxfcXVlcnk6IHN0ciwKICAgIGRhdGFiYXNlX3VybDogc3RyLAogICAgZmlsZV9leHQ6IHN0ciA9ICJwYXJxdWV0IiwKKSAtPiBOb25lOgogICAgIiIiU1FMIEluZ2VzdCAtIEluZ2VzdCBkYXRhIHVzaW5nIFNRTCBxdWVyeQoKICAgIDpwYXJhbSBjb250ZXh0OiAgICAgICAgICAgdGhlIGZ1bmN0aW9uIGNvbnRleHQKICAgIDpwYXJhbSBzcWxfcXVlcnk6ICAgICAgICAgdGhlIHNxbCBxdWVyeSB1c2VkIHRvIHJldHJpZXZlIHRoZSBkYXRhCiAgICA6cGFyYW0gZGF0YWJhc2VfdXJsOiAgICAgIGRhdGFiYXNlIGNvbm5lY3Rpb24gVVJMCiAgICA6cGFyYW0gZmlsZV9leHQ6ICAgICAgICAgICgicGFycXVldCIpIGZvcm1hdCBmb3IgcmVzdWx0IGZpbGUKICAgICIiIgoKICAgIGVuZ2luZSA9IGNyZWF0ZV9lbmdpbmUoZGF0YWJhc2VfdXJsKQogICAgZGYgPSBwZC5yZWFkX3NxbChzcWxfcXVlcnksIGVuZ2luZSkKCiAgICBjb250ZXh0LmxvZ19kYXRhc2V0KAogICAgICAgICJxdWVyeSByZXN1bHQiLAogICAgICAgIGRmPWRmLAogICAgICAgIGZvcm1hdD1maWxlX2V4dCwKICAgICAgICBhcnRpZmFjdF9wYXRoPWNvbnRleHQuYXJ0aWZhY3Rfc3VicGF0aCgiZGF0YSIpLAogICAgKQo=
-    commands: []
-    code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/sql_to_file/sql_to_file.py
-  affinity: null
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/sql_to_file/latest/static/item.html b/functions/development/sql_to_file/latest/static/item.html deleted file mode 100644 index a7e850ee..00000000 --- a/functions/development/sql_to_file/latest/static/item.html +++ /dev/null @@ -1,46 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- data-preparation
-description: SQL To File - Ingest data using SQL query
-doc: ''
-example: sql_to_file.ipynb
-generationDate: 2022-08-28:17-25
-hidden: false
-icon: ''
-labels:
-  author: adih
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 1.1.0
-name: sql-to-file
-platformVersion: 3.5.0
-spec:
-  filename: sql_to_file.py
-  handler: sql_to_file
-  image: mlrun/mlrun
-  kind: job
-  requirements: []
-url: ''
-version: 1.1.0
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/sql_to_file/latest/static/source.html b/functions/development/sql_to_file/latest/static/source.html deleted file mode 100644 index 990d3b78..00000000 --- a/functions/development/sql_to_file/latest/static/source.html +++ /dev/null @@ -1,67 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-# Copyright 2019 Iguazio
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# Generated by nuclio.export.NuclioExporter
-
-import pandas as pd
-import pyhive
-from sqlalchemy.engine import create_engine
-from mlrun.execution import MLClientCtx
-
-
-def sql_to_file(
-    context: MLClientCtx,
-    sql_query: str,
-    database_url: str,
-    file_ext: str = "parquet",
-) -> None:
-    """SQL Ingest - Ingest data using SQL query
-
-    :param context:           the function context
-    :param sql_query:         the sql query used to retrieve the data
-    :param database_url:      database connection URL
-    :param file_ext:          ("parquet") format for result file
-    """
-
-    engine = create_engine(database_url)
-    df = pd.read_sql(sql_query, engine)
-
-    context.log_dataset(
-        "query result",
-        df=df,
-        format=file_ext,
-        artifact_path=context.artifact_subpath("data"),
-    )
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/sql_to_file/latest/static/sql_to_file.html b/functions/development/sql_to_file/latest/static/sql_to_file.html deleted file mode 100644 index 8bfde314..00000000 --- a/functions/development/sql_to_file/latest/static/sql_to_file.html +++ /dev/null @@ -1,185 +0,0 @@ - - - - - - - -sql_to_file.sql_to_file - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - -
-
- -
-
-
-
-
- -
-

- -
-
-
-
-
-
-
-

Source code for sql_to_file.sql_to_file

-# Copyright 2019 Iguazio
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# Generated by nuclio.export.NuclioExporter
-
-import pandas as pd
-import pyhive
-from sqlalchemy.engine import create_engine
-from mlrun.execution import MLClientCtx
-
-
-
[docs]def sql_to_file( - context: MLClientCtx, - sql_query: str, - database_url: str, - file_ext: str = "parquet", -) -> None: - """SQL Ingest - Ingest data using SQL query - - :param context: the function context - :param sql_query: the sql query used to retrieve the data - :param database_url: database connection URL - :param file_ext: ("parquet") format for result file - """ - - engine = create_engine(database_url) - df = pd.read_sql(sql_query, engine) - - context.log_dataset( - "query result", - df=df, - format=file_ext, - artifact_path=context.artifact_subpath("data"), - )
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/stream_to_parquet/0.0.1/src/function.yaml b/functions/development/stream_to_parquet/0.0.1/src/function.yaml deleted file mode 100644 index 16fd58bc..00000000 --- a/functions/development/stream_to_parquet/0.0.1/src/function.yaml +++ /dev/null @@ -1,43 +0,0 @@ -kind: remote -metadata: - name: stream-to-parquet - tag: '' - hash: a26120ee0994dd4fd0915a700a88b7ec435acc52 - project: default - labels: - author: orz - categories: - - machine-learning - - data-preparation -spec: - command: '' - args: [] - image: mlrun/ml-models - description: Saves a stream to Parquet and can lunch drift detection task on it - min_replicas: 1 - max_replicas: 1 - env: [] - base_spec: - apiVersion: nuclio.io/v1 - kind: Function - metadata: - name: stream-to-parquet - labels: {} - annotations: - nuclio.io/generated_by: function generated from /home/kali/functions/stream_to_parquet/stream_to_parquet.py - spec: - runtime: python:3.6 - handler: stream_to_parquet:handler - env: [] - volumes: [] - build: - commands: [] - noBaseImagesPull: true - functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IG9zCmltcG9ydCBwYW5kYXMgYXMgcGQKaW1wb3J0IG51bXB5IGFzIG5wCmltcG9ydCBqc29uCmltcG9ydCBkYXRldGltZQppbXBvcnQgbWxydW4KCgpkZWYgcmVjb3JkX3RvX2ZlYXR1cmVzKHJlY29yZCk6CiAgICBmZWF0dXJlcyA9IHJlY29yZFsicmVxdWVzdCJdWyJpbnN0YW5jZXMiXVswXQogICAgdGltZXN0YW1wID0gcmVjb3JkWyJ3aGVuIl0KICAgIHByZWRpY3Rpb24gPSByZWNvcmRbInJlc3AiXQoKICAgIHJlY29yZCA9IHsidGltZXN0YW1wIjogdGltZXN0YW1wLCAqKmZlYXR1cmVzLCAicHJlZGljdGlvbnMiOiBwcmVkaWN0aW9ufQoKICAgIHJldHVybiByZWNvcmQKCgpkZWYgaW5pdF9jb250ZXh0KGNvbnRleHQpOgogICAgc2V0YXR0cihjb250ZXh0LCAiYmF0Y2giLCBbXSkKICAgIHNldGF0dHIoY29udGV4dCwgIndpbmRvdyIsIGludChvcy5nZXRlbnYoIndpbmRvdyIsIDEwKSkpCiAgICBzZXRhdHRyKGNvbnRleHQsICJzYXZlX3RvIiwgb3MuZ2V0ZW52KCJzYXZlX3RvIiwgIi9iaWdkYXRhL2luZmVyZW5jZV9wcS8iKSkKICAgIG9zLm1ha2VkaXJzKGNvbnRleHQuc2F2ZV90bywgZXhpc3Rfb2s9VHJ1ZSkKCiAgICBtbHJ1bi5tbGNvbmYuZGJwYXRoID0gbWxydW4ubWxjb25mLmRicGF0aCBvciAiaHR0cDovL21scnVuLWFwaTo4MDgwIgogICAgYXJ0aWZhY3RfcGF0aCA9IG9zLmdldGVudigiYXJ0aWZhY3RfcGF0aCIsIE5vbmUpCiAgICBpZiBhcnRpZmFjdF9wYXRoOgogICAgICAgIG1scnVuLm1sY29uZi5hcnRpZmFjdF9wYXRoID0gYXJ0aWZhY3RfcGF0aAogICAgaWYgImh1Yl91cmwiIGluIG9zLmVudmlyb246CiAgICAgICAgbWxydW4ubWxjb25mLmh1Yl91cmwgPSBvcy5lbnZpcm9uWyJodWJfdXJsIl0KICAgIHZpcnR1YWxfZHJpZnRfZm4gPSBtbHJ1bi5pbXBvcnRfZnVuY3Rpb24oImh1YjovL3ZpcnR1YWxfZHJpZnQiKQogICAgdmlydHVhbF9kcmlmdF9mbi5hcHBseSgKICAgICAgICBtbHJ1bi5tb3VudF92M2lvKAogICAgICAgICAgICBtb3VudF9wYXRoPW9zLmdldGVudigibW91bnRfcGF0aCIsICJ+LyIpLAogICAgICAgICAgICByZW1vdGU9b3MuZ2V0ZW52KCJtb3VudF9yZW1vdGUiLCAiL1VzZXIiKSwKICAgICAgICApCiAgICApCiAgICBzZXRhdHRyKGNvbnRleHQsICJ2aXJ0dWFsX2RyaWZ0X2ZuIiwgdmlydHVhbF9kcmlmdF9mbikKCiAgICBwcmVkaWN0aW9uc19jb2wgPSBvcy5nZXRlbnYoInByZWRpY3Rpb25zIiwgTm9uZSkKICAgIGxhYmVsX2NvbCA9IG9zLmdldGVudigibGFiZWxfY29sIiwgTm9uZSkKICAgIHNldGF0dHIoY29udGV4dCwgImJhc2VfZGF0YXNldCIsIG9zLmdldGVudigiYmFzZV9kYXRhc2V0IiwgIiIpKQogICAgc2V0YXR0cihjb250ZXh0LCAiaW5kZXhlcyIsIGpzb24ubG9hZHMob3MuZW52aXJvbi5nZXQoImluZGV4ZXMiLCAiW10iKSkpCiAgICBzZXRhdHRyKGNvbnRleHQsICJwcmVkaWN0aW9uc19jb2wiLCBwcmVkaWN0aW9uc19jb2wpCiAgICBzZXRhdHRyKGNvbnRleHQsICJsYWJlbF9jb2wiLCBsYWJlbF9jb2wpCiAgICBzZXRhdHRyKAogICAgICAgIGNvbnRleHQsICJyZXN1bHRzX3RzZGJfY29udGFpbmVyIiwgb3MuZ2V0ZW52KCJyZXN1bHRzX3RzZGJfY29udGFpbmVyIiwgTm9uZSkKICAgICkKICAgIHNldGF0dHIoY29udGV4dCwgInJlc3VsdHNfdHNkYl90YWJsZSIsIG9zLmdldGVudigicmVzdWx0c190c2RiX3RhYmxlIiwgTm9uZSkpCgoKZGVmIGhhbmRsZXIoY29udGV4dCwgZXZlbnQpOgoKICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZiJBZGRpbmcge2V2ZW50LmJvZHl9IikKICAgIGNvbnRleHQuYmF0Y2guYXBwZW5kKHJlY29yZF90b19mZWF0dXJlcyhqc29uLmxvYWRzKGV2ZW50LmJvZHkpKSkKCiAgICBpZiBsZW4oY29udGV4dC5iYXRjaCkgPiBjb250ZXh0LndpbmRvdzoKICAgICAgICBjb250ZXh0LmxvZ2dlci5pbmZvKGNvbnRleHQuYmF0Y2hbOjFdKQogICAgICAgIGNvbnRleHQubG9nZ2VyLmluZm8oY29udGV4dC5pbmRleGVzKQogICAgICAgIGRmID0gcGQuRGF0YUZyYW1lKGNvbnRleHQuYmF0Y2gpCiAgICAgICAgY29udGV4dC5sb2dnZXIuaW5mbyhmImRmIGV4YW1wbGU6IHtkZi5oZWFkKDEpfSIpCiAgICAgICAgaWYgY29udGV4dC5pbmRleGVzOgogICAgICAgICAgICBkZiA9IGRmLnNldF9pbmRleChjb250ZXh0LmluZGV4ZXMpCiAgICAgICAgZGZfcGF0aCA9IG9zLnBhdGguam9pbigKICAgICAgICAgICAgY29udGV4dC5zYXZlX3RvLAogICAgICAgICAgICBmIntkYXRldGltZS5kYXRldGltZS5ub3coKS5zdHJmdGltZSgnJVktJW0tJWRUJUg6JU06JVMnKX0ucHEiLAogICAgICAgICkKICAgICAgICBkZi50b19wYXJxdWV0KGRmX3BhdGgpCgogICAgICAgIHRhc2sgPSBtbHJ1bi5OZXdUYXNrKAogICAgICAgICAgICBuYW1lPSJkcmlmdF9tYWduaXR1ZGUiLAogICAgICAgICAgICBoYW5kbGVyPSJkcmlmdF9tYWduaXR1ZGUiLAogICAgICAgICAgICBwYXJhbXM9ewogICAgICAgICAgICAgICAgImxhYmVsX2NvbCI6IGNvbnRleHQubGFiZWxfY29sLAogICAgICAgICAgICAgICAgInByZWRpY3Rpb25fY29sIjogY29udGV4dC5wcmVkaWN0aW9uc19jb2wsCiAgICAgICAgICAgICAgICAicmVzdWx0c190c2RiX2NvbnRhaW5lciI6IGNvbnRleHQucmVzdWx0c190c2RiX2NvbnRhaW5lciwKICAgICAgICAgICAgICAgICJyZXN1bHRzX3RzZGJfdGFibGUiOiBjb250ZXh0LnJlc3VsdHNfdHNkYl90YWJsZSwKICAgICAgICAgICAgfSwKICAgICAgICAgICAgaW5wdXRzPXsidCI6IGNvbnRleHQuYmFzZV9kYXRhc2V0LCAidSI6IGRmX3BhdGh9LAogICAgICAgICAgICBhcnRpZmFjdF9wYXRoPW1scnVuLm1sY29uZi5hcnRpZmFjdF9wYXRoLAogICAgICAgICkKCiAgICAgICAgY29udGV4dC52aXJ0dWFsX2RyaWZ0X2ZuLnJ1bih0YXNrLCB3YXRjaD1GYWxzZSkKCiAgICAgICAgY29udGV4dC5iYXRjaCA9IFtdCg== - source: '' - build: - commands: [] - code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/stream_to_parquet/stream_to_parquet.py - default_handler: handler - affinity: null -verbose: false diff --git a/functions/development/stream_to_parquet/0.0.1/src/item.yaml b/functions/development/stream_to_parquet/0.0.1/src/item.yaml deleted file mode 100644 index 1bebbe10..00000000 --- a/functions/development/stream_to_parquet/0.0.1/src/item.yaml +++ /dev/null @@ -1,27 +0,0 @@ -apiVersion: v1 -categories: -- machine-learning -- data-preparation -description: Saves a stream to Parquet and can lunch drift detection task on it -doc: '' -example: stream_to_parquet.ipynb -generationDate: 2021-05-19:23-13 -icon: '' -labels: - author: orz -maintainers: [] -marketplaceType: '' -mlrunVersion: '' -name: stream-to-parquet -platformVersion: '' -spec: - filename: stream_to_parquet.py - handler: handler - image: mlrun/ml-models - kind: nuclio - requirements: [] - customFields: - min_replicas: 1 - max_replicas: 1 -url: '' -version: 0.0.1 diff --git a/functions/development/stream_to_parquet/0.0.1/src/stream_to_parquet.ipynb b/functions/development/stream_to_parquet/0.0.1/src/stream_to_parquet.ipynb deleted file mode 100644 index 1f0554f6..00000000 --- a/functions/development/stream_to_parquet/0.0.1/src/stream_to_parquet.ipynb +++ /dev/null @@ -1,262 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Stream to Parquet" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "import nuclio" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "%nuclio: setting kind to 'nuclio'\n", - "%nuclio: setting spec.build.baseImage to 'mlrun/ml-models'\n" - ] - } - ], - "source": [ - "# Define function spec\n", - "%nuclio config kind = \"nuclio\"\n", - "%nuclio config spec.build.baseImage = \"mlrun/ml-models\"" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: start-code" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "import pandas as pd\n", - "import numpy as np\n", - "import json\n", - "import datetime\n", - "import mlrun" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "def record_to_features(record):\n", - " features = record['request']['instances'][0]\n", - " timestamp = record['when']\n", - " prediction = record['resp']\n", - " \n", - " record = {'timestamp': timestamp,\n", - " **features,\n", - " 'predictions': prediction}\n", - " \n", - " return record" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": {}, - "outputs": [], - "source": [ - "def init_context(context):\n", - " setattr(context, 'batch', [])\n", - " setattr(context, 'window', int(os.getenv('window', 10))) \n", - " setattr(context, 'save_to', os.getenv('save_to', '/bigdata/inference_pq/'))\n", - " os.makedirs(context.save_to, exist_ok=True)\n", - " \n", - " mlrun.mlconf.dbpath = mlrun.mlconf.dbpath or 'http://mlrun-api:8080'\n", - " artifact_path = os.getenv('artifact_path', None)\n", - " if artifact_path:\n", - " mlrun.mlconf.artifact_path = artifact_path\n", - " if 'hub_url' in os.environ:\n", - " mlrun.mlconf.hub_url = os.environ['hub_url']\n", - " virtual_drift_fn = mlrun.import_function('hub://virtual_drift')\n", - " virtual_drift_fn.apply(mlrun.mount_v3io(mount_path=os.getenv('mount_path', '~/'), remote=os.getenv('mount_remote', '/User')))\n", - " setattr(context, 'virtual_drift_fn', virtual_drift_fn)\n", - " \n", - " predictions_col = os.getenv('predictions', None) \n", - " label_col = os.getenv('label_col', None)\n", - " setattr(context, 'base_dataset', os.getenv('base_dataset', ''))\n", - " setattr(context, 'indexes', json.loads(os.environ.get('indexes', '[]')))\n", - " setattr(context, 'predictions_col', predictions_col)\n", - " setattr(context, 'label_col', label_col)\n", - " setattr(context, 'results_tsdb_container', os.getenv('results_tsdb_container', None))\n", - " setattr(context, 'results_tsdb_table', os.getenv('results_tsdb_table', None))" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [], - "source": [ - "def handler(context, event):\n", - " \n", - " context.logger.info(f'Adding {event.body}')\n", - " context.batch.append(record_to_features(json.loads(event.body)))\n", - " \n", - " if len(context.batch) > context.window:\n", - " context.logger.info(context.batch[:1])\n", - " context.logger.info(context.indexes)\n", - " df = pd.DataFrame(context.batch)\n", - " context.logger.info(f'df example: {df.head(1)}')\n", - " if context.indexes:\n", - " df = df.set_index(context.indexes)\n", - " df_path = os.path.join(context.save_to, f\"{datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S')}.pq\")\n", - " df.to_parquet(df_path)\n", - "\n", - " task = mlrun.NewTask(name='drift_magnitude',\n", - " handler='drift_magnitude',\n", - " params={'label_col': context.label_col,\n", - " 'prediction_col': context.predictions_col,\n", - " 'results_tsdb_container': context.results_tsdb_container,\n", - " 'results_tsdb_table': context.results_tsdb_table},\n", - " inputs={'t': context.base_dataset,\n", - " 'u': df_path},\n", - " artifact_path=mlrun.mlconf.artifact_path)\n", - " \n", - " context.virtual_drift_fn.run(task,\n", - " watch=False)\n", - " \n", - " context.batch = []" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: end-code" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Save to function yaml" - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2020-12-24 14:34:39,340 [info] function spec saved to path: function.yaml\n", - "> 2020-12-24 14:34:39,350 [info] Starting remote function deploy\n", - "2020-12-24 14:34:39 (info) Deploying function\n", - "2020-12-24 14:34:39 (info) Building\n", - "2020-12-24 14:34:39 (info) Staging files and preparing base images\n", - "2020-12-24 14:34:39 (info) Building processor image\n", - "2020-12-24 14:34:40 (info) Build complete\n", - "2020-12-24 14:34:50 (info) Function deploy complete\n", - "> 2020-12-24 14:34:51,915 [info] function deployed, address=default-tenant.app.lewpwntlsyrb.iguazio-cd1.com:32225\n" - ] - }, - { - "data": { - "text/plain": [ - "'http://default-tenant.app.lewpwntlsyrb.iguazio-cd1.com:32225'" - ] - }, - "execution_count": 44, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import os\n", - "import json\n", - "from nuclio.triggers import V3IOStreamTrigger\n", - "from mlrun import mlconf, code_to_function, mount_v3io\n", - "\n", - "mlconf.dbpath = mlconf.dbpath or 'http://mlrun-api:8080'\n", - "\n", - "# create job function object from notebook code\n", - "fn = code_to_function(\"stream_to_parquet\")\n", - "fn.spec.min_replicas = 1\n", - "fn.spec.max_replicas = 1\n", - "\n", - "# add metadata (for templates and reuse)\n", - "fn.spec.default_handler = \"handler\"\n", - "fn.spec.description = \"Saves a stream to Parquet and can lunch drift detection task on it\"\n", - "fn.metadata.categories = [\"ml\", \"serve\"]\n", - "fn.metadata.labels = {\"author\": \"orz\"}\n", - "fn.export(\"function.yaml\")\n", - "\n", - "fn.add_trigger('labeled_stream', V3IOStreamTrigger(url=f'https://{os.environ[\"V3IO_API\"]}/users/orz/mlrun-demos/demos/network-operations/streaming/labeled_stream@s2p1', seekTo='latest'))\n", - "fn.apply(mount_v3io())\n", - "projdir = '/User/mlrun-demos/demos/network-operations/'\n", - "fn.set_envs({'window': 10000,\n", - " 'indexes': json.dumps(['timestamp', 'company', 'data_center', 'device']),\n", - " 'save_to': os.path.join(projdir, 'streaming', 'inference_pq'),\n", - " 'prediction_col': 'prediction',\n", - " 'label_col': 'is_error',\n", - " 'base_dataset': '/User/mlrun-demos/demos/network-operations/artifacts/test_set_preds.parquet',\n", - " 'results_tsdb_container': 'users',\n", - " 'results_tsdb_table': 'orz/mlrun-demos/demos/network-operations/streaming/s2p_tsdb',\n", - " 'mount_path': '/users/orz',\n", - " 'mount_remote': '/User',\n", - " 'artifact_path': '/User/mlrun-demos/demos/network-operations/streaming/drift_magnitude'})\n", - "fn.deploy(project='network-operations')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/functions/development/stream_to_parquet/0.0.1/src/stream_to_parquet.py b/functions/development/stream_to_parquet/0.0.1/src/stream_to_parquet.py deleted file mode 100644 index e50caee1..00000000 --- a/functions/development/stream_to_parquet/0.0.1/src/stream_to_parquet.py +++ /dev/null @@ -1,87 +0,0 @@ -# Generated by nuclio.export.NuclioExporter - -import os -import pandas as pd -import numpy as np -import json -import datetime -import mlrun - - -def record_to_features(record): - features = record["request"]["instances"][0] - timestamp = record["when"] - prediction = record["resp"] - - record = {"timestamp": timestamp, **features, "predictions": prediction} - - return record - - -def init_context(context): - setattr(context, "batch", []) - setattr(context, "window", int(os.getenv("window", 10))) - setattr(context, "save_to", os.getenv("save_to", "/bigdata/inference_pq/")) - os.makedirs(context.save_to, exist_ok=True) - - mlrun.mlconf.dbpath = mlrun.mlconf.dbpath or "http://mlrun-api:8080" - artifact_path = os.getenv("artifact_path", None) - if artifact_path: - mlrun.mlconf.artifact_path = artifact_path - if "hub_url" in os.environ: - mlrun.mlconf.hub_url = os.environ["hub_url"] - virtual_drift_fn = mlrun.import_function("hub://virtual_drift") - virtual_drift_fn.apply( - mlrun.mount_v3io( - mount_path=os.getenv("mount_path", "~/"), - remote=os.getenv("mount_remote", "/User"), - ) - ) - setattr(context, "virtual_drift_fn", virtual_drift_fn) - - predictions_col = os.getenv("predictions", None) - label_col = os.getenv("label_col", None) - setattr(context, "base_dataset", os.getenv("base_dataset", "")) - setattr(context, "indexes", json.loads(os.environ.get("indexes", "[]"))) - setattr(context, "predictions_col", predictions_col) - setattr(context, "label_col", label_col) - setattr( - context, "results_tsdb_container", os.getenv("results_tsdb_container", None) - ) - setattr(context, "results_tsdb_table", os.getenv("results_tsdb_table", None)) - - -def handler(context, event): - - context.logger.info(f"Adding {event.body}") - context.batch.append(record_to_features(json.loads(event.body))) - - if len(context.batch) > context.window: - context.logger.info(context.batch[:1]) - context.logger.info(context.indexes) - df = pd.DataFrame(context.batch) - context.logger.info(f"df example: {df.head(1)}") - if context.indexes: - df = df.set_index(context.indexes) - df_path = os.path.join( - context.save_to, - f"{datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S')}.pq", - ) - df.to_parquet(df_path) - - task = mlrun.NewTask( - name="drift_magnitude", - handler="drift_magnitude", - params={ - "label_col": context.label_col, - "prediction_col": context.predictions_col, - "results_tsdb_container": context.results_tsdb_container, - "results_tsdb_table": context.results_tsdb_table, - }, - inputs={"t": context.base_dataset, "u": df_path}, - artifact_path=mlrun.mlconf.artifact_path, - ) - - context.virtual_drift_fn.run(task, watch=False) - - context.batch = [] diff --git a/functions/development/stream_to_parquet/0.0.1/static/documentation.html b/functions/development/stream_to_parquet/0.0.1/static/documentation.html deleted file mode 100644 index 52e2c852..00000000 --- a/functions/development/stream_to_parquet/0.0.1/static/documentation.html +++ /dev/null @@ -1,137 +0,0 @@ - - - - - - - -stream_to_parquet package - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- - -
-
-
-
-
-
-

stream_to_parquet package

-
-

Submodules

-
-
-

stream_to_parquet.stream_to_parquet module

-
-
-stream_to_parquet.stream_to_parquet.handler(context, event)[source]
-
-
-
-stream_to_parquet.stream_to_parquet.init_context(context)[source]
-
-
-
-stream_to_parquet.stream_to_parquet.record_to_features(record)[source]
-
-
-
-

Module contents

-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/stream_to_parquet/0.0.1/static/example.html b/functions/development/stream_to_parquet/0.0.1/static/example.html deleted file mode 100644 index 21b20ffc..00000000 --- a/functions/development/stream_to_parquet/0.0.1/static/example.html +++ /dev/null @@ -1,294 +0,0 @@ - - - - - - - -Stream to Parquet - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- -
-
- Contents -
- -
-
-
-
-
-
-
-

Stream to Parquet

-
-
-
import nuclio
-
-
-
-
-
-
-
# Define function spec
-%nuclio config kind = "nuclio"
-%nuclio config spec.build.baseImage = "mlrun/ml-models"
-
-
-
-
-
%nuclio: setting kind to 'nuclio'
-%nuclio: setting spec.build.baseImage to 'mlrun/ml-models'
-
-
-
-
-
-
-
# nuclio: start-code
-
-
-
-
-
-
-
import os
-import pandas as pd
-import numpy as np
-import json
-import datetime
-import mlrun
-
-
-
-
-
-
-
def record_to_features(record):
-    features = record['request']['instances'][0]
-    timestamp = record['when']
-    prediction = record['resp']
-    
-    record = {'timestamp': timestamp,
-              **features,
-              'predictions': prediction}
-    
-    return record
-
-
-
-
-
-
-
def init_context(context):
-    setattr(context, 'batch', [])
-    setattr(context, 'window', int(os.getenv('window', 10)))    
-    setattr(context, 'save_to', os.getenv('save_to', '/bigdata/inference_pq/'))
-    os.makedirs(context.save_to, exist_ok=True)
-    
-    mlrun.mlconf.dbpath = mlrun.mlconf.dbpath or 'http://mlrun-api:8080'
-    artifact_path = os.getenv('artifact_path', None)
-    if artifact_path:
-        mlrun.mlconf.artifact_path = artifact_path
-    if 'hub_url' in os.environ:
-        mlrun.mlconf.hub_url = os.environ['hub_url']
-    virtual_drift_fn = mlrun.import_function('hub://virtual_drift')
-    virtual_drift_fn.apply(mlrun.mount_v3io(mount_path=os.getenv('mount_path', '~/'), remote=os.getenv('mount_remote', '/User')))
-    setattr(context, 'virtual_drift_fn', virtual_drift_fn)
-    
-    predictions_col = os.getenv('predictions', None) 
-    label_col = os.getenv('label_col', None)
-    setattr(context, 'base_dataset', os.getenv('base_dataset', ''))
-    setattr(context, 'indexes', json.loads(os.environ.get('indexes', '[]')))
-    setattr(context, 'predictions_col', predictions_col)
-    setattr(context, 'label_col', label_col)
-    setattr(context, 'results_tsdb_container', os.getenv('results_tsdb_container', None))
-    setattr(context, 'results_tsdb_table', os.getenv('results_tsdb_table', None))
-
-
-
-
-
-
-
def handler(context, event):
-    
-    context.logger.info(f'Adding {event.body}')
-    context.batch.append(record_to_features(json.loads(event.body)))
-    
-    if len(context.batch) > context.window:
-        context.logger.info(context.batch[:1])
-        context.logger.info(context.indexes)
-        df = pd.DataFrame(context.batch)
-        context.logger.info(f'df example: {df.head(1)}')
-        if context.indexes:
-            df = df.set_index(context.indexes)
-        df_path = os.path.join(context.save_to, f"{datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S')}.pq")
-        df.to_parquet(df_path)
-
-        task = mlrun.NewTask(name='drift_magnitude',
-                        handler='drift_magnitude',
-                        params={'label_col': context.label_col,
-                                'prediction_col': context.predictions_col,
-                                'results_tsdb_container': context.results_tsdb_container,
-                                'results_tsdb_table': context.results_tsdb_table},
-                        inputs={'t': context.base_dataset,
-                                'u': df_path},
-                        artifact_path=mlrun.mlconf.artifact_path)
-        
-        context.virtual_drift_fn.run(task,
-                                     watch=False)
-        
-        context.batch = []
-
-
-
-
-
-
-
# nuclio: end-code
-
-
-
-
-
-

Save to function yaml

-
-
-
import os
-import json
-from nuclio.triggers import V3IOStreamTrigger
-from mlrun import mlconf, code_to_function, mount_v3io
-
-mlconf.dbpath = mlconf.dbpath or 'http://mlrun-api:8080'
-
-# create job function object from notebook code
-fn = code_to_function("stream_to_parquet")
-fn.spec.min_replicas = 1
-fn.spec.max_replicas = 1
-
-# add metadata (for templates and reuse)
-fn.spec.default_handler = "handler"
-fn.spec.description = "Saves a stream to Parquet and can lunch drift detection task on it"
-fn.metadata.categories = ["ml", "serve"]
-fn.metadata.labels = {"author": "orz"}
-fn.export("function.yaml")
-
-fn.add_trigger('labeled_stream', V3IOStreamTrigger(url=f'https://{os.environ["V3IO_API"]}/users/orz/mlrun-demos/demos/network-operations/streaming/labeled_stream@s2p1', seekTo='latest'))
-fn.apply(mount_v3io())
-projdir = '/User/mlrun-demos/demos/network-operations/'
-fn.set_envs({'window': 10000,
-             'indexes': json.dumps(['timestamp', 'company', 'data_center', 'device']),
-             'save_to': os.path.join(projdir, 'streaming', 'inference_pq'),
-             'prediction_col': 'prediction',
-             'label_col': 'is_error',
-             'base_dataset': '/User/mlrun-demos/demos/network-operations/artifacts/test_set_preds.parquet',
-             'results_tsdb_container': 'users',
-             'results_tsdb_table': 'orz/mlrun-demos/demos/network-operations/streaming/s2p_tsdb',
-             'mount_path': '/users/orz',
-             'mount_remote': '/User',
-             'artifact_path': '/User/mlrun-demos/demos/network-operations/streaming/drift_magnitude'})
-fn.deploy(project='network-operations')
-
-
-
-
-
> 2020-12-24 14:34:39,340 [info] function spec saved to path: function.yaml
-> 2020-12-24 14:34:39,350 [info] Starting remote function deploy
-2020-12-24 14:34:39  (info) Deploying function
-2020-12-24 14:34:39  (info) Building
-2020-12-24 14:34:39  (info) Staging files and preparing base images
-2020-12-24 14:34:39  (info) Building processor image
-2020-12-24 14:34:40  (info) Build complete
-2020-12-24 14:34:50  (info) Function deploy complete
-> 2020-12-24 14:34:51,915 [info] function deployed, address=default-tenant.app.lewpwntlsyrb.iguazio-cd1.com:32225
-
-
-
'http://default-tenant.app.lewpwntlsyrb.iguazio-cd1.com:32225'
-
-
-
-
-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/stream_to_parquet/0.0.1/static/function.html b/functions/development/stream_to_parquet/0.0.1/static/function.html deleted file mode 100644 index 9052f2cd..00000000 --- a/functions/development/stream_to_parquet/0.0.1/static/function.html +++ /dev/null @@ -1,65 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: remote
-metadata:
-  name: stream-to-parquet
-  tag: ''
-  hash: a26120ee0994dd4fd0915a700a88b7ec435acc52
-  project: default
-  labels:
-    author: orz
-  categories:
-  - machine-learning
-  - data-preparation
-spec:
-  command: ''
-  args: []
-  image: mlrun/ml-models
-  description: Saves a stream to Parquet and can lunch drift detection task on it
-  min_replicas: 1
-  max_replicas: 1
-  env: []
-  base_spec:
-    apiVersion: nuclio.io/v1
-    kind: Function
-    metadata:
-      name: stream-to-parquet
-      labels: {}
-      annotations:
-        nuclio.io/generated_by: function generated from /home/kali/functions/stream_to_parquet/stream_to_parquet.py
-    spec:
-      runtime: python:3.6
-      handler: stream_to_parquet:handler
-      env: []
-      volumes: []
-      build:
-        commands: []
-        noBaseImagesPull: true
-        functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IG9zCmltcG9ydCBwYW5kYXMgYXMgcGQKaW1wb3J0IG51bXB5IGFzIG5wCmltcG9ydCBqc29uCmltcG9ydCBkYXRldGltZQppbXBvcnQgbWxydW4KCgpkZWYgcmVjb3JkX3RvX2ZlYXR1cmVzKHJlY29yZCk6CiAgICBmZWF0dXJlcyA9IHJlY29yZFsicmVxdWVzdCJdWyJpbnN0YW5jZXMiXVswXQogICAgdGltZXN0YW1wID0gcmVjb3JkWyJ3aGVuIl0KICAgIHByZWRpY3Rpb24gPSByZWNvcmRbInJlc3AiXQoKICAgIHJlY29yZCA9IHsidGltZXN0YW1wIjogdGltZXN0YW1wLCAqKmZlYXR1cmVzLCAicHJlZGljdGlvbnMiOiBwcmVkaWN0aW9ufQoKICAgIHJldHVybiByZWNvcmQKCgpkZWYgaW5pdF9jb250ZXh0KGNvbnRleHQpOgogICAgc2V0YXR0cihjb250ZXh0LCAiYmF0Y2giLCBbXSkKICAgIHNldGF0dHIoY29udGV4dCwgIndpbmRvdyIsIGludChvcy5nZXRlbnYoIndpbmRvdyIsIDEwKSkpCiAgICBzZXRhdHRyKGNvbnRleHQsICJzYXZlX3RvIiwgb3MuZ2V0ZW52KCJzYXZlX3RvIiwgIi9iaWdkYXRhL2luZmVyZW5jZV9wcS8iKSkKICAgIG9zLm1ha2VkaXJzKGNvbnRleHQuc2F2ZV90bywgZXhpc3Rfb2s9VHJ1ZSkKCiAgICBtbHJ1bi5tbGNvbmYuZGJwYXRoID0gbWxydW4ubWxjb25mLmRicGF0aCBvciAiaHR0cDovL21scnVuLWFwaTo4MDgwIgogICAgYXJ0aWZhY3RfcGF0aCA9IG9zLmdldGVudigiYXJ0aWZhY3RfcGF0aCIsIE5vbmUpCiAgICBpZiBhcnRpZmFjdF9wYXRoOgogICAgICAgIG1scnVuLm1sY29uZi5hcnRpZmFjdF9wYXRoID0gYXJ0aWZhY3RfcGF0aAogICAgaWYgImh1Yl91cmwiIGluIG9zLmVudmlyb246CiAgICAgICAgbWxydW4ubWxjb25mLmh1Yl91cmwgPSBvcy5lbnZpcm9uWyJodWJfdXJsIl0KICAgIHZpcnR1YWxfZHJpZnRfZm4gPSBtbHJ1bi5pbXBvcnRfZnVuY3Rpb24oImh1YjovL3ZpcnR1YWxfZHJpZnQiKQogICAgdmlydHVhbF9kcmlmdF9mbi5hcHBseSgKICAgICAgICBtbHJ1bi5tb3VudF92M2lvKAogICAgICAgICAgICBtb3VudF9wYXRoPW9zLmdldGVudigibW91bnRfcGF0aCIsICJ+LyIpLAogICAgICAgICAgICByZW1vdGU9b3MuZ2V0ZW52KCJtb3VudF9yZW1vdGUiLCAiL1VzZXIiKSwKICAgICAgICApCiAgICApCiAgICBzZXRhdHRyKGNvbnRleHQsICJ2aXJ0dWFsX2RyaWZ0X2ZuIiwgdmlydHVhbF9kcmlmdF9mbikKCiAgICBwcmVkaWN0aW9uc19jb2wgPSBvcy5nZXRlbnYoInByZWRpY3Rpb25zIiwgTm9uZSkKICAgIGxhYmVsX2NvbCA9IG9zLmdldGVudigibGFiZWxfY29sIiwgTm9uZSkKICAgIHNldGF0dHIoY29udGV4dCwgImJhc2VfZGF0YXNldCIsIG9zLmdldGVudigiYmFzZV9kYXRhc2V0IiwgIiIpKQogICAgc2V0YXR0cihjb250ZXh0LCAiaW5kZXhlcyIsIGpzb24ubG9hZHMob3MuZW52aXJvbi5nZXQoImluZGV4ZXMiLCAiW10iKSkpCiAgICBzZXRhdHRyKGNvbnRleHQsICJwcmVkaWN0aW9uc19jb2wiLCBwcmVkaWN0aW9uc19jb2wpCiAgICBzZXRhdHRyKGNvbnRleHQsICJsYWJlbF9jb2wiLCBsYWJlbF9jb2wpCiAgICBzZXRhdHRyKAogICAgICAgIGNvbnRleHQsICJyZXN1bHRzX3RzZGJfY29udGFpbmVyIiwgb3MuZ2V0ZW52KCJyZXN1bHRzX3RzZGJfY29udGFpbmVyIiwgTm9uZSkKICAgICkKICAgIHNldGF0dHIoY29udGV4dCwgInJlc3VsdHNfdHNkYl90YWJsZSIsIG9zLmdldGVudigicmVzdWx0c190c2RiX3RhYmxlIiwgTm9uZSkpCgoKZGVmIGhhbmRsZXIoY29udGV4dCwgZXZlbnQpOgoKICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZiJBZGRpbmcge2V2ZW50LmJvZHl9IikKICAgIGNvbnRleHQuYmF0Y2guYXBwZW5kKHJlY29yZF90b19mZWF0dXJlcyhqc29uLmxvYWRzKGV2ZW50LmJvZHkpKSkKCiAgICBpZiBsZW4oY29udGV4dC5iYXRjaCkgPiBjb250ZXh0LndpbmRvdzoKICAgICAgICBjb250ZXh0LmxvZ2dlci5pbmZvKGNvbnRleHQuYmF0Y2hbOjFdKQogICAgICAgIGNvbnRleHQubG9nZ2VyLmluZm8oY29udGV4dC5pbmRleGVzKQogICAgICAgIGRmID0gcGQuRGF0YUZyYW1lKGNvbnRleHQuYmF0Y2gpCiAgICAgICAgY29udGV4dC5sb2dnZXIuaW5mbyhmImRmIGV4YW1wbGU6IHtkZi5oZWFkKDEpfSIpCiAgICAgICAgaWYgY29udGV4dC5pbmRleGVzOgogICAgICAgICAgICBkZiA9IGRmLnNldF9pbmRleChjb250ZXh0LmluZGV4ZXMpCiAgICAgICAgZGZfcGF0aCA9IG9zLnBhdGguam9pbigKICAgICAgICAgICAgY29udGV4dC5zYXZlX3RvLAogICAgICAgICAgICBmIntkYXRldGltZS5kYXRldGltZS5ub3coKS5zdHJmdGltZSgnJVktJW0tJWRUJUg6JU06JVMnKX0ucHEiLAogICAgICAgICkKICAgICAgICBkZi50b19wYXJxdWV0KGRmX3BhdGgpCgogICAgICAgIHRhc2sgPSBtbHJ1bi5OZXdUYXNrKAogICAgICAgICAgICBuYW1lPSJkcmlmdF9tYWduaXR1ZGUiLAogICAgICAgICAgICBoYW5kbGVyPSJkcmlmdF9tYWduaXR1ZGUiLAogICAgICAgICAgICBwYXJhbXM9ewogICAgICAgICAgICAgICAgImxhYmVsX2NvbCI6IGNvbnRleHQubGFiZWxfY29sLAogICAgICAgICAgICAgICAgInByZWRpY3Rpb25fY29sIjogY29udGV4dC5wcmVkaWN0aW9uc19jb2wsCiAgICAgICAgICAgICAgICAicmVzdWx0c190c2RiX2NvbnRhaW5lciI6IGNvbnRleHQucmVzdWx0c190c2RiX2NvbnRhaW5lciwKICAgICAgICAgICAgICAgICJyZXN1bHRzX3RzZGJfdGFibGUiOiBjb250ZXh0LnJlc3VsdHNfdHNkYl90YWJsZSwKICAgICAgICAgICAgfSwKICAgICAgICAgICAgaW5wdXRzPXsidCI6IGNvbnRleHQuYmFzZV9kYXRhc2V0LCAidSI6IGRmX3BhdGh9LAogICAgICAgICAgICBhcnRpZmFjdF9wYXRoPW1scnVuLm1sY29uZi5hcnRpZmFjdF9wYXRoLAogICAgICAgICkKCiAgICAgICAgY29udGV4dC52aXJ0dWFsX2RyaWZ0X2ZuLnJ1bih0YXNrLCB3YXRjaD1GYWxzZSkKCiAgICAgICAgY29udGV4dC5iYXRjaCA9IFtdCg==
-  source: ''
-  build:
-    commands: []
-    code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/stream_to_parquet/stream_to_parquet.py
-  default_handler: handler
-  affinity: null
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/stream_to_parquet/0.0.1/static/item.html b/functions/development/stream_to_parquet/0.0.1/static/item.html deleted file mode 100644 index 2dab5d1b..00000000 --- a/functions/development/stream_to_parquet/0.0.1/static/item.html +++ /dev/null @@ -1,49 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- machine-learning
-- data-preparation
-description: Saves a stream to Parquet and can lunch drift detection task on it
-doc: ''
-example: stream_to_parquet.ipynb
-generationDate: 2021-05-19:23-13
-icon: ''
-labels:
-  author: orz
-maintainers: []
-marketplaceType: ''
-mlrunVersion: ''
-name: stream-to-parquet
-platformVersion: ''
-spec:
-  filename: stream_to_parquet.py
-  handler: handler
-  image: mlrun/ml-models
-  kind: nuclio
-  requirements: []
-  customFields:
-    min_replicas: 1
-    max_replicas: 1
-url: ''
-version: 0.0.1
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/stream_to_parquet/0.0.1/static/source.html b/functions/development/stream_to_parquet/0.0.1/static/source.html deleted file mode 100644 index a8468a39..00000000 --- a/functions/development/stream_to_parquet/0.0.1/static/source.html +++ /dev/null @@ -1,109 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-# Generated by nuclio.export.NuclioExporter
-
-import os
-import pandas as pd
-import numpy as np
-import json
-import datetime
-import mlrun
-
-
-def record_to_features(record):
-    features = record["request"]["instances"][0]
-    timestamp = record["when"]
-    prediction = record["resp"]
-
-    record = {"timestamp": timestamp, **features, "predictions": prediction}
-
-    return record
-
-
-def init_context(context):
-    setattr(context, "batch", [])
-    setattr(context, "window", int(os.getenv("window", 10)))
-    setattr(context, "save_to", os.getenv("save_to", "/bigdata/inference_pq/"))
-    os.makedirs(context.save_to, exist_ok=True)
-
-    mlrun.mlconf.dbpath = mlrun.mlconf.dbpath or "http://mlrun-api:8080"
-    artifact_path = os.getenv("artifact_path", None)
-    if artifact_path:
-        mlrun.mlconf.artifact_path = artifact_path
-    if "hub_url" in os.environ:
-        mlrun.mlconf.hub_url = os.environ["hub_url"]
-    virtual_drift_fn = mlrun.import_function("hub://virtual_drift")
-    virtual_drift_fn.apply(
-        mlrun.mount_v3io(
-            mount_path=os.getenv("mount_path", "~/"),
-            remote=os.getenv("mount_remote", "/User"),
-        )
-    )
-    setattr(context, "virtual_drift_fn", virtual_drift_fn)
-
-    predictions_col = os.getenv("predictions", None)
-    label_col = os.getenv("label_col", None)
-    setattr(context, "base_dataset", os.getenv("base_dataset", ""))
-    setattr(context, "indexes", json.loads(os.environ.get("indexes", "[]")))
-    setattr(context, "predictions_col", predictions_col)
-    setattr(context, "label_col", label_col)
-    setattr(
-        context, "results_tsdb_container", os.getenv("results_tsdb_container", None)
-    )
-    setattr(context, "results_tsdb_table", os.getenv("results_tsdb_table", None))
-
-
-def handler(context, event):
-
-    context.logger.info(f"Adding {event.body}")
-    context.batch.append(record_to_features(json.loads(event.body)))
-
-    if len(context.batch) > context.window:
-        context.logger.info(context.batch[:1])
-        context.logger.info(context.indexes)
-        df = pd.DataFrame(context.batch)
-        context.logger.info(f"df example: {df.head(1)}")
-        if context.indexes:
-            df = df.set_index(context.indexes)
-        df_path = os.path.join(
-            context.save_to,
-            f"{datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S')}.pq",
-        )
-        df.to_parquet(df_path)
-
-        task = mlrun.NewTask(
-            name="drift_magnitude",
-            handler="drift_magnitude",
-            params={
-                "label_col": context.label_col,
-                "prediction_col": context.predictions_col,
-                "results_tsdb_container": context.results_tsdb_container,
-                "results_tsdb_table": context.results_tsdb_table,
-            },
-            inputs={"t": context.base_dataset, "u": df_path},
-            artifact_path=mlrun.mlconf.artifact_path,
-        )
-
-        context.virtual_drift_fn.run(task, watch=False)
-
-        context.batch = []
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/stream_to_parquet/0.8.0/src/function.yaml b/functions/development/stream_to_parquet/0.8.0/src/function.yaml deleted file mode 100644 index 0e3a790f..00000000 --- a/functions/development/stream_to_parquet/0.8.0/src/function.yaml +++ /dev/null @@ -1,45 +0,0 @@ -kind: remote -metadata: - name: stream-to-parquet - tag: '' - hash: 78316bfbe731714715c19f0bc6deabf8652f15c4 - project: default - labels: - author: orz - categories: - - machine-learning - - data-preparation -spec: - command: '' - args: [] - image: mlrun/ml-models - description: Saves a stream to Parquet and can lunch drift detection task on it - min_replicas: 1 - max_replicas: 1 - env: [] - base_spec: - apiVersion: nuclio.io/v1 - kind: Function - metadata: - name: stream-to-parquet - labels: {} - annotations: - nuclio.io/generated_by: function generated from /User/test/functions/stream_to_parquet/stream_to_parquet.py - spec: - runtime: python:3.6 - handler: stream_to_parquet:handler - env: [] - volumes: [] - build: - commands: [] - noBaseImagesPull: true - functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IG9zCmltcG9ydCBwYW5kYXMgYXMgcGQKaW1wb3J0IG51bXB5IGFzIG5wCmltcG9ydCBqc29uCmltcG9ydCBkYXRldGltZQppbXBvcnQgbWxydW4KCgpkZWYgcmVjb3JkX3RvX2ZlYXR1cmVzKHJlY29yZCk6CiAgICBmZWF0dXJlcyA9IHJlY29yZFsicmVxdWVzdCJdWyJpbnN0YW5jZXMiXVswXQogICAgdGltZXN0YW1wID0gcmVjb3JkWyJ3aGVuIl0KICAgIHByZWRpY3Rpb24gPSByZWNvcmRbInJlc3AiXQoKICAgIHJlY29yZCA9IHsidGltZXN0YW1wIjogdGltZXN0YW1wLCAqKmZlYXR1cmVzLCAicHJlZGljdGlvbnMiOiBwcmVkaWN0aW9ufQoKICAgIHJldHVybiByZWNvcmQKCgpkZWYgaW5pdF9jb250ZXh0KGNvbnRleHQpOgogICAgc2V0YXR0cihjb250ZXh0LCAiYmF0Y2giLCBbXSkKICAgIHNldGF0dHIoY29udGV4dCwgIndpbmRvdyIsIGludChvcy5nZXRlbnYoIndpbmRvdyIsIDEwKSkpCiAgICBzZXRhdHRyKGNvbnRleHQsICJzYXZlX3RvIiwgb3MuZ2V0ZW52KCJzYXZlX3RvIiwgIi9iaWdkYXRhL2luZmVyZW5jZV9wcS8iKSkKICAgIG9zLm1ha2VkaXJzKGNvbnRleHQuc2F2ZV90bywgZXhpc3Rfb2s9VHJ1ZSkKCiAgICBtbHJ1bi5tbGNvbmYuZGJwYXRoID0gbWxydW4ubWxjb25mLmRicGF0aCBvciAiaHR0cDovL21scnVuLWFwaTo4MDgwIgogICAgYXJ0aWZhY3RfcGF0aCA9IG9zLmdldGVudigiYXJ0aWZhY3RfcGF0aCIsIE5vbmUpCiAgICBpZiBhcnRpZmFjdF9wYXRoOgogICAgICAgIG1scnVuLm1sY29uZi5hcnRpZmFjdF9wYXRoID0gYXJ0aWZhY3RfcGF0aAogICAgaWYgImh1Yl91cmwiIGluIG9zLmVudmlyb246CiAgICAgICAgbWxydW4ubWxjb25mLmh1Yl91cmwgPSBvcy5lbnZpcm9uWyJodWJfdXJsIl0KICAgIHZpcnR1YWxfZHJpZnRfZm4gPSBtbHJ1bi5pbXBvcnRfZnVuY3Rpb24oImh1YjovL3ZpcnR1YWxfZHJpZnQiKQogICAgdmlydHVhbF9kcmlmdF9mbi5hcHBseShtbHJ1bi5hdXRvX21vdW50KCkpCiAgICBzZXRhdHRyKGNvbnRleHQsICJ2aXJ0dWFsX2RyaWZ0X2ZuIiwgdmlydHVhbF9kcmlmdF9mbikKCiAgICBwcmVkaWN0aW9uc19jb2wgPSBvcy5nZXRlbnYoInByZWRpY3Rpb25zIiwgTm9uZSkKICAgIGxhYmVsX2NvbCA9IG9zLmdldGVudigibGFiZWxfY29sIiwgTm9uZSkKICAgIHNldGF0dHIoY29udGV4dCwgImJhc2VfZGF0YXNldCIsIG9zLmdldGVudigiYmFzZV9kYXRhc2V0IiwgIiIpKQogICAgc2V0YXR0cihjb250ZXh0LCAiaW5kZXhlcyIsIGpzb24ubG9hZHMob3MuZW52aXJvbi5nZXQoImluZGV4ZXMiLCAiW10iKSkpCiAgICBzZXRhdHRyKGNvbnRleHQsICJwcmVkaWN0aW9uc19jb2wiLCBwcmVkaWN0aW9uc19jb2wpCiAgICBzZXRhdHRyKGNvbnRleHQsICJsYWJlbF9jb2wiLCBsYWJlbF9jb2wpCiAgICBzZXRhdHRyKAogICAgICAgIGNvbnRleHQsICJyZXN1bHRzX3RzZGJfY29udGFpbmVyIiwgb3MuZ2V0ZW52KCJyZXN1bHRzX3RzZGJfY29udGFpbmVyIiwgTm9uZSkKICAgICkKICAgIHNldGF0dHIoY29udGV4dCwgInJlc3VsdHNfdHNkYl90YWJsZSIsIG9zLmdldGVudigicmVzdWx0c190c2RiX3RhYmxlIiwgTm9uZSkpCgoKZGVmIGhhbmRsZXIoY29udGV4dCwgZXZlbnQpOgoKICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZiJBZGRpbmcge2V2ZW50LmJvZHl9IikKICAgIGNvbnRleHQuYmF0Y2guYXBwZW5kKHJlY29yZF90b19mZWF0dXJlcyhqc29uLmxvYWRzKGV2ZW50LmJvZHkpKSkKCiAgICBpZiBsZW4oY29udGV4dC5iYXRjaCkgPiBjb250ZXh0LndpbmRvdzoKICAgICAgICBjb250ZXh0LmxvZ2dlci5pbmZvKGNvbnRleHQuYmF0Y2hbOjFdKQogICAgICAgIGNvbnRleHQubG9nZ2VyLmluZm8oY29udGV4dC5pbmRleGVzKQogICAgICAgIGRmID0gcGQuRGF0YUZyYW1lKGNvbnRleHQuYmF0Y2gpCiAgICAgICAgY29udGV4dC5sb2dnZXIuaW5mbyhmImRmIGV4YW1wbGU6IHtkZi5oZWFkKDEpfSIpCiAgICAgICAgaWYgY29udGV4dC5pbmRleGVzOgogICAgICAgICAgICBkZiA9IGRmLnNldF9pbmRleChjb250ZXh0LmluZGV4ZXMpCiAgICAgICAgZGZfcGF0aCA9IG9zLnBhdGguam9pbigKICAgICAgICAgICAgY29udGV4dC5zYXZlX3RvLAogICAgICAgICAgICBmIntkYXRldGltZS5kYXRldGltZS5ub3coKS5zdHJmdGltZSgnJVktJW0tJWRUJUg6JU06JVMnKX0ucHEiLAogICAgICAgICkKICAgICAgICBkZi50b19wYXJxdWV0KGRmX3BhdGgsaW5kZXg9RmFsc2UpCgogICAgICAgIHRhc2sgPSBtbHJ1bi5OZXdUYXNrKAogICAgICAgICAgICBuYW1lPSJkcmlmdF9tYWduaXR1ZGUiLAogICAgICAgICAgICBoYW5kbGVyPSJkcmlmdF9tYWduaXR1ZGUiLAogICAgICAgICAgICBwYXJhbXM9ewogICAgICAgICAgICAgICAgImxhYmVsX2NvbCI6IGNvbnRleHQubGFiZWxfY29sLAogICAgICAgICAgICAgICAgInByZWRpY3Rpb25fY29sIjogY29udGV4dC5wcmVkaWN0aW9uc19jb2wsCiAgICAgICAgICAgICAgICAicmVzdWx0c190c2RiX2NvbnRhaW5lciI6IGNvbnRleHQucmVzdWx0c190c2RiX2NvbnRhaW5lciwKICAgICAgICAgICAgICAgICJyZXN1bHRzX3RzZGJfdGFibGUiOiBjb250ZXh0LnJlc3VsdHNfdHNkYl90YWJsZSwKICAgICAgICAgICAgfSwKICAgICAgICAgICAgaW5wdXRzPXsidCI6IGNvbnRleHQuYmFzZV9kYXRhc2V0LCAidSI6IGRmX3BhdGh9LAogICAgICAgICAgICBhcnRpZmFjdF9wYXRoPW1scnVuLm1sY29uZi5hcnRpZmFjdF9wYXRoLAogICAgICAgICkKCiAgICAgICAgY29udGV4dC52aXJ0dWFsX2RyaWZ0X2ZuLnJ1bih0YXNrLCB3YXRjaD1GYWxzZSkKCiAgICAgICAgY29udGV4dC5iYXRjaCA9IFtdCg== - source: '' - build: - commands: [] - code_origin: https://github.com/daniels290813/functions.git#3605c9b8dcadab89a5a45f7d16dcd2fcfeca8697:/User/test/functions/stream_to_parquet/stream_to_parquet.py - origin_filename: /User/test/functions/stream_to_parquet/stream_to_parquet.py - default_handler: handler - disable_auto_mount: false - affinity: null -verbose: false diff --git a/functions/development/stream_to_parquet/0.8.0/src/item.yaml b/functions/development/stream_to_parquet/0.8.0/src/item.yaml deleted file mode 100644 index 644beda1..00000000 --- a/functions/development/stream_to_parquet/0.8.0/src/item.yaml +++ /dev/null @@ -1,27 +0,0 @@ -apiVersion: v1 -categories: -- machine-learning -- data-preparation -description: Saves a stream to Parquet and can lunch drift detection task on it -doc: '' -example: stream_to_parquet.ipynb -generationDate: 2021-05-19:23-13 -icon: '' -labels: - author: orz -maintainers: [] -marketplaceType: '' -mlrunVersion: 0.8.0 -name: stream-to-parquet -platformVersion: 3.2.0 -spec: - customFields: - max_replicas: 1 - min_replicas: 1 - filename: stream_to_parquet.py - handler: handler - image: mlrun/ml-models - kind: nuclio - requirements: [] -url: '' -version: 0.8.0 diff --git a/functions/development/stream_to_parquet/0.8.0/src/stream_to_parquet.ipynb b/functions/development/stream_to_parquet/0.8.0/src/stream_to_parquet.ipynb deleted file mode 100644 index 1f0554f6..00000000 --- a/functions/development/stream_to_parquet/0.8.0/src/stream_to_parquet.ipynb +++ /dev/null @@ -1,262 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Stream to Parquet" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "import nuclio" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "%nuclio: setting kind to 'nuclio'\n", - "%nuclio: setting spec.build.baseImage to 'mlrun/ml-models'\n" - ] - } - ], - "source": [ - "# Define function spec\n", - "%nuclio config kind = \"nuclio\"\n", - "%nuclio config spec.build.baseImage = \"mlrun/ml-models\"" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: start-code" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "import pandas as pd\n", - "import numpy as np\n", - "import json\n", - "import datetime\n", - "import mlrun" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "def record_to_features(record):\n", - " features = record['request']['instances'][0]\n", - " timestamp = record['when']\n", - " prediction = record['resp']\n", - " \n", - " record = {'timestamp': timestamp,\n", - " **features,\n", - " 'predictions': prediction}\n", - " \n", - " return record" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": {}, - "outputs": [], - "source": [ - "def init_context(context):\n", - " setattr(context, 'batch', [])\n", - " setattr(context, 'window', int(os.getenv('window', 10))) \n", - " setattr(context, 'save_to', os.getenv('save_to', '/bigdata/inference_pq/'))\n", - " os.makedirs(context.save_to, exist_ok=True)\n", - " \n", - " mlrun.mlconf.dbpath = mlrun.mlconf.dbpath or 'http://mlrun-api:8080'\n", - " artifact_path = os.getenv('artifact_path', None)\n", - " if artifact_path:\n", - " mlrun.mlconf.artifact_path = artifact_path\n", - " if 'hub_url' in os.environ:\n", - " mlrun.mlconf.hub_url = os.environ['hub_url']\n", - " virtual_drift_fn = mlrun.import_function('hub://virtual_drift')\n", - " virtual_drift_fn.apply(mlrun.mount_v3io(mount_path=os.getenv('mount_path', '~/'), remote=os.getenv('mount_remote', '/User')))\n", - " setattr(context, 'virtual_drift_fn', virtual_drift_fn)\n", - " \n", - " predictions_col = os.getenv('predictions', None) \n", - " label_col = os.getenv('label_col', None)\n", - " setattr(context, 'base_dataset', os.getenv('base_dataset', ''))\n", - " setattr(context, 'indexes', json.loads(os.environ.get('indexes', '[]')))\n", - " setattr(context, 'predictions_col', predictions_col)\n", - " setattr(context, 'label_col', label_col)\n", - " setattr(context, 'results_tsdb_container', os.getenv('results_tsdb_container', None))\n", - " setattr(context, 'results_tsdb_table', os.getenv('results_tsdb_table', None))" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [], - "source": [ - "def handler(context, event):\n", - " \n", - " context.logger.info(f'Adding {event.body}')\n", - " context.batch.append(record_to_features(json.loads(event.body)))\n", - " \n", - " if len(context.batch) > context.window:\n", - " context.logger.info(context.batch[:1])\n", - " context.logger.info(context.indexes)\n", - " df = pd.DataFrame(context.batch)\n", - " context.logger.info(f'df example: {df.head(1)}')\n", - " if context.indexes:\n", - " df = df.set_index(context.indexes)\n", - " df_path = os.path.join(context.save_to, f\"{datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S')}.pq\")\n", - " df.to_parquet(df_path)\n", - "\n", - " task = mlrun.NewTask(name='drift_magnitude',\n", - " handler='drift_magnitude',\n", - " params={'label_col': context.label_col,\n", - " 'prediction_col': context.predictions_col,\n", - " 'results_tsdb_container': context.results_tsdb_container,\n", - " 'results_tsdb_table': context.results_tsdb_table},\n", - " inputs={'t': context.base_dataset,\n", - " 'u': df_path},\n", - " artifact_path=mlrun.mlconf.artifact_path)\n", - " \n", - " context.virtual_drift_fn.run(task,\n", - " watch=False)\n", - " \n", - " context.batch = []" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: end-code" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Save to function yaml" - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2020-12-24 14:34:39,340 [info] function spec saved to path: function.yaml\n", - "> 2020-12-24 14:34:39,350 [info] Starting remote function deploy\n", - "2020-12-24 14:34:39 (info) Deploying function\n", - "2020-12-24 14:34:39 (info) Building\n", - "2020-12-24 14:34:39 (info) Staging files and preparing base images\n", - "2020-12-24 14:34:39 (info) Building processor image\n", - "2020-12-24 14:34:40 (info) Build complete\n", - "2020-12-24 14:34:50 (info) Function deploy complete\n", - "> 2020-12-24 14:34:51,915 [info] function deployed, address=default-tenant.app.lewpwntlsyrb.iguazio-cd1.com:32225\n" - ] - }, - { - "data": { - "text/plain": [ - "'http://default-tenant.app.lewpwntlsyrb.iguazio-cd1.com:32225'" - ] - }, - "execution_count": 44, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import os\n", - "import json\n", - "from nuclio.triggers import V3IOStreamTrigger\n", - "from mlrun import mlconf, code_to_function, mount_v3io\n", - "\n", - "mlconf.dbpath = mlconf.dbpath or 'http://mlrun-api:8080'\n", - "\n", - "# create job function object from notebook code\n", - "fn = code_to_function(\"stream_to_parquet\")\n", - "fn.spec.min_replicas = 1\n", - "fn.spec.max_replicas = 1\n", - "\n", - "# add metadata (for templates and reuse)\n", - "fn.spec.default_handler = \"handler\"\n", - "fn.spec.description = \"Saves a stream to Parquet and can lunch drift detection task on it\"\n", - "fn.metadata.categories = [\"ml\", \"serve\"]\n", - "fn.metadata.labels = {\"author\": \"orz\"}\n", - "fn.export(\"function.yaml\")\n", - "\n", - "fn.add_trigger('labeled_stream', V3IOStreamTrigger(url=f'https://{os.environ[\"V3IO_API\"]}/users/orz/mlrun-demos/demos/network-operations/streaming/labeled_stream@s2p1', seekTo='latest'))\n", - "fn.apply(mount_v3io())\n", - "projdir = '/User/mlrun-demos/demos/network-operations/'\n", - "fn.set_envs({'window': 10000,\n", - " 'indexes': json.dumps(['timestamp', 'company', 'data_center', 'device']),\n", - " 'save_to': os.path.join(projdir, 'streaming', 'inference_pq'),\n", - " 'prediction_col': 'prediction',\n", - " 'label_col': 'is_error',\n", - " 'base_dataset': '/User/mlrun-demos/demos/network-operations/artifacts/test_set_preds.parquet',\n", - " 'results_tsdb_container': 'users',\n", - " 'results_tsdb_table': 'orz/mlrun-demos/demos/network-operations/streaming/s2p_tsdb',\n", - " 'mount_path': '/users/orz',\n", - " 'mount_remote': '/User',\n", - " 'artifact_path': '/User/mlrun-demos/demos/network-operations/streaming/drift_magnitude'})\n", - "fn.deploy(project='network-operations')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/functions/development/stream_to_parquet/0.8.0/src/stream_to_parquet.py b/functions/development/stream_to_parquet/0.8.0/src/stream_to_parquet.py deleted file mode 100644 index 8ec5635f..00000000 --- a/functions/development/stream_to_parquet/0.8.0/src/stream_to_parquet.py +++ /dev/null @@ -1,82 +0,0 @@ -# Generated by nuclio.export.NuclioExporter - -import os -import pandas as pd -import numpy as np -import json -import datetime -import mlrun - - -def record_to_features(record): - features = record["request"]["instances"][0] - timestamp = record["when"] - prediction = record["resp"] - - record = {"timestamp": timestamp, **features, "predictions": prediction} - - return record - - -def init_context(context): - setattr(context, "batch", []) - setattr(context, "window", int(os.getenv("window", 10))) - setattr(context, "save_to", os.getenv("save_to", "/bigdata/inference_pq/")) - os.makedirs(context.save_to, exist_ok=True) - - mlrun.mlconf.dbpath = mlrun.mlconf.dbpath or "http://mlrun-api:8080" - artifact_path = os.getenv("artifact_path", None) - if artifact_path: - mlrun.mlconf.artifact_path = artifact_path - if "hub_url" in os.environ: - mlrun.mlconf.hub_url = os.environ["hub_url"] - virtual_drift_fn = mlrun.import_function("hub://virtual_drift") - virtual_drift_fn.apply(mlrun.auto_mount()) - setattr(context, "virtual_drift_fn", virtual_drift_fn) - - predictions_col = os.getenv("predictions", None) - label_col = os.getenv("label_col", None) - setattr(context, "base_dataset", os.getenv("base_dataset", "")) - setattr(context, "indexes", json.loads(os.environ.get("indexes", "[]"))) - setattr(context, "predictions_col", predictions_col) - setattr(context, "label_col", label_col) - setattr( - context, "results_tsdb_container", os.getenv("results_tsdb_container", None) - ) - setattr(context, "results_tsdb_table", os.getenv("results_tsdb_table", None)) - - -def handler(context, event): - - context.logger.info(f"Adding {event.body}") - context.batch.append(record_to_features(json.loads(event.body))) - - if len(context.batch) > context.window: - context.logger.info(context.batch[:1]) - context.logger.info(context.indexes) - df = pd.DataFrame(context.batch) - context.logger.info(f"df example: {df.head(1)}") - if context.indexes: - df = df.set_index(context.indexes) - df_path = os.path.join( - context.save_to, - f"{datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S')}.pq", - ) - df.to_parquet(df_path,index=False) - - task = mlrun.NewTask( - name="drift_magnitude", - handler="drift_magnitude", - params={ - "label_col": context.label_col, - "prediction_col": context.predictions_col, - "results_tsdb_container": context.results_tsdb_container, - "results_tsdb_table": context.results_tsdb_table, - }, - inputs={"t": context.base_dataset, "u": df_path}, - artifact_path=mlrun.mlconf.artifact_path, - ) - - context.virtual_drift_fn.run(task, watch=False) - - context.batch = [] diff --git a/functions/development/stream_to_parquet/0.8.0/static/documentation.html b/functions/development/stream_to_parquet/0.8.0/static/documentation.html deleted file mode 100644 index 52e2c852..00000000 --- a/functions/development/stream_to_parquet/0.8.0/static/documentation.html +++ /dev/null @@ -1,137 +0,0 @@ - - - - - - - -stream_to_parquet package - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- - -
-
-
-
-
-
-

stream_to_parquet package

-
-

Submodules

-
-
-

stream_to_parquet.stream_to_parquet module

-
-
-stream_to_parquet.stream_to_parquet.handler(context, event)[source]
-
-
-
-stream_to_parquet.stream_to_parquet.init_context(context)[source]
-
-
-
-stream_to_parquet.stream_to_parquet.record_to_features(record)[source]
-
-
-
-

Module contents

-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/stream_to_parquet/0.8.0/static/example.html b/functions/development/stream_to_parquet/0.8.0/static/example.html deleted file mode 100644 index 9fd3dfc9..00000000 --- a/functions/development/stream_to_parquet/0.8.0/static/example.html +++ /dev/null @@ -1,294 +0,0 @@ - - - - - - - -Stream to Parquet - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- -
-
- Contents -
- -
-
-
-
-
-
-
-

Stream to Parquet

-
-
-
import nuclio
-
-
-
-
-
-
-
# Define function spec
-%nuclio config kind = "nuclio"
-%nuclio config spec.build.baseImage = "mlrun/ml-models"
-
-
-
-
-
%nuclio: setting kind to 'nuclio'
-%nuclio: setting spec.build.baseImage to 'mlrun/ml-models'
-
-
-
-
-
-
-
# nuclio: start-code
-
-
-
-
-
-
-
import os
-import pandas as pd
-import numpy as np
-import json
-import datetime
-import mlrun
-
-
-
-
-
-
-
def record_to_features(record):
-    features = record['request']['instances'][0]
-    timestamp = record['when']
-    prediction = record['resp']
-    
-    record = {'timestamp': timestamp,
-              **features,
-              'predictions': prediction}
-    
-    return record
-
-
-
-
-
-
-
def init_context(context):
-    setattr(context, 'batch', [])
-    setattr(context, 'window', int(os.getenv('window', 10)))    
-    setattr(context, 'save_to', os.getenv('save_to', '/bigdata/inference_pq/'))
-    os.makedirs(context.save_to, exist_ok=True)
-    
-    mlrun.mlconf.dbpath = mlrun.mlconf.dbpath or 'http://mlrun-api:8080'
-    artifact_path = os.getenv('artifact_path', None)
-    if artifact_path:
-        mlrun.mlconf.artifact_path = artifact_path
-    if 'hub_url' in os.environ:
-        mlrun.mlconf.hub_url = os.environ['hub_url']
-    virtual_drift_fn = mlrun.import_function('hub://virtual_drift')
-    virtual_drift_fn.apply(mlrun.mount_v3io(mount_path=os.getenv('mount_path', '~/'), remote=os.getenv('mount_remote', '/User')))
-    setattr(context, 'virtual_drift_fn', virtual_drift_fn)
-    
-    predictions_col = os.getenv('predictions', None) 
-    label_col = os.getenv('label_col', None)
-    setattr(context, 'base_dataset', os.getenv('base_dataset', ''))
-    setattr(context, 'indexes', json.loads(os.environ.get('indexes', '[]')))
-    setattr(context, 'predictions_col', predictions_col)
-    setattr(context, 'label_col', label_col)
-    setattr(context, 'results_tsdb_container', os.getenv('results_tsdb_container', None))
-    setattr(context, 'results_tsdb_table', os.getenv('results_tsdb_table', None))
-
-
-
-
-
-
-
def handler(context, event):
-    
-    context.logger.info(f'Adding {event.body}')
-    context.batch.append(record_to_features(json.loads(event.body)))
-    
-    if len(context.batch) > context.window:
-        context.logger.info(context.batch[:1])
-        context.logger.info(context.indexes)
-        df = pd.DataFrame(context.batch)
-        context.logger.info(f'df example: {df.head(1)}')
-        if context.indexes:
-            df = df.set_index(context.indexes)
-        df_path = os.path.join(context.save_to, f"{datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S')}.pq")
-        df.to_parquet(df_path)
-
-        task = mlrun.NewTask(name='drift_magnitude',
-                        handler='drift_magnitude',
-                        params={'label_col': context.label_col,
-                                'prediction_col': context.predictions_col,
-                                'results_tsdb_container': context.results_tsdb_container,
-                                'results_tsdb_table': context.results_tsdb_table},
-                        inputs={'t': context.base_dataset,
-                                'u': df_path},
-                        artifact_path=mlrun.mlconf.artifact_path)
-        
-        context.virtual_drift_fn.run(task,
-                                     watch=False)
-        
-        context.batch = []
-
-
-
-
-
-
-
# nuclio: end-code
-
-
-
-
-
-

Save to function yaml

-
-
-
import os
-import json
-from nuclio.triggers import V3IOStreamTrigger
-from mlrun import mlconf, code_to_function, mount_v3io
-
-mlconf.dbpath = mlconf.dbpath or 'http://mlrun-api:8080'
-
-# create job function object from notebook code
-fn = code_to_function("stream_to_parquet")
-fn.spec.min_replicas = 1
-fn.spec.max_replicas = 1
-
-# add metadata (for templates and reuse)
-fn.spec.default_handler = "handler"
-fn.spec.description = "Saves a stream to Parquet and can lunch drift detection task on it"
-fn.metadata.categories = ["ml", "serve"]
-fn.metadata.labels = {"author": "orz"}
-fn.export("function.yaml")
-
-fn.add_trigger('labeled_stream', V3IOStreamTrigger(url=f'https://{os.environ["V3IO_API"]}/users/orz/mlrun-demos/demos/network-operations/streaming/labeled_stream@s2p1', seekTo='latest'))
-fn.apply(mount_v3io())
-projdir = '/User/mlrun-demos/demos/network-operations/'
-fn.set_envs({'window': 10000,
-             'indexes': json.dumps(['timestamp', 'company', 'data_center', 'device']),
-             'save_to': os.path.join(projdir, 'streaming', 'inference_pq'),
-             'prediction_col': 'prediction',
-             'label_col': 'is_error',
-             'base_dataset': '/User/mlrun-demos/demos/network-operations/artifacts/test_set_preds.parquet',
-             'results_tsdb_container': 'users',
-             'results_tsdb_table': 'orz/mlrun-demos/demos/network-operations/streaming/s2p_tsdb',
-             'mount_path': '/users/orz',
-             'mount_remote': '/User',
-             'artifact_path': '/User/mlrun-demos/demos/network-operations/streaming/drift_magnitude'})
-fn.deploy(project='network-operations')
-
-
-
-
-
> 2020-12-24 14:34:39,340 [info] function spec saved to path: function.yaml
-> 2020-12-24 14:34:39,350 [info] Starting remote function deploy
-2020-12-24 14:34:39  (info) Deploying function
-2020-12-24 14:34:39  (info) Building
-2020-12-24 14:34:39  (info) Staging files and preparing base images
-2020-12-24 14:34:39  (info) Building processor image
-2020-12-24 14:34:40  (info) Build complete
-2020-12-24 14:34:50  (info) Function deploy complete
-> 2020-12-24 14:34:51,915 [info] function deployed, address=default-tenant.app.lewpwntlsyrb.iguazio-cd1.com:32225
-
-
-
'http://default-tenant.app.lewpwntlsyrb.iguazio-cd1.com:32225'
-
-
-
-
-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/stream_to_parquet/0.8.0/static/function.html b/functions/development/stream_to_parquet/0.8.0/static/function.html deleted file mode 100644 index d9f6275c..00000000 --- a/functions/development/stream_to_parquet/0.8.0/static/function.html +++ /dev/null @@ -1,67 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: remote
-metadata:
-  name: stream-to-parquet
-  tag: ''
-  hash: 78316bfbe731714715c19f0bc6deabf8652f15c4
-  project: default
-  labels:
-    author: orz
-  categories:
-  - machine-learning
-  - data-preparation
-spec:
-  command: ''
-  args: []
-  image: mlrun/ml-models
-  description: Saves a stream to Parquet and can lunch drift detection task on it
-  min_replicas: 1
-  max_replicas: 1
-  env: []
-  base_spec:
-    apiVersion: nuclio.io/v1
-    kind: Function
-    metadata:
-      name: stream-to-parquet
-      labels: {}
-      annotations:
-        nuclio.io/generated_by: function generated from /User/test/functions/stream_to_parquet/stream_to_parquet.py
-    spec:
-      runtime: python:3.6
-      handler: stream_to_parquet:handler
-      env: []
-      volumes: []
-      build:
-        commands: []
-        noBaseImagesPull: true
-        functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IG9zCmltcG9ydCBwYW5kYXMgYXMgcGQKaW1wb3J0IG51bXB5IGFzIG5wCmltcG9ydCBqc29uCmltcG9ydCBkYXRldGltZQppbXBvcnQgbWxydW4KCgpkZWYgcmVjb3JkX3RvX2ZlYXR1cmVzKHJlY29yZCk6CiAgICBmZWF0dXJlcyA9IHJlY29yZFsicmVxdWVzdCJdWyJpbnN0YW5jZXMiXVswXQogICAgdGltZXN0YW1wID0gcmVjb3JkWyJ3aGVuIl0KICAgIHByZWRpY3Rpb24gPSByZWNvcmRbInJlc3AiXQoKICAgIHJlY29yZCA9IHsidGltZXN0YW1wIjogdGltZXN0YW1wLCAqKmZlYXR1cmVzLCAicHJlZGljdGlvbnMiOiBwcmVkaWN0aW9ufQoKICAgIHJldHVybiByZWNvcmQKCgpkZWYgaW5pdF9jb250ZXh0KGNvbnRleHQpOgogICAgc2V0YXR0cihjb250ZXh0LCAiYmF0Y2giLCBbXSkKICAgIHNldGF0dHIoY29udGV4dCwgIndpbmRvdyIsIGludChvcy5nZXRlbnYoIndpbmRvdyIsIDEwKSkpCiAgICBzZXRhdHRyKGNvbnRleHQsICJzYXZlX3RvIiwgb3MuZ2V0ZW52KCJzYXZlX3RvIiwgIi9iaWdkYXRhL2luZmVyZW5jZV9wcS8iKSkKICAgIG9zLm1ha2VkaXJzKGNvbnRleHQuc2F2ZV90bywgZXhpc3Rfb2s9VHJ1ZSkKCiAgICBtbHJ1bi5tbGNvbmYuZGJwYXRoID0gbWxydW4ubWxjb25mLmRicGF0aCBvciAiaHR0cDovL21scnVuLWFwaTo4MDgwIgogICAgYXJ0aWZhY3RfcGF0aCA9IG9zLmdldGVudigiYXJ0aWZhY3RfcGF0aCIsIE5vbmUpCiAgICBpZiBhcnRpZmFjdF9wYXRoOgogICAgICAgIG1scnVuLm1sY29uZi5hcnRpZmFjdF9wYXRoID0gYXJ0aWZhY3RfcGF0aAogICAgaWYgImh1Yl91cmwiIGluIG9zLmVudmlyb246CiAgICAgICAgbWxydW4ubWxjb25mLmh1Yl91cmwgPSBvcy5lbnZpcm9uWyJodWJfdXJsIl0KICAgIHZpcnR1YWxfZHJpZnRfZm4gPSBtbHJ1bi5pbXBvcnRfZnVuY3Rpb24oImh1YjovL3ZpcnR1YWxfZHJpZnQiKQogICAgdmlydHVhbF9kcmlmdF9mbi5hcHBseShtbHJ1bi5hdXRvX21vdW50KCkpCiAgICBzZXRhdHRyKGNvbnRleHQsICJ2aXJ0dWFsX2RyaWZ0X2ZuIiwgdmlydHVhbF9kcmlmdF9mbikKCiAgICBwcmVkaWN0aW9uc19jb2wgPSBvcy5nZXRlbnYoInByZWRpY3Rpb25zIiwgTm9uZSkKICAgIGxhYmVsX2NvbCA9IG9zLmdldGVudigibGFiZWxfY29sIiwgTm9uZSkKICAgIHNldGF0dHIoY29udGV4dCwgImJhc2VfZGF0YXNldCIsIG9zLmdldGVudigiYmFzZV9kYXRhc2V0IiwgIiIpKQogICAgc2V0YXR0cihjb250ZXh0LCAiaW5kZXhlcyIsIGpzb24ubG9hZHMob3MuZW52aXJvbi5nZXQoImluZGV4ZXMiLCAiW10iKSkpCiAgICBzZXRhdHRyKGNvbnRleHQsICJwcmVkaWN0aW9uc19jb2wiLCBwcmVkaWN0aW9uc19jb2wpCiAgICBzZXRhdHRyKGNvbnRleHQsICJsYWJlbF9jb2wiLCBsYWJlbF9jb2wpCiAgICBzZXRhdHRyKAogICAgICAgIGNvbnRleHQsICJyZXN1bHRzX3RzZGJfY29udGFpbmVyIiwgb3MuZ2V0ZW52KCJyZXN1bHRzX3RzZGJfY29udGFpbmVyIiwgTm9uZSkKICAgICkKICAgIHNldGF0dHIoY29udGV4dCwgInJlc3VsdHNfdHNkYl90YWJsZSIsIG9zLmdldGVudigicmVzdWx0c190c2RiX3RhYmxlIiwgTm9uZSkpCgoKZGVmIGhhbmRsZXIoY29udGV4dCwgZXZlbnQpOgoKICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZiJBZGRpbmcge2V2ZW50LmJvZHl9IikKICAgIGNvbnRleHQuYmF0Y2guYXBwZW5kKHJlY29yZF90b19mZWF0dXJlcyhqc29uLmxvYWRzKGV2ZW50LmJvZHkpKSkKCiAgICBpZiBsZW4oY29udGV4dC5iYXRjaCkgPiBjb250ZXh0LndpbmRvdzoKICAgICAgICBjb250ZXh0LmxvZ2dlci5pbmZvKGNvbnRleHQuYmF0Y2hbOjFdKQogICAgICAgIGNvbnRleHQubG9nZ2VyLmluZm8oY29udGV4dC5pbmRleGVzKQogICAgICAgIGRmID0gcGQuRGF0YUZyYW1lKGNvbnRleHQuYmF0Y2gpCiAgICAgICAgY29udGV4dC5sb2dnZXIuaW5mbyhmImRmIGV4YW1wbGU6IHtkZi5oZWFkKDEpfSIpCiAgICAgICAgaWYgY29udGV4dC5pbmRleGVzOgogICAgICAgICAgICBkZiA9IGRmLnNldF9pbmRleChjb250ZXh0LmluZGV4ZXMpCiAgICAgICAgZGZfcGF0aCA9IG9zLnBhdGguam9pbigKICAgICAgICAgICAgY29udGV4dC5zYXZlX3RvLAogICAgICAgICAgICBmIntkYXRldGltZS5kYXRldGltZS5ub3coKS5zdHJmdGltZSgnJVktJW0tJWRUJUg6JU06JVMnKX0ucHEiLAogICAgICAgICkKICAgICAgICBkZi50b19wYXJxdWV0KGRmX3BhdGgsaW5kZXg9RmFsc2UpCgogICAgICAgIHRhc2sgPSBtbHJ1bi5OZXdUYXNrKAogICAgICAgICAgICBuYW1lPSJkcmlmdF9tYWduaXR1ZGUiLAogICAgICAgICAgICBoYW5kbGVyPSJkcmlmdF9tYWduaXR1ZGUiLAogICAgICAgICAgICBwYXJhbXM9ewogICAgICAgICAgICAgICAgImxhYmVsX2NvbCI6IGNvbnRleHQubGFiZWxfY29sLAogICAgICAgICAgICAgICAgInByZWRpY3Rpb25fY29sIjogY29udGV4dC5wcmVkaWN0aW9uc19jb2wsCiAgICAgICAgICAgICAgICAicmVzdWx0c190c2RiX2NvbnRhaW5lciI6IGNvbnRleHQucmVzdWx0c190c2RiX2NvbnRhaW5lciwKICAgICAgICAgICAgICAgICJyZXN1bHRzX3RzZGJfdGFibGUiOiBjb250ZXh0LnJlc3VsdHNfdHNkYl90YWJsZSwKICAgICAgICAgICAgfSwKICAgICAgICAgICAgaW5wdXRzPXsidCI6IGNvbnRleHQuYmFzZV9kYXRhc2V0LCAidSI6IGRmX3BhdGh9LAogICAgICAgICAgICBhcnRpZmFjdF9wYXRoPW1scnVuLm1sY29uZi5hcnRpZmFjdF9wYXRoLAogICAgICAgICkKCiAgICAgICAgY29udGV4dC52aXJ0dWFsX2RyaWZ0X2ZuLnJ1bih0YXNrLCB3YXRjaD1GYWxzZSkKCiAgICAgICAgY29udGV4dC5iYXRjaCA9IFtdCg==
-  source: ''
-  build:
-    commands: []
-    code_origin: https://github.com/daniels290813/functions.git#3605c9b8dcadab89a5a45f7d16dcd2fcfeca8697:/User/test/functions/stream_to_parquet/stream_to_parquet.py
-    origin_filename: /User/test/functions/stream_to_parquet/stream_to_parquet.py
-  default_handler: handler
-  disable_auto_mount: false
-  affinity: null
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/stream_to_parquet/0.8.0/static/item.html b/functions/development/stream_to_parquet/0.8.0/static/item.html deleted file mode 100644 index 39f55916..00000000 --- a/functions/development/stream_to_parquet/0.8.0/static/item.html +++ /dev/null @@ -1,49 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- machine-learning
-- data-preparation
-description: Saves a stream to Parquet and can lunch drift detection task on it
-doc: ''
-example: stream_to_parquet.ipynb
-generationDate: 2021-05-19:23-13
-icon: ''
-labels:
-  author: orz
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 0.8.0
-name: stream-to-parquet
-platformVersion: 3.2.0
-spec:
-  customFields:
-    max_replicas: 1
-    min_replicas: 1
-  filename: stream_to_parquet.py
-  handler: handler
-  image: mlrun/ml-models
-  kind: nuclio
-  requirements: []
-url: ''
-version: 0.8.0
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/stream_to_parquet/0.8.0/static/source.html b/functions/development/stream_to_parquet/0.8.0/static/source.html deleted file mode 100644 index bf5efa53..00000000 --- a/functions/development/stream_to_parquet/0.8.0/static/source.html +++ /dev/null @@ -1,104 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-# Generated by nuclio.export.NuclioExporter
-
-import os
-import pandas as pd
-import numpy as np
-import json
-import datetime
-import mlrun
-
-
-def record_to_features(record):
-    features = record["request"]["instances"][0]
-    timestamp = record["when"]
-    prediction = record["resp"]
-
-    record = {"timestamp": timestamp, **features, "predictions": prediction}
-
-    return record
-
-
-def init_context(context):
-    setattr(context, "batch", [])
-    setattr(context, "window", int(os.getenv("window", 10)))
-    setattr(context, "save_to", os.getenv("save_to", "/bigdata/inference_pq/"))
-    os.makedirs(context.save_to, exist_ok=True)
-
-    mlrun.mlconf.dbpath = mlrun.mlconf.dbpath or "http://mlrun-api:8080"
-    artifact_path = os.getenv("artifact_path", None)
-    if artifact_path:
-        mlrun.mlconf.artifact_path = artifact_path
-    if "hub_url" in os.environ:
-        mlrun.mlconf.hub_url = os.environ["hub_url"]
-    virtual_drift_fn = mlrun.import_function("hub://virtual_drift")
-    virtual_drift_fn.apply(mlrun.auto_mount())
-    setattr(context, "virtual_drift_fn", virtual_drift_fn)
-
-    predictions_col = os.getenv("predictions", None)
-    label_col = os.getenv("label_col", None)
-    setattr(context, "base_dataset", os.getenv("base_dataset", ""))
-    setattr(context, "indexes", json.loads(os.environ.get("indexes", "[]")))
-    setattr(context, "predictions_col", predictions_col)
-    setattr(context, "label_col", label_col)
-    setattr(
-        context, "results_tsdb_container", os.getenv("results_tsdb_container", None)
-    )
-    setattr(context, "results_tsdb_table", os.getenv("results_tsdb_table", None))
-
-
-def handler(context, event):
-
-    context.logger.info(f"Adding {event.body}")
-    context.batch.append(record_to_features(json.loads(event.body)))
-
-    if len(context.batch) > context.window:
-        context.logger.info(context.batch[:1])
-        context.logger.info(context.indexes)
-        df = pd.DataFrame(context.batch)
-        context.logger.info(f"df example: {df.head(1)}")
-        if context.indexes:
-            df = df.set_index(context.indexes)
-        df_path = os.path.join(
-            context.save_to,
-            f"{datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S')}.pq",
-        )
-        df.to_parquet(df_path,index=False)
-
-        task = mlrun.NewTask(
-            name="drift_magnitude",
-            handler="drift_magnitude",
-            params={
-                "label_col": context.label_col,
-                "prediction_col": context.predictions_col,
-                "results_tsdb_container": context.results_tsdb_container,
-                "results_tsdb_table": context.results_tsdb_table,
-            },
-            inputs={"t": context.base_dataset, "u": df_path},
-            artifact_path=mlrun.mlconf.artifact_path,
-        )
-
-        context.virtual_drift_fn.run(task, watch=False)
-
-        context.batch = []
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/stream_to_parquet/0.9.0/src/function.yaml b/functions/development/stream_to_parquet/0.9.0/src/function.yaml deleted file mode 100644 index f8786cc9..00000000 --- a/functions/development/stream_to_parquet/0.9.0/src/function.yaml +++ /dev/null @@ -1,45 +0,0 @@ -kind: remote -metadata: - name: stream-to-parquet - tag: '' - hash: 78316bfbe731714715c19f0bc6deabf8652f15c4 - project: '' - labels: - author: orz - categories: - - machine-learning - - data-preparation -spec: - command: '' - args: [] - image: mlrun/ml-models - description: Saves a stream to Parquet and can lunch drift detection task on it - min_replicas: 1 - max_replicas: 1 - env: [] - base_spec: - apiVersion: nuclio.io/v1 - kind: Function - metadata: - name: stream-to-parquet - labels: {} - annotations: - nuclio.io/generated_by: function generated from /User/test/functions/stream_to_parquet/stream_to_parquet.py - spec: - runtime: python:3.6 - handler: stream_to_parquet:handler - env: [] - volumes: [] - build: - commands: [] - noBaseImagesPull: true - functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IG9zCmltcG9ydCBwYW5kYXMgYXMgcGQKaW1wb3J0IG51bXB5IGFzIG5wCmltcG9ydCBqc29uCmltcG9ydCBkYXRldGltZQppbXBvcnQgbWxydW4KCgpkZWYgcmVjb3JkX3RvX2ZlYXR1cmVzKHJlY29yZCk6CiAgICBmZWF0dXJlcyA9IHJlY29yZFsicmVxdWVzdCJdWyJpbnN0YW5jZXMiXVswXQogICAgdGltZXN0YW1wID0gcmVjb3JkWyJ3aGVuIl0KICAgIHByZWRpY3Rpb24gPSByZWNvcmRbInJlc3AiXQoKICAgIHJlY29yZCA9IHsidGltZXN0YW1wIjogdGltZXN0YW1wLCAqKmZlYXR1cmVzLCAicHJlZGljdGlvbnMiOiBwcmVkaWN0aW9ufQoKICAgIHJldHVybiByZWNvcmQKCgpkZWYgaW5pdF9jb250ZXh0KGNvbnRleHQpOgogICAgc2V0YXR0cihjb250ZXh0LCAiYmF0Y2giLCBbXSkKICAgIHNldGF0dHIoY29udGV4dCwgIndpbmRvdyIsIGludChvcy5nZXRlbnYoIndpbmRvdyIsIDEwKSkpCiAgICBzZXRhdHRyKGNvbnRleHQsICJzYXZlX3RvIiwgb3MuZ2V0ZW52KCJzYXZlX3RvIiwgIi9iaWdkYXRhL2luZmVyZW5jZV9wcS8iKSkKICAgIG9zLm1ha2VkaXJzKGNvbnRleHQuc2F2ZV90bywgZXhpc3Rfb2s9VHJ1ZSkKCiAgICBtbHJ1bi5tbGNvbmYuZGJwYXRoID0gbWxydW4ubWxjb25mLmRicGF0aCBvciAiaHR0cDovL21scnVuLWFwaTo4MDgwIgogICAgYXJ0aWZhY3RfcGF0aCA9IG9zLmdldGVudigiYXJ0aWZhY3RfcGF0aCIsIE5vbmUpCiAgICBpZiBhcnRpZmFjdF9wYXRoOgogICAgICAgIG1scnVuLm1sY29uZi5hcnRpZmFjdF9wYXRoID0gYXJ0aWZhY3RfcGF0aAogICAgaWYgImh1Yl91cmwiIGluIG9zLmVudmlyb246CiAgICAgICAgbWxydW4ubWxjb25mLmh1Yl91cmwgPSBvcy5lbnZpcm9uWyJodWJfdXJsIl0KICAgIHZpcnR1YWxfZHJpZnRfZm4gPSBtbHJ1bi5pbXBvcnRfZnVuY3Rpb24oImh1YjovL3ZpcnR1YWxfZHJpZnQiKQogICAgdmlydHVhbF9kcmlmdF9mbi5hcHBseShtbHJ1bi5hdXRvX21vdW50KCkpCiAgICBzZXRhdHRyKGNvbnRleHQsICJ2aXJ0dWFsX2RyaWZ0X2ZuIiwgdmlydHVhbF9kcmlmdF9mbikKCiAgICBwcmVkaWN0aW9uc19jb2wgPSBvcy5nZXRlbnYoInByZWRpY3Rpb25zIiwgTm9uZSkKICAgIGxhYmVsX2NvbCA9IG9zLmdldGVudigibGFiZWxfY29sIiwgTm9uZSkKICAgIHNldGF0dHIoY29udGV4dCwgImJhc2VfZGF0YXNldCIsIG9zLmdldGVudigiYmFzZV9kYXRhc2V0IiwgIiIpKQogICAgc2V0YXR0cihjb250ZXh0LCAiaW5kZXhlcyIsIGpzb24ubG9hZHMob3MuZW52aXJvbi5nZXQoImluZGV4ZXMiLCAiW10iKSkpCiAgICBzZXRhdHRyKGNvbnRleHQsICJwcmVkaWN0aW9uc19jb2wiLCBwcmVkaWN0aW9uc19jb2wpCiAgICBzZXRhdHRyKGNvbnRleHQsICJsYWJlbF9jb2wiLCBsYWJlbF9jb2wpCiAgICBzZXRhdHRyKAogICAgICAgIGNvbnRleHQsICJyZXN1bHRzX3RzZGJfY29udGFpbmVyIiwgb3MuZ2V0ZW52KCJyZXN1bHRzX3RzZGJfY29udGFpbmVyIiwgTm9uZSkKICAgICkKICAgIHNldGF0dHIoY29udGV4dCwgInJlc3VsdHNfdHNkYl90YWJsZSIsIG9zLmdldGVudigicmVzdWx0c190c2RiX3RhYmxlIiwgTm9uZSkpCgoKZGVmIGhhbmRsZXIoY29udGV4dCwgZXZlbnQpOgoKICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZiJBZGRpbmcge2V2ZW50LmJvZHl9IikKICAgIGNvbnRleHQuYmF0Y2guYXBwZW5kKHJlY29yZF90b19mZWF0dXJlcyhqc29uLmxvYWRzKGV2ZW50LmJvZHkpKSkKCiAgICBpZiBsZW4oY29udGV4dC5iYXRjaCkgPiBjb250ZXh0LndpbmRvdzoKICAgICAgICBjb250ZXh0LmxvZ2dlci5pbmZvKGNvbnRleHQuYmF0Y2hbOjFdKQogICAgICAgIGNvbnRleHQubG9nZ2VyLmluZm8oY29udGV4dC5pbmRleGVzKQogICAgICAgIGRmID0gcGQuRGF0YUZyYW1lKGNvbnRleHQuYmF0Y2gpCiAgICAgICAgY29udGV4dC5sb2dnZXIuaW5mbyhmImRmIGV4YW1wbGU6IHtkZi5oZWFkKDEpfSIpCiAgICAgICAgaWYgY29udGV4dC5pbmRleGVzOgogICAgICAgICAgICBkZiA9IGRmLnNldF9pbmRleChjb250ZXh0LmluZGV4ZXMpCiAgICAgICAgZGZfcGF0aCA9IG9zLnBhdGguam9pbigKICAgICAgICAgICAgY29udGV4dC5zYXZlX3RvLAogICAgICAgICAgICBmIntkYXRldGltZS5kYXRldGltZS5ub3coKS5zdHJmdGltZSgnJVktJW0tJWRUJUg6JU06JVMnKX0ucHEiLAogICAgICAgICkKICAgICAgICBkZi50b19wYXJxdWV0KGRmX3BhdGgsaW5kZXg9RmFsc2UpCgogICAgICAgIHRhc2sgPSBtbHJ1bi5OZXdUYXNrKAogICAgICAgICAgICBuYW1lPSJkcmlmdF9tYWduaXR1ZGUiLAogICAgICAgICAgICBoYW5kbGVyPSJkcmlmdF9tYWduaXR1ZGUiLAogICAgICAgICAgICBwYXJhbXM9ewogICAgICAgICAgICAgICAgImxhYmVsX2NvbCI6IGNvbnRleHQubGFiZWxfY29sLAogICAgICAgICAgICAgICAgInByZWRpY3Rpb25fY29sIjogY29udGV4dC5wcmVkaWN0aW9uc19jb2wsCiAgICAgICAgICAgICAgICAicmVzdWx0c190c2RiX2NvbnRhaW5lciI6IGNvbnRleHQucmVzdWx0c190c2RiX2NvbnRhaW5lciwKICAgICAgICAgICAgICAgICJyZXN1bHRzX3RzZGJfdGFibGUiOiBjb250ZXh0LnJlc3VsdHNfdHNkYl90YWJsZSwKICAgICAgICAgICAgfSwKICAgICAgICAgICAgaW5wdXRzPXsidCI6IGNvbnRleHQuYmFzZV9kYXRhc2V0LCAidSI6IGRmX3BhdGh9LAogICAgICAgICAgICBhcnRpZmFjdF9wYXRoPW1scnVuLm1sY29uZi5hcnRpZmFjdF9wYXRoLAogICAgICAgICkKCiAgICAgICAgY29udGV4dC52aXJ0dWFsX2RyaWZ0X2ZuLnJ1bih0YXNrLCB3YXRjaD1GYWxzZSkKCiAgICAgICAgY29udGV4dC5iYXRjaCA9IFtdCg== - source: '' - build: - commands: [] - code_origin: https://github.com/daniels290813/functions.git#3605c9b8dcadab89a5a45f7d16dcd2fcfeca8697:/User/test/functions/stream_to_parquet/stream_to_parquet.py - origin_filename: /User/test/functions/stream_to_parquet/stream_to_parquet.py - default_handler: handler - disable_auto_mount: false - affinity: null -verbose: false diff --git a/functions/development/stream_to_parquet/0.9.0/src/item.yaml b/functions/development/stream_to_parquet/0.9.0/src/item.yaml deleted file mode 100644 index 7d283a01..00000000 --- a/functions/development/stream_to_parquet/0.9.0/src/item.yaml +++ /dev/null @@ -1,27 +0,0 @@ -apiVersion: v1 -categories: -- machine-learning -- data-preparation -description: Saves a stream to Parquet and can lunch drift detection task on it -doc: '' -example: stream_to_parquet.ipynb -generationDate: 2021-11-18:12-28 -icon: '' -labels: - author: orz -maintainers: [] -marketplaceType: '' -mlrunVersion: 0.8.0 -name: stream-to-parquet -platformVersion: 3.2.0 -spec: - customFields: - max_replicas: 1 - min_replicas: 1 - filename: stream_to_parquet.py - handler: handler - image: mlrun/ml-models - kind: nuclio - requirements: [] -url: '' -version: 0.9.0 diff --git a/functions/development/stream_to_parquet/0.9.0/src/stream_to_parquet.ipynb b/functions/development/stream_to_parquet/0.9.0/src/stream_to_parquet.ipynb deleted file mode 100644 index e47c6be9..00000000 --- a/functions/development/stream_to_parquet/0.9.0/src/stream_to_parquet.ipynb +++ /dev/null @@ -1,698 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Stream to Parquet" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Part of the [network operations](https://github.com/mlrun/demos/tree/0.7.x/network-operations) demo pipeline, this function listens to a labeld stream and writes it as parquet files.
\n", - "This function also deploys the function [virtual_drift](https://github.com/mlrun/functions/tree/master/virtual_drift) from the hub, which computes drift magnitude metrics between base dataset t and dataset u,
\n", - "in our case (as well as in the demo) - base dataset (the one that the model trained on) and the dataset the model predicted.
\n", - "virtual_drift writes the output to TSDB." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Steps**\n", - "\n", - "1. [Data exploration](#Data-exploration)\n", - "2. [Creating the labeled stream](#Creating-the-labeled-stream)\n", - "3. [Importing the function](#Importing-the-function)\n", - "4. [Running the functioh remotely](#Running-the-function-remotely)\n", - "5. [Testing the function](#Testing-the-function)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Data exploration**" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In order to know about the performance of a drift detector by measuring the different detection metrics, we need to know beforehand where a real drift occurs.
\n", - "This is only possible with synthetic datasets.
The scikit-multiflow framework allows generating several kinds of synthetic data to simulate the occurrence of drifts.
\n", - "[Harvard dataverse](https://dataverse.harvard.edu) provides futher explanations on the [used dataset](https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/5OWRGB) along with different kinds of drifted datasets.
\n", - "mixed_0101_abrupto has 4 concepts and 3 drifts at time steps 10000, 20000, and 30000.
\n", - "Our dataset will be train-test-splitted, the train part (first 5000 examples) is used to train the model (that is generated easly using [sklearn_classifer](https://github.com/mlrun/functions/blob/master/sklearn_classifier/sklearn_classifier.ipynb)).
\n", - "The test part (which is already predicted by the model) will be pushed to the input stream in order to detect drifts." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
X1X2X3X4class
00.01.00.4601010.5927441.0
11.01.00.5887880.5749840.0
20.00.00.4016410.6793251.0
31.01.00.3060760.1821080.0
40.00.00.9628470.5792451.0
\n", - "
" - ], - "text/plain": [ - " X1 X2 X3 X4 class\n", - "0 0.0 1.0 0.460101 0.592744 1.0\n", - "1 1.0 1.0 0.588788 0.574984 0.0\n", - "2 0.0 0.0 0.401641 0.679325 1.0\n", - "3 1.0 1.0 0.306076 0.182108 0.0\n", - "4 0.0 0.0 0.962847 0.579245 1.0" - ] - }, - "execution_count": 1, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import pandas as pd\n", - "data_path = 'https://s3.wasabisys.com/iguazio/data/function-marketplace-data/concept_drift/mixed_0101_abrupto.csv'\n", - "base_dataset = 'https://s3.wasabisys.com/iguazio/data/function-marketplace-data/concept_drift/predicted_abrupto_train.csv'\n", - "# The predicted test data is pushed to the stream\n", - "predicted_test_data_path = 'https://s3.wasabisys.com/iguazio/data/function-marketplace-data/concept_drift/predicted_abrupto_test.csv'\n", - "# You can find the model used here\n", - "models_path = 'https://s3.wasabisys.com/iguazio/models/function-marketplace-models/concept_drift/concept_drift_random_forest.pkl'\n", - "original_data = pd.read_csv(data_path)\n", - "original_data.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
X1X2X3X4classpredicted_col
349950.00.00.0101060.6472690.01.0
349961.01.00.2936510.7372911.00.0
349970.00.00.8485460.5523370.01.0
349981.01.00.6147540.8598961.00.0
349991.00.00.2653060.8437160.01.0
\n", - "
" - ], - "text/plain": [ - " X1 X2 X3 X4 class predicted_col\n", - "34995 0.0 0.0 0.010106 0.647269 0.0 1.0\n", - "34996 1.0 1.0 0.293651 0.737291 1.0 0.0\n", - "34997 0.0 0.0 0.848546 0.552337 0.0 1.0\n", - "34998 1.0 1.0 0.614754 0.859896 1.0 0.0\n", - "34999 1.0 0.0 0.265306 0.843716 0.0 1.0" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "predicted_test = pd.read_csv(predicted_test_data_path)\n", - "predicted_test.tail()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Creating the labeled stream**" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "import os \n", - "\n", - "container = os.path.join('/',os.environ['V3IO_HOME'].split('/')[0])\n", - "user = os.environ[\"V3IO_USERNAME\"]\n", - "rel_path = os.getcwd()[6:] + '/artifacts'\n", - "\n", - "base_input_stream = os.path.join(user,rel_path) + \"/inputs_stream\"\n", - "base_output_stream = os.path.join(user,rel_path) + \"/output_stream\"\n", - "input_stream = os.path.join(container,base_input_stream)\n", - "tsdb_path = os.path.join(user,rel_path) + \"/output_tsdb\"\n", - "\n", - "stream_consumer_group = 's2p'" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "import v3io.dataplane\n", - "\n", - "client = v3io.dataplane.Client()\n", - "response = client.stream.create(container = container,\n", - " stream_path=base_input_stream,\n", - " shard_count=1,\n", - " raise_for_status = v3io.dataplane.RaiseForStatus.never)\n", - "response.raise_for_status([409, 204])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Importing the function**" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2021-10-26 14:37:45,224 [info] created and saved project function-marketplace\n" - ] - } - ], - "source": [ - "import mlrun\n", - "\n", - "# Importing the function\n", - "mlrun.set_environment(project='function-marketplace')\n", - "\n", - "fn = mlrun.import_function(\"hub://stream_to_parquet:development\")\n", - "fn.apply(mlrun.auto_mount())\n", - "\n", - "fn.add_v3io_stream_trigger(stream_path=input_stream, name='stream', group=stream_consumer_group)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Running the function remotely**" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2021-10-26 14:37:45,513 [info] Starting remote function deploy\n", - "2021-10-26 14:37:45 (info) Deploying function\n", - "2021-10-26 14:37:45 (info) Building\n", - "2021-10-26 14:37:45 (info) Staging files and preparing base images\n", - "2021-10-26 14:37:45 (info) Building processor image\n", - "2021-10-26 14:37:47 (info) Build complete\n", - "2021-10-26 14:37:55 (info) Function deploy complete\n", - "> 2021-10-26 14:37:55,689 [info] successfully deployed function: {'internal_invocation_urls': ['nuclio-function-marketplace-stream-to-parquet.default-tenant.svc.cluster.local:8080'], 'external_invocation_urls': ['default-tenant.app.dev39.lab.iguazeng.com:31445']}\n" - ] - }, - { - "data": { - "text/plain": [ - "'http://default-tenant.app.dev39.lab.iguazeng.com:31445'" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import json\n", - "fn.set_envs({'window': 200,\n", - " 'save_to': os.path.join(os.path.join('/User',rel_path), 'inference_pq'),\n", - " 'prediction_col': 'predicted_col',\n", - " 'label_col': 'class',\n", - " 'base_dataset': base_dataset,\n", - " 'results_tsdb_container': container[1:],\n", - " 'results_tsdb_table': tsdb_path,\n", - " 'mount_path': os.path.join(container,user),\n", - " 'mount_remote': container,\n", - " 'artifact_path': os.path.join('/User',rel_path)})\n", - "\n", - "fn.deploy()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Testing the function**" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'data': '{\"request\": {\"instances\": [{\"X1\": 0.0, \"X2\": 0.0, \"X3\": 0.0634475073, \"X4\": 0.4136568818, \"class\": 1.0, \"predicted_col\": 1.0}]}, \"resp\": [1], \"when\": \"2021-10-26 14:37:55.864974\", \"model\": \"sklearn.ensemble.RandomForestClassifier\"}'}" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import json\n", - "import datetime\n", - "\n", - "# Reshaping the data to V3IOStream format.\n", - "def restructure_stream_event(context, event):\n", - " instances = [dict()]\n", - " for key in predicted_test.keys():\n", - " if key not in ['when', 'model', 'worker', 'hostname', 'predicted_col']:\n", - " instances[0].update({key: event.pop(key)})\n", - " instances[0].update({key: event.get(key)}) \n", - " event['request'] = {'instances': instances}\n", - " event['resp'] = [int(event.pop('predicted_col'))]\n", - " event['when'] = datetime.datetime.strftime(datetime.datetime.now(), format=\"%Y-%m-%d %H:%M:%S.%f\")\n", - " event['model'] = 'sklearn.ensemble.RandomForestClassifier'\n", - " return event\n", - " \n", - " \n", - "records = json.loads(predicted_test.to_json(orient='records'))\n", - "records = [{'data': json.dumps(restructure_stream_event(context, record))} for record in records]\n", - "\n", - "# showing first record\n", - "records[0]" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "# Pushing some data to the input stream\n", - "step = 500\n", - "for i in range(0,20000,step):\n", - " response = client.stream.put_records(container=container,\n", - " stream_path=base_input_stream, \n", - " records=records[i:i+step])" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
class_shift_helingerclass_shift_kldclass_shift_tvdprior_helingerprior_kldprior_tvdstream
time
2021-10-26 14:38:08.027000+00:000.0017590.0000250.0024881.010.01.0some_stream
2021-10-26 14:38:08.699000+00:000.0017590.0000250.0024881.010.01.0some_stream
2021-10-26 14:38:09.599000+00:000.0017590.0000250.0024881.010.01.0some_stream
2021-10-26 14:38:10.759000+00:000.0017590.0000250.0024881.010.01.0some_stream
2021-10-26 14:38:11.561000+00:000.0017590.0000250.0024881.010.01.0some_stream
........................
2021-10-26 14:39:42.037000+00:000.0017590.0000250.0024881.010.01.0some_stream
2021-10-26 14:39:42.191000+00:000.0017590.0000250.0024881.010.01.0some_stream
2021-10-26 14:39:42.586000+00:000.0017590.0000250.0024881.010.01.0some_stream
2021-10-26 14:39:42.816000+00:000.0017590.0000250.0024881.010.01.0some_stream
2021-10-26 14:39:49.180000+00:000.0017590.0000250.0024881.010.01.0some_stream
\n", - "

99 rows × 7 columns

\n", - "
" - ], - "text/plain": [ - " class_shift_helinger class_shift_kld \\\n", - "time \n", - "2021-10-26 14:38:08.027000+00:00 0.001759 0.000025 \n", - "2021-10-26 14:38:08.699000+00:00 0.001759 0.000025 \n", - "2021-10-26 14:38:09.599000+00:00 0.001759 0.000025 \n", - "2021-10-26 14:38:10.759000+00:00 0.001759 0.000025 \n", - "2021-10-26 14:38:11.561000+00:00 0.001759 0.000025 \n", - "... ... ... \n", - "2021-10-26 14:39:42.037000+00:00 0.001759 0.000025 \n", - "2021-10-26 14:39:42.191000+00:00 0.001759 0.000025 \n", - "2021-10-26 14:39:42.586000+00:00 0.001759 0.000025 \n", - "2021-10-26 14:39:42.816000+00:00 0.001759 0.000025 \n", - "2021-10-26 14:39:49.180000+00:00 0.001759 0.000025 \n", - "\n", - " class_shift_tvd prior_helinger prior_kld \\\n", - "time \n", - "2021-10-26 14:38:08.027000+00:00 0.002488 1.0 10.0 \n", - "2021-10-26 14:38:08.699000+00:00 0.002488 1.0 10.0 \n", - "2021-10-26 14:38:09.599000+00:00 0.002488 1.0 10.0 \n", - "2021-10-26 14:38:10.759000+00:00 0.002488 1.0 10.0 \n", - "2021-10-26 14:38:11.561000+00:00 0.002488 1.0 10.0 \n", - "... ... ... ... \n", - "2021-10-26 14:39:42.037000+00:00 0.002488 1.0 10.0 \n", - "2021-10-26 14:39:42.191000+00:00 0.002488 1.0 10.0 \n", - "2021-10-26 14:39:42.586000+00:00 0.002488 1.0 10.0 \n", - "2021-10-26 14:39:42.816000+00:00 0.002488 1.0 10.0 \n", - "2021-10-26 14:39:49.180000+00:00 0.002488 1.0 10.0 \n", - "\n", - " prior_tvd stream \n", - "time \n", - "2021-10-26 14:38:08.027000+00:00 1.0 some_stream \n", - "2021-10-26 14:38:08.699000+00:00 1.0 some_stream \n", - "2021-10-26 14:38:09.599000+00:00 1.0 some_stream \n", - "2021-10-26 14:38:10.759000+00:00 1.0 some_stream \n", - "2021-10-26 14:38:11.561000+00:00 1.0 some_stream \n", - "... ... ... \n", - "2021-10-26 14:39:42.037000+00:00 1.0 some_stream \n", - "2021-10-26 14:39:42.191000+00:00 1.0 some_stream \n", - "2021-10-26 14:39:42.586000+00:00 1.0 some_stream \n", - "2021-10-26 14:39:42.816000+00:00 1.0 some_stream \n", - "2021-10-26 14:39:49.180000+00:00 1.0 some_stream \n", - "\n", - "[99 rows x 7 columns]" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Reading from TSDB\n", - "import v3io_frames as v3f\n", - "\n", - "v3f_client = v3f.Client(os.environ[\"V3IO_FRAMESD\"],container=container[1:])\n", - "v3f_client.read(backend='tsdb',table=tsdb_path)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "[Back to the top](#Stream-to-Parquet)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/functions/development/stream_to_parquet/0.9.0/src/stream_to_parquet.py b/functions/development/stream_to_parquet/0.9.0/src/stream_to_parquet.py deleted file mode 100644 index 8ec5635f..00000000 --- a/functions/development/stream_to_parquet/0.9.0/src/stream_to_parquet.py +++ /dev/null @@ -1,82 +0,0 @@ -# Generated by nuclio.export.NuclioExporter - -import os -import pandas as pd -import numpy as np -import json -import datetime -import mlrun - - -def record_to_features(record): - features = record["request"]["instances"][0] - timestamp = record["when"] - prediction = record["resp"] - - record = {"timestamp": timestamp, **features, "predictions": prediction} - - return record - - -def init_context(context): - setattr(context, "batch", []) - setattr(context, "window", int(os.getenv("window", 10))) - setattr(context, "save_to", os.getenv("save_to", "/bigdata/inference_pq/")) - os.makedirs(context.save_to, exist_ok=True) - - mlrun.mlconf.dbpath = mlrun.mlconf.dbpath or "http://mlrun-api:8080" - artifact_path = os.getenv("artifact_path", None) - if artifact_path: - mlrun.mlconf.artifact_path = artifact_path - if "hub_url" in os.environ: - mlrun.mlconf.hub_url = os.environ["hub_url"] - virtual_drift_fn = mlrun.import_function("hub://virtual_drift") - virtual_drift_fn.apply(mlrun.auto_mount()) - setattr(context, "virtual_drift_fn", virtual_drift_fn) - - predictions_col = os.getenv("predictions", None) - label_col = os.getenv("label_col", None) - setattr(context, "base_dataset", os.getenv("base_dataset", "")) - setattr(context, "indexes", json.loads(os.environ.get("indexes", "[]"))) - setattr(context, "predictions_col", predictions_col) - setattr(context, "label_col", label_col) - setattr( - context, "results_tsdb_container", os.getenv("results_tsdb_container", None) - ) - setattr(context, "results_tsdb_table", os.getenv("results_tsdb_table", None)) - - -def handler(context, event): - - context.logger.info(f"Adding {event.body}") - context.batch.append(record_to_features(json.loads(event.body))) - - if len(context.batch) > context.window: - context.logger.info(context.batch[:1]) - context.logger.info(context.indexes) - df = pd.DataFrame(context.batch) - context.logger.info(f"df example: {df.head(1)}") - if context.indexes: - df = df.set_index(context.indexes) - df_path = os.path.join( - context.save_to, - f"{datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S')}.pq", - ) - df.to_parquet(df_path,index=False) - - task = mlrun.NewTask( - name="drift_magnitude", - handler="drift_magnitude", - params={ - "label_col": context.label_col, - "prediction_col": context.predictions_col, - "results_tsdb_container": context.results_tsdb_container, - "results_tsdb_table": context.results_tsdb_table, - }, - inputs={"t": context.base_dataset, "u": df_path}, - artifact_path=mlrun.mlconf.artifact_path, - ) - - context.virtual_drift_fn.run(task, watch=False) - - context.batch = [] diff --git a/functions/development/stream_to_parquet/0.9.0/static/documentation.html b/functions/development/stream_to_parquet/0.9.0/static/documentation.html deleted file mode 100644 index 52e2c852..00000000 --- a/functions/development/stream_to_parquet/0.9.0/static/documentation.html +++ /dev/null @@ -1,137 +0,0 @@ - - - - - - - -stream_to_parquet package - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- - -
-
-
-
-
-
-

stream_to_parquet package

-
-

Submodules

-
-
-

stream_to_parquet.stream_to_parquet module

-
-
-stream_to_parquet.stream_to_parquet.handler(context, event)[source]
-
-
-
-stream_to_parquet.stream_to_parquet.init_context(context)[source]
-
-
-
-stream_to_parquet.stream_to_parquet.record_to_features(record)[source]
-
-
-
-

Module contents

-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/stream_to_parquet/0.9.0/static/example.html b/functions/development/stream_to_parquet/0.9.0/static/example.html deleted file mode 100644 index 2fc930de..00000000 --- a/functions/development/stream_to_parquet/0.9.0/static/example.html +++ /dev/null @@ -1,643 +0,0 @@ - - - - - - - -Stream to Parquet - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
- -
-
-
-
-

Stream to Parquet

-

Part of the network operations demo pipeline, this function listens to a labeld stream and writes it as parquet files.
-This function also deploys the function virtual_drift from the hub, which computes drift magnitude metrics between base dataset t and dataset u,
-in our case (as well as in the demo) - base dataset (the one that the model trained on) and the dataset the model predicted.
-virtual_drift writes the output to TSDB.

- -
-

Data exploration

-

In order to know about the performance of a drift detector by measuring the different detection metrics, we need to know beforehand where a real drift occurs.
-This is only possible with synthetic datasets.
The scikit-multiflow framework allows generating several kinds of synthetic data to simulate the occurrence of drifts.
-Harvard dataverse provides futher explanations on the used dataset along with different kinds of drifted datasets.
-mixed_0101_abrupto has 4 concepts and 3 drifts at time steps 10000, 20000, and 30000.
-Our dataset will be train-test-splitted, the train part (first 5000 examples) is used to train the model (that is generated easly using sklearn_classifer).
-The test part (which is already predicted by the model) will be pushed to the input stream in order to detect drifts.

-
-
-
import pandas as pd
-data_path = 'https://s3.wasabisys.com/iguazio/data/function-marketplace-data/concept_drift/mixed_0101_abrupto.csv'
-base_dataset = 'https://s3.wasabisys.com/iguazio/data/function-marketplace-data/concept_drift/predicted_abrupto_train.csv'
-# The predicted test data is pushed to the stream
-predicted_test_data_path = 'https://s3.wasabisys.com/iguazio/data/function-marketplace-data/concept_drift/predicted_abrupto_test.csv'
-# You can find the model used here
-models_path = 'https://s3.wasabisys.com/iguazio/models/function-marketplace-models/concept_drift/concept_drift_random_forest.pkl'
-original_data = pd.read_csv(data_path)
-original_data.head()
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
X1X2X3X4class
00.01.00.4601010.5927441.0
11.01.00.5887880.5749840.0
20.00.00.4016410.6793251.0
31.01.00.3060760.1821080.0
40.00.00.9628470.5792451.0
-
-
-
-
-
predicted_test = pd.read_csv(predicted_test_data_path)
-predicted_test.tail()
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
X1X2X3X4classpredicted_col
349950.00.00.0101060.6472690.01.0
349961.01.00.2936510.7372911.00.0
349970.00.00.8485460.5523370.01.0
349981.01.00.6147540.8598961.00.0
349991.00.00.2653060.8437160.01.0
-
-
-
-
-

Creating the labeled stream

-
-
-
import os 
-
-container = os.path.join('/',os.environ['V3IO_HOME'].split('/')[0])
-user = os.environ["V3IO_USERNAME"]
-rel_path = os.getcwd()[6:] + '/artifacts'
-
-base_input_stream = os.path.join(user,rel_path) + "/inputs_stream"
-base_output_stream = os.path.join(user,rel_path) + "/output_stream"
-input_stream = os.path.join(container,base_input_stream)
-tsdb_path = os.path.join(user,rel_path) + "/output_tsdb"
-
-stream_consumer_group = 's2p'
-
-
-
-
-
-
-
import v3io.dataplane
-
-client = v3io.dataplane.Client()
-response = client.stream.create(container = container,
-                                stream_path=base_input_stream,
-                                shard_count=1,
-                                raise_for_status = v3io.dataplane.RaiseForStatus.never)
-response.raise_for_status([409, 204])
-
-
-
-
-
-
-

Importing the function

-
-
-
import mlrun
-
-# Importing the function
-mlrun.set_environment(project='function-marketplace')
-
-fn = mlrun.import_function("hub://stream_to_parquet:development")
-fn.apply(mlrun.auto_mount())
-
-fn.add_v3io_stream_trigger(stream_path=input_stream, name='stream', group=stream_consumer_group)
-
-
-
-
-
> 2021-10-26 14:37:45,224 [info] created and saved project function-marketplace
-
-
-
-
-
-
-

Running the function remotely

-
-
-
import json
-fn.set_envs({'window': 200,
-             'save_to': os.path.join(os.path.join('/User',rel_path), 'inference_pq'),
-             'prediction_col': 'predicted_col',
-             'label_col': 'class',
-             'base_dataset': base_dataset,
-             'results_tsdb_container': container[1:],
-             'results_tsdb_table': tsdb_path,
-             'mount_path': os.path.join(container,user),
-             'mount_remote': container,
-             'artifact_path': os.path.join('/User',rel_path)})
-
-fn.deploy()
-
-
-
-
-
> 2021-10-26 14:37:45,513 [info] Starting remote function deploy
-2021-10-26 14:37:45  (info) Deploying function
-2021-10-26 14:37:45  (info) Building
-2021-10-26 14:37:45  (info) Staging files and preparing base images
-2021-10-26 14:37:45  (info) Building processor image
-2021-10-26 14:37:47  (info) Build complete
-2021-10-26 14:37:55  (info) Function deploy complete
-> 2021-10-26 14:37:55,689 [info] successfully deployed function: {'internal_invocation_urls': ['nuclio-function-marketplace-stream-to-parquet.default-tenant.svc.cluster.local:8080'], 'external_invocation_urls': ['default-tenant.app.dev39.lab.iguazeng.com:31445']}
-
-
-
'http://default-tenant.app.dev39.lab.iguazeng.com:31445'
-
-
-
-
-
-
-

Testing the function

-
-
-
import json
-import datetime
-
-# Reshaping the data to V3IOStream format.
-def restructure_stream_event(context, event):
-    instances = [dict()]
-    for key in predicted_test.keys():
-        if key not in ['when', 'model', 'worker', 'hostname', 'predicted_col']:
-            instances[0].update({key: event.pop(key)})
-    instances[0].update({key: event.get(key)})      
-    event['request'] = {'instances': instances}
-    event['resp'] = [int(event.pop('predicted_col'))]
-    event['when'] = datetime.datetime.strftime(datetime.datetime.now(), format="%Y-%m-%d %H:%M:%S.%f")
-    event['model'] = 'sklearn.ensemble.RandomForestClassifier'
-    return event
-    
-    
-records = json.loads(predicted_test.to_json(orient='records'))
-records = [{'data': json.dumps(restructure_stream_event(context, record))} for record in records]
-
-# showing first record
-records[0]
-
-
-
-
-
{'data': '{"request": {"instances": [{"X1": 0.0, "X2": 0.0, "X3": 0.0634475073, "X4": 0.4136568818, "class": 1.0, "predicted_col": 1.0}]}, "resp": [1], "when": "2021-10-26 14:37:55.864974", "model": "sklearn.ensemble.RandomForestClassifier"}'}
-
-
-
-
-
-
-
# Pushing some data to the input stream
-step = 500
-for i in range(0,20000,step):
-    response = client.stream.put_records(container=container,
-                                              stream_path=base_input_stream, 
-                                              records=records[i:i+step])
-
-
-
-
-
-
-
# Reading from TSDB
-import v3io_frames as v3f
-
-v3f_client = v3f.Client(os.environ["V3IO_FRAMESD"],container=container[1:])
-v3f_client.read(backend='tsdb',table=tsdb_path)
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
class_shift_helingerclass_shift_kldclass_shift_tvdprior_helingerprior_kldprior_tvdstream
time
2021-10-26 14:38:08.027000+00:000.0017590.0000250.0024881.010.01.0some_stream
2021-10-26 14:38:08.699000+00:000.0017590.0000250.0024881.010.01.0some_stream
2021-10-26 14:38:09.599000+00:000.0017590.0000250.0024881.010.01.0some_stream
2021-10-26 14:38:10.759000+00:000.0017590.0000250.0024881.010.01.0some_stream
2021-10-26 14:38:11.561000+00:000.0017590.0000250.0024881.010.01.0some_stream
........................
2021-10-26 14:39:42.037000+00:000.0017590.0000250.0024881.010.01.0some_stream
2021-10-26 14:39:42.191000+00:000.0017590.0000250.0024881.010.01.0some_stream
2021-10-26 14:39:42.586000+00:000.0017590.0000250.0024881.010.01.0some_stream
2021-10-26 14:39:42.816000+00:000.0017590.0000250.0024881.010.01.0some_stream
2021-10-26 14:39:49.180000+00:000.0017590.0000250.0024881.010.01.0some_stream
-

99 rows × 7 columns

-
-
-

Back to the top

-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/stream_to_parquet/0.9.0/static/function.html b/functions/development/stream_to_parquet/0.9.0/static/function.html deleted file mode 100644 index 5cc5a2c2..00000000 --- a/functions/development/stream_to_parquet/0.9.0/static/function.html +++ /dev/null @@ -1,67 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: remote
-metadata:
-  name: stream-to-parquet
-  tag: ''
-  hash: 78316bfbe731714715c19f0bc6deabf8652f15c4
-  project: ''
-  labels:
-    author: orz
-  categories:
-  - machine-learning
-  - data-preparation
-spec:
-  command: ''
-  args: []
-  image: mlrun/ml-models
-  description: Saves a stream to Parquet and can lunch drift detection task on it
-  min_replicas: 1
-  max_replicas: 1
-  env: []
-  base_spec:
-    apiVersion: nuclio.io/v1
-    kind: Function
-    metadata:
-      name: stream-to-parquet
-      labels: {}
-      annotations:
-        nuclio.io/generated_by: function generated from /User/test/functions/stream_to_parquet/stream_to_parquet.py
-    spec:
-      runtime: python:3.6
-      handler: stream_to_parquet:handler
-      env: []
-      volumes: []
-      build:
-        commands: []
-        noBaseImagesPull: true
-        functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IG9zCmltcG9ydCBwYW5kYXMgYXMgcGQKaW1wb3J0IG51bXB5IGFzIG5wCmltcG9ydCBqc29uCmltcG9ydCBkYXRldGltZQppbXBvcnQgbWxydW4KCgpkZWYgcmVjb3JkX3RvX2ZlYXR1cmVzKHJlY29yZCk6CiAgICBmZWF0dXJlcyA9IHJlY29yZFsicmVxdWVzdCJdWyJpbnN0YW5jZXMiXVswXQogICAgdGltZXN0YW1wID0gcmVjb3JkWyJ3aGVuIl0KICAgIHByZWRpY3Rpb24gPSByZWNvcmRbInJlc3AiXQoKICAgIHJlY29yZCA9IHsidGltZXN0YW1wIjogdGltZXN0YW1wLCAqKmZlYXR1cmVzLCAicHJlZGljdGlvbnMiOiBwcmVkaWN0aW9ufQoKICAgIHJldHVybiByZWNvcmQKCgpkZWYgaW5pdF9jb250ZXh0KGNvbnRleHQpOgogICAgc2V0YXR0cihjb250ZXh0LCAiYmF0Y2giLCBbXSkKICAgIHNldGF0dHIoY29udGV4dCwgIndpbmRvdyIsIGludChvcy5nZXRlbnYoIndpbmRvdyIsIDEwKSkpCiAgICBzZXRhdHRyKGNvbnRleHQsICJzYXZlX3RvIiwgb3MuZ2V0ZW52KCJzYXZlX3RvIiwgIi9iaWdkYXRhL2luZmVyZW5jZV9wcS8iKSkKICAgIG9zLm1ha2VkaXJzKGNvbnRleHQuc2F2ZV90bywgZXhpc3Rfb2s9VHJ1ZSkKCiAgICBtbHJ1bi5tbGNvbmYuZGJwYXRoID0gbWxydW4ubWxjb25mLmRicGF0aCBvciAiaHR0cDovL21scnVuLWFwaTo4MDgwIgogICAgYXJ0aWZhY3RfcGF0aCA9IG9zLmdldGVudigiYXJ0aWZhY3RfcGF0aCIsIE5vbmUpCiAgICBpZiBhcnRpZmFjdF9wYXRoOgogICAgICAgIG1scnVuLm1sY29uZi5hcnRpZmFjdF9wYXRoID0gYXJ0aWZhY3RfcGF0aAogICAgaWYgImh1Yl91cmwiIGluIG9zLmVudmlyb246CiAgICAgICAgbWxydW4ubWxjb25mLmh1Yl91cmwgPSBvcy5lbnZpcm9uWyJodWJfdXJsIl0KICAgIHZpcnR1YWxfZHJpZnRfZm4gPSBtbHJ1bi5pbXBvcnRfZnVuY3Rpb24oImh1YjovL3ZpcnR1YWxfZHJpZnQiKQogICAgdmlydHVhbF9kcmlmdF9mbi5hcHBseShtbHJ1bi5hdXRvX21vdW50KCkpCiAgICBzZXRhdHRyKGNvbnRleHQsICJ2aXJ0dWFsX2RyaWZ0X2ZuIiwgdmlydHVhbF9kcmlmdF9mbikKCiAgICBwcmVkaWN0aW9uc19jb2wgPSBvcy5nZXRlbnYoInByZWRpY3Rpb25zIiwgTm9uZSkKICAgIGxhYmVsX2NvbCA9IG9zLmdldGVudigibGFiZWxfY29sIiwgTm9uZSkKICAgIHNldGF0dHIoY29udGV4dCwgImJhc2VfZGF0YXNldCIsIG9zLmdldGVudigiYmFzZV9kYXRhc2V0IiwgIiIpKQogICAgc2V0YXR0cihjb250ZXh0LCAiaW5kZXhlcyIsIGpzb24ubG9hZHMob3MuZW52aXJvbi5nZXQoImluZGV4ZXMiLCAiW10iKSkpCiAgICBzZXRhdHRyKGNvbnRleHQsICJwcmVkaWN0aW9uc19jb2wiLCBwcmVkaWN0aW9uc19jb2wpCiAgICBzZXRhdHRyKGNvbnRleHQsICJsYWJlbF9jb2wiLCBsYWJlbF9jb2wpCiAgICBzZXRhdHRyKAogICAgICAgIGNvbnRleHQsICJyZXN1bHRzX3RzZGJfY29udGFpbmVyIiwgb3MuZ2V0ZW52KCJyZXN1bHRzX3RzZGJfY29udGFpbmVyIiwgTm9uZSkKICAgICkKICAgIHNldGF0dHIoY29udGV4dCwgInJlc3VsdHNfdHNkYl90YWJsZSIsIG9zLmdldGVudigicmVzdWx0c190c2RiX3RhYmxlIiwgTm9uZSkpCgoKZGVmIGhhbmRsZXIoY29udGV4dCwgZXZlbnQpOgoKICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZiJBZGRpbmcge2V2ZW50LmJvZHl9IikKICAgIGNvbnRleHQuYmF0Y2guYXBwZW5kKHJlY29yZF90b19mZWF0dXJlcyhqc29uLmxvYWRzKGV2ZW50LmJvZHkpKSkKCiAgICBpZiBsZW4oY29udGV4dC5iYXRjaCkgPiBjb250ZXh0LndpbmRvdzoKICAgICAgICBjb250ZXh0LmxvZ2dlci5pbmZvKGNvbnRleHQuYmF0Y2hbOjFdKQogICAgICAgIGNvbnRleHQubG9nZ2VyLmluZm8oY29udGV4dC5pbmRleGVzKQogICAgICAgIGRmID0gcGQuRGF0YUZyYW1lKGNvbnRleHQuYmF0Y2gpCiAgICAgICAgY29udGV4dC5sb2dnZXIuaW5mbyhmImRmIGV4YW1wbGU6IHtkZi5oZWFkKDEpfSIpCiAgICAgICAgaWYgY29udGV4dC5pbmRleGVzOgogICAgICAgICAgICBkZiA9IGRmLnNldF9pbmRleChjb250ZXh0LmluZGV4ZXMpCiAgICAgICAgZGZfcGF0aCA9IG9zLnBhdGguam9pbigKICAgICAgICAgICAgY29udGV4dC5zYXZlX3RvLAogICAgICAgICAgICBmIntkYXRldGltZS5kYXRldGltZS5ub3coKS5zdHJmdGltZSgnJVktJW0tJWRUJUg6JU06JVMnKX0ucHEiLAogICAgICAgICkKICAgICAgICBkZi50b19wYXJxdWV0KGRmX3BhdGgsaW5kZXg9RmFsc2UpCgogICAgICAgIHRhc2sgPSBtbHJ1bi5OZXdUYXNrKAogICAgICAgICAgICBuYW1lPSJkcmlmdF9tYWduaXR1ZGUiLAogICAgICAgICAgICBoYW5kbGVyPSJkcmlmdF9tYWduaXR1ZGUiLAogICAgICAgICAgICBwYXJhbXM9ewogICAgICAgICAgICAgICAgImxhYmVsX2NvbCI6IGNvbnRleHQubGFiZWxfY29sLAogICAgICAgICAgICAgICAgInByZWRpY3Rpb25fY29sIjogY29udGV4dC5wcmVkaWN0aW9uc19jb2wsCiAgICAgICAgICAgICAgICAicmVzdWx0c190c2RiX2NvbnRhaW5lciI6IGNvbnRleHQucmVzdWx0c190c2RiX2NvbnRhaW5lciwKICAgICAgICAgICAgICAgICJyZXN1bHRzX3RzZGJfdGFibGUiOiBjb250ZXh0LnJlc3VsdHNfdHNkYl90YWJsZSwKICAgICAgICAgICAgfSwKICAgICAgICAgICAgaW5wdXRzPXsidCI6IGNvbnRleHQuYmFzZV9kYXRhc2V0LCAidSI6IGRmX3BhdGh9LAogICAgICAgICAgICBhcnRpZmFjdF9wYXRoPW1scnVuLm1sY29uZi5hcnRpZmFjdF9wYXRoLAogICAgICAgICkKCiAgICAgICAgY29udGV4dC52aXJ0dWFsX2RyaWZ0X2ZuLnJ1bih0YXNrLCB3YXRjaD1GYWxzZSkKCiAgICAgICAgY29udGV4dC5iYXRjaCA9IFtdCg==
-  source: ''
-  build:
-    commands: []
-    code_origin: https://github.com/daniels290813/functions.git#3605c9b8dcadab89a5a45f7d16dcd2fcfeca8697:/User/test/functions/stream_to_parquet/stream_to_parquet.py
-    origin_filename: /User/test/functions/stream_to_parquet/stream_to_parquet.py
-  default_handler: handler
-  disable_auto_mount: false
-  affinity: null
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/stream_to_parquet/0.9.0/static/item.html b/functions/development/stream_to_parquet/0.9.0/static/item.html deleted file mode 100644 index 1f187b52..00000000 --- a/functions/development/stream_to_parquet/0.9.0/static/item.html +++ /dev/null @@ -1,49 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- machine-learning
-- data-preparation
-description: Saves a stream to Parquet and can lunch drift detection task on it
-doc: ''
-example: stream_to_parquet.ipynb
-generationDate: 2021-11-18:12-28
-icon: ''
-labels:
-  author: orz
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 0.8.0
-name: stream-to-parquet
-platformVersion: 3.2.0
-spec:
-  customFields:
-    max_replicas: 1
-    min_replicas: 1
-  filename: stream_to_parquet.py
-  handler: handler
-  image: mlrun/ml-models
-  kind: nuclio
-  requirements: []
-url: ''
-version: 0.9.0
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/stream_to_parquet/0.9.0/static/source.html b/functions/development/stream_to_parquet/0.9.0/static/source.html deleted file mode 100644 index bf5efa53..00000000 --- a/functions/development/stream_to_parquet/0.9.0/static/source.html +++ /dev/null @@ -1,104 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-# Generated by nuclio.export.NuclioExporter
-
-import os
-import pandas as pd
-import numpy as np
-import json
-import datetime
-import mlrun
-
-
-def record_to_features(record):
-    features = record["request"]["instances"][0]
-    timestamp = record["when"]
-    prediction = record["resp"]
-
-    record = {"timestamp": timestamp, **features, "predictions": prediction}
-
-    return record
-
-
-def init_context(context):
-    setattr(context, "batch", [])
-    setattr(context, "window", int(os.getenv("window", 10)))
-    setattr(context, "save_to", os.getenv("save_to", "/bigdata/inference_pq/"))
-    os.makedirs(context.save_to, exist_ok=True)
-
-    mlrun.mlconf.dbpath = mlrun.mlconf.dbpath or "http://mlrun-api:8080"
-    artifact_path = os.getenv("artifact_path", None)
-    if artifact_path:
-        mlrun.mlconf.artifact_path = artifact_path
-    if "hub_url" in os.environ:
-        mlrun.mlconf.hub_url = os.environ["hub_url"]
-    virtual_drift_fn = mlrun.import_function("hub://virtual_drift")
-    virtual_drift_fn.apply(mlrun.auto_mount())
-    setattr(context, "virtual_drift_fn", virtual_drift_fn)
-
-    predictions_col = os.getenv("predictions", None)
-    label_col = os.getenv("label_col", None)
-    setattr(context, "base_dataset", os.getenv("base_dataset", ""))
-    setattr(context, "indexes", json.loads(os.environ.get("indexes", "[]")))
-    setattr(context, "predictions_col", predictions_col)
-    setattr(context, "label_col", label_col)
-    setattr(
-        context, "results_tsdb_container", os.getenv("results_tsdb_container", None)
-    )
-    setattr(context, "results_tsdb_table", os.getenv("results_tsdb_table", None))
-
-
-def handler(context, event):
-
-    context.logger.info(f"Adding {event.body}")
-    context.batch.append(record_to_features(json.loads(event.body)))
-
-    if len(context.batch) > context.window:
-        context.logger.info(context.batch[:1])
-        context.logger.info(context.indexes)
-        df = pd.DataFrame(context.batch)
-        context.logger.info(f"df example: {df.head(1)}")
-        if context.indexes:
-            df = df.set_index(context.indexes)
-        df_path = os.path.join(
-            context.save_to,
-            f"{datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S')}.pq",
-        )
-        df.to_parquet(df_path,index=False)
-
-        task = mlrun.NewTask(
-            name="drift_magnitude",
-            handler="drift_magnitude",
-            params={
-                "label_col": context.label_col,
-                "prediction_col": context.predictions_col,
-                "results_tsdb_container": context.results_tsdb_container,
-                "results_tsdb_table": context.results_tsdb_table,
-            },
-            inputs={"t": context.base_dataset, "u": df_path},
-            artifact_path=mlrun.mlconf.artifact_path,
-        )
-
-        context.virtual_drift_fn.run(task, watch=False)
-
-        context.batch = []
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/stream_to_parquet/1.1.0/src/function.yaml b/functions/development/stream_to_parquet/1.1.0/src/function.yaml deleted file mode 100644 index f8786cc9..00000000 --- a/functions/development/stream_to_parquet/1.1.0/src/function.yaml +++ /dev/null @@ -1,45 +0,0 @@ -kind: remote -metadata: - name: stream-to-parquet - tag: '' - hash: 78316bfbe731714715c19f0bc6deabf8652f15c4 - project: '' - labels: - author: orz - categories: - - machine-learning - - data-preparation -spec: - command: '' - args: [] - image: mlrun/ml-models - description: Saves a stream to Parquet and can lunch drift detection task on it - min_replicas: 1 - max_replicas: 1 - env: [] - base_spec: - apiVersion: nuclio.io/v1 - kind: Function - metadata: - name: stream-to-parquet - labels: {} - annotations: - nuclio.io/generated_by: function generated from /User/test/functions/stream_to_parquet/stream_to_parquet.py - spec: - runtime: python:3.6 - handler: stream_to_parquet:handler - env: [] - volumes: [] - build: - commands: [] - noBaseImagesPull: true - functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IG9zCmltcG9ydCBwYW5kYXMgYXMgcGQKaW1wb3J0IG51bXB5IGFzIG5wCmltcG9ydCBqc29uCmltcG9ydCBkYXRldGltZQppbXBvcnQgbWxydW4KCgpkZWYgcmVjb3JkX3RvX2ZlYXR1cmVzKHJlY29yZCk6CiAgICBmZWF0dXJlcyA9IHJlY29yZFsicmVxdWVzdCJdWyJpbnN0YW5jZXMiXVswXQogICAgdGltZXN0YW1wID0gcmVjb3JkWyJ3aGVuIl0KICAgIHByZWRpY3Rpb24gPSByZWNvcmRbInJlc3AiXQoKICAgIHJlY29yZCA9IHsidGltZXN0YW1wIjogdGltZXN0YW1wLCAqKmZlYXR1cmVzLCAicHJlZGljdGlvbnMiOiBwcmVkaWN0aW9ufQoKICAgIHJldHVybiByZWNvcmQKCgpkZWYgaW5pdF9jb250ZXh0KGNvbnRleHQpOgogICAgc2V0YXR0cihjb250ZXh0LCAiYmF0Y2giLCBbXSkKICAgIHNldGF0dHIoY29udGV4dCwgIndpbmRvdyIsIGludChvcy5nZXRlbnYoIndpbmRvdyIsIDEwKSkpCiAgICBzZXRhdHRyKGNvbnRleHQsICJzYXZlX3RvIiwgb3MuZ2V0ZW52KCJzYXZlX3RvIiwgIi9iaWdkYXRhL2luZmVyZW5jZV9wcS8iKSkKICAgIG9zLm1ha2VkaXJzKGNvbnRleHQuc2F2ZV90bywgZXhpc3Rfb2s9VHJ1ZSkKCiAgICBtbHJ1bi5tbGNvbmYuZGJwYXRoID0gbWxydW4ubWxjb25mLmRicGF0aCBvciAiaHR0cDovL21scnVuLWFwaTo4MDgwIgogICAgYXJ0aWZhY3RfcGF0aCA9IG9zLmdldGVudigiYXJ0aWZhY3RfcGF0aCIsIE5vbmUpCiAgICBpZiBhcnRpZmFjdF9wYXRoOgogICAgICAgIG1scnVuLm1sY29uZi5hcnRpZmFjdF9wYXRoID0gYXJ0aWZhY3RfcGF0aAogICAgaWYgImh1Yl91cmwiIGluIG9zLmVudmlyb246CiAgICAgICAgbWxydW4ubWxjb25mLmh1Yl91cmwgPSBvcy5lbnZpcm9uWyJodWJfdXJsIl0KICAgIHZpcnR1YWxfZHJpZnRfZm4gPSBtbHJ1bi5pbXBvcnRfZnVuY3Rpb24oImh1YjovL3ZpcnR1YWxfZHJpZnQiKQogICAgdmlydHVhbF9kcmlmdF9mbi5hcHBseShtbHJ1bi5hdXRvX21vdW50KCkpCiAgICBzZXRhdHRyKGNvbnRleHQsICJ2aXJ0dWFsX2RyaWZ0X2ZuIiwgdmlydHVhbF9kcmlmdF9mbikKCiAgICBwcmVkaWN0aW9uc19jb2wgPSBvcy5nZXRlbnYoInByZWRpY3Rpb25zIiwgTm9uZSkKICAgIGxhYmVsX2NvbCA9IG9zLmdldGVudigibGFiZWxfY29sIiwgTm9uZSkKICAgIHNldGF0dHIoY29udGV4dCwgImJhc2VfZGF0YXNldCIsIG9zLmdldGVudigiYmFzZV9kYXRhc2V0IiwgIiIpKQogICAgc2V0YXR0cihjb250ZXh0LCAiaW5kZXhlcyIsIGpzb24ubG9hZHMob3MuZW52aXJvbi5nZXQoImluZGV4ZXMiLCAiW10iKSkpCiAgICBzZXRhdHRyKGNvbnRleHQsICJwcmVkaWN0aW9uc19jb2wiLCBwcmVkaWN0aW9uc19jb2wpCiAgICBzZXRhdHRyKGNvbnRleHQsICJsYWJlbF9jb2wiLCBsYWJlbF9jb2wpCiAgICBzZXRhdHRyKAogICAgICAgIGNvbnRleHQsICJyZXN1bHRzX3RzZGJfY29udGFpbmVyIiwgb3MuZ2V0ZW52KCJyZXN1bHRzX3RzZGJfY29udGFpbmVyIiwgTm9uZSkKICAgICkKICAgIHNldGF0dHIoY29udGV4dCwgInJlc3VsdHNfdHNkYl90YWJsZSIsIG9zLmdldGVudigicmVzdWx0c190c2RiX3RhYmxlIiwgTm9uZSkpCgoKZGVmIGhhbmRsZXIoY29udGV4dCwgZXZlbnQpOgoKICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZiJBZGRpbmcge2V2ZW50LmJvZHl9IikKICAgIGNvbnRleHQuYmF0Y2guYXBwZW5kKHJlY29yZF90b19mZWF0dXJlcyhqc29uLmxvYWRzKGV2ZW50LmJvZHkpKSkKCiAgICBpZiBsZW4oY29udGV4dC5iYXRjaCkgPiBjb250ZXh0LndpbmRvdzoKICAgICAgICBjb250ZXh0LmxvZ2dlci5pbmZvKGNvbnRleHQuYmF0Y2hbOjFdKQogICAgICAgIGNvbnRleHQubG9nZ2VyLmluZm8oY29udGV4dC5pbmRleGVzKQogICAgICAgIGRmID0gcGQuRGF0YUZyYW1lKGNvbnRleHQuYmF0Y2gpCiAgICAgICAgY29udGV4dC5sb2dnZXIuaW5mbyhmImRmIGV4YW1wbGU6IHtkZi5oZWFkKDEpfSIpCiAgICAgICAgaWYgY29udGV4dC5pbmRleGVzOgogICAgICAgICAgICBkZiA9IGRmLnNldF9pbmRleChjb250ZXh0LmluZGV4ZXMpCiAgICAgICAgZGZfcGF0aCA9IG9zLnBhdGguam9pbigKICAgICAgICAgICAgY29udGV4dC5zYXZlX3RvLAogICAgICAgICAgICBmIntkYXRldGltZS5kYXRldGltZS5ub3coKS5zdHJmdGltZSgnJVktJW0tJWRUJUg6JU06JVMnKX0ucHEiLAogICAgICAgICkKICAgICAgICBkZi50b19wYXJxdWV0KGRmX3BhdGgsaW5kZXg9RmFsc2UpCgogICAgICAgIHRhc2sgPSBtbHJ1bi5OZXdUYXNrKAogICAgICAgICAgICBuYW1lPSJkcmlmdF9tYWduaXR1ZGUiLAogICAgICAgICAgICBoYW5kbGVyPSJkcmlmdF9tYWduaXR1ZGUiLAogICAgICAgICAgICBwYXJhbXM9ewogICAgICAgICAgICAgICAgImxhYmVsX2NvbCI6IGNvbnRleHQubGFiZWxfY29sLAogICAgICAgICAgICAgICAgInByZWRpY3Rpb25fY29sIjogY29udGV4dC5wcmVkaWN0aW9uc19jb2wsCiAgICAgICAgICAgICAgICAicmVzdWx0c190c2RiX2NvbnRhaW5lciI6IGNvbnRleHQucmVzdWx0c190c2RiX2NvbnRhaW5lciwKICAgICAgICAgICAgICAgICJyZXN1bHRzX3RzZGJfdGFibGUiOiBjb250ZXh0LnJlc3VsdHNfdHNkYl90YWJsZSwKICAgICAgICAgICAgfSwKICAgICAgICAgICAgaW5wdXRzPXsidCI6IGNvbnRleHQuYmFzZV9kYXRhc2V0LCAidSI6IGRmX3BhdGh9LAogICAgICAgICAgICBhcnRpZmFjdF9wYXRoPW1scnVuLm1sY29uZi5hcnRpZmFjdF9wYXRoLAogICAgICAgICkKCiAgICAgICAgY29udGV4dC52aXJ0dWFsX2RyaWZ0X2ZuLnJ1bih0YXNrLCB3YXRjaD1GYWxzZSkKCiAgICAgICAgY29udGV4dC5iYXRjaCA9IFtdCg== - source: '' - build: - commands: [] - code_origin: https://github.com/daniels290813/functions.git#3605c9b8dcadab89a5a45f7d16dcd2fcfeca8697:/User/test/functions/stream_to_parquet/stream_to_parquet.py - origin_filename: /User/test/functions/stream_to_parquet/stream_to_parquet.py - default_handler: handler - disable_auto_mount: false - affinity: null -verbose: false diff --git a/functions/development/stream_to_parquet/1.1.0/src/item.yaml b/functions/development/stream_to_parquet/1.1.0/src/item.yaml deleted file mode 100644 index cbd59376..00000000 --- a/functions/development/stream_to_parquet/1.1.0/src/item.yaml +++ /dev/null @@ -1,28 +0,0 @@ -apiVersion: v1 -categories: -- machine-learning -- data-preparation -description: Saves a stream to Parquet and can lunch drift detection task on it -doc: '' -example: stream_to_parquet.ipynb -generationDate: 2022-08-28:17-25 -hidden: false -icon: '' -labels: - author: orz -maintainers: [] -marketplaceType: '' -mlrunVersion: 1.1.0 -name: stream-to-parquet -platformVersion: 3.5.0 -spec: - customFields: - max_replicas: 1 - min_replicas: 1 - filename: stream_to_parquet.py - handler: handler - image: mlrun/ml-models - kind: nuclio - requirements: [] -url: '' -version: 1.1.0 diff --git a/functions/development/stream_to_parquet/1.1.0/src/stream_to_parquet.ipynb b/functions/development/stream_to_parquet/1.1.0/src/stream_to_parquet.ipynb deleted file mode 100644 index e47c6be9..00000000 --- a/functions/development/stream_to_parquet/1.1.0/src/stream_to_parquet.ipynb +++ /dev/null @@ -1,698 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Stream to Parquet" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Part of the [network operations](https://github.com/mlrun/demos/tree/0.7.x/network-operations) demo pipeline, this function listens to a labeld stream and writes it as parquet files.
\n", - "This function also deploys the function [virtual_drift](https://github.com/mlrun/functions/tree/master/virtual_drift) from the hub, which computes drift magnitude metrics between base dataset t and dataset u,
\n", - "in our case (as well as in the demo) - base dataset (the one that the model trained on) and the dataset the model predicted.
\n", - "virtual_drift writes the output to TSDB." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Steps**\n", - "\n", - "1. [Data exploration](#Data-exploration)\n", - "2. [Creating the labeled stream](#Creating-the-labeled-stream)\n", - "3. [Importing the function](#Importing-the-function)\n", - "4. [Running the functioh remotely](#Running-the-function-remotely)\n", - "5. [Testing the function](#Testing-the-function)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Data exploration**" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In order to know about the performance of a drift detector by measuring the different detection metrics, we need to know beforehand where a real drift occurs.
\n", - "This is only possible with synthetic datasets.
The scikit-multiflow framework allows generating several kinds of synthetic data to simulate the occurrence of drifts.
\n", - "[Harvard dataverse](https://dataverse.harvard.edu) provides futher explanations on the [used dataset](https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/5OWRGB) along with different kinds of drifted datasets.
\n", - "mixed_0101_abrupto has 4 concepts and 3 drifts at time steps 10000, 20000, and 30000.
\n", - "Our dataset will be train-test-splitted, the train part (first 5000 examples) is used to train the model (that is generated easly using [sklearn_classifer](https://github.com/mlrun/functions/blob/master/sklearn_classifier/sklearn_classifier.ipynb)).
\n", - "The test part (which is already predicted by the model) will be pushed to the input stream in order to detect drifts." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
X1X2X3X4class
00.01.00.4601010.5927441.0
11.01.00.5887880.5749840.0
20.00.00.4016410.6793251.0
31.01.00.3060760.1821080.0
40.00.00.9628470.5792451.0
\n", - "
" - ], - "text/plain": [ - " X1 X2 X3 X4 class\n", - "0 0.0 1.0 0.460101 0.592744 1.0\n", - "1 1.0 1.0 0.588788 0.574984 0.0\n", - "2 0.0 0.0 0.401641 0.679325 1.0\n", - "3 1.0 1.0 0.306076 0.182108 0.0\n", - "4 0.0 0.0 0.962847 0.579245 1.0" - ] - }, - "execution_count": 1, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import pandas as pd\n", - "data_path = 'https://s3.wasabisys.com/iguazio/data/function-marketplace-data/concept_drift/mixed_0101_abrupto.csv'\n", - "base_dataset = 'https://s3.wasabisys.com/iguazio/data/function-marketplace-data/concept_drift/predicted_abrupto_train.csv'\n", - "# The predicted test data is pushed to the stream\n", - "predicted_test_data_path = 'https://s3.wasabisys.com/iguazio/data/function-marketplace-data/concept_drift/predicted_abrupto_test.csv'\n", - "# You can find the model used here\n", - "models_path = 'https://s3.wasabisys.com/iguazio/models/function-marketplace-models/concept_drift/concept_drift_random_forest.pkl'\n", - "original_data = pd.read_csv(data_path)\n", - "original_data.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
X1X2X3X4classpredicted_col
349950.00.00.0101060.6472690.01.0
349961.01.00.2936510.7372911.00.0
349970.00.00.8485460.5523370.01.0
349981.01.00.6147540.8598961.00.0
349991.00.00.2653060.8437160.01.0
\n", - "
" - ], - "text/plain": [ - " X1 X2 X3 X4 class predicted_col\n", - "34995 0.0 0.0 0.010106 0.647269 0.0 1.0\n", - "34996 1.0 1.0 0.293651 0.737291 1.0 0.0\n", - "34997 0.0 0.0 0.848546 0.552337 0.0 1.0\n", - "34998 1.0 1.0 0.614754 0.859896 1.0 0.0\n", - "34999 1.0 0.0 0.265306 0.843716 0.0 1.0" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "predicted_test = pd.read_csv(predicted_test_data_path)\n", - "predicted_test.tail()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Creating the labeled stream**" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "import os \n", - "\n", - "container = os.path.join('/',os.environ['V3IO_HOME'].split('/')[0])\n", - "user = os.environ[\"V3IO_USERNAME\"]\n", - "rel_path = os.getcwd()[6:] + '/artifacts'\n", - "\n", - "base_input_stream = os.path.join(user,rel_path) + \"/inputs_stream\"\n", - "base_output_stream = os.path.join(user,rel_path) + \"/output_stream\"\n", - "input_stream = os.path.join(container,base_input_stream)\n", - "tsdb_path = os.path.join(user,rel_path) + \"/output_tsdb\"\n", - "\n", - "stream_consumer_group = 's2p'" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "import v3io.dataplane\n", - "\n", - "client = v3io.dataplane.Client()\n", - "response = client.stream.create(container = container,\n", - " stream_path=base_input_stream,\n", - " shard_count=1,\n", - " raise_for_status = v3io.dataplane.RaiseForStatus.never)\n", - "response.raise_for_status([409, 204])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Importing the function**" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2021-10-26 14:37:45,224 [info] created and saved project function-marketplace\n" - ] - } - ], - "source": [ - "import mlrun\n", - "\n", - "# Importing the function\n", - "mlrun.set_environment(project='function-marketplace')\n", - "\n", - "fn = mlrun.import_function(\"hub://stream_to_parquet:development\")\n", - "fn.apply(mlrun.auto_mount())\n", - "\n", - "fn.add_v3io_stream_trigger(stream_path=input_stream, name='stream', group=stream_consumer_group)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Running the function remotely**" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2021-10-26 14:37:45,513 [info] Starting remote function deploy\n", - "2021-10-26 14:37:45 (info) Deploying function\n", - "2021-10-26 14:37:45 (info) Building\n", - "2021-10-26 14:37:45 (info) Staging files and preparing base images\n", - "2021-10-26 14:37:45 (info) Building processor image\n", - "2021-10-26 14:37:47 (info) Build complete\n", - "2021-10-26 14:37:55 (info) Function deploy complete\n", - "> 2021-10-26 14:37:55,689 [info] successfully deployed function: {'internal_invocation_urls': ['nuclio-function-marketplace-stream-to-parquet.default-tenant.svc.cluster.local:8080'], 'external_invocation_urls': ['default-tenant.app.dev39.lab.iguazeng.com:31445']}\n" - ] - }, - { - "data": { - "text/plain": [ - "'http://default-tenant.app.dev39.lab.iguazeng.com:31445'" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import json\n", - "fn.set_envs({'window': 200,\n", - " 'save_to': os.path.join(os.path.join('/User',rel_path), 'inference_pq'),\n", - " 'prediction_col': 'predicted_col',\n", - " 'label_col': 'class',\n", - " 'base_dataset': base_dataset,\n", - " 'results_tsdb_container': container[1:],\n", - " 'results_tsdb_table': tsdb_path,\n", - " 'mount_path': os.path.join(container,user),\n", - " 'mount_remote': container,\n", - " 'artifact_path': os.path.join('/User',rel_path)})\n", - "\n", - "fn.deploy()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Testing the function**" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'data': '{\"request\": {\"instances\": [{\"X1\": 0.0, \"X2\": 0.0, \"X3\": 0.0634475073, \"X4\": 0.4136568818, \"class\": 1.0, \"predicted_col\": 1.0}]}, \"resp\": [1], \"when\": \"2021-10-26 14:37:55.864974\", \"model\": \"sklearn.ensemble.RandomForestClassifier\"}'}" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import json\n", - "import datetime\n", - "\n", - "# Reshaping the data to V3IOStream format.\n", - "def restructure_stream_event(context, event):\n", - " instances = [dict()]\n", - " for key in predicted_test.keys():\n", - " if key not in ['when', 'model', 'worker', 'hostname', 'predicted_col']:\n", - " instances[0].update({key: event.pop(key)})\n", - " instances[0].update({key: event.get(key)}) \n", - " event['request'] = {'instances': instances}\n", - " event['resp'] = [int(event.pop('predicted_col'))]\n", - " event['when'] = datetime.datetime.strftime(datetime.datetime.now(), format=\"%Y-%m-%d %H:%M:%S.%f\")\n", - " event['model'] = 'sklearn.ensemble.RandomForestClassifier'\n", - " return event\n", - " \n", - " \n", - "records = json.loads(predicted_test.to_json(orient='records'))\n", - "records = [{'data': json.dumps(restructure_stream_event(context, record))} for record in records]\n", - "\n", - "# showing first record\n", - "records[0]" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "# Pushing some data to the input stream\n", - "step = 500\n", - "for i in range(0,20000,step):\n", - " response = client.stream.put_records(container=container,\n", - " stream_path=base_input_stream, \n", - " records=records[i:i+step])" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
class_shift_helingerclass_shift_kldclass_shift_tvdprior_helingerprior_kldprior_tvdstream
time
2021-10-26 14:38:08.027000+00:000.0017590.0000250.0024881.010.01.0some_stream
2021-10-26 14:38:08.699000+00:000.0017590.0000250.0024881.010.01.0some_stream
2021-10-26 14:38:09.599000+00:000.0017590.0000250.0024881.010.01.0some_stream
2021-10-26 14:38:10.759000+00:000.0017590.0000250.0024881.010.01.0some_stream
2021-10-26 14:38:11.561000+00:000.0017590.0000250.0024881.010.01.0some_stream
........................
2021-10-26 14:39:42.037000+00:000.0017590.0000250.0024881.010.01.0some_stream
2021-10-26 14:39:42.191000+00:000.0017590.0000250.0024881.010.01.0some_stream
2021-10-26 14:39:42.586000+00:000.0017590.0000250.0024881.010.01.0some_stream
2021-10-26 14:39:42.816000+00:000.0017590.0000250.0024881.010.01.0some_stream
2021-10-26 14:39:49.180000+00:000.0017590.0000250.0024881.010.01.0some_stream
\n", - "

99 rows × 7 columns

\n", - "
" - ], - "text/plain": [ - " class_shift_helinger class_shift_kld \\\n", - "time \n", - "2021-10-26 14:38:08.027000+00:00 0.001759 0.000025 \n", - "2021-10-26 14:38:08.699000+00:00 0.001759 0.000025 \n", - "2021-10-26 14:38:09.599000+00:00 0.001759 0.000025 \n", - "2021-10-26 14:38:10.759000+00:00 0.001759 0.000025 \n", - "2021-10-26 14:38:11.561000+00:00 0.001759 0.000025 \n", - "... ... ... \n", - "2021-10-26 14:39:42.037000+00:00 0.001759 0.000025 \n", - "2021-10-26 14:39:42.191000+00:00 0.001759 0.000025 \n", - "2021-10-26 14:39:42.586000+00:00 0.001759 0.000025 \n", - "2021-10-26 14:39:42.816000+00:00 0.001759 0.000025 \n", - "2021-10-26 14:39:49.180000+00:00 0.001759 0.000025 \n", - "\n", - " class_shift_tvd prior_helinger prior_kld \\\n", - "time \n", - "2021-10-26 14:38:08.027000+00:00 0.002488 1.0 10.0 \n", - "2021-10-26 14:38:08.699000+00:00 0.002488 1.0 10.0 \n", - "2021-10-26 14:38:09.599000+00:00 0.002488 1.0 10.0 \n", - "2021-10-26 14:38:10.759000+00:00 0.002488 1.0 10.0 \n", - "2021-10-26 14:38:11.561000+00:00 0.002488 1.0 10.0 \n", - "... ... ... ... \n", - "2021-10-26 14:39:42.037000+00:00 0.002488 1.0 10.0 \n", - "2021-10-26 14:39:42.191000+00:00 0.002488 1.0 10.0 \n", - "2021-10-26 14:39:42.586000+00:00 0.002488 1.0 10.0 \n", - "2021-10-26 14:39:42.816000+00:00 0.002488 1.0 10.0 \n", - "2021-10-26 14:39:49.180000+00:00 0.002488 1.0 10.0 \n", - "\n", - " prior_tvd stream \n", - "time \n", - "2021-10-26 14:38:08.027000+00:00 1.0 some_stream \n", - "2021-10-26 14:38:08.699000+00:00 1.0 some_stream \n", - "2021-10-26 14:38:09.599000+00:00 1.0 some_stream \n", - "2021-10-26 14:38:10.759000+00:00 1.0 some_stream \n", - "2021-10-26 14:38:11.561000+00:00 1.0 some_stream \n", - "... ... ... \n", - "2021-10-26 14:39:42.037000+00:00 1.0 some_stream \n", - "2021-10-26 14:39:42.191000+00:00 1.0 some_stream \n", - "2021-10-26 14:39:42.586000+00:00 1.0 some_stream \n", - "2021-10-26 14:39:42.816000+00:00 1.0 some_stream \n", - "2021-10-26 14:39:49.180000+00:00 1.0 some_stream \n", - "\n", - "[99 rows x 7 columns]" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Reading from TSDB\n", - "import v3io_frames as v3f\n", - "\n", - "v3f_client = v3f.Client(os.environ[\"V3IO_FRAMESD\"],container=container[1:])\n", - "v3f_client.read(backend='tsdb',table=tsdb_path)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "[Back to the top](#Stream-to-Parquet)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/functions/development/stream_to_parquet/1.1.0/src/stream_to_parquet.py b/functions/development/stream_to_parquet/1.1.0/src/stream_to_parquet.py deleted file mode 100644 index 175c1282..00000000 --- a/functions/development/stream_to_parquet/1.1.0/src/stream_to_parquet.py +++ /dev/null @@ -1,96 +0,0 @@ -# Copyright 2019 Iguazio -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# Generated by nuclio.export.NuclioExporter - -import os -import pandas as pd -import numpy as np -import json -import datetime -import mlrun - - -def record_to_features(record): - features = record["request"]["instances"][0] - timestamp = record["when"] - prediction = record["resp"] - - record = {"timestamp": timestamp, **features, "predictions": prediction} - - return record - - -def init_context(context): - setattr(context, "batch", []) - setattr(context, "window", int(os.getenv("window", 10))) - setattr(context, "save_to", os.getenv("save_to", "/bigdata/inference_pq/")) - os.makedirs(context.save_to, exist_ok=True) - - mlrun.mlconf.dbpath = mlrun.mlconf.dbpath or "http://mlrun-api:8080" - artifact_path = os.getenv("artifact_path", None) - if artifact_path: - mlrun.mlconf.artifact_path = artifact_path - if "hub_url" in os.environ: - mlrun.mlconf.hub_url = os.environ["hub_url"] - virtual_drift_fn = mlrun.import_function("hub://virtual_drift") - virtual_drift_fn.apply(mlrun.auto_mount()) - setattr(context, "virtual_drift_fn", virtual_drift_fn) - - predictions_col = os.getenv("predictions", None) - label_col = os.getenv("label_col", None) - setattr(context, "base_dataset", os.getenv("base_dataset", "")) - setattr(context, "indexes", json.loads(os.environ.get("indexes", "[]"))) - setattr(context, "predictions_col", predictions_col) - setattr(context, "label_col", label_col) - setattr( - context, "results_tsdb_container", os.getenv("results_tsdb_container", None) - ) - setattr(context, "results_tsdb_table", os.getenv("results_tsdb_table", None)) - - -def handler(context, event): - - context.logger.info(f"Adding {event.body}") - context.batch.append(record_to_features(json.loads(event.body))) - - if len(context.batch) > context.window: - context.logger.info(context.batch[:1]) - context.logger.info(context.indexes) - df = pd.DataFrame(context.batch) - context.logger.info(f"df example: {df.head(1)}") - if context.indexes: - df = df.set_index(context.indexes) - df_path = os.path.join( - context.save_to, - f"{datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S')}.pq", - ) - df.to_parquet(df_path,index=False) - - task = mlrun.NewTask( - name="drift_magnitude", - handler="drift_magnitude", - params={ - "label_col": context.label_col, - "prediction_col": context.predictions_col, - "results_tsdb_container": context.results_tsdb_container, - "results_tsdb_table": context.results_tsdb_table, - }, - inputs={"t": context.base_dataset, "u": df_path}, - artifact_path=mlrun.mlconf.artifact_path, - ) - - context.virtual_drift_fn.run(task, watch=False) - - context.batch = [] diff --git a/functions/development/stream_to_parquet/1.1.0/static/documentation.html b/functions/development/stream_to_parquet/1.1.0/static/documentation.html deleted file mode 100644 index efefa293..00000000 --- a/functions/development/stream_to_parquet/1.1.0/static/documentation.html +++ /dev/null @@ -1,234 +0,0 @@ - - - - - - - -stream_to_parquet package - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - - - -
-
- - -
-
-
- -
-

stream_to_parquet package

- -
- -
-
-
-
-
-

stream_to_parquet package#

-
-

Submodules#

-
-
-

stream_to_parquet.stream_to_parquet module#

-
-
-stream_to_parquet.stream_to_parquet.handler(context, event)[source]#
-
-
-
-stream_to_parquet.stream_to_parquet.init_context(context)[source]#
-
-
-
-stream_to_parquet.stream_to_parquet.record_to_features(record)[source]#
-
-
-
-

Module contents#

-
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/stream_to_parquet/1.1.0/static/example.html b/functions/development/stream_to_parquet/1.1.0/static/example.html deleted file mode 100644 index 05826f82..00000000 --- a/functions/development/stream_to_parquet/1.1.0/static/example.html +++ /dev/null @@ -1,777 +0,0 @@ - - - - - - - -Stream to Parquet - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - - - -
-
- - -
-
-
- - -
-
-
-

Stream to Parquet#

-

Part of the network operations demo pipeline, this function listens to a labeld stream and writes it as parquet files.
-This function also deploys the function virtual_drift from the hub, which computes drift magnitude metrics between base dataset t and dataset u,
-in our case (as well as in the demo) - base dataset (the one that the model trained on) and the dataset the model predicted.
-virtual_drift writes the output to TSDB.

-
-

Steps#

-
    -
  1. Data exploration

  2. -
  3. Creating the labeled stream

  4. -
  5. Importing the function

  6. -
  7. Running the functioh remotely

  8. -
  9. Testing the function

  10. -
-
-
-

Data exploration#

-

In order to know about the performance of a drift detector by measuring the different detection metrics, we need to know beforehand where a real drift occurs.
-This is only possible with synthetic datasets.
The scikit-multiflow framework allows generating several kinds of synthetic data to simulate the occurrence of drifts.
-Harvard dataverse provides futher explanations on the used dataset along with different kinds of drifted datasets.
-mixed_0101_abrupto has 4 concepts and 3 drifts at time steps 10000, 20000, and 30000.
-Our dataset will be train-test-splitted, the train part (first 5000 examples) is used to train the model (that is generated easly using sklearn_classifer).
-The test part (which is already predicted by the model) will be pushed to the input stream in order to detect drifts.

-
-
-
import pandas as pd
-data_path = 'https://s3.wasabisys.com/iguazio/data/function-marketplace-data/concept_drift/mixed_0101_abrupto.csv'
-base_dataset = 'https://s3.wasabisys.com/iguazio/data/function-marketplace-data/concept_drift/predicted_abrupto_train.csv'
-# The predicted test data is pushed to the stream
-predicted_test_data_path = 'https://s3.wasabisys.com/iguazio/data/function-marketplace-data/concept_drift/predicted_abrupto_test.csv'
-# You can find the model used here
-models_path = 'https://s3.wasabisys.com/iguazio/models/function-marketplace-models/concept_drift/concept_drift_random_forest.pkl'
-original_data = pd.read_csv(data_path)
-original_data.head()
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
X1X2X3X4class
00.01.00.4601010.5927441.0
11.01.00.5887880.5749840.0
20.00.00.4016410.6793251.0
31.01.00.3060760.1821080.0
40.00.00.9628470.5792451.0
-
-
-
-
-
predicted_test = pd.read_csv(predicted_test_data_path)
-predicted_test.tail()
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
X1X2X3X4classpredicted_col
349950.00.00.0101060.6472690.01.0
349961.01.00.2936510.7372911.00.0
349970.00.00.8485460.5523370.01.0
349981.01.00.6147540.8598961.00.0
349991.00.00.2653060.8437160.01.0
-
-
-
-
-

Creating the labeled stream#

-
-
-
import os 
-
-container = os.path.join('/',os.environ['V3IO_HOME'].split('/')[0])
-user = os.environ["V3IO_USERNAME"]
-rel_path = os.getcwd()[6:] + '/artifacts'
-
-base_input_stream = os.path.join(user,rel_path) + "/inputs_stream"
-base_output_stream = os.path.join(user,rel_path) + "/output_stream"
-input_stream = os.path.join(container,base_input_stream)
-tsdb_path = os.path.join(user,rel_path) + "/output_tsdb"
-
-stream_consumer_group = 's2p'
-
-
-
-
-
-
-
import v3io.dataplane
-
-client = v3io.dataplane.Client()
-response = client.stream.create(container = container,
-                                stream_path=base_input_stream,
-                                shard_count=1,
-                                raise_for_status = v3io.dataplane.RaiseForStatus.never)
-response.raise_for_status([409, 204])
-
-
-
-
-
-
-

Importing the function#

-
-
-
import mlrun
-
-# Importing the function
-mlrun.set_environment(project='function-marketplace')
-
-fn = mlrun.import_function("hub://stream_to_parquet:development")
-fn.apply(mlrun.auto_mount())
-
-fn.add_v3io_stream_trigger(stream_path=input_stream, name='stream', group=stream_consumer_group)
-
-
-
-
-
> 2021-10-26 14:37:45,224 [info] created and saved project function-marketplace
-
-
-
-
-
-
-

Running the function remotely#

-
-
-
import json
-fn.set_envs({'window': 200,
-             'save_to': os.path.join(os.path.join('/User',rel_path), 'inference_pq'),
-             'prediction_col': 'predicted_col',
-             'label_col': 'class',
-             'base_dataset': base_dataset,
-             'results_tsdb_container': container[1:],
-             'results_tsdb_table': tsdb_path,
-             'mount_path': os.path.join(container,user),
-             'mount_remote': container,
-             'artifact_path': os.path.join('/User',rel_path)})
-
-fn.deploy()
-
-
-
-
-
> 2021-10-26 14:37:45,513 [info] Starting remote function deploy
-2021-10-26 14:37:45  (info) Deploying function
-2021-10-26 14:37:45  (info) Building
-2021-10-26 14:37:45  (info) Staging files and preparing base images
-2021-10-26 14:37:45  (info) Building processor image
-2021-10-26 14:37:47  (info) Build complete
-2021-10-26 14:37:55  (info) Function deploy complete
-> 2021-10-26 14:37:55,689 [info] successfully deployed function: {'internal_invocation_urls': ['nuclio-function-marketplace-stream-to-parquet.default-tenant.svc.cluster.local:8080'], 'external_invocation_urls': ['default-tenant.app.dev39.lab.iguazeng.com:31445']}
-
-
-
'http://default-tenant.app.dev39.lab.iguazeng.com:31445'
-
-
-
-
-
-
-

Testing the function#

-
-
-
import json
-import datetime
-
-# Reshaping the data to V3IOStream format.
-def restructure_stream_event(context, event):
-    instances = [dict()]
-    for key in predicted_test.keys():
-        if key not in ['when', 'model', 'worker', 'hostname', 'predicted_col']:
-            instances[0].update({key: event.pop(key)})
-    instances[0].update({key: event.get(key)})      
-    event['request'] = {'instances': instances}
-    event['resp'] = [int(event.pop('predicted_col'))]
-    event['when'] = datetime.datetime.strftime(datetime.datetime.now(), format="%Y-%m-%d %H:%M:%S.%f")
-    event['model'] = 'sklearn.ensemble.RandomForestClassifier'
-    return event
-    
-    
-records = json.loads(predicted_test.to_json(orient='records'))
-records = [{'data': json.dumps(restructure_stream_event(context, record))} for record in records]
-
-# showing first record
-records[0]
-
-
-
-
-
{'data': '{"request": {"instances": [{"X1": 0.0, "X2": 0.0, "X3": 0.0634475073, "X4": 0.4136568818, "class": 1.0, "predicted_col": 1.0}]}, "resp": [1], "when": "2021-10-26 14:37:55.864974", "model": "sklearn.ensemble.RandomForestClassifier"}'}
-
-
-
-
-
-
-
# Pushing some data to the input stream
-step = 500
-for i in range(0,20000,step):
-    response = client.stream.put_records(container=container,
-                                              stream_path=base_input_stream, 
-                                              records=records[i:i+step])
-
-
-
-
-
-
-
# Reading from TSDB
-import v3io_frames as v3f
-
-v3f_client = v3f.Client(os.environ["V3IO_FRAMESD"],container=container[1:])
-v3f_client.read(backend='tsdb',table=tsdb_path)
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
class_shift_helingerclass_shift_kldclass_shift_tvdprior_helingerprior_kldprior_tvdstream
time
2021-10-26 14:38:08.027000+00:000.0017590.0000250.0024881.010.01.0some_stream
2021-10-26 14:38:08.699000+00:000.0017590.0000250.0024881.010.01.0some_stream
2021-10-26 14:38:09.599000+00:000.0017590.0000250.0024881.010.01.0some_stream
2021-10-26 14:38:10.759000+00:000.0017590.0000250.0024881.010.01.0some_stream
2021-10-26 14:38:11.561000+00:000.0017590.0000250.0024881.010.01.0some_stream
........................
2021-10-26 14:39:42.037000+00:000.0017590.0000250.0024881.010.01.0some_stream
2021-10-26 14:39:42.191000+00:000.0017590.0000250.0024881.010.01.0some_stream
2021-10-26 14:39:42.586000+00:000.0017590.0000250.0024881.010.01.0some_stream
2021-10-26 14:39:42.816000+00:000.0017590.0000250.0024881.010.01.0some_stream
2021-10-26 14:39:49.180000+00:000.0017590.0000250.0024881.010.01.0some_stream
-

99 rows × 7 columns

-
-
-

Back to the top

-
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/stream_to_parquet/1.1.0/static/function.html b/functions/development/stream_to_parquet/1.1.0/static/function.html deleted file mode 100644 index 5cc5a2c2..00000000 --- a/functions/development/stream_to_parquet/1.1.0/static/function.html +++ /dev/null @@ -1,67 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: remote
-metadata:
-  name: stream-to-parquet
-  tag: ''
-  hash: 78316bfbe731714715c19f0bc6deabf8652f15c4
-  project: ''
-  labels:
-    author: orz
-  categories:
-  - machine-learning
-  - data-preparation
-spec:
-  command: ''
-  args: []
-  image: mlrun/ml-models
-  description: Saves a stream to Parquet and can lunch drift detection task on it
-  min_replicas: 1
-  max_replicas: 1
-  env: []
-  base_spec:
-    apiVersion: nuclio.io/v1
-    kind: Function
-    metadata:
-      name: stream-to-parquet
-      labels: {}
-      annotations:
-        nuclio.io/generated_by: function generated from /User/test/functions/stream_to_parquet/stream_to_parquet.py
-    spec:
-      runtime: python:3.6
-      handler: stream_to_parquet:handler
-      env: []
-      volumes: []
-      build:
-        commands: []
-        noBaseImagesPull: true
-        functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IG9zCmltcG9ydCBwYW5kYXMgYXMgcGQKaW1wb3J0IG51bXB5IGFzIG5wCmltcG9ydCBqc29uCmltcG9ydCBkYXRldGltZQppbXBvcnQgbWxydW4KCgpkZWYgcmVjb3JkX3RvX2ZlYXR1cmVzKHJlY29yZCk6CiAgICBmZWF0dXJlcyA9IHJlY29yZFsicmVxdWVzdCJdWyJpbnN0YW5jZXMiXVswXQogICAgdGltZXN0YW1wID0gcmVjb3JkWyJ3aGVuIl0KICAgIHByZWRpY3Rpb24gPSByZWNvcmRbInJlc3AiXQoKICAgIHJlY29yZCA9IHsidGltZXN0YW1wIjogdGltZXN0YW1wLCAqKmZlYXR1cmVzLCAicHJlZGljdGlvbnMiOiBwcmVkaWN0aW9ufQoKICAgIHJldHVybiByZWNvcmQKCgpkZWYgaW5pdF9jb250ZXh0KGNvbnRleHQpOgogICAgc2V0YXR0cihjb250ZXh0LCAiYmF0Y2giLCBbXSkKICAgIHNldGF0dHIoY29udGV4dCwgIndpbmRvdyIsIGludChvcy5nZXRlbnYoIndpbmRvdyIsIDEwKSkpCiAgICBzZXRhdHRyKGNvbnRleHQsICJzYXZlX3RvIiwgb3MuZ2V0ZW52KCJzYXZlX3RvIiwgIi9iaWdkYXRhL2luZmVyZW5jZV9wcS8iKSkKICAgIG9zLm1ha2VkaXJzKGNvbnRleHQuc2F2ZV90bywgZXhpc3Rfb2s9VHJ1ZSkKCiAgICBtbHJ1bi5tbGNvbmYuZGJwYXRoID0gbWxydW4ubWxjb25mLmRicGF0aCBvciAiaHR0cDovL21scnVuLWFwaTo4MDgwIgogICAgYXJ0aWZhY3RfcGF0aCA9IG9zLmdldGVudigiYXJ0aWZhY3RfcGF0aCIsIE5vbmUpCiAgICBpZiBhcnRpZmFjdF9wYXRoOgogICAgICAgIG1scnVuLm1sY29uZi5hcnRpZmFjdF9wYXRoID0gYXJ0aWZhY3RfcGF0aAogICAgaWYgImh1Yl91cmwiIGluIG9zLmVudmlyb246CiAgICAgICAgbWxydW4ubWxjb25mLmh1Yl91cmwgPSBvcy5lbnZpcm9uWyJodWJfdXJsIl0KICAgIHZpcnR1YWxfZHJpZnRfZm4gPSBtbHJ1bi5pbXBvcnRfZnVuY3Rpb24oImh1YjovL3ZpcnR1YWxfZHJpZnQiKQogICAgdmlydHVhbF9kcmlmdF9mbi5hcHBseShtbHJ1bi5hdXRvX21vdW50KCkpCiAgICBzZXRhdHRyKGNvbnRleHQsICJ2aXJ0dWFsX2RyaWZ0X2ZuIiwgdmlydHVhbF9kcmlmdF9mbikKCiAgICBwcmVkaWN0aW9uc19jb2wgPSBvcy5nZXRlbnYoInByZWRpY3Rpb25zIiwgTm9uZSkKICAgIGxhYmVsX2NvbCA9IG9zLmdldGVudigibGFiZWxfY29sIiwgTm9uZSkKICAgIHNldGF0dHIoY29udGV4dCwgImJhc2VfZGF0YXNldCIsIG9zLmdldGVudigiYmFzZV9kYXRhc2V0IiwgIiIpKQogICAgc2V0YXR0cihjb250ZXh0LCAiaW5kZXhlcyIsIGpzb24ubG9hZHMob3MuZW52aXJvbi5nZXQoImluZGV4ZXMiLCAiW10iKSkpCiAgICBzZXRhdHRyKGNvbnRleHQsICJwcmVkaWN0aW9uc19jb2wiLCBwcmVkaWN0aW9uc19jb2wpCiAgICBzZXRhdHRyKGNvbnRleHQsICJsYWJlbF9jb2wiLCBsYWJlbF9jb2wpCiAgICBzZXRhdHRyKAogICAgICAgIGNvbnRleHQsICJyZXN1bHRzX3RzZGJfY29udGFpbmVyIiwgb3MuZ2V0ZW52KCJyZXN1bHRzX3RzZGJfY29udGFpbmVyIiwgTm9uZSkKICAgICkKICAgIHNldGF0dHIoY29udGV4dCwgInJlc3VsdHNfdHNkYl90YWJsZSIsIG9zLmdldGVudigicmVzdWx0c190c2RiX3RhYmxlIiwgTm9uZSkpCgoKZGVmIGhhbmRsZXIoY29udGV4dCwgZXZlbnQpOgoKICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZiJBZGRpbmcge2V2ZW50LmJvZHl9IikKICAgIGNvbnRleHQuYmF0Y2guYXBwZW5kKHJlY29yZF90b19mZWF0dXJlcyhqc29uLmxvYWRzKGV2ZW50LmJvZHkpKSkKCiAgICBpZiBsZW4oY29udGV4dC5iYXRjaCkgPiBjb250ZXh0LndpbmRvdzoKICAgICAgICBjb250ZXh0LmxvZ2dlci5pbmZvKGNvbnRleHQuYmF0Y2hbOjFdKQogICAgICAgIGNvbnRleHQubG9nZ2VyLmluZm8oY29udGV4dC5pbmRleGVzKQogICAgICAgIGRmID0gcGQuRGF0YUZyYW1lKGNvbnRleHQuYmF0Y2gpCiAgICAgICAgY29udGV4dC5sb2dnZXIuaW5mbyhmImRmIGV4YW1wbGU6IHtkZi5oZWFkKDEpfSIpCiAgICAgICAgaWYgY29udGV4dC5pbmRleGVzOgogICAgICAgICAgICBkZiA9IGRmLnNldF9pbmRleChjb250ZXh0LmluZGV4ZXMpCiAgICAgICAgZGZfcGF0aCA9IG9zLnBhdGguam9pbigKICAgICAgICAgICAgY29udGV4dC5zYXZlX3RvLAogICAgICAgICAgICBmIntkYXRldGltZS5kYXRldGltZS5ub3coKS5zdHJmdGltZSgnJVktJW0tJWRUJUg6JU06JVMnKX0ucHEiLAogICAgICAgICkKICAgICAgICBkZi50b19wYXJxdWV0KGRmX3BhdGgsaW5kZXg9RmFsc2UpCgogICAgICAgIHRhc2sgPSBtbHJ1bi5OZXdUYXNrKAogICAgICAgICAgICBuYW1lPSJkcmlmdF9tYWduaXR1ZGUiLAogICAgICAgICAgICBoYW5kbGVyPSJkcmlmdF9tYWduaXR1ZGUiLAogICAgICAgICAgICBwYXJhbXM9ewogICAgICAgICAgICAgICAgImxhYmVsX2NvbCI6IGNvbnRleHQubGFiZWxfY29sLAogICAgICAgICAgICAgICAgInByZWRpY3Rpb25fY29sIjogY29udGV4dC5wcmVkaWN0aW9uc19jb2wsCiAgICAgICAgICAgICAgICAicmVzdWx0c190c2RiX2NvbnRhaW5lciI6IGNvbnRleHQucmVzdWx0c190c2RiX2NvbnRhaW5lciwKICAgICAgICAgICAgICAgICJyZXN1bHRzX3RzZGJfdGFibGUiOiBjb250ZXh0LnJlc3VsdHNfdHNkYl90YWJsZSwKICAgICAgICAgICAgfSwKICAgICAgICAgICAgaW5wdXRzPXsidCI6IGNvbnRleHQuYmFzZV9kYXRhc2V0LCAidSI6IGRmX3BhdGh9LAogICAgICAgICAgICBhcnRpZmFjdF9wYXRoPW1scnVuLm1sY29uZi5hcnRpZmFjdF9wYXRoLAogICAgICAgICkKCiAgICAgICAgY29udGV4dC52aXJ0dWFsX2RyaWZ0X2ZuLnJ1bih0YXNrLCB3YXRjaD1GYWxzZSkKCiAgICAgICAgY29udGV4dC5iYXRjaCA9IFtdCg==
-  source: ''
-  build:
-    commands: []
-    code_origin: https://github.com/daniels290813/functions.git#3605c9b8dcadab89a5a45f7d16dcd2fcfeca8697:/User/test/functions/stream_to_parquet/stream_to_parquet.py
-    origin_filename: /User/test/functions/stream_to_parquet/stream_to_parquet.py
-  default_handler: handler
-  disable_auto_mount: false
-  affinity: null
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/stream_to_parquet/1.1.0/static/item.html b/functions/development/stream_to_parquet/1.1.0/static/item.html deleted file mode 100644 index 7ffcdaaf..00000000 --- a/functions/development/stream_to_parquet/1.1.0/static/item.html +++ /dev/null @@ -1,50 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- machine-learning
-- data-preparation
-description: Saves a stream to Parquet and can lunch drift detection task on it
-doc: ''
-example: stream_to_parquet.ipynb
-generationDate: 2022-08-28:17-25
-hidden: false
-icon: ''
-labels:
-  author: orz
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 1.1.0
-name: stream-to-parquet
-platformVersion: 3.5.0
-spec:
-  customFields:
-    max_replicas: 1
-    min_replicas: 1
-  filename: stream_to_parquet.py
-  handler: handler
-  image: mlrun/ml-models
-  kind: nuclio
-  requirements: []
-url: ''
-version: 1.1.0
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/stream_to_parquet/1.1.0/static/source.html b/functions/development/stream_to_parquet/1.1.0/static/source.html deleted file mode 100644 index acca6741..00000000 --- a/functions/development/stream_to_parquet/1.1.0/static/source.html +++ /dev/null @@ -1,118 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-# Copyright 2019 Iguazio
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# Generated by nuclio.export.NuclioExporter
-
-import os
-import pandas as pd
-import numpy as np
-import json
-import datetime
-import mlrun
-
-
-def record_to_features(record):
-    features = record["request"]["instances"][0]
-    timestamp = record["when"]
-    prediction = record["resp"]
-
-    record = {"timestamp": timestamp, **features, "predictions": prediction}
-
-    return record
-
-
-def init_context(context):
-    setattr(context, "batch", [])
-    setattr(context, "window", int(os.getenv("window", 10)))
-    setattr(context, "save_to", os.getenv("save_to", "/bigdata/inference_pq/"))
-    os.makedirs(context.save_to, exist_ok=True)
-
-    mlrun.mlconf.dbpath = mlrun.mlconf.dbpath or "http://mlrun-api:8080"
-    artifact_path = os.getenv("artifact_path", None)
-    if artifact_path:
-        mlrun.mlconf.artifact_path = artifact_path
-    if "hub_url" in os.environ:
-        mlrun.mlconf.hub_url = os.environ["hub_url"]
-    virtual_drift_fn = mlrun.import_function("hub://virtual_drift")
-    virtual_drift_fn.apply(mlrun.auto_mount())
-    setattr(context, "virtual_drift_fn", virtual_drift_fn)
-
-    predictions_col = os.getenv("predictions", None)
-    label_col = os.getenv("label_col", None)
-    setattr(context, "base_dataset", os.getenv("base_dataset", ""))
-    setattr(context, "indexes", json.loads(os.environ.get("indexes", "[]")))
-    setattr(context, "predictions_col", predictions_col)
-    setattr(context, "label_col", label_col)
-    setattr(
-        context, "results_tsdb_container", os.getenv("results_tsdb_container", None)
-    )
-    setattr(context, "results_tsdb_table", os.getenv("results_tsdb_table", None))
-
-
-def handler(context, event):
-
-    context.logger.info(f"Adding {event.body}")
-    context.batch.append(record_to_features(json.loads(event.body)))
-
-    if len(context.batch) > context.window:
-        context.logger.info(context.batch[:1])
-        context.logger.info(context.indexes)
-        df = pd.DataFrame(context.batch)
-        context.logger.info(f"df example: {df.head(1)}")
-        if context.indexes:
-            df = df.set_index(context.indexes)
-        df_path = os.path.join(
-            context.save_to,
-            f"{datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S')}.pq",
-        )
-        df.to_parquet(df_path,index=False)
-
-        task = mlrun.NewTask(
-            name="drift_magnitude",
-            handler="drift_magnitude",
-            params={
-                "label_col": context.label_col,
-                "prediction_col": context.predictions_col,
-                "results_tsdb_container": context.results_tsdb_container,
-                "results_tsdb_table": context.results_tsdb_table,
-            },
-            inputs={"t": context.base_dataset, "u": df_path},
-            artifact_path=mlrun.mlconf.artifact_path,
-        )
-
-        context.virtual_drift_fn.run(task, watch=False)
-
-        context.batch = []
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/stream_to_parquet/1.1.0/static/stream_to_parquet.html b/functions/development/stream_to_parquet/1.1.0/static/stream_to_parquet.html deleted file mode 100644 index e9d43572..00000000 --- a/functions/development/stream_to_parquet/1.1.0/static/stream_to_parquet.html +++ /dev/null @@ -1,236 +0,0 @@ - - - - - - - -stream_to_parquet.stream_to_parquet - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - -
-
- -
-
-
-
-
- -
-

- -
-
-
-
-
-
-
-

Source code for stream_to_parquet.stream_to_parquet

-# Copyright 2019 Iguazio
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# Generated by nuclio.export.NuclioExporter
-
-import os
-import pandas as pd
-import numpy as np
-import json
-import datetime
-import mlrun
-
-
-
[docs]def record_to_features(record): - features = record["request"]["instances"][0] - timestamp = record["when"] - prediction = record["resp"] - - record = {"timestamp": timestamp, **features, "predictions": prediction} - - return record
- - -
[docs]def init_context(context): - setattr(context, "batch", []) - setattr(context, "window", int(os.getenv("window", 10))) - setattr(context, "save_to", os.getenv("save_to", "/bigdata/inference_pq/")) - os.makedirs(context.save_to, exist_ok=True) - - mlrun.mlconf.dbpath = mlrun.mlconf.dbpath or "http://mlrun-api:8080" - artifact_path = os.getenv("artifact_path", None) - if artifact_path: - mlrun.mlconf.artifact_path = artifact_path - if "hub_url" in os.environ: - mlrun.mlconf.hub_url = os.environ["hub_url"] - virtual_drift_fn = mlrun.import_function("hub://virtual_drift") - virtual_drift_fn.apply(mlrun.auto_mount()) - setattr(context, "virtual_drift_fn", virtual_drift_fn) - - predictions_col = os.getenv("predictions", None) - label_col = os.getenv("label_col", None) - setattr(context, "base_dataset", os.getenv("base_dataset", "")) - setattr(context, "indexes", json.loads(os.environ.get("indexes", "[]"))) - setattr(context, "predictions_col", predictions_col) - setattr(context, "label_col", label_col) - setattr( - context, "results_tsdb_container", os.getenv("results_tsdb_container", None) - ) - setattr(context, "results_tsdb_table", os.getenv("results_tsdb_table", None))
- - -
[docs]def handler(context, event): - - context.logger.info(f"Adding {event.body}") - context.batch.append(record_to_features(json.loads(event.body))) - - if len(context.batch) > context.window: - context.logger.info(context.batch[:1]) - context.logger.info(context.indexes) - df = pd.DataFrame(context.batch) - context.logger.info(f"df example: {df.head(1)}") - if context.indexes: - df = df.set_index(context.indexes) - df_path = os.path.join( - context.save_to, - f"{datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S')}.pq", - ) - df.to_parquet(df_path,index=False) - - task = mlrun.NewTask( - name="drift_magnitude", - handler="drift_magnitude", - params={ - "label_col": context.label_col, - "prediction_col": context.predictions_col, - "results_tsdb_container": context.results_tsdb_container, - "results_tsdb_table": context.results_tsdb_table, - }, - inputs={"t": context.base_dataset, "u": df_path}, - artifact_path=mlrun.mlconf.artifact_path, - ) - - context.virtual_drift_fn.run(task, watch=False) - - context.batch = []
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/stream_to_parquet/latest/src/function.yaml b/functions/development/stream_to_parquet/latest/src/function.yaml deleted file mode 100644 index f8786cc9..00000000 --- a/functions/development/stream_to_parquet/latest/src/function.yaml +++ /dev/null @@ -1,45 +0,0 @@ -kind: remote -metadata: - name: stream-to-parquet - tag: '' - hash: 78316bfbe731714715c19f0bc6deabf8652f15c4 - project: '' - labels: - author: orz - categories: - - machine-learning - - data-preparation -spec: - command: '' - args: [] - image: mlrun/ml-models - description: Saves a stream to Parquet and can lunch drift detection task on it - min_replicas: 1 - max_replicas: 1 - env: [] - base_spec: - apiVersion: nuclio.io/v1 - kind: Function - metadata: - name: stream-to-parquet - labels: {} - annotations: - nuclio.io/generated_by: function generated from /User/test/functions/stream_to_parquet/stream_to_parquet.py - spec: - runtime: python:3.6 - handler: stream_to_parquet:handler - env: [] - volumes: [] - build: - commands: [] - noBaseImagesPull: true - functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IG9zCmltcG9ydCBwYW5kYXMgYXMgcGQKaW1wb3J0IG51bXB5IGFzIG5wCmltcG9ydCBqc29uCmltcG9ydCBkYXRldGltZQppbXBvcnQgbWxydW4KCgpkZWYgcmVjb3JkX3RvX2ZlYXR1cmVzKHJlY29yZCk6CiAgICBmZWF0dXJlcyA9IHJlY29yZFsicmVxdWVzdCJdWyJpbnN0YW5jZXMiXVswXQogICAgdGltZXN0YW1wID0gcmVjb3JkWyJ3aGVuIl0KICAgIHByZWRpY3Rpb24gPSByZWNvcmRbInJlc3AiXQoKICAgIHJlY29yZCA9IHsidGltZXN0YW1wIjogdGltZXN0YW1wLCAqKmZlYXR1cmVzLCAicHJlZGljdGlvbnMiOiBwcmVkaWN0aW9ufQoKICAgIHJldHVybiByZWNvcmQKCgpkZWYgaW5pdF9jb250ZXh0KGNvbnRleHQpOgogICAgc2V0YXR0cihjb250ZXh0LCAiYmF0Y2giLCBbXSkKICAgIHNldGF0dHIoY29udGV4dCwgIndpbmRvdyIsIGludChvcy5nZXRlbnYoIndpbmRvdyIsIDEwKSkpCiAgICBzZXRhdHRyKGNvbnRleHQsICJzYXZlX3RvIiwgb3MuZ2V0ZW52KCJzYXZlX3RvIiwgIi9iaWdkYXRhL2luZmVyZW5jZV9wcS8iKSkKICAgIG9zLm1ha2VkaXJzKGNvbnRleHQuc2F2ZV90bywgZXhpc3Rfb2s9VHJ1ZSkKCiAgICBtbHJ1bi5tbGNvbmYuZGJwYXRoID0gbWxydW4ubWxjb25mLmRicGF0aCBvciAiaHR0cDovL21scnVuLWFwaTo4MDgwIgogICAgYXJ0aWZhY3RfcGF0aCA9IG9zLmdldGVudigiYXJ0aWZhY3RfcGF0aCIsIE5vbmUpCiAgICBpZiBhcnRpZmFjdF9wYXRoOgogICAgICAgIG1scnVuLm1sY29uZi5hcnRpZmFjdF9wYXRoID0gYXJ0aWZhY3RfcGF0aAogICAgaWYgImh1Yl91cmwiIGluIG9zLmVudmlyb246CiAgICAgICAgbWxydW4ubWxjb25mLmh1Yl91cmwgPSBvcy5lbnZpcm9uWyJodWJfdXJsIl0KICAgIHZpcnR1YWxfZHJpZnRfZm4gPSBtbHJ1bi5pbXBvcnRfZnVuY3Rpb24oImh1YjovL3ZpcnR1YWxfZHJpZnQiKQogICAgdmlydHVhbF9kcmlmdF9mbi5hcHBseShtbHJ1bi5hdXRvX21vdW50KCkpCiAgICBzZXRhdHRyKGNvbnRleHQsICJ2aXJ0dWFsX2RyaWZ0X2ZuIiwgdmlydHVhbF9kcmlmdF9mbikKCiAgICBwcmVkaWN0aW9uc19jb2wgPSBvcy5nZXRlbnYoInByZWRpY3Rpb25zIiwgTm9uZSkKICAgIGxhYmVsX2NvbCA9IG9zLmdldGVudigibGFiZWxfY29sIiwgTm9uZSkKICAgIHNldGF0dHIoY29udGV4dCwgImJhc2VfZGF0YXNldCIsIG9zLmdldGVudigiYmFzZV9kYXRhc2V0IiwgIiIpKQogICAgc2V0YXR0cihjb250ZXh0LCAiaW5kZXhlcyIsIGpzb24ubG9hZHMob3MuZW52aXJvbi5nZXQoImluZGV4ZXMiLCAiW10iKSkpCiAgICBzZXRhdHRyKGNvbnRleHQsICJwcmVkaWN0aW9uc19jb2wiLCBwcmVkaWN0aW9uc19jb2wpCiAgICBzZXRhdHRyKGNvbnRleHQsICJsYWJlbF9jb2wiLCBsYWJlbF9jb2wpCiAgICBzZXRhdHRyKAogICAgICAgIGNvbnRleHQsICJyZXN1bHRzX3RzZGJfY29udGFpbmVyIiwgb3MuZ2V0ZW52KCJyZXN1bHRzX3RzZGJfY29udGFpbmVyIiwgTm9uZSkKICAgICkKICAgIHNldGF0dHIoY29udGV4dCwgInJlc3VsdHNfdHNkYl90YWJsZSIsIG9zLmdldGVudigicmVzdWx0c190c2RiX3RhYmxlIiwgTm9uZSkpCgoKZGVmIGhhbmRsZXIoY29udGV4dCwgZXZlbnQpOgoKICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZiJBZGRpbmcge2V2ZW50LmJvZHl9IikKICAgIGNvbnRleHQuYmF0Y2guYXBwZW5kKHJlY29yZF90b19mZWF0dXJlcyhqc29uLmxvYWRzKGV2ZW50LmJvZHkpKSkKCiAgICBpZiBsZW4oY29udGV4dC5iYXRjaCkgPiBjb250ZXh0LndpbmRvdzoKICAgICAgICBjb250ZXh0LmxvZ2dlci5pbmZvKGNvbnRleHQuYmF0Y2hbOjFdKQogICAgICAgIGNvbnRleHQubG9nZ2VyLmluZm8oY29udGV4dC5pbmRleGVzKQogICAgICAgIGRmID0gcGQuRGF0YUZyYW1lKGNvbnRleHQuYmF0Y2gpCiAgICAgICAgY29udGV4dC5sb2dnZXIuaW5mbyhmImRmIGV4YW1wbGU6IHtkZi5oZWFkKDEpfSIpCiAgICAgICAgaWYgY29udGV4dC5pbmRleGVzOgogICAgICAgICAgICBkZiA9IGRmLnNldF9pbmRleChjb250ZXh0LmluZGV4ZXMpCiAgICAgICAgZGZfcGF0aCA9IG9zLnBhdGguam9pbigKICAgICAgICAgICAgY29udGV4dC5zYXZlX3RvLAogICAgICAgICAgICBmIntkYXRldGltZS5kYXRldGltZS5ub3coKS5zdHJmdGltZSgnJVktJW0tJWRUJUg6JU06JVMnKX0ucHEiLAogICAgICAgICkKICAgICAgICBkZi50b19wYXJxdWV0KGRmX3BhdGgsaW5kZXg9RmFsc2UpCgogICAgICAgIHRhc2sgPSBtbHJ1bi5OZXdUYXNrKAogICAgICAgICAgICBuYW1lPSJkcmlmdF9tYWduaXR1ZGUiLAogICAgICAgICAgICBoYW5kbGVyPSJkcmlmdF9tYWduaXR1ZGUiLAogICAgICAgICAgICBwYXJhbXM9ewogICAgICAgICAgICAgICAgImxhYmVsX2NvbCI6IGNvbnRleHQubGFiZWxfY29sLAogICAgICAgICAgICAgICAgInByZWRpY3Rpb25fY29sIjogY29udGV4dC5wcmVkaWN0aW9uc19jb2wsCiAgICAgICAgICAgICAgICAicmVzdWx0c190c2RiX2NvbnRhaW5lciI6IGNvbnRleHQucmVzdWx0c190c2RiX2NvbnRhaW5lciwKICAgICAgICAgICAgICAgICJyZXN1bHRzX3RzZGJfdGFibGUiOiBjb250ZXh0LnJlc3VsdHNfdHNkYl90YWJsZSwKICAgICAgICAgICAgfSwKICAgICAgICAgICAgaW5wdXRzPXsidCI6IGNvbnRleHQuYmFzZV9kYXRhc2V0LCAidSI6IGRmX3BhdGh9LAogICAgICAgICAgICBhcnRpZmFjdF9wYXRoPW1scnVuLm1sY29uZi5hcnRpZmFjdF9wYXRoLAogICAgICAgICkKCiAgICAgICAgY29udGV4dC52aXJ0dWFsX2RyaWZ0X2ZuLnJ1bih0YXNrLCB3YXRjaD1GYWxzZSkKCiAgICAgICAgY29udGV4dC5iYXRjaCA9IFtdCg== - source: '' - build: - commands: [] - code_origin: https://github.com/daniels290813/functions.git#3605c9b8dcadab89a5a45f7d16dcd2fcfeca8697:/User/test/functions/stream_to_parquet/stream_to_parquet.py - origin_filename: /User/test/functions/stream_to_parquet/stream_to_parquet.py - default_handler: handler - disable_auto_mount: false - affinity: null -verbose: false diff --git a/functions/development/stream_to_parquet/latest/src/item.yaml b/functions/development/stream_to_parquet/latest/src/item.yaml deleted file mode 100644 index cbd59376..00000000 --- a/functions/development/stream_to_parquet/latest/src/item.yaml +++ /dev/null @@ -1,28 +0,0 @@ -apiVersion: v1 -categories: -- machine-learning -- data-preparation -description: Saves a stream to Parquet and can lunch drift detection task on it -doc: '' -example: stream_to_parquet.ipynb -generationDate: 2022-08-28:17-25 -hidden: false -icon: '' -labels: - author: orz -maintainers: [] -marketplaceType: '' -mlrunVersion: 1.1.0 -name: stream-to-parquet -platformVersion: 3.5.0 -spec: - customFields: - max_replicas: 1 - min_replicas: 1 - filename: stream_to_parquet.py - handler: handler - image: mlrun/ml-models - kind: nuclio - requirements: [] -url: '' -version: 1.1.0 diff --git a/functions/development/stream_to_parquet/latest/src/stream_to_parquet.ipynb b/functions/development/stream_to_parquet/latest/src/stream_to_parquet.ipynb deleted file mode 100644 index e47c6be9..00000000 --- a/functions/development/stream_to_parquet/latest/src/stream_to_parquet.ipynb +++ /dev/null @@ -1,698 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Stream to Parquet" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Part of the [network operations](https://github.com/mlrun/demos/tree/0.7.x/network-operations) demo pipeline, this function listens to a labeld stream and writes it as parquet files.
\n", - "This function also deploys the function [virtual_drift](https://github.com/mlrun/functions/tree/master/virtual_drift) from the hub, which computes drift magnitude metrics between base dataset t and dataset u,
\n", - "in our case (as well as in the demo) - base dataset (the one that the model trained on) and the dataset the model predicted.
\n", - "virtual_drift writes the output to TSDB." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Steps**\n", - "\n", - "1. [Data exploration](#Data-exploration)\n", - "2. [Creating the labeled stream](#Creating-the-labeled-stream)\n", - "3. [Importing the function](#Importing-the-function)\n", - "4. [Running the functioh remotely](#Running-the-function-remotely)\n", - "5. [Testing the function](#Testing-the-function)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Data exploration**" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In order to know about the performance of a drift detector by measuring the different detection metrics, we need to know beforehand where a real drift occurs.
\n", - "This is only possible with synthetic datasets.
The scikit-multiflow framework allows generating several kinds of synthetic data to simulate the occurrence of drifts.
\n", - "[Harvard dataverse](https://dataverse.harvard.edu) provides futher explanations on the [used dataset](https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/5OWRGB) along with different kinds of drifted datasets.
\n", - "mixed_0101_abrupto has 4 concepts and 3 drifts at time steps 10000, 20000, and 30000.
\n", - "Our dataset will be train-test-splitted, the train part (first 5000 examples) is used to train the model (that is generated easly using [sklearn_classifer](https://github.com/mlrun/functions/blob/master/sklearn_classifier/sklearn_classifier.ipynb)).
\n", - "The test part (which is already predicted by the model) will be pushed to the input stream in order to detect drifts." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
X1X2X3X4class
00.01.00.4601010.5927441.0
11.01.00.5887880.5749840.0
20.00.00.4016410.6793251.0
31.01.00.3060760.1821080.0
40.00.00.9628470.5792451.0
\n", - "
" - ], - "text/plain": [ - " X1 X2 X3 X4 class\n", - "0 0.0 1.0 0.460101 0.592744 1.0\n", - "1 1.0 1.0 0.588788 0.574984 0.0\n", - "2 0.0 0.0 0.401641 0.679325 1.0\n", - "3 1.0 1.0 0.306076 0.182108 0.0\n", - "4 0.0 0.0 0.962847 0.579245 1.0" - ] - }, - "execution_count": 1, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import pandas as pd\n", - "data_path = 'https://s3.wasabisys.com/iguazio/data/function-marketplace-data/concept_drift/mixed_0101_abrupto.csv'\n", - "base_dataset = 'https://s3.wasabisys.com/iguazio/data/function-marketplace-data/concept_drift/predicted_abrupto_train.csv'\n", - "# The predicted test data is pushed to the stream\n", - "predicted_test_data_path = 'https://s3.wasabisys.com/iguazio/data/function-marketplace-data/concept_drift/predicted_abrupto_test.csv'\n", - "# You can find the model used here\n", - "models_path = 'https://s3.wasabisys.com/iguazio/models/function-marketplace-models/concept_drift/concept_drift_random_forest.pkl'\n", - "original_data = pd.read_csv(data_path)\n", - "original_data.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
X1X2X3X4classpredicted_col
349950.00.00.0101060.6472690.01.0
349961.01.00.2936510.7372911.00.0
349970.00.00.8485460.5523370.01.0
349981.01.00.6147540.8598961.00.0
349991.00.00.2653060.8437160.01.0
\n", - "
" - ], - "text/plain": [ - " X1 X2 X3 X4 class predicted_col\n", - "34995 0.0 0.0 0.010106 0.647269 0.0 1.0\n", - "34996 1.0 1.0 0.293651 0.737291 1.0 0.0\n", - "34997 0.0 0.0 0.848546 0.552337 0.0 1.0\n", - "34998 1.0 1.0 0.614754 0.859896 1.0 0.0\n", - "34999 1.0 0.0 0.265306 0.843716 0.0 1.0" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "predicted_test = pd.read_csv(predicted_test_data_path)\n", - "predicted_test.tail()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Creating the labeled stream**" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "import os \n", - "\n", - "container = os.path.join('/',os.environ['V3IO_HOME'].split('/')[0])\n", - "user = os.environ[\"V3IO_USERNAME\"]\n", - "rel_path = os.getcwd()[6:] + '/artifacts'\n", - "\n", - "base_input_stream = os.path.join(user,rel_path) + \"/inputs_stream\"\n", - "base_output_stream = os.path.join(user,rel_path) + \"/output_stream\"\n", - "input_stream = os.path.join(container,base_input_stream)\n", - "tsdb_path = os.path.join(user,rel_path) + \"/output_tsdb\"\n", - "\n", - "stream_consumer_group = 's2p'" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "import v3io.dataplane\n", - "\n", - "client = v3io.dataplane.Client()\n", - "response = client.stream.create(container = container,\n", - " stream_path=base_input_stream,\n", - " shard_count=1,\n", - " raise_for_status = v3io.dataplane.RaiseForStatus.never)\n", - "response.raise_for_status([409, 204])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Importing the function**" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2021-10-26 14:37:45,224 [info] created and saved project function-marketplace\n" - ] - } - ], - "source": [ - "import mlrun\n", - "\n", - "# Importing the function\n", - "mlrun.set_environment(project='function-marketplace')\n", - "\n", - "fn = mlrun.import_function(\"hub://stream_to_parquet:development\")\n", - "fn.apply(mlrun.auto_mount())\n", - "\n", - "fn.add_v3io_stream_trigger(stream_path=input_stream, name='stream', group=stream_consumer_group)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Running the function remotely**" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2021-10-26 14:37:45,513 [info] Starting remote function deploy\n", - "2021-10-26 14:37:45 (info) Deploying function\n", - "2021-10-26 14:37:45 (info) Building\n", - "2021-10-26 14:37:45 (info) Staging files and preparing base images\n", - "2021-10-26 14:37:45 (info) Building processor image\n", - "2021-10-26 14:37:47 (info) Build complete\n", - "2021-10-26 14:37:55 (info) Function deploy complete\n", - "> 2021-10-26 14:37:55,689 [info] successfully deployed function: {'internal_invocation_urls': ['nuclio-function-marketplace-stream-to-parquet.default-tenant.svc.cluster.local:8080'], 'external_invocation_urls': ['default-tenant.app.dev39.lab.iguazeng.com:31445']}\n" - ] - }, - { - "data": { - "text/plain": [ - "'http://default-tenant.app.dev39.lab.iguazeng.com:31445'" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import json\n", - "fn.set_envs({'window': 200,\n", - " 'save_to': os.path.join(os.path.join('/User',rel_path), 'inference_pq'),\n", - " 'prediction_col': 'predicted_col',\n", - " 'label_col': 'class',\n", - " 'base_dataset': base_dataset,\n", - " 'results_tsdb_container': container[1:],\n", - " 'results_tsdb_table': tsdb_path,\n", - " 'mount_path': os.path.join(container,user),\n", - " 'mount_remote': container,\n", - " 'artifact_path': os.path.join('/User',rel_path)})\n", - "\n", - "fn.deploy()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Testing the function**" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'data': '{\"request\": {\"instances\": [{\"X1\": 0.0, \"X2\": 0.0, \"X3\": 0.0634475073, \"X4\": 0.4136568818, \"class\": 1.0, \"predicted_col\": 1.0}]}, \"resp\": [1], \"when\": \"2021-10-26 14:37:55.864974\", \"model\": \"sklearn.ensemble.RandomForestClassifier\"}'}" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import json\n", - "import datetime\n", - "\n", - "# Reshaping the data to V3IOStream format.\n", - "def restructure_stream_event(context, event):\n", - " instances = [dict()]\n", - " for key in predicted_test.keys():\n", - " if key not in ['when', 'model', 'worker', 'hostname', 'predicted_col']:\n", - " instances[0].update({key: event.pop(key)})\n", - " instances[0].update({key: event.get(key)}) \n", - " event['request'] = {'instances': instances}\n", - " event['resp'] = [int(event.pop('predicted_col'))]\n", - " event['when'] = datetime.datetime.strftime(datetime.datetime.now(), format=\"%Y-%m-%d %H:%M:%S.%f\")\n", - " event['model'] = 'sklearn.ensemble.RandomForestClassifier'\n", - " return event\n", - " \n", - " \n", - "records = json.loads(predicted_test.to_json(orient='records'))\n", - "records = [{'data': json.dumps(restructure_stream_event(context, record))} for record in records]\n", - "\n", - "# showing first record\n", - "records[0]" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "# Pushing some data to the input stream\n", - "step = 500\n", - "for i in range(0,20000,step):\n", - " response = client.stream.put_records(container=container,\n", - " stream_path=base_input_stream, \n", - " records=records[i:i+step])" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
class_shift_helingerclass_shift_kldclass_shift_tvdprior_helingerprior_kldprior_tvdstream
time
2021-10-26 14:38:08.027000+00:000.0017590.0000250.0024881.010.01.0some_stream
2021-10-26 14:38:08.699000+00:000.0017590.0000250.0024881.010.01.0some_stream
2021-10-26 14:38:09.599000+00:000.0017590.0000250.0024881.010.01.0some_stream
2021-10-26 14:38:10.759000+00:000.0017590.0000250.0024881.010.01.0some_stream
2021-10-26 14:38:11.561000+00:000.0017590.0000250.0024881.010.01.0some_stream
........................
2021-10-26 14:39:42.037000+00:000.0017590.0000250.0024881.010.01.0some_stream
2021-10-26 14:39:42.191000+00:000.0017590.0000250.0024881.010.01.0some_stream
2021-10-26 14:39:42.586000+00:000.0017590.0000250.0024881.010.01.0some_stream
2021-10-26 14:39:42.816000+00:000.0017590.0000250.0024881.010.01.0some_stream
2021-10-26 14:39:49.180000+00:000.0017590.0000250.0024881.010.01.0some_stream
\n", - "

99 rows × 7 columns

\n", - "
" - ], - "text/plain": [ - " class_shift_helinger class_shift_kld \\\n", - "time \n", - "2021-10-26 14:38:08.027000+00:00 0.001759 0.000025 \n", - "2021-10-26 14:38:08.699000+00:00 0.001759 0.000025 \n", - "2021-10-26 14:38:09.599000+00:00 0.001759 0.000025 \n", - "2021-10-26 14:38:10.759000+00:00 0.001759 0.000025 \n", - "2021-10-26 14:38:11.561000+00:00 0.001759 0.000025 \n", - "... ... ... \n", - "2021-10-26 14:39:42.037000+00:00 0.001759 0.000025 \n", - "2021-10-26 14:39:42.191000+00:00 0.001759 0.000025 \n", - "2021-10-26 14:39:42.586000+00:00 0.001759 0.000025 \n", - "2021-10-26 14:39:42.816000+00:00 0.001759 0.000025 \n", - "2021-10-26 14:39:49.180000+00:00 0.001759 0.000025 \n", - "\n", - " class_shift_tvd prior_helinger prior_kld \\\n", - "time \n", - "2021-10-26 14:38:08.027000+00:00 0.002488 1.0 10.0 \n", - "2021-10-26 14:38:08.699000+00:00 0.002488 1.0 10.0 \n", - "2021-10-26 14:38:09.599000+00:00 0.002488 1.0 10.0 \n", - "2021-10-26 14:38:10.759000+00:00 0.002488 1.0 10.0 \n", - "2021-10-26 14:38:11.561000+00:00 0.002488 1.0 10.0 \n", - "... ... ... ... \n", - "2021-10-26 14:39:42.037000+00:00 0.002488 1.0 10.0 \n", - "2021-10-26 14:39:42.191000+00:00 0.002488 1.0 10.0 \n", - "2021-10-26 14:39:42.586000+00:00 0.002488 1.0 10.0 \n", - "2021-10-26 14:39:42.816000+00:00 0.002488 1.0 10.0 \n", - "2021-10-26 14:39:49.180000+00:00 0.002488 1.0 10.0 \n", - "\n", - " prior_tvd stream \n", - "time \n", - "2021-10-26 14:38:08.027000+00:00 1.0 some_stream \n", - "2021-10-26 14:38:08.699000+00:00 1.0 some_stream \n", - "2021-10-26 14:38:09.599000+00:00 1.0 some_stream \n", - "2021-10-26 14:38:10.759000+00:00 1.0 some_stream \n", - "2021-10-26 14:38:11.561000+00:00 1.0 some_stream \n", - "... ... ... \n", - "2021-10-26 14:39:42.037000+00:00 1.0 some_stream \n", - "2021-10-26 14:39:42.191000+00:00 1.0 some_stream \n", - "2021-10-26 14:39:42.586000+00:00 1.0 some_stream \n", - "2021-10-26 14:39:42.816000+00:00 1.0 some_stream \n", - "2021-10-26 14:39:49.180000+00:00 1.0 some_stream \n", - "\n", - "[99 rows x 7 columns]" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Reading from TSDB\n", - "import v3io_frames as v3f\n", - "\n", - "v3f_client = v3f.Client(os.environ[\"V3IO_FRAMESD\"],container=container[1:])\n", - "v3f_client.read(backend='tsdb',table=tsdb_path)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "[Back to the top](#Stream-to-Parquet)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/functions/development/stream_to_parquet/latest/src/stream_to_parquet.py b/functions/development/stream_to_parquet/latest/src/stream_to_parquet.py deleted file mode 100644 index 175c1282..00000000 --- a/functions/development/stream_to_parquet/latest/src/stream_to_parquet.py +++ /dev/null @@ -1,96 +0,0 @@ -# Copyright 2019 Iguazio -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# Generated by nuclio.export.NuclioExporter - -import os -import pandas as pd -import numpy as np -import json -import datetime -import mlrun - - -def record_to_features(record): - features = record["request"]["instances"][0] - timestamp = record["when"] - prediction = record["resp"] - - record = {"timestamp": timestamp, **features, "predictions": prediction} - - return record - - -def init_context(context): - setattr(context, "batch", []) - setattr(context, "window", int(os.getenv("window", 10))) - setattr(context, "save_to", os.getenv("save_to", "/bigdata/inference_pq/")) - os.makedirs(context.save_to, exist_ok=True) - - mlrun.mlconf.dbpath = mlrun.mlconf.dbpath or "http://mlrun-api:8080" - artifact_path = os.getenv("artifact_path", None) - if artifact_path: - mlrun.mlconf.artifact_path = artifact_path - if "hub_url" in os.environ: - mlrun.mlconf.hub_url = os.environ["hub_url"] - virtual_drift_fn = mlrun.import_function("hub://virtual_drift") - virtual_drift_fn.apply(mlrun.auto_mount()) - setattr(context, "virtual_drift_fn", virtual_drift_fn) - - predictions_col = os.getenv("predictions", None) - label_col = os.getenv("label_col", None) - setattr(context, "base_dataset", os.getenv("base_dataset", "")) - setattr(context, "indexes", json.loads(os.environ.get("indexes", "[]"))) - setattr(context, "predictions_col", predictions_col) - setattr(context, "label_col", label_col) - setattr( - context, "results_tsdb_container", os.getenv("results_tsdb_container", None) - ) - setattr(context, "results_tsdb_table", os.getenv("results_tsdb_table", None)) - - -def handler(context, event): - - context.logger.info(f"Adding {event.body}") - context.batch.append(record_to_features(json.loads(event.body))) - - if len(context.batch) > context.window: - context.logger.info(context.batch[:1]) - context.logger.info(context.indexes) - df = pd.DataFrame(context.batch) - context.logger.info(f"df example: {df.head(1)}") - if context.indexes: - df = df.set_index(context.indexes) - df_path = os.path.join( - context.save_to, - f"{datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S')}.pq", - ) - df.to_parquet(df_path,index=False) - - task = mlrun.NewTask( - name="drift_magnitude", - handler="drift_magnitude", - params={ - "label_col": context.label_col, - "prediction_col": context.predictions_col, - "results_tsdb_container": context.results_tsdb_container, - "results_tsdb_table": context.results_tsdb_table, - }, - inputs={"t": context.base_dataset, "u": df_path}, - artifact_path=mlrun.mlconf.artifact_path, - ) - - context.virtual_drift_fn.run(task, watch=False) - - context.batch = [] diff --git a/functions/development/stream_to_parquet/latest/static/documentation.html b/functions/development/stream_to_parquet/latest/static/documentation.html deleted file mode 100644 index efefa293..00000000 --- a/functions/development/stream_to_parquet/latest/static/documentation.html +++ /dev/null @@ -1,234 +0,0 @@ - - - - - - - -stream_to_parquet package - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - - - -
-
- - -
-
-
- -
-

stream_to_parquet package

- -
- -
-
-
-
-
-

stream_to_parquet package#

-
-

Submodules#

-
-
-

stream_to_parquet.stream_to_parquet module#

-
-
-stream_to_parquet.stream_to_parquet.handler(context, event)[source]#
-
-
-
-stream_to_parquet.stream_to_parquet.init_context(context)[source]#
-
-
-
-stream_to_parquet.stream_to_parquet.record_to_features(record)[source]#
-
-
-
-

Module contents#

-
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/stream_to_parquet/latest/static/example.html b/functions/development/stream_to_parquet/latest/static/example.html deleted file mode 100644 index 05826f82..00000000 --- a/functions/development/stream_to_parquet/latest/static/example.html +++ /dev/null @@ -1,777 +0,0 @@ - - - - - - - -Stream to Parquet - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - - - -
-
- - -
-
-
- - -
-
-
-

Stream to Parquet#

-

Part of the network operations demo pipeline, this function listens to a labeld stream and writes it as parquet files.
-This function also deploys the function virtual_drift from the hub, which computes drift magnitude metrics between base dataset t and dataset u,
-in our case (as well as in the demo) - base dataset (the one that the model trained on) and the dataset the model predicted.
-virtual_drift writes the output to TSDB.

-
-

Steps#

-
    -
  1. Data exploration

  2. -
  3. Creating the labeled stream

  4. -
  5. Importing the function

  6. -
  7. Running the functioh remotely

  8. -
  9. Testing the function

  10. -
-
-
-

Data exploration#

-

In order to know about the performance of a drift detector by measuring the different detection metrics, we need to know beforehand where a real drift occurs.
-This is only possible with synthetic datasets.
The scikit-multiflow framework allows generating several kinds of synthetic data to simulate the occurrence of drifts.
-Harvard dataverse provides futher explanations on the used dataset along with different kinds of drifted datasets.
-mixed_0101_abrupto has 4 concepts and 3 drifts at time steps 10000, 20000, and 30000.
-Our dataset will be train-test-splitted, the train part (first 5000 examples) is used to train the model (that is generated easly using sklearn_classifer).
-The test part (which is already predicted by the model) will be pushed to the input stream in order to detect drifts.

-
-
-
import pandas as pd
-data_path = 'https://s3.wasabisys.com/iguazio/data/function-marketplace-data/concept_drift/mixed_0101_abrupto.csv'
-base_dataset = 'https://s3.wasabisys.com/iguazio/data/function-marketplace-data/concept_drift/predicted_abrupto_train.csv'
-# The predicted test data is pushed to the stream
-predicted_test_data_path = 'https://s3.wasabisys.com/iguazio/data/function-marketplace-data/concept_drift/predicted_abrupto_test.csv'
-# You can find the model used here
-models_path = 'https://s3.wasabisys.com/iguazio/models/function-marketplace-models/concept_drift/concept_drift_random_forest.pkl'
-original_data = pd.read_csv(data_path)
-original_data.head()
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
X1X2X3X4class
00.01.00.4601010.5927441.0
11.01.00.5887880.5749840.0
20.00.00.4016410.6793251.0
31.01.00.3060760.1821080.0
40.00.00.9628470.5792451.0
-
-
-
-
-
predicted_test = pd.read_csv(predicted_test_data_path)
-predicted_test.tail()
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
X1X2X3X4classpredicted_col
349950.00.00.0101060.6472690.01.0
349961.01.00.2936510.7372911.00.0
349970.00.00.8485460.5523370.01.0
349981.01.00.6147540.8598961.00.0
349991.00.00.2653060.8437160.01.0
-
-
-
-
-

Creating the labeled stream#

-
-
-
import os 
-
-container = os.path.join('/',os.environ['V3IO_HOME'].split('/')[0])
-user = os.environ["V3IO_USERNAME"]
-rel_path = os.getcwd()[6:] + '/artifacts'
-
-base_input_stream = os.path.join(user,rel_path) + "/inputs_stream"
-base_output_stream = os.path.join(user,rel_path) + "/output_stream"
-input_stream = os.path.join(container,base_input_stream)
-tsdb_path = os.path.join(user,rel_path) + "/output_tsdb"
-
-stream_consumer_group = 's2p'
-
-
-
-
-
-
-
import v3io.dataplane
-
-client = v3io.dataplane.Client()
-response = client.stream.create(container = container,
-                                stream_path=base_input_stream,
-                                shard_count=1,
-                                raise_for_status = v3io.dataplane.RaiseForStatus.never)
-response.raise_for_status([409, 204])
-
-
-
-
-
-
-

Importing the function#

-
-
-
import mlrun
-
-# Importing the function
-mlrun.set_environment(project='function-marketplace')
-
-fn = mlrun.import_function("hub://stream_to_parquet:development")
-fn.apply(mlrun.auto_mount())
-
-fn.add_v3io_stream_trigger(stream_path=input_stream, name='stream', group=stream_consumer_group)
-
-
-
-
-
> 2021-10-26 14:37:45,224 [info] created and saved project function-marketplace
-
-
-
-
-
-
-

Running the function remotely#

-
-
-
import json
-fn.set_envs({'window': 200,
-             'save_to': os.path.join(os.path.join('/User',rel_path), 'inference_pq'),
-             'prediction_col': 'predicted_col',
-             'label_col': 'class',
-             'base_dataset': base_dataset,
-             'results_tsdb_container': container[1:],
-             'results_tsdb_table': tsdb_path,
-             'mount_path': os.path.join(container,user),
-             'mount_remote': container,
-             'artifact_path': os.path.join('/User',rel_path)})
-
-fn.deploy()
-
-
-
-
-
> 2021-10-26 14:37:45,513 [info] Starting remote function deploy
-2021-10-26 14:37:45  (info) Deploying function
-2021-10-26 14:37:45  (info) Building
-2021-10-26 14:37:45  (info) Staging files and preparing base images
-2021-10-26 14:37:45  (info) Building processor image
-2021-10-26 14:37:47  (info) Build complete
-2021-10-26 14:37:55  (info) Function deploy complete
-> 2021-10-26 14:37:55,689 [info] successfully deployed function: {'internal_invocation_urls': ['nuclio-function-marketplace-stream-to-parquet.default-tenant.svc.cluster.local:8080'], 'external_invocation_urls': ['default-tenant.app.dev39.lab.iguazeng.com:31445']}
-
-
-
'http://default-tenant.app.dev39.lab.iguazeng.com:31445'
-
-
-
-
-
-
-

Testing the function#

-
-
-
import json
-import datetime
-
-# Reshaping the data to V3IOStream format.
-def restructure_stream_event(context, event):
-    instances = [dict()]
-    for key in predicted_test.keys():
-        if key not in ['when', 'model', 'worker', 'hostname', 'predicted_col']:
-            instances[0].update({key: event.pop(key)})
-    instances[0].update({key: event.get(key)})      
-    event['request'] = {'instances': instances}
-    event['resp'] = [int(event.pop('predicted_col'))]
-    event['when'] = datetime.datetime.strftime(datetime.datetime.now(), format="%Y-%m-%d %H:%M:%S.%f")
-    event['model'] = 'sklearn.ensemble.RandomForestClassifier'
-    return event
-    
-    
-records = json.loads(predicted_test.to_json(orient='records'))
-records = [{'data': json.dumps(restructure_stream_event(context, record))} for record in records]
-
-# showing first record
-records[0]
-
-
-
-
-
{'data': '{"request": {"instances": [{"X1": 0.0, "X2": 0.0, "X3": 0.0634475073, "X4": 0.4136568818, "class": 1.0, "predicted_col": 1.0}]}, "resp": [1], "when": "2021-10-26 14:37:55.864974", "model": "sklearn.ensemble.RandomForestClassifier"}'}
-
-
-
-
-
-
-
# Pushing some data to the input stream
-step = 500
-for i in range(0,20000,step):
-    response = client.stream.put_records(container=container,
-                                              stream_path=base_input_stream, 
-                                              records=records[i:i+step])
-
-
-
-
-
-
-
# Reading from TSDB
-import v3io_frames as v3f
-
-v3f_client = v3f.Client(os.environ["V3IO_FRAMESD"],container=container[1:])
-v3f_client.read(backend='tsdb',table=tsdb_path)
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
class_shift_helingerclass_shift_kldclass_shift_tvdprior_helingerprior_kldprior_tvdstream
time
2021-10-26 14:38:08.027000+00:000.0017590.0000250.0024881.010.01.0some_stream
2021-10-26 14:38:08.699000+00:000.0017590.0000250.0024881.010.01.0some_stream
2021-10-26 14:38:09.599000+00:000.0017590.0000250.0024881.010.01.0some_stream
2021-10-26 14:38:10.759000+00:000.0017590.0000250.0024881.010.01.0some_stream
2021-10-26 14:38:11.561000+00:000.0017590.0000250.0024881.010.01.0some_stream
........................
2021-10-26 14:39:42.037000+00:000.0017590.0000250.0024881.010.01.0some_stream
2021-10-26 14:39:42.191000+00:000.0017590.0000250.0024881.010.01.0some_stream
2021-10-26 14:39:42.586000+00:000.0017590.0000250.0024881.010.01.0some_stream
2021-10-26 14:39:42.816000+00:000.0017590.0000250.0024881.010.01.0some_stream
2021-10-26 14:39:49.180000+00:000.0017590.0000250.0024881.010.01.0some_stream
-

99 rows × 7 columns

-
-
-

Back to the top

-
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/stream_to_parquet/latest/static/function.html b/functions/development/stream_to_parquet/latest/static/function.html deleted file mode 100644 index 5cc5a2c2..00000000 --- a/functions/development/stream_to_parquet/latest/static/function.html +++ /dev/null @@ -1,67 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: remote
-metadata:
-  name: stream-to-parquet
-  tag: ''
-  hash: 78316bfbe731714715c19f0bc6deabf8652f15c4
-  project: ''
-  labels:
-    author: orz
-  categories:
-  - machine-learning
-  - data-preparation
-spec:
-  command: ''
-  args: []
-  image: mlrun/ml-models
-  description: Saves a stream to Parquet and can lunch drift detection task on it
-  min_replicas: 1
-  max_replicas: 1
-  env: []
-  base_spec:
-    apiVersion: nuclio.io/v1
-    kind: Function
-    metadata:
-      name: stream-to-parquet
-      labels: {}
-      annotations:
-        nuclio.io/generated_by: function generated from /User/test/functions/stream_to_parquet/stream_to_parquet.py
-    spec:
-      runtime: python:3.6
-      handler: stream_to_parquet:handler
-      env: []
-      volumes: []
-      build:
-        commands: []
-        noBaseImagesPull: true
-        functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IG9zCmltcG9ydCBwYW5kYXMgYXMgcGQKaW1wb3J0IG51bXB5IGFzIG5wCmltcG9ydCBqc29uCmltcG9ydCBkYXRldGltZQppbXBvcnQgbWxydW4KCgpkZWYgcmVjb3JkX3RvX2ZlYXR1cmVzKHJlY29yZCk6CiAgICBmZWF0dXJlcyA9IHJlY29yZFsicmVxdWVzdCJdWyJpbnN0YW5jZXMiXVswXQogICAgdGltZXN0YW1wID0gcmVjb3JkWyJ3aGVuIl0KICAgIHByZWRpY3Rpb24gPSByZWNvcmRbInJlc3AiXQoKICAgIHJlY29yZCA9IHsidGltZXN0YW1wIjogdGltZXN0YW1wLCAqKmZlYXR1cmVzLCAicHJlZGljdGlvbnMiOiBwcmVkaWN0aW9ufQoKICAgIHJldHVybiByZWNvcmQKCgpkZWYgaW5pdF9jb250ZXh0KGNvbnRleHQpOgogICAgc2V0YXR0cihjb250ZXh0LCAiYmF0Y2giLCBbXSkKICAgIHNldGF0dHIoY29udGV4dCwgIndpbmRvdyIsIGludChvcy5nZXRlbnYoIndpbmRvdyIsIDEwKSkpCiAgICBzZXRhdHRyKGNvbnRleHQsICJzYXZlX3RvIiwgb3MuZ2V0ZW52KCJzYXZlX3RvIiwgIi9iaWdkYXRhL2luZmVyZW5jZV9wcS8iKSkKICAgIG9zLm1ha2VkaXJzKGNvbnRleHQuc2F2ZV90bywgZXhpc3Rfb2s9VHJ1ZSkKCiAgICBtbHJ1bi5tbGNvbmYuZGJwYXRoID0gbWxydW4ubWxjb25mLmRicGF0aCBvciAiaHR0cDovL21scnVuLWFwaTo4MDgwIgogICAgYXJ0aWZhY3RfcGF0aCA9IG9zLmdldGVudigiYXJ0aWZhY3RfcGF0aCIsIE5vbmUpCiAgICBpZiBhcnRpZmFjdF9wYXRoOgogICAgICAgIG1scnVuLm1sY29uZi5hcnRpZmFjdF9wYXRoID0gYXJ0aWZhY3RfcGF0aAogICAgaWYgImh1Yl91cmwiIGluIG9zLmVudmlyb246CiAgICAgICAgbWxydW4ubWxjb25mLmh1Yl91cmwgPSBvcy5lbnZpcm9uWyJodWJfdXJsIl0KICAgIHZpcnR1YWxfZHJpZnRfZm4gPSBtbHJ1bi5pbXBvcnRfZnVuY3Rpb24oImh1YjovL3ZpcnR1YWxfZHJpZnQiKQogICAgdmlydHVhbF9kcmlmdF9mbi5hcHBseShtbHJ1bi5hdXRvX21vdW50KCkpCiAgICBzZXRhdHRyKGNvbnRleHQsICJ2aXJ0dWFsX2RyaWZ0X2ZuIiwgdmlydHVhbF9kcmlmdF9mbikKCiAgICBwcmVkaWN0aW9uc19jb2wgPSBvcy5nZXRlbnYoInByZWRpY3Rpb25zIiwgTm9uZSkKICAgIGxhYmVsX2NvbCA9IG9zLmdldGVudigibGFiZWxfY29sIiwgTm9uZSkKICAgIHNldGF0dHIoY29udGV4dCwgImJhc2VfZGF0YXNldCIsIG9zLmdldGVudigiYmFzZV9kYXRhc2V0IiwgIiIpKQogICAgc2V0YXR0cihjb250ZXh0LCAiaW5kZXhlcyIsIGpzb24ubG9hZHMob3MuZW52aXJvbi5nZXQoImluZGV4ZXMiLCAiW10iKSkpCiAgICBzZXRhdHRyKGNvbnRleHQsICJwcmVkaWN0aW9uc19jb2wiLCBwcmVkaWN0aW9uc19jb2wpCiAgICBzZXRhdHRyKGNvbnRleHQsICJsYWJlbF9jb2wiLCBsYWJlbF9jb2wpCiAgICBzZXRhdHRyKAogICAgICAgIGNvbnRleHQsICJyZXN1bHRzX3RzZGJfY29udGFpbmVyIiwgb3MuZ2V0ZW52KCJyZXN1bHRzX3RzZGJfY29udGFpbmVyIiwgTm9uZSkKICAgICkKICAgIHNldGF0dHIoY29udGV4dCwgInJlc3VsdHNfdHNkYl90YWJsZSIsIG9zLmdldGVudigicmVzdWx0c190c2RiX3RhYmxlIiwgTm9uZSkpCgoKZGVmIGhhbmRsZXIoY29udGV4dCwgZXZlbnQpOgoKICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZiJBZGRpbmcge2V2ZW50LmJvZHl9IikKICAgIGNvbnRleHQuYmF0Y2guYXBwZW5kKHJlY29yZF90b19mZWF0dXJlcyhqc29uLmxvYWRzKGV2ZW50LmJvZHkpKSkKCiAgICBpZiBsZW4oY29udGV4dC5iYXRjaCkgPiBjb250ZXh0LndpbmRvdzoKICAgICAgICBjb250ZXh0LmxvZ2dlci5pbmZvKGNvbnRleHQuYmF0Y2hbOjFdKQogICAgICAgIGNvbnRleHQubG9nZ2VyLmluZm8oY29udGV4dC5pbmRleGVzKQogICAgICAgIGRmID0gcGQuRGF0YUZyYW1lKGNvbnRleHQuYmF0Y2gpCiAgICAgICAgY29udGV4dC5sb2dnZXIuaW5mbyhmImRmIGV4YW1wbGU6IHtkZi5oZWFkKDEpfSIpCiAgICAgICAgaWYgY29udGV4dC5pbmRleGVzOgogICAgICAgICAgICBkZiA9IGRmLnNldF9pbmRleChjb250ZXh0LmluZGV4ZXMpCiAgICAgICAgZGZfcGF0aCA9IG9zLnBhdGguam9pbigKICAgICAgICAgICAgY29udGV4dC5zYXZlX3RvLAogICAgICAgICAgICBmIntkYXRldGltZS5kYXRldGltZS5ub3coKS5zdHJmdGltZSgnJVktJW0tJWRUJUg6JU06JVMnKX0ucHEiLAogICAgICAgICkKICAgICAgICBkZi50b19wYXJxdWV0KGRmX3BhdGgsaW5kZXg9RmFsc2UpCgogICAgICAgIHRhc2sgPSBtbHJ1bi5OZXdUYXNrKAogICAgICAgICAgICBuYW1lPSJkcmlmdF9tYWduaXR1ZGUiLAogICAgICAgICAgICBoYW5kbGVyPSJkcmlmdF9tYWduaXR1ZGUiLAogICAgICAgICAgICBwYXJhbXM9ewogICAgICAgICAgICAgICAgImxhYmVsX2NvbCI6IGNvbnRleHQubGFiZWxfY29sLAogICAgICAgICAgICAgICAgInByZWRpY3Rpb25fY29sIjogY29udGV4dC5wcmVkaWN0aW9uc19jb2wsCiAgICAgICAgICAgICAgICAicmVzdWx0c190c2RiX2NvbnRhaW5lciI6IGNvbnRleHQucmVzdWx0c190c2RiX2NvbnRhaW5lciwKICAgICAgICAgICAgICAgICJyZXN1bHRzX3RzZGJfdGFibGUiOiBjb250ZXh0LnJlc3VsdHNfdHNkYl90YWJsZSwKICAgICAgICAgICAgfSwKICAgICAgICAgICAgaW5wdXRzPXsidCI6IGNvbnRleHQuYmFzZV9kYXRhc2V0LCAidSI6IGRmX3BhdGh9LAogICAgICAgICAgICBhcnRpZmFjdF9wYXRoPW1scnVuLm1sY29uZi5hcnRpZmFjdF9wYXRoLAogICAgICAgICkKCiAgICAgICAgY29udGV4dC52aXJ0dWFsX2RyaWZ0X2ZuLnJ1bih0YXNrLCB3YXRjaD1GYWxzZSkKCiAgICAgICAgY29udGV4dC5iYXRjaCA9IFtdCg==
-  source: ''
-  build:
-    commands: []
-    code_origin: https://github.com/daniels290813/functions.git#3605c9b8dcadab89a5a45f7d16dcd2fcfeca8697:/User/test/functions/stream_to_parquet/stream_to_parquet.py
-    origin_filename: /User/test/functions/stream_to_parquet/stream_to_parquet.py
-  default_handler: handler
-  disable_auto_mount: false
-  affinity: null
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/stream_to_parquet/latest/static/item.html b/functions/development/stream_to_parquet/latest/static/item.html deleted file mode 100644 index 7ffcdaaf..00000000 --- a/functions/development/stream_to_parquet/latest/static/item.html +++ /dev/null @@ -1,50 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- machine-learning
-- data-preparation
-description: Saves a stream to Parquet and can lunch drift detection task on it
-doc: ''
-example: stream_to_parquet.ipynb
-generationDate: 2022-08-28:17-25
-hidden: false
-icon: ''
-labels:
-  author: orz
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 1.1.0
-name: stream-to-parquet
-platformVersion: 3.5.0
-spec:
-  customFields:
-    max_replicas: 1
-    min_replicas: 1
-  filename: stream_to_parquet.py
-  handler: handler
-  image: mlrun/ml-models
-  kind: nuclio
-  requirements: []
-url: ''
-version: 1.1.0
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/stream_to_parquet/latest/static/source.html b/functions/development/stream_to_parquet/latest/static/source.html deleted file mode 100644 index acca6741..00000000 --- a/functions/development/stream_to_parquet/latest/static/source.html +++ /dev/null @@ -1,118 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-# Copyright 2019 Iguazio
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# Generated by nuclio.export.NuclioExporter
-
-import os
-import pandas as pd
-import numpy as np
-import json
-import datetime
-import mlrun
-
-
-def record_to_features(record):
-    features = record["request"]["instances"][0]
-    timestamp = record["when"]
-    prediction = record["resp"]
-
-    record = {"timestamp": timestamp, **features, "predictions": prediction}
-
-    return record
-
-
-def init_context(context):
-    setattr(context, "batch", [])
-    setattr(context, "window", int(os.getenv("window", 10)))
-    setattr(context, "save_to", os.getenv("save_to", "/bigdata/inference_pq/"))
-    os.makedirs(context.save_to, exist_ok=True)
-
-    mlrun.mlconf.dbpath = mlrun.mlconf.dbpath or "http://mlrun-api:8080"
-    artifact_path = os.getenv("artifact_path", None)
-    if artifact_path:
-        mlrun.mlconf.artifact_path = artifact_path
-    if "hub_url" in os.environ:
-        mlrun.mlconf.hub_url = os.environ["hub_url"]
-    virtual_drift_fn = mlrun.import_function("hub://virtual_drift")
-    virtual_drift_fn.apply(mlrun.auto_mount())
-    setattr(context, "virtual_drift_fn", virtual_drift_fn)
-
-    predictions_col = os.getenv("predictions", None)
-    label_col = os.getenv("label_col", None)
-    setattr(context, "base_dataset", os.getenv("base_dataset", ""))
-    setattr(context, "indexes", json.loads(os.environ.get("indexes", "[]")))
-    setattr(context, "predictions_col", predictions_col)
-    setattr(context, "label_col", label_col)
-    setattr(
-        context, "results_tsdb_container", os.getenv("results_tsdb_container", None)
-    )
-    setattr(context, "results_tsdb_table", os.getenv("results_tsdb_table", None))
-
-
-def handler(context, event):
-
-    context.logger.info(f"Adding {event.body}")
-    context.batch.append(record_to_features(json.loads(event.body)))
-
-    if len(context.batch) > context.window:
-        context.logger.info(context.batch[:1])
-        context.logger.info(context.indexes)
-        df = pd.DataFrame(context.batch)
-        context.logger.info(f"df example: {df.head(1)}")
-        if context.indexes:
-            df = df.set_index(context.indexes)
-        df_path = os.path.join(
-            context.save_to,
-            f"{datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S')}.pq",
-        )
-        df.to_parquet(df_path,index=False)
-
-        task = mlrun.NewTask(
-            name="drift_magnitude",
-            handler="drift_magnitude",
-            params={
-                "label_col": context.label_col,
-                "prediction_col": context.predictions_col,
-                "results_tsdb_container": context.results_tsdb_container,
-                "results_tsdb_table": context.results_tsdb_table,
-            },
-            inputs={"t": context.base_dataset, "u": df_path},
-            artifact_path=mlrun.mlconf.artifact_path,
-        )
-
-        context.virtual_drift_fn.run(task, watch=False)
-
-        context.batch = []
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/stream_to_parquet/latest/static/stream_to_parquet.html b/functions/development/stream_to_parquet/latest/static/stream_to_parquet.html deleted file mode 100644 index e9d43572..00000000 --- a/functions/development/stream_to_parquet/latest/static/stream_to_parquet.html +++ /dev/null @@ -1,236 +0,0 @@ - - - - - - - -stream_to_parquet.stream_to_parquet - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - -
-
- -
-
-
-
-
- -
-

- -
-
-
-
-
-
-
-

Source code for stream_to_parquet.stream_to_parquet

-# Copyright 2019 Iguazio
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# Generated by nuclio.export.NuclioExporter
-
-import os
-import pandas as pd
-import numpy as np
-import json
-import datetime
-import mlrun
-
-
-
[docs]def record_to_features(record): - features = record["request"]["instances"][0] - timestamp = record["when"] - prediction = record["resp"] - - record = {"timestamp": timestamp, **features, "predictions": prediction} - - return record
- - -
[docs]def init_context(context): - setattr(context, "batch", []) - setattr(context, "window", int(os.getenv("window", 10))) - setattr(context, "save_to", os.getenv("save_to", "/bigdata/inference_pq/")) - os.makedirs(context.save_to, exist_ok=True) - - mlrun.mlconf.dbpath = mlrun.mlconf.dbpath or "http://mlrun-api:8080" - artifact_path = os.getenv("artifact_path", None) - if artifact_path: - mlrun.mlconf.artifact_path = artifact_path - if "hub_url" in os.environ: - mlrun.mlconf.hub_url = os.environ["hub_url"] - virtual_drift_fn = mlrun.import_function("hub://virtual_drift") - virtual_drift_fn.apply(mlrun.auto_mount()) - setattr(context, "virtual_drift_fn", virtual_drift_fn) - - predictions_col = os.getenv("predictions", None) - label_col = os.getenv("label_col", None) - setattr(context, "base_dataset", os.getenv("base_dataset", "")) - setattr(context, "indexes", json.loads(os.environ.get("indexes", "[]"))) - setattr(context, "predictions_col", predictions_col) - setattr(context, "label_col", label_col) - setattr( - context, "results_tsdb_container", os.getenv("results_tsdb_container", None) - ) - setattr(context, "results_tsdb_table", os.getenv("results_tsdb_table", None))
- - -
[docs]def handler(context, event): - - context.logger.info(f"Adding {event.body}") - context.batch.append(record_to_features(json.loads(event.body))) - - if len(context.batch) > context.window: - context.logger.info(context.batch[:1]) - context.logger.info(context.indexes) - df = pd.DataFrame(context.batch) - context.logger.info(f"df example: {df.head(1)}") - if context.indexes: - df = df.set_index(context.indexes) - df_path = os.path.join( - context.save_to, - f"{datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S')}.pq", - ) - df.to_parquet(df_path,index=False) - - task = mlrun.NewTask( - name="drift_magnitude", - handler="drift_magnitude", - params={ - "label_col": context.label_col, - "prediction_col": context.predictions_col, - "results_tsdb_container": context.results_tsdb_container, - "results_tsdb_table": context.results_tsdb_table, - }, - inputs={"t": context.base_dataset, "u": df_path}, - artifact_path=mlrun.mlconf.artifact_path, - ) - - context.virtual_drift_fn.run(task, watch=False) - - context.batch = []
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/tags.json b/functions/development/tags.json index 6ebec7be..c42112b8 100644 --- a/functions/development/tags.json +++ b/functions/development/tags.json @@ -1 +1 @@ -{"categories": ["data-analysis", "model-testing", "Huggingface", "data-generation", "model-training", "PyTorch", "deep-learning", "data-validation", "model-serving", "utils", "GenAI", "etl", "data-preparation", "NLP", "feature-store", "machine-learning", "Audio", "monitoring"], "kind": ["job", "serving", "nuclio:serving", "nuclio"]} \ No newline at end of file +{"kind": ["serving", "job", "nuclio:serving"], "categories": ["model-training", "PyTorch", "data-validation", "model-testing", "machine-learning", "data-analysis", "utils", "Huggingface", "deep-learning", "data-generation", "NLP", "data-preparation", "model-serving", "Audio", "etl", "monitoring", "GenAI"]} \ No newline at end of file diff --git a/functions/development/tf1_serving/0.0.1/src/function.yaml b/functions/development/tf1_serving/0.0.1/src/function.yaml deleted file mode 100644 index 4debbcce..00000000 --- a/functions/development/tf1_serving/0.0.1/src/function.yaml +++ /dev/null @@ -1,48 +0,0 @@ -kind: remote -metadata: - name: tf1-serving - tag: '' - hash: 20cdeb2119a67fc51e55474ac84d386c7b658db3 - project: default - labels: - author: yaronh - categories: - - model-serving - - machine-learning -spec: - command: '' - args: [] - image: mlrun/mlrun - description: tf1 image classification server - min_replicas: 1 - max_replicas: 4 - env: - - name: MODEL_CLASS - value: TFModel - - name: ENABLE_EXPLAINER - value: false - base_spec: - apiVersion: nuclio.io/v1 - kind: Function - metadata: - name: tf1-serving - labels: {} - annotations: - nuclio.io/generated_by: function generated from /home/kali/functions/tf1_serving/tf1_serving.py - spec: - runtime: python:3.6 - handler: tf1_serving:handler - env: [] - volumes: [] - build: - commands: [] - noBaseImagesPull: true - functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHdhcm5pbmdzCgp3YXJuaW5ncy5zaW1wbGVmaWx0ZXIoYWN0aW9uPSJpZ25vcmUiLCBjYXRlZ29yeT1GdXR1cmVXYXJuaW5nKQoKaW1wb3J0IGpzb24KaW1wb3J0IG51bXB5IGFzIG5wCmltcG9ydCByZXF1ZXN0cwpmcm9tIHRlbnNvcmZsb3cgaW1wb3J0IGtlcmFzCmZyb20ga2VyYXMubW9kZWxzIGltcG9ydCBsb2FkX21vZGVsCmZyb20ga2VyYXMucHJlcHJvY2Vzc2luZyBpbXBvcnQgaW1hZ2UKZnJvbSBrZXJhcy5wcmVwcm9jZXNzaW5nLmltYWdlIGltcG9ydCBsb2FkX2ltZwpmcm9tIG9zIGltcG9ydCBlbnZpcm9uLCBwYXRoCmZyb20gUElMIGltcG9ydCBJbWFnZQpmcm9tIGlvIGltcG9ydCBCeXRlc0lPCmZyb20gdXJsbGliLnJlcXVlc3QgaW1wb3J0IHVybG9wZW4KaW1wb3J0IG1scnVuCgoKY2xhc3MgVEZNb2RlbChtbHJ1bi5ydW50aW1lcy5NTE1vZGVsU2VydmVyKToKICAgIGRlZiBfX2luaXRfXyhzZWxmLCBuYW1lOiBzdHIsIG1vZGVsX2Rpcjogc3RyKToKICAgICAgICBzdXBlcigpLl9faW5pdF9fKG5hbWUsIG1vZGVsX2RpcikKCiAgICAgICAgc2VsZi5JTUFHRV9XSURUSCA9IGludChlbnZpcm9uLmdldCgiSU1BR0VfV0lEVEgiLCAiMTI4IikpCiAgICAgICAgc2VsZi5JTUFHRV9IRUlHSFQgPSBpbnQoZW52aXJvbi5nZXQoIklNQUdFX0hFSUdIVCIsICIxMjgiKSkKICAgICAgICBzZWxmLmNsYXNzZXMgPSBOb25lCiAgICAgICAgdHJ5OgogICAgICAgICAgICB3aXRoIG9wZW4oZW52aXJvblsiY2xhc3Nlc19tYXAiXSwgInIiKSBhcyBmOgogICAgICAgICAgICAgICAgc2VsZi5jbGFzc2VzID0ganNvbi5sb2FkKGYpCiAgICAgICAgZXhjZXB0OgogICAgICAgICAgICBwYXNzCgogICAgZGVmIGxvYWQoc2VsZik6CiAgICAgICAgbW9kZWxfZmlsZSwgZXh0cmFfZGF0YSA9IHNlbGYuZ2V0X21vZGVsKCIuaDUiKQogICAgICAgIHNlbGYubW9kZWwgPSBsb2FkX21vZGVsKG9wZW4obW9kZWxfZmlsZSwgInJiIikpCgogICAgZGVmIHByZXByb2Nlc3Moc2VsZiwgYm9keSk6CiAgICAgICAgdHJ5OgogICAgICAgICAgICBvdXRwdXQgPSB7Imluc3RhbmNlcyI6IFtdfQogICAgICAgICAgICBpbnN0YW5jZXMgPSBib2R5LmdldCgiaW5zdGFuY2VzIiwgW10pCiAgICAgICAgICAgIGZvciBieXRlX2ltYWdlIGluIGluc3RhbmNlczoKICAgICAgICAgICAgICAgIGltZyA9IEltYWdlLm9wZW4oYnl0ZV9pbWFnZSkKICAgICAgICAgICAgICAgIGltZyA9IGltZy5yZXNpemUoKHNlbGYuSU1BR0VfV0lEVEgsIHNlbGYuSU1BR0VfSEVJR0hUKSkKCiAgICAgICAgICAgICAgICB4ID0gaW1hZ2UuaW1nX3RvX2FycmF5KGltZykKICAgICAgICAgICAgICAgIHggPSBucC5leHBhbmRfZGltcyh4LCBheGlzPTApCiAgICAgICAgICAgICAgICBvdXRwdXRbImluc3RhbmNlcyJdLmFwcGVuZCh4KQoKICAgICAgICAgICAgb3V0cHV0WyJpbnN0YW5jZXMiXSA9IFtucC52c3RhY2sob3V0cHV0WyJpbnN0YW5jZXMiXSldCiAgICAgICAgICAgIHJldHVybiBvdXRwdXQKICAgICAgICBleGNlcHQ6CiAgICAgICAgICAgIHJhaXNlIEV4Y2VwdGlvbihmInJlY2VpdmVkOiB7Ym9keX0iKQoKICAgIGRlZiBwcmVkaWN0KHNlbGYsIGRhdGEpOgogICAgICAgIGltYWdlcyA9IGRhdGEuZ2V0KCJpbnN0YW5jZXMiLCBbXSkKCiAgICAgICAgcHJlZGljdGVkX3Byb2JhYmlsaXR5ID0gc2VsZi5tb2RlbC5wcmVkaWN0KGltYWdlcykKCiAgICAgICAgcmV0dXJuIHByZWRpY3RlZF9wcm9iYWJpbGl0eQoKICAgIGRlZiBwb3N0cHJvY2VzcyhzZWxmLCBwcmVkaWN0ZWRfcHJvYmFiaWxpdHkpOgogICAgICAgIGlmIHNlbGYuY2xhc3NlczoKICAgICAgICAgICAgcHJlZGljdGVkX2NsYXNzZXMgPSBucC5hcm91bmQocHJlZGljdGVkX3Byb2JhYmlsaXR5LCAxKS50b2xpc3QoKVswXQogICAgICAgICAgICBwcmVkaWN0ZWRfcHJvYmFiaWxpdGllcyA9IHByZWRpY3RlZF9wcm9iYWJpbGl0eS50b2xpc3QoKVswXQogICAgICAgICAgICByZXR1cm4gewogICAgICAgICAgICAgICAgInByZWRpY3Rpb24iOiBbCiAgICAgICAgICAgICAgICAgICAgc2VsZi5jbGFzc2VzW3N0cihpbnQoY2xzKSldIGZvciBjbHMgaW4gcHJlZGljdGVkX2NsYXNzZXMKICAgICAgICAgICAgICAgIF0sCiAgICAgICAgICAgICAgICBmJ3tzZWxmLmNsYXNzZXNbIjEiXX0tcHJvYmFiaWxpdHknOiBwcmVkaWN0ZWRfcHJvYmFiaWxpdGllcywKICAgICAgICAgICAgfQogICAgICAgIGVsc2U6CiAgICAgICAgICAgIHJldHVybiBwcmVkaWN0ZWRfcHJvYmFiaWxpdHkudG9saXN0KClbMF0KCmZyb20gbWxydW4ucnVudGltZXMgaW1wb3J0IG51Y2xpb19pbml0X2hvb2sKZGVmIGluaXRfY29udGV4dChjb250ZXh0KToKICAgIG51Y2xpb19pbml0X2hvb2soY29udGV4dCwgZ2xvYmFscygpLCAnc2VydmluZycpCgpkZWYgaGFuZGxlcihjb250ZXh0LCBldmVudCk6CiAgICByZXR1cm4gY29udGV4dC5tbHJ1bl9oYW5kbGVyKGNvbnRleHQsIGV2ZW50KQo= - source: '' - function_kind: serving - build: - commands: [] - code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/tf1_serving/tf1_serving.py - default_handler: handler - affinity: null -verbose: false diff --git a/functions/development/tf1_serving/0.0.1/src/item.yaml b/functions/development/tf1_serving/0.0.1/src/item.yaml deleted file mode 100644 index 53b54e78..00000000 --- a/functions/development/tf1_serving/0.0.1/src/item.yaml +++ /dev/null @@ -1,27 +0,0 @@ -apiVersion: v1 -categories: -- model-serving -- machine-learning -description: tf1 image classification server -doc: '' -example: tf1_serving.ipynb -generationDate: 2021-05-19:23-13 -icon: '' -labels: - author: yaronh -maintainers: [] -marketplaceType: '' -mlrunVersion: '' -name: tf1-serving -platformVersion: '' -spec: - filename: tf1_serving.py - handler: handler - image: mlrun/mlrun - kind: nuclio:serving - requirements: [] - env: - MODEL_CLASS: TFModel - ENABLE_EXPLAINER: False -url: '' -version: 0.0.1 diff --git a/functions/development/tf1_serving/0.0.1/src/tf1_serving.ipynb b/functions/development/tf1_serving/0.0.1/src/tf1_serving.ipynb deleted file mode 100644 index 1d42ee60..00000000 --- a/functions/development/tf1_serving/0.0.1/src/tf1_serving.ipynb +++ /dev/null @@ -1,567 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Image Classification Model - Serving Function\n", - "\n", - "This notebook demonstrates how to deploy a Tensorflow model using MLRun & Nuclio.\n", - "\n", - "**In this notebook you will:**\n", - "* Write a Tensorflow-Model class to load and predict on the incoming data\n", - "* Deploy the model as a serverless function\n", - "* Invoke the serving endpoint with data as:\n", - " * URLs to images hosted on S3\n", - " * Direct image send\n", - " \n", - "**Steps:** \n", - "* [Define Nuclio function](#Define-Nuclio-function) \n", - " * [Install dependencies and set config](#Install-dependencies-and-set-config) \n", - " * [Model serving class](#Model-Serving-Class) \n", - "* [Deploy the serving function to the cluster](#Deploy-the-serving-function-to-the-cluster) \n", - "* [Define test parameters](#Define-test-parameters)\n", - "* [Test the deployed function on the cluster](#Test-the-deployed-function-on-the-cluster)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Define Nuclio Function" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To use the magic commands for deploying this jupyter notebook as a nuclio function we must first import nuclio \n", - "Since we do not want to import nuclio in the actual function, the comment annotation `nuclio: ignore` is used. This marks the cell for nuclio, telling it to ignore the cell's values when building the function." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: ignore\n", - "import nuclio" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Install dependencies and set config\n", - "> Note: Since tensorflow 1.14 is being pulled from the baseimage it is not directly installed as a build command.\n", - "If it is not installed on your system please uninstall and install using the line: `pip install tensorflow==1.14 keras`" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "%nuclio: setting kind to 'nuclio:serving'\n", - "%nuclio: setting 'MODEL_CLASS' environment variable\n", - "%nuclio: setting spec.build.baseImage to 'mlrun/mlrun'\n" - ] - } - ], - "source": [ - "%nuclio config kind=\"nuclio:serving\"\n", - "%nuclio env MODEL_CLASS=TFModel\n", - "\n", - "# tensorflow version 1 requires a different version of python than \n", - "# the default (3.7), so we override the default tag here:\n", - "\n", - "%nuclio config spec.build.baseImage = \"mlrun/mlrun\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Since we are using packages which are not surely installed on our baseimage, or want to verify that a specific version of the package will be installed we use the `%nuclio cmd` annotation. \n", - ">`%nuclio cmd` works both locally and during deployment by default, but can be set with `-c` flag to only run the commands while deploying or `-l` to set the variable for the local environment only." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "%%nuclio cmd -c\n", - "pip install tensorflow==1.14 keras==2.3.1 'h5py<3.0.0'\n", - "pip install requests pillow" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Function Code" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "import warnings\n", - "warnings.simplefilter(action=\"ignore\", category=FutureWarning)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "import json\n", - "import numpy as np\n", - "import requests\n", - "from tensorflow import keras\n", - "from keras.models import load_model\n", - "from keras.preprocessing import image\n", - "from keras.preprocessing.image import load_img\n", - "from os import environ, path\n", - "from PIL import Image\n", - "from io import BytesIO\n", - "from urllib.request import urlopen\n", - "import mlrun" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Model Serving Class" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We define the `TFModel` class which we will use to define data handling and prediction of our model. \n", - "\n", - "The class should consist of:\n", - "* `__init__(name, model_dir)` - Setup the internal parameters\n", - "* `load(self)` - How to load the model and broadcast it's ready for prediction\n", - "* `preprocess(self, body)` - How to handle the incoming event, forming the request to an `{'instances': []}` dictionary as requested by the protocol\n", - "* `predict(self, data)` - Receives and `{'instances': []}` and returns the model's prediction as a list\n", - "* `postprocess(self, data)` - Does any additional processing needed on the predictions." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "class TFModel(mlrun.runtimes.MLModelServer):\n", - " def __init__(self, name: str, model_dir: str):\n", - " super().__init__(name, model_dir)\n", - "\n", - " self.IMAGE_WIDTH = int(environ.get('IMAGE_WIDTH', '128'))\n", - " self.IMAGE_HEIGHT = int(environ.get('IMAGE_HEIGHT', '128'))\n", - " self.classes = None\n", - " try:\n", - " with open(environ['classes_map'], 'r') as f:\n", - " self.classes = json.load(f)\n", - " except:\n", - " pass\n", - " \n", - " def load(self):\n", - " model_file, extra_data = self.get_model('.h5')\n", - " self.model = load_model(open(model_file, 'rb'))\n", - " \n", - " def preprocess(self, body):\n", - " try:\n", - " output = {'instances': []}\n", - " instances = body.get('instances', [])\n", - " for byte_image in instances:\n", - " img = Image.open(byte_image)\n", - " img = img.resize((self.IMAGE_WIDTH, self.IMAGE_HEIGHT))\n", - "\n", - " # Load image\n", - " x = image.img_to_array(img)\n", - " x = np.expand_dims(x, axis=0)\n", - " output['instances'].append(x)\n", - " \n", - " # Format instances list\n", - " output['instances'] = [np.vstack(output['instances'])]\n", - " return output\n", - " except:\n", - " raise Exception(f'received: {body}')\n", - " \n", - "\n", - " def predict(self, data):\n", - " images = data.get('instances', [])\n", - "\n", - " # Predict\n", - " predicted_probability = self.model.predict(images)\n", - "\n", - " # return prediction\n", - " return predicted_probability\n", - " \n", - " def postprocess(self, predicted_probability):\n", - " if self.classes:\n", - " predicted_classes = np.around(predicted_probability, 1).tolist()[0]\n", - " predicted_probabilities = predicted_probability.tolist()[0]\n", - " return {\n", - " 'prediction': [self.classes[str(int(cls))] for cls in predicted_classes], \n", - " f'{self.classes[\"1\"]}-probability': predicted_probabilities\n", - " }\n", - " else:\n", - " return predicted_probability.tolist()[0]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To let our nuclio builder know that our function code ends at this point we will use the comment annotation `nuclio: end-code`. \n", - "\n", - "Any new cell from now on will be treated as if a `nuclio: ignore` comment was set, and will not be added to the funcion." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: end-code" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Test the function locally" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Make sure your local TF / Keras version is the same as pulled in the nuclio image for accurate testing\n", - "\n", - "Set the served models and their file paths using: `SERVING_MODEL_ = `\n", - "\n", - "> Note: this notebook assumes the model and categories are under /User/mlrun/examples/" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "from PIL import Image\n", - "from io import BytesIO\n", - "import matplotlib.pyplot as plt\n", - "import os, requests" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Define test parameters" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Test image:\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "# Testing event\n", - "cat_image_url = 'https://s3.amazonaws.com/iguazio-sample-data/images/catanddog/cat.102.jpg'\n", - "response = requests.get(cat_image_url)\n", - "cat_image = response.content\n", - "img = Image.open(BytesIO(cat_image))\n", - "\n", - "print('Test image:')\n", - "plt.imshow(img)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Define Function specifications" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import mlconf\n", - "import os\n", - "\n", - "# Model Server variables\n", - "model_class = 'TFModel'\n", - "model_name = 'cat_vs_dog_tfv1' # Define for later use in tests\n", - "models = {model_name: os.path.join(mlconf.artifact_path, 'tf1/cats_n_dogs.h5')}\n", - "\n", - "# Specific model variables\n", - "function_envs = {\n", - " 'IMAGE_HEIGHT': 128,\n", - " 'IMAGE_WIDTH': 128,\n", - " 'classes_map': os.path.join(mlconf.artifact_path, 'categories_map.json')\n", - "}" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Deploy the serving function to the cluster" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import new_model_server, mount_v3io" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-05-04 21:22:18,924 function spec saved to path: function.yaml\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Setup the model server function\n", - "fn = new_model_server('tf1-serving', \n", - " model_class=model_class,\n", - " models=models)\n", - "fn.set_envs(function_envs)\n", - "fn.spec.description = \"tf1 image classification server\"\n", - "fn.metadata.categories = ['serving', 'dl']\n", - "fn.metadata.labels = {'author': 'yaronh'}\n", - "fn.export(\"function.yaml\")" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "if \"V3IO_HOME\" in list(os.environ):\n", - " from mlrun import mount_v3io\n", - " fn.apply(mount_v3io())\n", - "else:\n", - " # is you set up mlrun using the instructions at\n", - " # https://github.com/mlrun/mlrun/blob/master/hack/local/README.md\n", - " from mlrun.platforms import mount_pvc\n", - " fn.apply(mount_pvc('nfsvol', 'nfsvol', '/home/joyan/data'))" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-04-30 20:52:15,886 deploy started\n", - "[nuclio] 2020-04-30 20:53:46,385 (info) Build complete\n", - "[nuclio] 2020-04-30 20:53:56,566 (info) Function deploy complete\n", - "[nuclio] 2020-04-30 20:53:56,573 done updating tensorflow-v1-2layers, function address: 3.135.130.246:30961\n" - ] - } - ], - "source": [ - "# Deploy the model server\n", - "addr = fn.deploy(project='cat-and-dog-servers')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Test the deployed function on the cluster" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Test the deployed function (with URL)" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Sending event: {\"data_url\": \"https://s3.amazonaws.com/iguazio-sample-data/images/catanddog/cat.102.jpg\"}\n" - ] - }, - { - "data": { - "text/plain": [ - "b'[0.0]'" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# URL event\n", - "event_body = json.dumps({\"data_url\": cat_image_url})\n", - "print(f'Sending event: {event_body}')\n", - "\n", - "headers = {'Content-type': 'application/json'}\n", - "response = requests.post(url=addr + f'/{model_name}/predict', data=event_body, headers=headers)\n", - "response.content" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Test the deployed function (with Jpeg Image)" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Sending image from https://s3.amazonaws.com/iguazio-sample-data/images/catanddog/cat.102.jpg\n" - ] - }, - { - "data": { - "text/plain": [ - "b'[0.0]'" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "# URL event\n", - "event_body = cat_image\n", - "print(f'Sending image from {cat_image_url}')\n", - "plt.imshow(img)\n", - "\n", - "headers = {'Content-type': 'image/jpeg'}\n", - "response = requests.post(url=addr + f'/{model_name}/predict/', data=event_body, headers=headers)\n", - "response.content" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/functions/development/tf1_serving/0.0.1/src/tf1_serving.py b/functions/development/tf1_serving/0.0.1/src/tf1_serving.py deleted file mode 100644 index 5bced05c..00000000 --- a/functions/development/tf1_serving/0.0.1/src/tf1_serving.py +++ /dev/null @@ -1,73 +0,0 @@ -# Generated by nuclio.export.NuclioExporter - -import warnings - -warnings.simplefilter(action="ignore", category=FutureWarning) - -import json -import numpy as np -import requests -from tensorflow import keras -from keras.models import load_model -from keras.preprocessing import image -from keras.preprocessing.image import load_img -from os import environ, path -from PIL import Image -from io import BytesIO -from urllib.request import urlopen -import mlrun - - -class TFModel(mlrun.runtimes.MLModelServer): - def __init__(self, name: str, model_dir: str): - super().__init__(name, model_dir) - - self.IMAGE_WIDTH = int(environ.get("IMAGE_WIDTH", "128")) - self.IMAGE_HEIGHT = int(environ.get("IMAGE_HEIGHT", "128")) - self.classes = None - try: - with open(environ["classes_map"], "r") as f: - self.classes = json.load(f) - except: - pass - - def load(self): - model_file, extra_data = self.get_model(".h5") - self.model = load_model(open(model_file, "rb")) - - def preprocess(self, body): - try: - output = {"instances": []} - instances = body.get("instances", []) - for byte_image in instances: - img = Image.open(byte_image) - img = img.resize((self.IMAGE_WIDTH, self.IMAGE_HEIGHT)) - - x = image.img_to_array(img) - x = np.expand_dims(x, axis=0) - output["instances"].append(x) - - output["instances"] = [np.vstack(output["instances"])] - return output - except: - raise Exception(f"received: {body}") - - def predict(self, data): - images = data.get("instances", []) - - predicted_probability = self.model.predict(images) - - return predicted_probability - - def postprocess(self, predicted_probability): - if self.classes: - predicted_classes = np.around(predicted_probability, 1).tolist()[0] - predicted_probabilities = predicted_probability.tolist()[0] - return { - "prediction": [ - self.classes[str(int(cls))] for cls in predicted_classes - ], - f'{self.classes["1"]}-probability': predicted_probabilities, - } - else: - return predicted_probability.tolist()[0] diff --git a/functions/development/tf1_serving/0.0.1/static/documentation.html b/functions/development/tf1_serving/0.0.1/static/documentation.html deleted file mode 100644 index c06a4727..00000000 --- a/functions/development/tf1_serving/0.0.1/static/documentation.html +++ /dev/null @@ -1,125 +0,0 @@ - - - - - - - -tf1_serving package - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- - -
-
-
-
-
-
-

tf1_serving package

-
-

Submodules

-
-
-

tf1_serving.tf1_serving module

-
-
-

Module contents

-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/tf1_serving/0.0.1/static/example.html b/functions/development/tf1_serving/0.0.1/static/example.html deleted file mode 100644 index 54421102..00000000 --- a/functions/development/tf1_serving/0.0.1/static/example.html +++ /dev/null @@ -1,536 +0,0 @@ - - - - - - - -Image Classification Model - Serving Function - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
- -
-
-
-
-

Image Classification Model - Serving Function

-

This notebook demonstrates how to deploy a Tensorflow model using MLRun & Nuclio.

-

In this notebook you will:

-
    -
  • Write a Tensorflow-Model class to load and predict on the incoming data

  • -
  • Deploy the model as a serverless function

  • -
  • Invoke the serving endpoint with data as:

    -
      -
    • URLs to images hosted on S3

    • -
    • Direct image send

    • -
    -
  • -
-

Steps:

- -
-

Define Nuclio Function

-

To use the magic commands for deploying this jupyter notebook as a nuclio function we must first import nuclio
-Since we do not want to import nuclio in the actual function, the comment annotation nuclio: ignore is used. This marks the cell for nuclio, telling it to ignore the cell’s values when building the function.

-
-
-
# nuclio: ignore
-import nuclio
-
-
-
-
-
-

Install dependencies and set config

-
-

Note: Since tensorflow 1.14 is being pulled from the baseimage it is not directly installed as a build command. -If it is not installed on your system please uninstall and install using the line: pip install tensorflow==1.14 keras

-
-
-
-
%nuclio config kind="nuclio:serving"
-%nuclio env MODEL_CLASS=TFModel
-
-# tensorflow version 1 requires a different version of python than 
-# the default (3.7), so we override the default tag here:
-
-%nuclio config spec.build.baseImage = "mlrun/mlrun"
-
-
-
-
-
%nuclio: setting kind to 'nuclio:serving'
-%nuclio: setting 'MODEL_CLASS' environment variable
-%nuclio: setting spec.build.baseImage to 'mlrun/mlrun'
-
-
-
-
-

Since we are using packages which are not surely installed on our baseimage, or want to verify that a specific version of the package will be installed we use the %nuclio cmd annotation.

-
-

%nuclio cmd works both locally and during deployment by default, but can be set with -c flag to only run the commands while deploying or -l to set the variable for the local environment only.

-
-
-
-
%%nuclio cmd -c
-pip install tensorflow==1.14 keras==2.3.1 'h5py<3.0.0'
-pip install requests pillow
-
-
-
-
-
-
-
-

Function Code

-
-
-
import warnings
-warnings.simplefilter(action="ignore", category=FutureWarning)
-
-
-
-
-
-
-
import json
-import numpy as np
-import requests
-from tensorflow import keras
-from keras.models import load_model
-from keras.preprocessing import image
-from keras.preprocessing.image import load_img
-from os import environ, path
-from PIL import Image
-from io import BytesIO
-from urllib.request import urlopen
-import mlrun
-
-
-
-
-
-

Model Serving Class

-

We define the TFModel class which we will use to define data handling and prediction of our model.

-

The class should consist of:

-
    -
  • __init__(name, model_dir) - Setup the internal parameters

  • -
  • load(self) - How to load the model and broadcast it’s ready for prediction

  • -
  • preprocess(self, body) - How to handle the incoming event, forming the request to an {'instances': [<samples>]} dictionary as requested by the protocol

  • -
  • predict(self, data) - Receives and {'instances': [<samples>]} and returns the model’s prediction as a list

  • -
  • postprocess(self, data) - Does any additional processing needed on the predictions.

  • -
-
-
-
class TFModel(mlrun.runtimes.MLModelServer):
-    def __init__(self, name: str, model_dir: str):
-        super().__init__(name, model_dir)
-
-        self.IMAGE_WIDTH = int(environ.get('IMAGE_WIDTH', '128'))
-        self.IMAGE_HEIGHT = int(environ.get('IMAGE_HEIGHT', '128'))
-        self.classes = None
-        try:
-            with open(environ['classes_map'], 'r') as f:
-                self.classes = json.load(f)
-        except:
-            pass
-        
-    def load(self):
-        model_file, extra_data = self.get_model('.h5')
-        self.model = load_model(open(model_file, 'rb'))
-        
-    def preprocess(self, body):
-        try:
-            output = {'instances': []}
-            instances = body.get('instances', [])
-            for byte_image in instances:
-                img = Image.open(byte_image)
-                img = img.resize((self.IMAGE_WIDTH, self.IMAGE_HEIGHT))
-
-                # Load image
-                x = image.img_to_array(img)
-                x = np.expand_dims(x, axis=0)
-                output['instances'].append(x)
-            
-            # Format instances list
-            output['instances'] = [np.vstack(output['instances'])]
-            return output
-        except:
-            raise Exception(f'received: {body}')
-            
-
-    def predict(self, data):
-        images = data.get('instances', [])
-
-        # Predict
-        predicted_probability = self.model.predict(images)
-
-        # return prediction
-        return predicted_probability
-        
-    def postprocess(self, predicted_probability):
-        if self.classes:
-            predicted_classes = np.around(predicted_probability, 1).tolist()[0]
-            predicted_probabilities = predicted_probability.tolist()[0]
-            return {
-                'prediction': [self.classes[str(int(cls))] for cls in predicted_classes], 
-                f'{self.classes["1"]}-probability': predicted_probabilities
-            }
-        else:
-            return predicted_probability.tolist()[0]
-
-
-
-
-

To let our nuclio builder know that our function code ends at this point we will use the comment annotation nuclio: end-code.

-

Any new cell from now on will be treated as if a nuclio: ignore comment was set, and will not be added to the funcion.

-
-
-
# nuclio: end-code
-
-
-
-
-
-
-
-

Test the function locally

-

Make sure your local TF / Keras version is the same as pulled in the nuclio image for accurate testing

-

Set the served models and their file paths using: SERVING_MODEL_<name> = <model file path>

-
-

Note: this notebook assumes the model and categories are under /User/mlrun/examples/

-
-
-
-
from PIL import Image
-from io import BytesIO
-import matplotlib.pyplot as plt
-import os, requests
-
-
-
-
-
-

Define test parameters

-
-
-
# Testing event
-cat_image_url = 'https://s3.amazonaws.com/iguazio-sample-data/images/catanddog/cat.102.jpg'
-response = requests.get(cat_image_url)
-cat_image = response.content
-img = Image.open(BytesIO(cat_image))
-
-print('Test image:')
-plt.imshow(img)
-
-
-
-
-
Test image:
-
-
-
<matplotlib.image.AxesImage at 0x7f1232ea52e8>
-
-
-_images/tf1_serving_example_20_2.png -
-
-
-
-

Define Function specifications

-
-
-
from mlrun import mlconf
-import os
-
-# Model Server variables
-model_class = 'TFModel'
-model_name = 'cat_vs_dog_tfv1' # Define for later use in tests
-models = {model_name: os.path.join(mlconf.artifact_path, 'tf1/cats_n_dogs.h5')}
-
-# Specific model variables
-function_envs = {
-    'IMAGE_HEIGHT': 128,
-    'IMAGE_WIDTH': 128,
-    'classes_map': os.path.join(mlconf.artifact_path, 'categories_map.json')
-}
-
-
-
-
-
-
-
-

Deploy the serving function to the cluster

-
-
-
from mlrun import new_model_server, mount_v3io
-
-
-
-
-
-
-
# Setup the model server function
-fn = new_model_server('tf1-serving', 
-                      model_class=model_class,
-                      models=models)
-fn.set_envs(function_envs)
-fn.spec.description = "tf1 image classification server"
-fn.metadata.categories = ['serving', 'dl']
-fn.metadata.labels = {'author': 'yaronh'}
-fn.export("function.yaml")
-
-
-
-
-
[mlrun] 2020-05-04 21:22:18,924 function spec saved to path: function.yaml
-
-
-
<mlrun.runtimes.function.RemoteRuntime at 0x7f5a1585f908>
-
-
-
-
-
-
-
if "V3IO_HOME" in list(os.environ):
-    from mlrun import mount_v3io
-    fn.apply(mount_v3io())
-else:
-    # is you set up mlrun using the instructions at
-    # https://github.com/mlrun/mlrun/blob/master/hack/local/README.md
-    from mlrun.platforms import mount_pvc
-    fn.apply(mount_pvc('nfsvol', 'nfsvol', '/home/joyan/data'))
-
-
-
-
-
-
-
# Deploy the model server
-addr = fn.deploy(project='cat-and-dog-servers')
-
-
-
-
-
[mlrun] 2020-04-30 20:52:15,886 deploy started
-[nuclio] 2020-04-30 20:53:46,385 (info) Build complete
-[nuclio] 2020-04-30 20:53:56,566 (info) Function deploy complete
-[nuclio] 2020-04-30 20:53:56,573 done updating tensorflow-v1-2layers, function address: 3.135.130.246:30961
-
-
-
-
-
-
-

Test the deployed function on the cluster

-
-

Test the deployed function (with URL)

-
-
-
# URL event
-event_body = json.dumps({"data_url": cat_image_url})
-print(f'Sending event: {event_body}')
-
-headers = {'Content-type': 'application/json'}
-response = requests.post(url=addr + f'/{model_name}/predict', data=event_body, headers=headers)
-response.content
-
-
-
-
-
Sending event: {"data_url": "https://s3.amazonaws.com/iguazio-sample-data/images/catanddog/cat.102.jpg"}
-
-
-
b'[0.0]'
-
-
-
-
-
-
-

Test the deployed function (with Jpeg Image)

-
-
-
# URL event
-event_body = cat_image
-print(f'Sending image from {cat_image_url}')
-plt.imshow(img)
-
-headers = {'Content-type': 'image/jpeg'}
-response = requests.post(url=addr + f'/{model_name}/predict/', data=event_body, headers=headers)
-response.content
-
-
-
-
-
Sending image from https://s3.amazonaws.com/iguazio-sample-data/images/catanddog/cat.102.jpg
-
-
-
b'[0.0]'
-
-
-_images/tf1_serving_example_32_2.png -
-
-
-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/tf1_serving/0.0.1/static/function.html b/functions/development/tf1_serving/0.0.1/static/function.html deleted file mode 100644 index 1ef2668b..00000000 --- a/functions/development/tf1_serving/0.0.1/static/function.html +++ /dev/null @@ -1,70 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: remote
-metadata:
-  name: tf1-serving
-  tag: ''
-  hash: 20cdeb2119a67fc51e55474ac84d386c7b658db3
-  project: default
-  labels:
-    author: yaronh
-  categories:
-  - model-serving
-  - machine-learning
-spec:
-  command: ''
-  args: []
-  image: mlrun/mlrun
-  description: tf1 image classification server
-  min_replicas: 1
-  max_replicas: 4
-  env:
-  - name: MODEL_CLASS
-    value: TFModel
-  - name: ENABLE_EXPLAINER
-    value: false
-  base_spec:
-    apiVersion: nuclio.io/v1
-    kind: Function
-    metadata:
-      name: tf1-serving
-      labels: {}
-      annotations:
-        nuclio.io/generated_by: function generated from /home/kali/functions/tf1_serving/tf1_serving.py
-    spec:
-      runtime: python:3.6
-      handler: tf1_serving:handler
-      env: []
-      volumes: []
-      build:
-        commands: []
-        noBaseImagesPull: true
-        functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHdhcm5pbmdzCgp3YXJuaW5ncy5zaW1wbGVmaWx0ZXIoYWN0aW9uPSJpZ25vcmUiLCBjYXRlZ29yeT1GdXR1cmVXYXJuaW5nKQoKaW1wb3J0IGpzb24KaW1wb3J0IG51bXB5IGFzIG5wCmltcG9ydCByZXF1ZXN0cwpmcm9tIHRlbnNvcmZsb3cgaW1wb3J0IGtlcmFzCmZyb20ga2VyYXMubW9kZWxzIGltcG9ydCBsb2FkX21vZGVsCmZyb20ga2VyYXMucHJlcHJvY2Vzc2luZyBpbXBvcnQgaW1hZ2UKZnJvbSBrZXJhcy5wcmVwcm9jZXNzaW5nLmltYWdlIGltcG9ydCBsb2FkX2ltZwpmcm9tIG9zIGltcG9ydCBlbnZpcm9uLCBwYXRoCmZyb20gUElMIGltcG9ydCBJbWFnZQpmcm9tIGlvIGltcG9ydCBCeXRlc0lPCmZyb20gdXJsbGliLnJlcXVlc3QgaW1wb3J0IHVybG9wZW4KaW1wb3J0IG1scnVuCgoKY2xhc3MgVEZNb2RlbChtbHJ1bi5ydW50aW1lcy5NTE1vZGVsU2VydmVyKToKICAgIGRlZiBfX2luaXRfXyhzZWxmLCBuYW1lOiBzdHIsIG1vZGVsX2Rpcjogc3RyKToKICAgICAgICBzdXBlcigpLl9faW5pdF9fKG5hbWUsIG1vZGVsX2RpcikKCiAgICAgICAgc2VsZi5JTUFHRV9XSURUSCA9IGludChlbnZpcm9uLmdldCgiSU1BR0VfV0lEVEgiLCAiMTI4IikpCiAgICAgICAgc2VsZi5JTUFHRV9IRUlHSFQgPSBpbnQoZW52aXJvbi5nZXQoIklNQUdFX0hFSUdIVCIsICIxMjgiKSkKICAgICAgICBzZWxmLmNsYXNzZXMgPSBOb25lCiAgICAgICAgdHJ5OgogICAgICAgICAgICB3aXRoIG9wZW4oZW52aXJvblsiY2xhc3Nlc19tYXAiXSwgInIiKSBhcyBmOgogICAgICAgICAgICAgICAgc2VsZi5jbGFzc2VzID0ganNvbi5sb2FkKGYpCiAgICAgICAgZXhjZXB0OgogICAgICAgICAgICBwYXNzCgogICAgZGVmIGxvYWQoc2VsZik6CiAgICAgICAgbW9kZWxfZmlsZSwgZXh0cmFfZGF0YSA9IHNlbGYuZ2V0X21vZGVsKCIuaDUiKQogICAgICAgIHNlbGYubW9kZWwgPSBsb2FkX21vZGVsKG9wZW4obW9kZWxfZmlsZSwgInJiIikpCgogICAgZGVmIHByZXByb2Nlc3Moc2VsZiwgYm9keSk6CiAgICAgICAgdHJ5OgogICAgICAgICAgICBvdXRwdXQgPSB7Imluc3RhbmNlcyI6IFtdfQogICAgICAgICAgICBpbnN0YW5jZXMgPSBib2R5LmdldCgiaW5zdGFuY2VzIiwgW10pCiAgICAgICAgICAgIGZvciBieXRlX2ltYWdlIGluIGluc3RhbmNlczoKICAgICAgICAgICAgICAgIGltZyA9IEltYWdlLm9wZW4oYnl0ZV9pbWFnZSkKICAgICAgICAgICAgICAgIGltZyA9IGltZy5yZXNpemUoKHNlbGYuSU1BR0VfV0lEVEgsIHNlbGYuSU1BR0VfSEVJR0hUKSkKCiAgICAgICAgICAgICAgICB4ID0gaW1hZ2UuaW1nX3RvX2FycmF5KGltZykKICAgICAgICAgICAgICAgIHggPSBucC5leHBhbmRfZGltcyh4LCBheGlzPTApCiAgICAgICAgICAgICAgICBvdXRwdXRbImluc3RhbmNlcyJdLmFwcGVuZCh4KQoKICAgICAgICAgICAgb3V0cHV0WyJpbnN0YW5jZXMiXSA9IFtucC52c3RhY2sob3V0cHV0WyJpbnN0YW5jZXMiXSldCiAgICAgICAgICAgIHJldHVybiBvdXRwdXQKICAgICAgICBleGNlcHQ6CiAgICAgICAgICAgIHJhaXNlIEV4Y2VwdGlvbihmInJlY2VpdmVkOiB7Ym9keX0iKQoKICAgIGRlZiBwcmVkaWN0KHNlbGYsIGRhdGEpOgogICAgICAgIGltYWdlcyA9IGRhdGEuZ2V0KCJpbnN0YW5jZXMiLCBbXSkKCiAgICAgICAgcHJlZGljdGVkX3Byb2JhYmlsaXR5ID0gc2VsZi5tb2RlbC5wcmVkaWN0KGltYWdlcykKCiAgICAgICAgcmV0dXJuIHByZWRpY3RlZF9wcm9iYWJpbGl0eQoKICAgIGRlZiBwb3N0cHJvY2VzcyhzZWxmLCBwcmVkaWN0ZWRfcHJvYmFiaWxpdHkpOgogICAgICAgIGlmIHNlbGYuY2xhc3NlczoKICAgICAgICAgICAgcHJlZGljdGVkX2NsYXNzZXMgPSBucC5hcm91bmQocHJlZGljdGVkX3Byb2JhYmlsaXR5LCAxKS50b2xpc3QoKVswXQogICAgICAgICAgICBwcmVkaWN0ZWRfcHJvYmFiaWxpdGllcyA9IHByZWRpY3RlZF9wcm9iYWJpbGl0eS50b2xpc3QoKVswXQogICAgICAgICAgICByZXR1cm4gewogICAgICAgICAgICAgICAgInByZWRpY3Rpb24iOiBbCiAgICAgICAgICAgICAgICAgICAgc2VsZi5jbGFzc2VzW3N0cihpbnQoY2xzKSldIGZvciBjbHMgaW4gcHJlZGljdGVkX2NsYXNzZXMKICAgICAgICAgICAgICAgIF0sCiAgICAgICAgICAgICAgICBmJ3tzZWxmLmNsYXNzZXNbIjEiXX0tcHJvYmFiaWxpdHknOiBwcmVkaWN0ZWRfcHJvYmFiaWxpdGllcywKICAgICAgICAgICAgfQogICAgICAgIGVsc2U6CiAgICAgICAgICAgIHJldHVybiBwcmVkaWN0ZWRfcHJvYmFiaWxpdHkudG9saXN0KClbMF0KCmZyb20gbWxydW4ucnVudGltZXMgaW1wb3J0IG51Y2xpb19pbml0X2hvb2sKZGVmIGluaXRfY29udGV4dChjb250ZXh0KToKICAgIG51Y2xpb19pbml0X2hvb2soY29udGV4dCwgZ2xvYmFscygpLCAnc2VydmluZycpCgpkZWYgaGFuZGxlcihjb250ZXh0LCBldmVudCk6CiAgICByZXR1cm4gY29udGV4dC5tbHJ1bl9oYW5kbGVyKGNvbnRleHQsIGV2ZW50KQo=
-  source: ''
-  function_kind: serving
-  build:
-    commands: []
-    code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/tf1_serving/tf1_serving.py
-  default_handler: handler
-  affinity: null
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/tf1_serving/0.0.1/static/item.html b/functions/development/tf1_serving/0.0.1/static/item.html deleted file mode 100644 index 2bad563f..00000000 --- a/functions/development/tf1_serving/0.0.1/static/item.html +++ /dev/null @@ -1,49 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- model-serving
-- machine-learning
-description: tf1 image classification server
-doc: ''
-example: tf1_serving.ipynb
-generationDate: 2021-05-19:23-13
-icon: ''
-labels:
-  author: yaronh
-maintainers: []
-marketplaceType: ''
-mlrunVersion: ''
-name: tf1-serving
-platformVersion: ''
-spec:
-  filename: tf1_serving.py
-  handler: handler
-  image: mlrun/mlrun
-  kind: nuclio:serving
-  requirements: []
-  env:
-    MODEL_CLASS: TFModel
-    ENABLE_EXPLAINER: False
-url: ''
-version: 0.0.1
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/tf1_serving/0.0.1/static/source.html b/functions/development/tf1_serving/0.0.1/static/source.html deleted file mode 100644 index ae10b34f..00000000 --- a/functions/development/tf1_serving/0.0.1/static/source.html +++ /dev/null @@ -1,95 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-# Generated by nuclio.export.NuclioExporter
-
-import warnings
-
-warnings.simplefilter(action="ignore", category=FutureWarning)
-
-import json
-import numpy as np
-import requests
-from tensorflow import keras
-from keras.models import load_model
-from keras.preprocessing import image
-from keras.preprocessing.image import load_img
-from os import environ, path
-from PIL import Image
-from io import BytesIO
-from urllib.request import urlopen
-import mlrun
-
-
-class TFModel(mlrun.runtimes.MLModelServer):
-    def __init__(self, name: str, model_dir: str):
-        super().__init__(name, model_dir)
-
-        self.IMAGE_WIDTH = int(environ.get("IMAGE_WIDTH", "128"))
-        self.IMAGE_HEIGHT = int(environ.get("IMAGE_HEIGHT", "128"))
-        self.classes = None
-        try:
-            with open(environ["classes_map"], "r") as f:
-                self.classes = json.load(f)
-        except:
-            pass
-
-    def load(self):
-        model_file, extra_data = self.get_model(".h5")
-        self.model = load_model(open(model_file, "rb"))
-
-    def preprocess(self, body):
-        try:
-            output = {"instances": []}
-            instances = body.get("instances", [])
-            for byte_image in instances:
-                img = Image.open(byte_image)
-                img = img.resize((self.IMAGE_WIDTH, self.IMAGE_HEIGHT))
-
-                x = image.img_to_array(img)
-                x = np.expand_dims(x, axis=0)
-                output["instances"].append(x)
-
-            output["instances"] = [np.vstack(output["instances"])]
-            return output
-        except:
-            raise Exception(f"received: {body}")
-
-    def predict(self, data):
-        images = data.get("instances", [])
-
-        predicted_probability = self.model.predict(images)
-
-        return predicted_probability
-
-    def postprocess(self, predicted_probability):
-        if self.classes:
-            predicted_classes = np.around(predicted_probability, 1).tolist()[0]
-            predicted_probabilities = predicted_probability.tolist()[0]
-            return {
-                "prediction": [
-                    self.classes[str(int(cls))] for cls in predicted_classes
-                ],
-                f'{self.classes["1"]}-probability': predicted_probabilities,
-            }
-        else:
-            return predicted_probability.tolist()[0]
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/tf1_serving/0.8.0/src/function.yaml b/functions/development/tf1_serving/0.8.0/src/function.yaml deleted file mode 100644 index 4debbcce..00000000 --- a/functions/development/tf1_serving/0.8.0/src/function.yaml +++ /dev/null @@ -1,48 +0,0 @@ -kind: remote -metadata: - name: tf1-serving - tag: '' - hash: 20cdeb2119a67fc51e55474ac84d386c7b658db3 - project: default - labels: - author: yaronh - categories: - - model-serving - - machine-learning -spec: - command: '' - args: [] - image: mlrun/mlrun - description: tf1 image classification server - min_replicas: 1 - max_replicas: 4 - env: - - name: MODEL_CLASS - value: TFModel - - name: ENABLE_EXPLAINER - value: false - base_spec: - apiVersion: nuclio.io/v1 - kind: Function - metadata: - name: tf1-serving - labels: {} - annotations: - nuclio.io/generated_by: function generated from /home/kali/functions/tf1_serving/tf1_serving.py - spec: - runtime: python:3.6 - handler: tf1_serving:handler - env: [] - volumes: [] - build: - commands: [] - noBaseImagesPull: true - functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHdhcm5pbmdzCgp3YXJuaW5ncy5zaW1wbGVmaWx0ZXIoYWN0aW9uPSJpZ25vcmUiLCBjYXRlZ29yeT1GdXR1cmVXYXJuaW5nKQoKaW1wb3J0IGpzb24KaW1wb3J0IG51bXB5IGFzIG5wCmltcG9ydCByZXF1ZXN0cwpmcm9tIHRlbnNvcmZsb3cgaW1wb3J0IGtlcmFzCmZyb20ga2VyYXMubW9kZWxzIGltcG9ydCBsb2FkX21vZGVsCmZyb20ga2VyYXMucHJlcHJvY2Vzc2luZyBpbXBvcnQgaW1hZ2UKZnJvbSBrZXJhcy5wcmVwcm9jZXNzaW5nLmltYWdlIGltcG9ydCBsb2FkX2ltZwpmcm9tIG9zIGltcG9ydCBlbnZpcm9uLCBwYXRoCmZyb20gUElMIGltcG9ydCBJbWFnZQpmcm9tIGlvIGltcG9ydCBCeXRlc0lPCmZyb20gdXJsbGliLnJlcXVlc3QgaW1wb3J0IHVybG9wZW4KaW1wb3J0IG1scnVuCgoKY2xhc3MgVEZNb2RlbChtbHJ1bi5ydW50aW1lcy5NTE1vZGVsU2VydmVyKToKICAgIGRlZiBfX2luaXRfXyhzZWxmLCBuYW1lOiBzdHIsIG1vZGVsX2Rpcjogc3RyKToKICAgICAgICBzdXBlcigpLl9faW5pdF9fKG5hbWUsIG1vZGVsX2RpcikKCiAgICAgICAgc2VsZi5JTUFHRV9XSURUSCA9IGludChlbnZpcm9uLmdldCgiSU1BR0VfV0lEVEgiLCAiMTI4IikpCiAgICAgICAgc2VsZi5JTUFHRV9IRUlHSFQgPSBpbnQoZW52aXJvbi5nZXQoIklNQUdFX0hFSUdIVCIsICIxMjgiKSkKICAgICAgICBzZWxmLmNsYXNzZXMgPSBOb25lCiAgICAgICAgdHJ5OgogICAgICAgICAgICB3aXRoIG9wZW4oZW52aXJvblsiY2xhc3Nlc19tYXAiXSwgInIiKSBhcyBmOgogICAgICAgICAgICAgICAgc2VsZi5jbGFzc2VzID0ganNvbi5sb2FkKGYpCiAgICAgICAgZXhjZXB0OgogICAgICAgICAgICBwYXNzCgogICAgZGVmIGxvYWQoc2VsZik6CiAgICAgICAgbW9kZWxfZmlsZSwgZXh0cmFfZGF0YSA9IHNlbGYuZ2V0X21vZGVsKCIuaDUiKQogICAgICAgIHNlbGYubW9kZWwgPSBsb2FkX21vZGVsKG9wZW4obW9kZWxfZmlsZSwgInJiIikpCgogICAgZGVmIHByZXByb2Nlc3Moc2VsZiwgYm9keSk6CiAgICAgICAgdHJ5OgogICAgICAgICAgICBvdXRwdXQgPSB7Imluc3RhbmNlcyI6IFtdfQogICAgICAgICAgICBpbnN0YW5jZXMgPSBib2R5LmdldCgiaW5zdGFuY2VzIiwgW10pCiAgICAgICAgICAgIGZvciBieXRlX2ltYWdlIGluIGluc3RhbmNlczoKICAgICAgICAgICAgICAgIGltZyA9IEltYWdlLm9wZW4oYnl0ZV9pbWFnZSkKICAgICAgICAgICAgICAgIGltZyA9IGltZy5yZXNpemUoKHNlbGYuSU1BR0VfV0lEVEgsIHNlbGYuSU1BR0VfSEVJR0hUKSkKCiAgICAgICAgICAgICAgICB4ID0gaW1hZ2UuaW1nX3RvX2FycmF5KGltZykKICAgICAgICAgICAgICAgIHggPSBucC5leHBhbmRfZGltcyh4LCBheGlzPTApCiAgICAgICAgICAgICAgICBvdXRwdXRbImluc3RhbmNlcyJdLmFwcGVuZCh4KQoKICAgICAgICAgICAgb3V0cHV0WyJpbnN0YW5jZXMiXSA9IFtucC52c3RhY2sob3V0cHV0WyJpbnN0YW5jZXMiXSldCiAgICAgICAgICAgIHJldHVybiBvdXRwdXQKICAgICAgICBleGNlcHQ6CiAgICAgICAgICAgIHJhaXNlIEV4Y2VwdGlvbihmInJlY2VpdmVkOiB7Ym9keX0iKQoKICAgIGRlZiBwcmVkaWN0KHNlbGYsIGRhdGEpOgogICAgICAgIGltYWdlcyA9IGRhdGEuZ2V0KCJpbnN0YW5jZXMiLCBbXSkKCiAgICAgICAgcHJlZGljdGVkX3Byb2JhYmlsaXR5ID0gc2VsZi5tb2RlbC5wcmVkaWN0KGltYWdlcykKCiAgICAgICAgcmV0dXJuIHByZWRpY3RlZF9wcm9iYWJpbGl0eQoKICAgIGRlZiBwb3N0cHJvY2VzcyhzZWxmLCBwcmVkaWN0ZWRfcHJvYmFiaWxpdHkpOgogICAgICAgIGlmIHNlbGYuY2xhc3NlczoKICAgICAgICAgICAgcHJlZGljdGVkX2NsYXNzZXMgPSBucC5hcm91bmQocHJlZGljdGVkX3Byb2JhYmlsaXR5LCAxKS50b2xpc3QoKVswXQogICAgICAgICAgICBwcmVkaWN0ZWRfcHJvYmFiaWxpdGllcyA9IHByZWRpY3RlZF9wcm9iYWJpbGl0eS50b2xpc3QoKVswXQogICAgICAgICAgICByZXR1cm4gewogICAgICAgICAgICAgICAgInByZWRpY3Rpb24iOiBbCiAgICAgICAgICAgICAgICAgICAgc2VsZi5jbGFzc2VzW3N0cihpbnQoY2xzKSldIGZvciBjbHMgaW4gcHJlZGljdGVkX2NsYXNzZXMKICAgICAgICAgICAgICAgIF0sCiAgICAgICAgICAgICAgICBmJ3tzZWxmLmNsYXNzZXNbIjEiXX0tcHJvYmFiaWxpdHknOiBwcmVkaWN0ZWRfcHJvYmFiaWxpdGllcywKICAgICAgICAgICAgfQogICAgICAgIGVsc2U6CiAgICAgICAgICAgIHJldHVybiBwcmVkaWN0ZWRfcHJvYmFiaWxpdHkudG9saXN0KClbMF0KCmZyb20gbWxydW4ucnVudGltZXMgaW1wb3J0IG51Y2xpb19pbml0X2hvb2sKZGVmIGluaXRfY29udGV4dChjb250ZXh0KToKICAgIG51Y2xpb19pbml0X2hvb2soY29udGV4dCwgZ2xvYmFscygpLCAnc2VydmluZycpCgpkZWYgaGFuZGxlcihjb250ZXh0LCBldmVudCk6CiAgICByZXR1cm4gY29udGV4dC5tbHJ1bl9oYW5kbGVyKGNvbnRleHQsIGV2ZW50KQo= - source: '' - function_kind: serving - build: - commands: [] - code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/tf1_serving/tf1_serving.py - default_handler: handler - affinity: null -verbose: false diff --git a/functions/development/tf1_serving/0.8.0/src/item.yaml b/functions/development/tf1_serving/0.8.0/src/item.yaml deleted file mode 100644 index 8c148abc..00000000 --- a/functions/development/tf1_serving/0.8.0/src/item.yaml +++ /dev/null @@ -1,27 +0,0 @@ -apiVersion: v1 -categories: -- model-serving -- machine-learning -description: tf1 image classification server -doc: '' -example: tf1_serving.ipynb -generationDate: 2021-05-19:23-13 -icon: '' -labels: - author: yaronh -maintainers: [] -marketplaceType: '' -mlrunVersion: 0.8.0 -name: tf1-serving -platformVersion: 3.2.0 -spec: - env: - ENABLE_EXPLAINER: false - MODEL_CLASS: TFModel - filename: tf1_serving.py - handler: handler - image: mlrun/mlrun - kind: nuclio:serving - requirements: [] -url: '' -version: 0.8.0 diff --git a/functions/development/tf1_serving/0.8.0/src/tf1_serving.ipynb b/functions/development/tf1_serving/0.8.0/src/tf1_serving.ipynb deleted file mode 100644 index 1d42ee60..00000000 --- a/functions/development/tf1_serving/0.8.0/src/tf1_serving.ipynb +++ /dev/null @@ -1,567 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Image Classification Model - Serving Function\n", - "\n", - "This notebook demonstrates how to deploy a Tensorflow model using MLRun & Nuclio.\n", - "\n", - "**In this notebook you will:**\n", - "* Write a Tensorflow-Model class to load and predict on the incoming data\n", - "* Deploy the model as a serverless function\n", - "* Invoke the serving endpoint with data as:\n", - " * URLs to images hosted on S3\n", - " * Direct image send\n", - " \n", - "**Steps:** \n", - "* [Define Nuclio function](#Define-Nuclio-function) \n", - " * [Install dependencies and set config](#Install-dependencies-and-set-config) \n", - " * [Model serving class](#Model-Serving-Class) \n", - "* [Deploy the serving function to the cluster](#Deploy-the-serving-function-to-the-cluster) \n", - "* [Define test parameters](#Define-test-parameters)\n", - "* [Test the deployed function on the cluster](#Test-the-deployed-function-on-the-cluster)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Define Nuclio Function" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To use the magic commands for deploying this jupyter notebook as a nuclio function we must first import nuclio \n", - "Since we do not want to import nuclio in the actual function, the comment annotation `nuclio: ignore` is used. This marks the cell for nuclio, telling it to ignore the cell's values when building the function." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: ignore\n", - "import nuclio" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Install dependencies and set config\n", - "> Note: Since tensorflow 1.14 is being pulled from the baseimage it is not directly installed as a build command.\n", - "If it is not installed on your system please uninstall and install using the line: `pip install tensorflow==1.14 keras`" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "%nuclio: setting kind to 'nuclio:serving'\n", - "%nuclio: setting 'MODEL_CLASS' environment variable\n", - "%nuclio: setting spec.build.baseImage to 'mlrun/mlrun'\n" - ] - } - ], - "source": [ - "%nuclio config kind=\"nuclio:serving\"\n", - "%nuclio env MODEL_CLASS=TFModel\n", - "\n", - "# tensorflow version 1 requires a different version of python than \n", - "# the default (3.7), so we override the default tag here:\n", - "\n", - "%nuclio config spec.build.baseImage = \"mlrun/mlrun\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Since we are using packages which are not surely installed on our baseimage, or want to verify that a specific version of the package will be installed we use the `%nuclio cmd` annotation. \n", - ">`%nuclio cmd` works both locally and during deployment by default, but can be set with `-c` flag to only run the commands while deploying or `-l` to set the variable for the local environment only." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "%%nuclio cmd -c\n", - "pip install tensorflow==1.14 keras==2.3.1 'h5py<3.0.0'\n", - "pip install requests pillow" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Function Code" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "import warnings\n", - "warnings.simplefilter(action=\"ignore\", category=FutureWarning)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "import json\n", - "import numpy as np\n", - "import requests\n", - "from tensorflow import keras\n", - "from keras.models import load_model\n", - "from keras.preprocessing import image\n", - "from keras.preprocessing.image import load_img\n", - "from os import environ, path\n", - "from PIL import Image\n", - "from io import BytesIO\n", - "from urllib.request import urlopen\n", - "import mlrun" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Model Serving Class" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We define the `TFModel` class which we will use to define data handling and prediction of our model. \n", - "\n", - "The class should consist of:\n", - "* `__init__(name, model_dir)` - Setup the internal parameters\n", - "* `load(self)` - How to load the model and broadcast it's ready for prediction\n", - "* `preprocess(self, body)` - How to handle the incoming event, forming the request to an `{'instances': []}` dictionary as requested by the protocol\n", - "* `predict(self, data)` - Receives and `{'instances': []}` and returns the model's prediction as a list\n", - "* `postprocess(self, data)` - Does any additional processing needed on the predictions." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "class TFModel(mlrun.runtimes.MLModelServer):\n", - " def __init__(self, name: str, model_dir: str):\n", - " super().__init__(name, model_dir)\n", - "\n", - " self.IMAGE_WIDTH = int(environ.get('IMAGE_WIDTH', '128'))\n", - " self.IMAGE_HEIGHT = int(environ.get('IMAGE_HEIGHT', '128'))\n", - " self.classes = None\n", - " try:\n", - " with open(environ['classes_map'], 'r') as f:\n", - " self.classes = json.load(f)\n", - " except:\n", - " pass\n", - " \n", - " def load(self):\n", - " model_file, extra_data = self.get_model('.h5')\n", - " self.model = load_model(open(model_file, 'rb'))\n", - " \n", - " def preprocess(self, body):\n", - " try:\n", - " output = {'instances': []}\n", - " instances = body.get('instances', [])\n", - " for byte_image in instances:\n", - " img = Image.open(byte_image)\n", - " img = img.resize((self.IMAGE_WIDTH, self.IMAGE_HEIGHT))\n", - "\n", - " # Load image\n", - " x = image.img_to_array(img)\n", - " x = np.expand_dims(x, axis=0)\n", - " output['instances'].append(x)\n", - " \n", - " # Format instances list\n", - " output['instances'] = [np.vstack(output['instances'])]\n", - " return output\n", - " except:\n", - " raise Exception(f'received: {body}')\n", - " \n", - "\n", - " def predict(self, data):\n", - " images = data.get('instances', [])\n", - "\n", - " # Predict\n", - " predicted_probability = self.model.predict(images)\n", - "\n", - " # return prediction\n", - " return predicted_probability\n", - " \n", - " def postprocess(self, predicted_probability):\n", - " if self.classes:\n", - " predicted_classes = np.around(predicted_probability, 1).tolist()[0]\n", - " predicted_probabilities = predicted_probability.tolist()[0]\n", - " return {\n", - " 'prediction': [self.classes[str(int(cls))] for cls in predicted_classes], \n", - " f'{self.classes[\"1\"]}-probability': predicted_probabilities\n", - " }\n", - " else:\n", - " return predicted_probability.tolist()[0]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To let our nuclio builder know that our function code ends at this point we will use the comment annotation `nuclio: end-code`. \n", - "\n", - "Any new cell from now on will be treated as if a `nuclio: ignore` comment was set, and will not be added to the funcion." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: end-code" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Test the function locally" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Make sure your local TF / Keras version is the same as pulled in the nuclio image for accurate testing\n", - "\n", - "Set the served models and their file paths using: `SERVING_MODEL_ = `\n", - "\n", - "> Note: this notebook assumes the model and categories are under /User/mlrun/examples/" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "from PIL import Image\n", - "from io import BytesIO\n", - "import matplotlib.pyplot as plt\n", - "import os, requests" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Define test parameters" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Test image:\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "# Testing event\n", - "cat_image_url = 'https://s3.amazonaws.com/iguazio-sample-data/images/catanddog/cat.102.jpg'\n", - "response = requests.get(cat_image_url)\n", - "cat_image = response.content\n", - "img = Image.open(BytesIO(cat_image))\n", - "\n", - "print('Test image:')\n", - "plt.imshow(img)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Define Function specifications" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import mlconf\n", - "import os\n", - "\n", - "# Model Server variables\n", - "model_class = 'TFModel'\n", - "model_name = 'cat_vs_dog_tfv1' # Define for later use in tests\n", - "models = {model_name: os.path.join(mlconf.artifact_path, 'tf1/cats_n_dogs.h5')}\n", - "\n", - "# Specific model variables\n", - "function_envs = {\n", - " 'IMAGE_HEIGHT': 128,\n", - " 'IMAGE_WIDTH': 128,\n", - " 'classes_map': os.path.join(mlconf.artifact_path, 'categories_map.json')\n", - "}" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Deploy the serving function to the cluster" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import new_model_server, mount_v3io" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-05-04 21:22:18,924 function spec saved to path: function.yaml\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Setup the model server function\n", - "fn = new_model_server('tf1-serving', \n", - " model_class=model_class,\n", - " models=models)\n", - "fn.set_envs(function_envs)\n", - "fn.spec.description = \"tf1 image classification server\"\n", - "fn.metadata.categories = ['serving', 'dl']\n", - "fn.metadata.labels = {'author': 'yaronh'}\n", - "fn.export(\"function.yaml\")" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "if \"V3IO_HOME\" in list(os.environ):\n", - " from mlrun import mount_v3io\n", - " fn.apply(mount_v3io())\n", - "else:\n", - " # is you set up mlrun using the instructions at\n", - " # https://github.com/mlrun/mlrun/blob/master/hack/local/README.md\n", - " from mlrun.platforms import mount_pvc\n", - " fn.apply(mount_pvc('nfsvol', 'nfsvol', '/home/joyan/data'))" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-04-30 20:52:15,886 deploy started\n", - "[nuclio] 2020-04-30 20:53:46,385 (info) Build complete\n", - "[nuclio] 2020-04-30 20:53:56,566 (info) Function deploy complete\n", - "[nuclio] 2020-04-30 20:53:56,573 done updating tensorflow-v1-2layers, function address: 3.135.130.246:30961\n" - ] - } - ], - "source": [ - "# Deploy the model server\n", - "addr = fn.deploy(project='cat-and-dog-servers')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Test the deployed function on the cluster" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Test the deployed function (with URL)" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Sending event: {\"data_url\": \"https://s3.amazonaws.com/iguazio-sample-data/images/catanddog/cat.102.jpg\"}\n" - ] - }, - { - "data": { - "text/plain": [ - "b'[0.0]'" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# URL event\n", - "event_body = json.dumps({\"data_url\": cat_image_url})\n", - "print(f'Sending event: {event_body}')\n", - "\n", - "headers = {'Content-type': 'application/json'}\n", - "response = requests.post(url=addr + f'/{model_name}/predict', data=event_body, headers=headers)\n", - "response.content" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Test the deployed function (with Jpeg Image)" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Sending image from https://s3.amazonaws.com/iguazio-sample-data/images/catanddog/cat.102.jpg\n" - ] - }, - { - "data": { - "text/plain": [ - "b'[0.0]'" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "# URL event\n", - "event_body = cat_image\n", - "print(f'Sending image from {cat_image_url}')\n", - "plt.imshow(img)\n", - "\n", - "headers = {'Content-type': 'image/jpeg'}\n", - "response = requests.post(url=addr + f'/{model_name}/predict/', data=event_body, headers=headers)\n", - "response.content" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/functions/development/tf1_serving/0.8.0/src/tf1_serving.py b/functions/development/tf1_serving/0.8.0/src/tf1_serving.py deleted file mode 100644 index 5bced05c..00000000 --- a/functions/development/tf1_serving/0.8.0/src/tf1_serving.py +++ /dev/null @@ -1,73 +0,0 @@ -# Generated by nuclio.export.NuclioExporter - -import warnings - -warnings.simplefilter(action="ignore", category=FutureWarning) - -import json -import numpy as np -import requests -from tensorflow import keras -from keras.models import load_model -from keras.preprocessing import image -from keras.preprocessing.image import load_img -from os import environ, path -from PIL import Image -from io import BytesIO -from urllib.request import urlopen -import mlrun - - -class TFModel(mlrun.runtimes.MLModelServer): - def __init__(self, name: str, model_dir: str): - super().__init__(name, model_dir) - - self.IMAGE_WIDTH = int(environ.get("IMAGE_WIDTH", "128")) - self.IMAGE_HEIGHT = int(environ.get("IMAGE_HEIGHT", "128")) - self.classes = None - try: - with open(environ["classes_map"], "r") as f: - self.classes = json.load(f) - except: - pass - - def load(self): - model_file, extra_data = self.get_model(".h5") - self.model = load_model(open(model_file, "rb")) - - def preprocess(self, body): - try: - output = {"instances": []} - instances = body.get("instances", []) - for byte_image in instances: - img = Image.open(byte_image) - img = img.resize((self.IMAGE_WIDTH, self.IMAGE_HEIGHT)) - - x = image.img_to_array(img) - x = np.expand_dims(x, axis=0) - output["instances"].append(x) - - output["instances"] = [np.vstack(output["instances"])] - return output - except: - raise Exception(f"received: {body}") - - def predict(self, data): - images = data.get("instances", []) - - predicted_probability = self.model.predict(images) - - return predicted_probability - - def postprocess(self, predicted_probability): - if self.classes: - predicted_classes = np.around(predicted_probability, 1).tolist()[0] - predicted_probabilities = predicted_probability.tolist()[0] - return { - "prediction": [ - self.classes[str(int(cls))] for cls in predicted_classes - ], - f'{self.classes["1"]}-probability': predicted_probabilities, - } - else: - return predicted_probability.tolist()[0] diff --git a/functions/development/tf1_serving/0.8.0/static/documentation.html b/functions/development/tf1_serving/0.8.0/static/documentation.html deleted file mode 100644 index c06a4727..00000000 --- a/functions/development/tf1_serving/0.8.0/static/documentation.html +++ /dev/null @@ -1,125 +0,0 @@ - - - - - - - -tf1_serving package - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- - -
-
-
-
-
-
-

tf1_serving package

-
-

Submodules

-
-
-

tf1_serving.tf1_serving module

-
-
-

Module contents

-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/tf1_serving/0.8.0/static/example.html b/functions/development/tf1_serving/0.8.0/static/example.html deleted file mode 100644 index fa08ec43..00000000 --- a/functions/development/tf1_serving/0.8.0/static/example.html +++ /dev/null @@ -1,536 +0,0 @@ - - - - - - - -Image Classification Model - Serving Function - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
- -
-
-
-
-

Image Classification Model - Serving Function

-

This notebook demonstrates how to deploy a Tensorflow model using MLRun & Nuclio.

-

In this notebook you will:

-
    -
  • Write a Tensorflow-Model class to load and predict on the incoming data

  • -
  • Deploy the model as a serverless function

  • -
  • Invoke the serving endpoint with data as:

    -
      -
    • URLs to images hosted on S3

    • -
    • Direct image send

    • -
    -
  • -
-

Steps:

- -
-

Define Nuclio Function

-

To use the magic commands for deploying this jupyter notebook as a nuclio function we must first import nuclio
-Since we do not want to import nuclio in the actual function, the comment annotation nuclio: ignore is used. This marks the cell for nuclio, telling it to ignore the cell’s values when building the function.

-
-
-
# nuclio: ignore
-import nuclio
-
-
-
-
-
-

Install dependencies and set config

-
-

Note: Since tensorflow 1.14 is being pulled from the baseimage it is not directly installed as a build command. -If it is not installed on your system please uninstall and install using the line: pip install tensorflow==1.14 keras

-
-
-
-
%nuclio config kind="nuclio:serving"
-%nuclio env MODEL_CLASS=TFModel
-
-# tensorflow version 1 requires a different version of python than 
-# the default (3.7), so we override the default tag here:
-
-%nuclio config spec.build.baseImage = "mlrun/mlrun"
-
-
-
-
-
%nuclio: setting kind to 'nuclio:serving'
-%nuclio: setting 'MODEL_CLASS' environment variable
-%nuclio: setting spec.build.baseImage to 'mlrun/mlrun'
-
-
-
-
-

Since we are using packages which are not surely installed on our baseimage, or want to verify that a specific version of the package will be installed we use the %nuclio cmd annotation.

-
-

%nuclio cmd works both locally and during deployment by default, but can be set with -c flag to only run the commands while deploying or -l to set the variable for the local environment only.

-
-
-
-
%%nuclio cmd -c
-pip install tensorflow==1.14 keras==2.3.1 'h5py<3.0.0'
-pip install requests pillow
-
-
-
-
-
-
-
-

Function Code

-
-
-
import warnings
-warnings.simplefilter(action="ignore", category=FutureWarning)
-
-
-
-
-
-
-
import json
-import numpy as np
-import requests
-from tensorflow import keras
-from keras.models import load_model
-from keras.preprocessing import image
-from keras.preprocessing.image import load_img
-from os import environ, path
-from PIL import Image
-from io import BytesIO
-from urllib.request import urlopen
-import mlrun
-
-
-
-
-
-

Model Serving Class

-

We define the TFModel class which we will use to define data handling and prediction of our model.

-

The class should consist of:

-
    -
  • __init__(name, model_dir) - Setup the internal parameters

  • -
  • load(self) - How to load the model and broadcast it’s ready for prediction

  • -
  • preprocess(self, body) - How to handle the incoming event, forming the request to an {'instances': [<samples>]} dictionary as requested by the protocol

  • -
  • predict(self, data) - Receives and {'instances': [<samples>]} and returns the model’s prediction as a list

  • -
  • postprocess(self, data) - Does any additional processing needed on the predictions.

  • -
-
-
-
class TFModel(mlrun.runtimes.MLModelServer):
-    def __init__(self, name: str, model_dir: str):
-        super().__init__(name, model_dir)
-
-        self.IMAGE_WIDTH = int(environ.get('IMAGE_WIDTH', '128'))
-        self.IMAGE_HEIGHT = int(environ.get('IMAGE_HEIGHT', '128'))
-        self.classes = None
-        try:
-            with open(environ['classes_map'], 'r') as f:
-                self.classes = json.load(f)
-        except:
-            pass
-        
-    def load(self):
-        model_file, extra_data = self.get_model('.h5')
-        self.model = load_model(open(model_file, 'rb'))
-        
-    def preprocess(self, body):
-        try:
-            output = {'instances': []}
-            instances = body.get('instances', [])
-            for byte_image in instances:
-                img = Image.open(byte_image)
-                img = img.resize((self.IMAGE_WIDTH, self.IMAGE_HEIGHT))
-
-                # Load image
-                x = image.img_to_array(img)
-                x = np.expand_dims(x, axis=0)
-                output['instances'].append(x)
-            
-            # Format instances list
-            output['instances'] = [np.vstack(output['instances'])]
-            return output
-        except:
-            raise Exception(f'received: {body}')
-            
-
-    def predict(self, data):
-        images = data.get('instances', [])
-
-        # Predict
-        predicted_probability = self.model.predict(images)
-
-        # return prediction
-        return predicted_probability
-        
-    def postprocess(self, predicted_probability):
-        if self.classes:
-            predicted_classes = np.around(predicted_probability, 1).tolist()[0]
-            predicted_probabilities = predicted_probability.tolist()[0]
-            return {
-                'prediction': [self.classes[str(int(cls))] for cls in predicted_classes], 
-                f'{self.classes["1"]}-probability': predicted_probabilities
-            }
-        else:
-            return predicted_probability.tolist()[0]
-
-
-
-
-

To let our nuclio builder know that our function code ends at this point we will use the comment annotation nuclio: end-code.

-

Any new cell from now on will be treated as if a nuclio: ignore comment was set, and will not be added to the funcion.

-
-
-
# nuclio: end-code
-
-
-
-
-
-
-
-

Test the function locally

-

Make sure your local TF / Keras version is the same as pulled in the nuclio image for accurate testing

-

Set the served models and their file paths using: SERVING_MODEL_<name> = <model file path>

-
-

Note: this notebook assumes the model and categories are under /User/mlrun/examples/

-
-
-
-
from PIL import Image
-from io import BytesIO
-import matplotlib.pyplot as plt
-import os, requests
-
-
-
-
-
-

Define test parameters

-
-
-
# Testing event
-cat_image_url = 'https://s3.amazonaws.com/iguazio-sample-data/images/catanddog/cat.102.jpg'
-response = requests.get(cat_image_url)
-cat_image = response.content
-img = Image.open(BytesIO(cat_image))
-
-print('Test image:')
-plt.imshow(img)
-
-
-
-
-
Test image:
-
-
-
<matplotlib.image.AxesImage at 0x7f1232ea52e8>
-
-
-_images/tf1_serving_example_20_2.png -
-
-
-
-

Define Function specifications

-
-
-
from mlrun import mlconf
-import os
-
-# Model Server variables
-model_class = 'TFModel'
-model_name = 'cat_vs_dog_tfv1' # Define for later use in tests
-models = {model_name: os.path.join(mlconf.artifact_path, 'tf1/cats_n_dogs.h5')}
-
-# Specific model variables
-function_envs = {
-    'IMAGE_HEIGHT': 128,
-    'IMAGE_WIDTH': 128,
-    'classes_map': os.path.join(mlconf.artifact_path, 'categories_map.json')
-}
-
-
-
-
-
-
-
-

Deploy the serving function to the cluster

-
-
-
from mlrun import new_model_server, mount_v3io
-
-
-
-
-
-
-
# Setup the model server function
-fn = new_model_server('tf1-serving', 
-                      model_class=model_class,
-                      models=models)
-fn.set_envs(function_envs)
-fn.spec.description = "tf1 image classification server"
-fn.metadata.categories = ['serving', 'dl']
-fn.metadata.labels = {'author': 'yaronh'}
-fn.export("function.yaml")
-
-
-
-
-
[mlrun] 2020-05-04 21:22:18,924 function spec saved to path: function.yaml
-
-
-
<mlrun.runtimes.function.RemoteRuntime at 0x7f5a1585f908>
-
-
-
-
-
-
-
if "V3IO_HOME" in list(os.environ):
-    from mlrun import mount_v3io
-    fn.apply(mount_v3io())
-else:
-    # is you set up mlrun using the instructions at
-    # https://github.com/mlrun/mlrun/blob/master/hack/local/README.md
-    from mlrun.platforms import mount_pvc
-    fn.apply(mount_pvc('nfsvol', 'nfsvol', '/home/joyan/data'))
-
-
-
-
-
-
-
# Deploy the model server
-addr = fn.deploy(project='cat-and-dog-servers')
-
-
-
-
-
[mlrun] 2020-04-30 20:52:15,886 deploy started
-[nuclio] 2020-04-30 20:53:46,385 (info) Build complete
-[nuclio] 2020-04-30 20:53:56,566 (info) Function deploy complete
-[nuclio] 2020-04-30 20:53:56,573 done updating tensorflow-v1-2layers, function address: 3.135.130.246:30961
-
-
-
-
-
-
-

Test the deployed function on the cluster

-
-

Test the deployed function (with URL)

-
-
-
# URL event
-event_body = json.dumps({"data_url": cat_image_url})
-print(f'Sending event: {event_body}')
-
-headers = {'Content-type': 'application/json'}
-response = requests.post(url=addr + f'/{model_name}/predict', data=event_body, headers=headers)
-response.content
-
-
-
-
-
Sending event: {"data_url": "https://s3.amazonaws.com/iguazio-sample-data/images/catanddog/cat.102.jpg"}
-
-
-
b'[0.0]'
-
-
-
-
-
-
-

Test the deployed function (with Jpeg Image)

-
-
-
# URL event
-event_body = cat_image
-print(f'Sending image from {cat_image_url}')
-plt.imshow(img)
-
-headers = {'Content-type': 'image/jpeg'}
-response = requests.post(url=addr + f'/{model_name}/predict/', data=event_body, headers=headers)
-response.content
-
-
-
-
-
Sending image from https://s3.amazonaws.com/iguazio-sample-data/images/catanddog/cat.102.jpg
-
-
-
b'[0.0]'
-
-
-_images/tf1_serving_example_32_2.png -
-
-
-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/tf1_serving/0.8.0/static/function.html b/functions/development/tf1_serving/0.8.0/static/function.html deleted file mode 100644 index 1ef2668b..00000000 --- a/functions/development/tf1_serving/0.8.0/static/function.html +++ /dev/null @@ -1,70 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: remote
-metadata:
-  name: tf1-serving
-  tag: ''
-  hash: 20cdeb2119a67fc51e55474ac84d386c7b658db3
-  project: default
-  labels:
-    author: yaronh
-  categories:
-  - model-serving
-  - machine-learning
-spec:
-  command: ''
-  args: []
-  image: mlrun/mlrun
-  description: tf1 image classification server
-  min_replicas: 1
-  max_replicas: 4
-  env:
-  - name: MODEL_CLASS
-    value: TFModel
-  - name: ENABLE_EXPLAINER
-    value: false
-  base_spec:
-    apiVersion: nuclio.io/v1
-    kind: Function
-    metadata:
-      name: tf1-serving
-      labels: {}
-      annotations:
-        nuclio.io/generated_by: function generated from /home/kali/functions/tf1_serving/tf1_serving.py
-    spec:
-      runtime: python:3.6
-      handler: tf1_serving:handler
-      env: []
-      volumes: []
-      build:
-        commands: []
-        noBaseImagesPull: true
-        functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHdhcm5pbmdzCgp3YXJuaW5ncy5zaW1wbGVmaWx0ZXIoYWN0aW9uPSJpZ25vcmUiLCBjYXRlZ29yeT1GdXR1cmVXYXJuaW5nKQoKaW1wb3J0IGpzb24KaW1wb3J0IG51bXB5IGFzIG5wCmltcG9ydCByZXF1ZXN0cwpmcm9tIHRlbnNvcmZsb3cgaW1wb3J0IGtlcmFzCmZyb20ga2VyYXMubW9kZWxzIGltcG9ydCBsb2FkX21vZGVsCmZyb20ga2VyYXMucHJlcHJvY2Vzc2luZyBpbXBvcnQgaW1hZ2UKZnJvbSBrZXJhcy5wcmVwcm9jZXNzaW5nLmltYWdlIGltcG9ydCBsb2FkX2ltZwpmcm9tIG9zIGltcG9ydCBlbnZpcm9uLCBwYXRoCmZyb20gUElMIGltcG9ydCBJbWFnZQpmcm9tIGlvIGltcG9ydCBCeXRlc0lPCmZyb20gdXJsbGliLnJlcXVlc3QgaW1wb3J0IHVybG9wZW4KaW1wb3J0IG1scnVuCgoKY2xhc3MgVEZNb2RlbChtbHJ1bi5ydW50aW1lcy5NTE1vZGVsU2VydmVyKToKICAgIGRlZiBfX2luaXRfXyhzZWxmLCBuYW1lOiBzdHIsIG1vZGVsX2Rpcjogc3RyKToKICAgICAgICBzdXBlcigpLl9faW5pdF9fKG5hbWUsIG1vZGVsX2RpcikKCiAgICAgICAgc2VsZi5JTUFHRV9XSURUSCA9IGludChlbnZpcm9uLmdldCgiSU1BR0VfV0lEVEgiLCAiMTI4IikpCiAgICAgICAgc2VsZi5JTUFHRV9IRUlHSFQgPSBpbnQoZW52aXJvbi5nZXQoIklNQUdFX0hFSUdIVCIsICIxMjgiKSkKICAgICAgICBzZWxmLmNsYXNzZXMgPSBOb25lCiAgICAgICAgdHJ5OgogICAgICAgICAgICB3aXRoIG9wZW4oZW52aXJvblsiY2xhc3Nlc19tYXAiXSwgInIiKSBhcyBmOgogICAgICAgICAgICAgICAgc2VsZi5jbGFzc2VzID0ganNvbi5sb2FkKGYpCiAgICAgICAgZXhjZXB0OgogICAgICAgICAgICBwYXNzCgogICAgZGVmIGxvYWQoc2VsZik6CiAgICAgICAgbW9kZWxfZmlsZSwgZXh0cmFfZGF0YSA9IHNlbGYuZ2V0X21vZGVsKCIuaDUiKQogICAgICAgIHNlbGYubW9kZWwgPSBsb2FkX21vZGVsKG9wZW4obW9kZWxfZmlsZSwgInJiIikpCgogICAgZGVmIHByZXByb2Nlc3Moc2VsZiwgYm9keSk6CiAgICAgICAgdHJ5OgogICAgICAgICAgICBvdXRwdXQgPSB7Imluc3RhbmNlcyI6IFtdfQogICAgICAgICAgICBpbnN0YW5jZXMgPSBib2R5LmdldCgiaW5zdGFuY2VzIiwgW10pCiAgICAgICAgICAgIGZvciBieXRlX2ltYWdlIGluIGluc3RhbmNlczoKICAgICAgICAgICAgICAgIGltZyA9IEltYWdlLm9wZW4oYnl0ZV9pbWFnZSkKICAgICAgICAgICAgICAgIGltZyA9IGltZy5yZXNpemUoKHNlbGYuSU1BR0VfV0lEVEgsIHNlbGYuSU1BR0VfSEVJR0hUKSkKCiAgICAgICAgICAgICAgICB4ID0gaW1hZ2UuaW1nX3RvX2FycmF5KGltZykKICAgICAgICAgICAgICAgIHggPSBucC5leHBhbmRfZGltcyh4LCBheGlzPTApCiAgICAgICAgICAgICAgICBvdXRwdXRbImluc3RhbmNlcyJdLmFwcGVuZCh4KQoKICAgICAgICAgICAgb3V0cHV0WyJpbnN0YW5jZXMiXSA9IFtucC52c3RhY2sob3V0cHV0WyJpbnN0YW5jZXMiXSldCiAgICAgICAgICAgIHJldHVybiBvdXRwdXQKICAgICAgICBleGNlcHQ6CiAgICAgICAgICAgIHJhaXNlIEV4Y2VwdGlvbihmInJlY2VpdmVkOiB7Ym9keX0iKQoKICAgIGRlZiBwcmVkaWN0KHNlbGYsIGRhdGEpOgogICAgICAgIGltYWdlcyA9IGRhdGEuZ2V0KCJpbnN0YW5jZXMiLCBbXSkKCiAgICAgICAgcHJlZGljdGVkX3Byb2JhYmlsaXR5ID0gc2VsZi5tb2RlbC5wcmVkaWN0KGltYWdlcykKCiAgICAgICAgcmV0dXJuIHByZWRpY3RlZF9wcm9iYWJpbGl0eQoKICAgIGRlZiBwb3N0cHJvY2VzcyhzZWxmLCBwcmVkaWN0ZWRfcHJvYmFiaWxpdHkpOgogICAgICAgIGlmIHNlbGYuY2xhc3NlczoKICAgICAgICAgICAgcHJlZGljdGVkX2NsYXNzZXMgPSBucC5hcm91bmQocHJlZGljdGVkX3Byb2JhYmlsaXR5LCAxKS50b2xpc3QoKVswXQogICAgICAgICAgICBwcmVkaWN0ZWRfcHJvYmFiaWxpdGllcyA9IHByZWRpY3RlZF9wcm9iYWJpbGl0eS50b2xpc3QoKVswXQogICAgICAgICAgICByZXR1cm4gewogICAgICAgICAgICAgICAgInByZWRpY3Rpb24iOiBbCiAgICAgICAgICAgICAgICAgICAgc2VsZi5jbGFzc2VzW3N0cihpbnQoY2xzKSldIGZvciBjbHMgaW4gcHJlZGljdGVkX2NsYXNzZXMKICAgICAgICAgICAgICAgIF0sCiAgICAgICAgICAgICAgICBmJ3tzZWxmLmNsYXNzZXNbIjEiXX0tcHJvYmFiaWxpdHknOiBwcmVkaWN0ZWRfcHJvYmFiaWxpdGllcywKICAgICAgICAgICAgfQogICAgICAgIGVsc2U6CiAgICAgICAgICAgIHJldHVybiBwcmVkaWN0ZWRfcHJvYmFiaWxpdHkudG9saXN0KClbMF0KCmZyb20gbWxydW4ucnVudGltZXMgaW1wb3J0IG51Y2xpb19pbml0X2hvb2sKZGVmIGluaXRfY29udGV4dChjb250ZXh0KToKICAgIG51Y2xpb19pbml0X2hvb2soY29udGV4dCwgZ2xvYmFscygpLCAnc2VydmluZycpCgpkZWYgaGFuZGxlcihjb250ZXh0LCBldmVudCk6CiAgICByZXR1cm4gY29udGV4dC5tbHJ1bl9oYW5kbGVyKGNvbnRleHQsIGV2ZW50KQo=
-  source: ''
-  function_kind: serving
-  build:
-    commands: []
-    code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/tf1_serving/tf1_serving.py
-  default_handler: handler
-  affinity: null
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/tf1_serving/0.8.0/static/item.html b/functions/development/tf1_serving/0.8.0/static/item.html deleted file mode 100644 index d845347c..00000000 --- a/functions/development/tf1_serving/0.8.0/static/item.html +++ /dev/null @@ -1,49 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- model-serving
-- machine-learning
-description: tf1 image classification server
-doc: ''
-example: tf1_serving.ipynb
-generationDate: 2021-05-19:23-13
-icon: ''
-labels:
-  author: yaronh
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 0.8.0
-name: tf1-serving
-platformVersion: 3.2.0
-spec:
-  env:
-    ENABLE_EXPLAINER: false
-    MODEL_CLASS: TFModel
-  filename: tf1_serving.py
-  handler: handler
-  image: mlrun/mlrun
-  kind: nuclio:serving
-  requirements: []
-url: ''
-version: 0.8.0
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/tf1_serving/0.8.0/static/source.html b/functions/development/tf1_serving/0.8.0/static/source.html deleted file mode 100644 index ae10b34f..00000000 --- a/functions/development/tf1_serving/0.8.0/static/source.html +++ /dev/null @@ -1,95 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-# Generated by nuclio.export.NuclioExporter
-
-import warnings
-
-warnings.simplefilter(action="ignore", category=FutureWarning)
-
-import json
-import numpy as np
-import requests
-from tensorflow import keras
-from keras.models import load_model
-from keras.preprocessing import image
-from keras.preprocessing.image import load_img
-from os import environ, path
-from PIL import Image
-from io import BytesIO
-from urllib.request import urlopen
-import mlrun
-
-
-class TFModel(mlrun.runtimes.MLModelServer):
-    def __init__(self, name: str, model_dir: str):
-        super().__init__(name, model_dir)
-
-        self.IMAGE_WIDTH = int(environ.get("IMAGE_WIDTH", "128"))
-        self.IMAGE_HEIGHT = int(environ.get("IMAGE_HEIGHT", "128"))
-        self.classes = None
-        try:
-            with open(environ["classes_map"], "r") as f:
-                self.classes = json.load(f)
-        except:
-            pass
-
-    def load(self):
-        model_file, extra_data = self.get_model(".h5")
-        self.model = load_model(open(model_file, "rb"))
-
-    def preprocess(self, body):
-        try:
-            output = {"instances": []}
-            instances = body.get("instances", [])
-            for byte_image in instances:
-                img = Image.open(byte_image)
-                img = img.resize((self.IMAGE_WIDTH, self.IMAGE_HEIGHT))
-
-                x = image.img_to_array(img)
-                x = np.expand_dims(x, axis=0)
-                output["instances"].append(x)
-
-            output["instances"] = [np.vstack(output["instances"])]
-            return output
-        except:
-            raise Exception(f"received: {body}")
-
-    def predict(self, data):
-        images = data.get("instances", [])
-
-        predicted_probability = self.model.predict(images)
-
-        return predicted_probability
-
-    def postprocess(self, predicted_probability):
-        if self.classes:
-            predicted_classes = np.around(predicted_probability, 1).tolist()[0]
-            predicted_probabilities = predicted_probability.tolist()[0]
-            return {
-                "prediction": [
-                    self.classes[str(int(cls))] for cls in predicted_classes
-                ],
-                f'{self.classes["1"]}-probability': predicted_probabilities,
-            }
-        else:
-            return predicted_probability.tolist()[0]
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/tf1_serving/0.9.0/src/function.yaml b/functions/development/tf1_serving/0.9.0/src/function.yaml deleted file mode 100644 index e6a57c4b..00000000 --- a/functions/development/tf1_serving/0.9.0/src/function.yaml +++ /dev/null @@ -1,48 +0,0 @@ -kind: remote -metadata: - name: tf1-serving - tag: '' - hash: 20cdeb2119a67fc51e55474ac84d386c7b658db3 - project: '' - labels: - author: yaronh - categories: - - model-serving - - machine-learning -spec: - command: '' - args: [] - image: mlrun/mlrun - description: tf1 image classification server - min_replicas: 1 - max_replicas: 4 - env: - - name: MODEL_CLASS - value: TFModel - - name: ENABLE_EXPLAINER - value: false - base_spec: - apiVersion: nuclio.io/v1 - kind: Function - metadata: - name: tf1-serving - labels: {} - annotations: - nuclio.io/generated_by: function generated from /home/kali/functions/tf1_serving/tf1_serving.py - spec: - runtime: python:3.6 - handler: tf1_serving:handler - env: [] - volumes: [] - build: - commands: [] - noBaseImagesPull: true - functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHdhcm5pbmdzCgp3YXJuaW5ncy5zaW1wbGVmaWx0ZXIoYWN0aW9uPSJpZ25vcmUiLCBjYXRlZ29yeT1GdXR1cmVXYXJuaW5nKQoKaW1wb3J0IGpzb24KaW1wb3J0IG51bXB5IGFzIG5wCmltcG9ydCByZXF1ZXN0cwpmcm9tIHRlbnNvcmZsb3cgaW1wb3J0IGtlcmFzCmZyb20ga2VyYXMubW9kZWxzIGltcG9ydCBsb2FkX21vZGVsCmZyb20ga2VyYXMucHJlcHJvY2Vzc2luZyBpbXBvcnQgaW1hZ2UKZnJvbSBrZXJhcy5wcmVwcm9jZXNzaW5nLmltYWdlIGltcG9ydCBsb2FkX2ltZwpmcm9tIG9zIGltcG9ydCBlbnZpcm9uLCBwYXRoCmZyb20gUElMIGltcG9ydCBJbWFnZQpmcm9tIGlvIGltcG9ydCBCeXRlc0lPCmZyb20gdXJsbGliLnJlcXVlc3QgaW1wb3J0IHVybG9wZW4KaW1wb3J0IG1scnVuCgoKY2xhc3MgVEZNb2RlbChtbHJ1bi5ydW50aW1lcy5NTE1vZGVsU2VydmVyKToKICAgIGRlZiBfX2luaXRfXyhzZWxmLCBuYW1lOiBzdHIsIG1vZGVsX2Rpcjogc3RyKToKICAgICAgICBzdXBlcigpLl9faW5pdF9fKG5hbWUsIG1vZGVsX2RpcikKCiAgICAgICAgc2VsZi5JTUFHRV9XSURUSCA9IGludChlbnZpcm9uLmdldCgiSU1BR0VfV0lEVEgiLCAiMTI4IikpCiAgICAgICAgc2VsZi5JTUFHRV9IRUlHSFQgPSBpbnQoZW52aXJvbi5nZXQoIklNQUdFX0hFSUdIVCIsICIxMjgiKSkKICAgICAgICBzZWxmLmNsYXNzZXMgPSBOb25lCiAgICAgICAgdHJ5OgogICAgICAgICAgICB3aXRoIG9wZW4oZW52aXJvblsiY2xhc3Nlc19tYXAiXSwgInIiKSBhcyBmOgogICAgICAgICAgICAgICAgc2VsZi5jbGFzc2VzID0ganNvbi5sb2FkKGYpCiAgICAgICAgZXhjZXB0OgogICAgICAgICAgICBwYXNzCgogICAgZGVmIGxvYWQoc2VsZik6CiAgICAgICAgbW9kZWxfZmlsZSwgZXh0cmFfZGF0YSA9IHNlbGYuZ2V0X21vZGVsKCIuaDUiKQogICAgICAgIHNlbGYubW9kZWwgPSBsb2FkX21vZGVsKG9wZW4obW9kZWxfZmlsZSwgInJiIikpCgogICAgZGVmIHByZXByb2Nlc3Moc2VsZiwgYm9keSk6CiAgICAgICAgdHJ5OgogICAgICAgICAgICBvdXRwdXQgPSB7Imluc3RhbmNlcyI6IFtdfQogICAgICAgICAgICBpbnN0YW5jZXMgPSBib2R5LmdldCgiaW5zdGFuY2VzIiwgW10pCiAgICAgICAgICAgIGZvciBieXRlX2ltYWdlIGluIGluc3RhbmNlczoKICAgICAgICAgICAgICAgIGltZyA9IEltYWdlLm9wZW4oYnl0ZV9pbWFnZSkKICAgICAgICAgICAgICAgIGltZyA9IGltZy5yZXNpemUoKHNlbGYuSU1BR0VfV0lEVEgsIHNlbGYuSU1BR0VfSEVJR0hUKSkKCiAgICAgICAgICAgICAgICB4ID0gaW1hZ2UuaW1nX3RvX2FycmF5KGltZykKICAgICAgICAgICAgICAgIHggPSBucC5leHBhbmRfZGltcyh4LCBheGlzPTApCiAgICAgICAgICAgICAgICBvdXRwdXRbImluc3RhbmNlcyJdLmFwcGVuZCh4KQoKICAgICAgICAgICAgb3V0cHV0WyJpbnN0YW5jZXMiXSA9IFtucC52c3RhY2sob3V0cHV0WyJpbnN0YW5jZXMiXSldCiAgICAgICAgICAgIHJldHVybiBvdXRwdXQKICAgICAgICBleGNlcHQ6CiAgICAgICAgICAgIHJhaXNlIEV4Y2VwdGlvbihmInJlY2VpdmVkOiB7Ym9keX0iKQoKICAgIGRlZiBwcmVkaWN0KHNlbGYsIGRhdGEpOgogICAgICAgIGltYWdlcyA9IGRhdGEuZ2V0KCJpbnN0YW5jZXMiLCBbXSkKCiAgICAgICAgcHJlZGljdGVkX3Byb2JhYmlsaXR5ID0gc2VsZi5tb2RlbC5wcmVkaWN0KGltYWdlcykKCiAgICAgICAgcmV0dXJuIHByZWRpY3RlZF9wcm9iYWJpbGl0eQoKICAgIGRlZiBwb3N0cHJvY2VzcyhzZWxmLCBwcmVkaWN0ZWRfcHJvYmFiaWxpdHkpOgogICAgICAgIGlmIHNlbGYuY2xhc3NlczoKICAgICAgICAgICAgcHJlZGljdGVkX2NsYXNzZXMgPSBucC5hcm91bmQocHJlZGljdGVkX3Byb2JhYmlsaXR5LCAxKS50b2xpc3QoKVswXQogICAgICAgICAgICBwcmVkaWN0ZWRfcHJvYmFiaWxpdGllcyA9IHByZWRpY3RlZF9wcm9iYWJpbGl0eS50b2xpc3QoKVswXQogICAgICAgICAgICByZXR1cm4gewogICAgICAgICAgICAgICAgInByZWRpY3Rpb24iOiBbCiAgICAgICAgICAgICAgICAgICAgc2VsZi5jbGFzc2VzW3N0cihpbnQoY2xzKSldIGZvciBjbHMgaW4gcHJlZGljdGVkX2NsYXNzZXMKICAgICAgICAgICAgICAgIF0sCiAgICAgICAgICAgICAgICBmJ3tzZWxmLmNsYXNzZXNbIjEiXX0tcHJvYmFiaWxpdHknOiBwcmVkaWN0ZWRfcHJvYmFiaWxpdGllcywKICAgICAgICAgICAgfQogICAgICAgIGVsc2U6CiAgICAgICAgICAgIHJldHVybiBwcmVkaWN0ZWRfcHJvYmFiaWxpdHkudG9saXN0KClbMF0KCmZyb20gbWxydW4ucnVudGltZXMgaW1wb3J0IG51Y2xpb19pbml0X2hvb2sKZGVmIGluaXRfY29udGV4dChjb250ZXh0KToKICAgIG51Y2xpb19pbml0X2hvb2soY29udGV4dCwgZ2xvYmFscygpLCAnc2VydmluZycpCgpkZWYgaGFuZGxlcihjb250ZXh0LCBldmVudCk6CiAgICByZXR1cm4gY29udGV4dC5tbHJ1bl9oYW5kbGVyKGNvbnRleHQsIGV2ZW50KQo= - source: '' - function_kind: serving - build: - commands: [] - code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/tf1_serving/tf1_serving.py - default_handler: handler - affinity: null -verbose: false diff --git a/functions/development/tf1_serving/0.9.0/src/item.yaml b/functions/development/tf1_serving/0.9.0/src/item.yaml deleted file mode 100644 index 44afbe9d..00000000 --- a/functions/development/tf1_serving/0.9.0/src/item.yaml +++ /dev/null @@ -1,27 +0,0 @@ -apiVersion: v1 -categories: -- model-serving -- machine-learning -description: tf1 image classification server -doc: '' -example: tf1_serving.ipynb -generationDate: 2021-11-18:12-28 -icon: '' -labels: - author: yaronh -maintainers: [] -marketplaceType: '' -mlrunVersion: 0.8.0 -name: tf1-serving -platformVersion: 3.2.0 -spec: - env: - ENABLE_EXPLAINER: false - MODEL_CLASS: TFModel - filename: tf1_serving.py - handler: handler - image: mlrun/mlrun - kind: nuclio:serving - requirements: [] -url: '' -version: 0.9.0 diff --git a/functions/development/tf1_serving/0.9.0/src/tf1_serving.ipynb b/functions/development/tf1_serving/0.9.0/src/tf1_serving.ipynb deleted file mode 100644 index 1d42ee60..00000000 --- a/functions/development/tf1_serving/0.9.0/src/tf1_serving.ipynb +++ /dev/null @@ -1,567 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Image Classification Model - Serving Function\n", - "\n", - "This notebook demonstrates how to deploy a Tensorflow model using MLRun & Nuclio.\n", - "\n", - "**In this notebook you will:**\n", - "* Write a Tensorflow-Model class to load and predict on the incoming data\n", - "* Deploy the model as a serverless function\n", - "* Invoke the serving endpoint with data as:\n", - " * URLs to images hosted on S3\n", - " * Direct image send\n", - " \n", - "**Steps:** \n", - "* [Define Nuclio function](#Define-Nuclio-function) \n", - " * [Install dependencies and set config](#Install-dependencies-and-set-config) \n", - " * [Model serving class](#Model-Serving-Class) \n", - "* [Deploy the serving function to the cluster](#Deploy-the-serving-function-to-the-cluster) \n", - "* [Define test parameters](#Define-test-parameters)\n", - "* [Test the deployed function on the cluster](#Test-the-deployed-function-on-the-cluster)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Define Nuclio Function" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To use the magic commands for deploying this jupyter notebook as a nuclio function we must first import nuclio \n", - "Since we do not want to import nuclio in the actual function, the comment annotation `nuclio: ignore` is used. This marks the cell for nuclio, telling it to ignore the cell's values when building the function." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: ignore\n", - "import nuclio" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Install dependencies and set config\n", - "> Note: Since tensorflow 1.14 is being pulled from the baseimage it is not directly installed as a build command.\n", - "If it is not installed on your system please uninstall and install using the line: `pip install tensorflow==1.14 keras`" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "%nuclio: setting kind to 'nuclio:serving'\n", - "%nuclio: setting 'MODEL_CLASS' environment variable\n", - "%nuclio: setting spec.build.baseImage to 'mlrun/mlrun'\n" - ] - } - ], - "source": [ - "%nuclio config kind=\"nuclio:serving\"\n", - "%nuclio env MODEL_CLASS=TFModel\n", - "\n", - "# tensorflow version 1 requires a different version of python than \n", - "# the default (3.7), so we override the default tag here:\n", - "\n", - "%nuclio config spec.build.baseImage = \"mlrun/mlrun\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Since we are using packages which are not surely installed on our baseimage, or want to verify that a specific version of the package will be installed we use the `%nuclio cmd` annotation. \n", - ">`%nuclio cmd` works both locally and during deployment by default, but can be set with `-c` flag to only run the commands while deploying or `-l` to set the variable for the local environment only." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "%%nuclio cmd -c\n", - "pip install tensorflow==1.14 keras==2.3.1 'h5py<3.0.0'\n", - "pip install requests pillow" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Function Code" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "import warnings\n", - "warnings.simplefilter(action=\"ignore\", category=FutureWarning)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "import json\n", - "import numpy as np\n", - "import requests\n", - "from tensorflow import keras\n", - "from keras.models import load_model\n", - "from keras.preprocessing import image\n", - "from keras.preprocessing.image import load_img\n", - "from os import environ, path\n", - "from PIL import Image\n", - "from io import BytesIO\n", - "from urllib.request import urlopen\n", - "import mlrun" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Model Serving Class" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We define the `TFModel` class which we will use to define data handling and prediction of our model. \n", - "\n", - "The class should consist of:\n", - "* `__init__(name, model_dir)` - Setup the internal parameters\n", - "* `load(self)` - How to load the model and broadcast it's ready for prediction\n", - "* `preprocess(self, body)` - How to handle the incoming event, forming the request to an `{'instances': []}` dictionary as requested by the protocol\n", - "* `predict(self, data)` - Receives and `{'instances': []}` and returns the model's prediction as a list\n", - "* `postprocess(self, data)` - Does any additional processing needed on the predictions." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "class TFModel(mlrun.runtimes.MLModelServer):\n", - " def __init__(self, name: str, model_dir: str):\n", - " super().__init__(name, model_dir)\n", - "\n", - " self.IMAGE_WIDTH = int(environ.get('IMAGE_WIDTH', '128'))\n", - " self.IMAGE_HEIGHT = int(environ.get('IMAGE_HEIGHT', '128'))\n", - " self.classes = None\n", - " try:\n", - " with open(environ['classes_map'], 'r') as f:\n", - " self.classes = json.load(f)\n", - " except:\n", - " pass\n", - " \n", - " def load(self):\n", - " model_file, extra_data = self.get_model('.h5')\n", - " self.model = load_model(open(model_file, 'rb'))\n", - " \n", - " def preprocess(self, body):\n", - " try:\n", - " output = {'instances': []}\n", - " instances = body.get('instances', [])\n", - " for byte_image in instances:\n", - " img = Image.open(byte_image)\n", - " img = img.resize((self.IMAGE_WIDTH, self.IMAGE_HEIGHT))\n", - "\n", - " # Load image\n", - " x = image.img_to_array(img)\n", - " x = np.expand_dims(x, axis=0)\n", - " output['instances'].append(x)\n", - " \n", - " # Format instances list\n", - " output['instances'] = [np.vstack(output['instances'])]\n", - " return output\n", - " except:\n", - " raise Exception(f'received: {body}')\n", - " \n", - "\n", - " def predict(self, data):\n", - " images = data.get('instances', [])\n", - "\n", - " # Predict\n", - " predicted_probability = self.model.predict(images)\n", - "\n", - " # return prediction\n", - " return predicted_probability\n", - " \n", - " def postprocess(self, predicted_probability):\n", - " if self.classes:\n", - " predicted_classes = np.around(predicted_probability, 1).tolist()[0]\n", - " predicted_probabilities = predicted_probability.tolist()[0]\n", - " return {\n", - " 'prediction': [self.classes[str(int(cls))] for cls in predicted_classes], \n", - " f'{self.classes[\"1\"]}-probability': predicted_probabilities\n", - " }\n", - " else:\n", - " return predicted_probability.tolist()[0]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To let our nuclio builder know that our function code ends at this point we will use the comment annotation `nuclio: end-code`. \n", - "\n", - "Any new cell from now on will be treated as if a `nuclio: ignore` comment was set, and will not be added to the funcion." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: end-code" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Test the function locally" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Make sure your local TF / Keras version is the same as pulled in the nuclio image for accurate testing\n", - "\n", - "Set the served models and their file paths using: `SERVING_MODEL_ = `\n", - "\n", - "> Note: this notebook assumes the model and categories are under /User/mlrun/examples/" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "from PIL import Image\n", - "from io import BytesIO\n", - "import matplotlib.pyplot as plt\n", - "import os, requests" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Define test parameters" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Test image:\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "# Testing event\n", - "cat_image_url = 'https://s3.amazonaws.com/iguazio-sample-data/images/catanddog/cat.102.jpg'\n", - "response = requests.get(cat_image_url)\n", - "cat_image = response.content\n", - "img = Image.open(BytesIO(cat_image))\n", - "\n", - "print('Test image:')\n", - "plt.imshow(img)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Define Function specifications" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import mlconf\n", - "import os\n", - "\n", - "# Model Server variables\n", - "model_class = 'TFModel'\n", - "model_name = 'cat_vs_dog_tfv1' # Define for later use in tests\n", - "models = {model_name: os.path.join(mlconf.artifact_path, 'tf1/cats_n_dogs.h5')}\n", - "\n", - "# Specific model variables\n", - "function_envs = {\n", - " 'IMAGE_HEIGHT': 128,\n", - " 'IMAGE_WIDTH': 128,\n", - " 'classes_map': os.path.join(mlconf.artifact_path, 'categories_map.json')\n", - "}" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Deploy the serving function to the cluster" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import new_model_server, mount_v3io" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-05-04 21:22:18,924 function spec saved to path: function.yaml\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Setup the model server function\n", - "fn = new_model_server('tf1-serving', \n", - " model_class=model_class,\n", - " models=models)\n", - "fn.set_envs(function_envs)\n", - "fn.spec.description = \"tf1 image classification server\"\n", - "fn.metadata.categories = ['serving', 'dl']\n", - "fn.metadata.labels = {'author': 'yaronh'}\n", - "fn.export(\"function.yaml\")" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "if \"V3IO_HOME\" in list(os.environ):\n", - " from mlrun import mount_v3io\n", - " fn.apply(mount_v3io())\n", - "else:\n", - " # is you set up mlrun using the instructions at\n", - " # https://github.com/mlrun/mlrun/blob/master/hack/local/README.md\n", - " from mlrun.platforms import mount_pvc\n", - " fn.apply(mount_pvc('nfsvol', 'nfsvol', '/home/joyan/data'))" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-04-30 20:52:15,886 deploy started\n", - "[nuclio] 2020-04-30 20:53:46,385 (info) Build complete\n", - "[nuclio] 2020-04-30 20:53:56,566 (info) Function deploy complete\n", - "[nuclio] 2020-04-30 20:53:56,573 done updating tensorflow-v1-2layers, function address: 3.135.130.246:30961\n" - ] - } - ], - "source": [ - "# Deploy the model server\n", - "addr = fn.deploy(project='cat-and-dog-servers')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Test the deployed function on the cluster" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Test the deployed function (with URL)" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Sending event: {\"data_url\": \"https://s3.amazonaws.com/iguazio-sample-data/images/catanddog/cat.102.jpg\"}\n" - ] - }, - { - "data": { - "text/plain": [ - "b'[0.0]'" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# URL event\n", - "event_body = json.dumps({\"data_url\": cat_image_url})\n", - "print(f'Sending event: {event_body}')\n", - "\n", - "headers = {'Content-type': 'application/json'}\n", - "response = requests.post(url=addr + f'/{model_name}/predict', data=event_body, headers=headers)\n", - "response.content" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Test the deployed function (with Jpeg Image)" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Sending image from https://s3.amazonaws.com/iguazio-sample-data/images/catanddog/cat.102.jpg\n" - ] - }, - { - "data": { - "text/plain": [ - "b'[0.0]'" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "# URL event\n", - "event_body = cat_image\n", - "print(f'Sending image from {cat_image_url}')\n", - "plt.imshow(img)\n", - "\n", - "headers = {'Content-type': 'image/jpeg'}\n", - "response = requests.post(url=addr + f'/{model_name}/predict/', data=event_body, headers=headers)\n", - "response.content" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/functions/development/tf1_serving/0.9.0/src/tf1_serving.py b/functions/development/tf1_serving/0.9.0/src/tf1_serving.py deleted file mode 100644 index 5bced05c..00000000 --- a/functions/development/tf1_serving/0.9.0/src/tf1_serving.py +++ /dev/null @@ -1,73 +0,0 @@ -# Generated by nuclio.export.NuclioExporter - -import warnings - -warnings.simplefilter(action="ignore", category=FutureWarning) - -import json -import numpy as np -import requests -from tensorflow import keras -from keras.models import load_model -from keras.preprocessing import image -from keras.preprocessing.image import load_img -from os import environ, path -from PIL import Image -from io import BytesIO -from urllib.request import urlopen -import mlrun - - -class TFModel(mlrun.runtimes.MLModelServer): - def __init__(self, name: str, model_dir: str): - super().__init__(name, model_dir) - - self.IMAGE_WIDTH = int(environ.get("IMAGE_WIDTH", "128")) - self.IMAGE_HEIGHT = int(environ.get("IMAGE_HEIGHT", "128")) - self.classes = None - try: - with open(environ["classes_map"], "r") as f: - self.classes = json.load(f) - except: - pass - - def load(self): - model_file, extra_data = self.get_model(".h5") - self.model = load_model(open(model_file, "rb")) - - def preprocess(self, body): - try: - output = {"instances": []} - instances = body.get("instances", []) - for byte_image in instances: - img = Image.open(byte_image) - img = img.resize((self.IMAGE_WIDTH, self.IMAGE_HEIGHT)) - - x = image.img_to_array(img) - x = np.expand_dims(x, axis=0) - output["instances"].append(x) - - output["instances"] = [np.vstack(output["instances"])] - return output - except: - raise Exception(f"received: {body}") - - def predict(self, data): - images = data.get("instances", []) - - predicted_probability = self.model.predict(images) - - return predicted_probability - - def postprocess(self, predicted_probability): - if self.classes: - predicted_classes = np.around(predicted_probability, 1).tolist()[0] - predicted_probabilities = predicted_probability.tolist()[0] - return { - "prediction": [ - self.classes[str(int(cls))] for cls in predicted_classes - ], - f'{self.classes["1"]}-probability': predicted_probabilities, - } - else: - return predicted_probability.tolist()[0] diff --git a/functions/development/tf1_serving/0.9.0/static/documentation.html b/functions/development/tf1_serving/0.9.0/static/documentation.html deleted file mode 100644 index c06a4727..00000000 --- a/functions/development/tf1_serving/0.9.0/static/documentation.html +++ /dev/null @@ -1,125 +0,0 @@ - - - - - - - -tf1_serving package - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- - -
-
-
-
-
-
-

tf1_serving package

-
-

Submodules

-
-
-

tf1_serving.tf1_serving module

-
-
-

Module contents

-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/tf1_serving/0.9.0/static/example.html b/functions/development/tf1_serving/0.9.0/static/example.html deleted file mode 100644 index fa08ec43..00000000 --- a/functions/development/tf1_serving/0.9.0/static/example.html +++ /dev/null @@ -1,536 +0,0 @@ - - - - - - - -Image Classification Model - Serving Function - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
- -
-
-
-
-

Image Classification Model - Serving Function

-

This notebook demonstrates how to deploy a Tensorflow model using MLRun & Nuclio.

-

In this notebook you will:

-
    -
  • Write a Tensorflow-Model class to load and predict on the incoming data

  • -
  • Deploy the model as a serverless function

  • -
  • Invoke the serving endpoint with data as:

    -
      -
    • URLs to images hosted on S3

    • -
    • Direct image send

    • -
    -
  • -
-

Steps:

- -
-

Define Nuclio Function

-

To use the magic commands for deploying this jupyter notebook as a nuclio function we must first import nuclio
-Since we do not want to import nuclio in the actual function, the comment annotation nuclio: ignore is used. This marks the cell for nuclio, telling it to ignore the cell’s values when building the function.

-
-
-
# nuclio: ignore
-import nuclio
-
-
-
-
-
-

Install dependencies and set config

-
-

Note: Since tensorflow 1.14 is being pulled from the baseimage it is not directly installed as a build command. -If it is not installed on your system please uninstall and install using the line: pip install tensorflow==1.14 keras

-
-
-
-
%nuclio config kind="nuclio:serving"
-%nuclio env MODEL_CLASS=TFModel
-
-# tensorflow version 1 requires a different version of python than 
-# the default (3.7), so we override the default tag here:
-
-%nuclio config spec.build.baseImage = "mlrun/mlrun"
-
-
-
-
-
%nuclio: setting kind to 'nuclio:serving'
-%nuclio: setting 'MODEL_CLASS' environment variable
-%nuclio: setting spec.build.baseImage to 'mlrun/mlrun'
-
-
-
-
-

Since we are using packages which are not surely installed on our baseimage, or want to verify that a specific version of the package will be installed we use the %nuclio cmd annotation.

-
-

%nuclio cmd works both locally and during deployment by default, but can be set with -c flag to only run the commands while deploying or -l to set the variable for the local environment only.

-
-
-
-
%%nuclio cmd -c
-pip install tensorflow==1.14 keras==2.3.1 'h5py<3.0.0'
-pip install requests pillow
-
-
-
-
-
-
-
-

Function Code

-
-
-
import warnings
-warnings.simplefilter(action="ignore", category=FutureWarning)
-
-
-
-
-
-
-
import json
-import numpy as np
-import requests
-from tensorflow import keras
-from keras.models import load_model
-from keras.preprocessing import image
-from keras.preprocessing.image import load_img
-from os import environ, path
-from PIL import Image
-from io import BytesIO
-from urllib.request import urlopen
-import mlrun
-
-
-
-
-
-

Model Serving Class

-

We define the TFModel class which we will use to define data handling and prediction of our model.

-

The class should consist of:

-
    -
  • __init__(name, model_dir) - Setup the internal parameters

  • -
  • load(self) - How to load the model and broadcast it’s ready for prediction

  • -
  • preprocess(self, body) - How to handle the incoming event, forming the request to an {'instances': [<samples>]} dictionary as requested by the protocol

  • -
  • predict(self, data) - Receives and {'instances': [<samples>]} and returns the model’s prediction as a list

  • -
  • postprocess(self, data) - Does any additional processing needed on the predictions.

  • -
-
-
-
class TFModel(mlrun.runtimes.MLModelServer):
-    def __init__(self, name: str, model_dir: str):
-        super().__init__(name, model_dir)
-
-        self.IMAGE_WIDTH = int(environ.get('IMAGE_WIDTH', '128'))
-        self.IMAGE_HEIGHT = int(environ.get('IMAGE_HEIGHT', '128'))
-        self.classes = None
-        try:
-            with open(environ['classes_map'], 'r') as f:
-                self.classes = json.load(f)
-        except:
-            pass
-        
-    def load(self):
-        model_file, extra_data = self.get_model('.h5')
-        self.model = load_model(open(model_file, 'rb'))
-        
-    def preprocess(self, body):
-        try:
-            output = {'instances': []}
-            instances = body.get('instances', [])
-            for byte_image in instances:
-                img = Image.open(byte_image)
-                img = img.resize((self.IMAGE_WIDTH, self.IMAGE_HEIGHT))
-
-                # Load image
-                x = image.img_to_array(img)
-                x = np.expand_dims(x, axis=0)
-                output['instances'].append(x)
-            
-            # Format instances list
-            output['instances'] = [np.vstack(output['instances'])]
-            return output
-        except:
-            raise Exception(f'received: {body}')
-            
-
-    def predict(self, data):
-        images = data.get('instances', [])
-
-        # Predict
-        predicted_probability = self.model.predict(images)
-
-        # return prediction
-        return predicted_probability
-        
-    def postprocess(self, predicted_probability):
-        if self.classes:
-            predicted_classes = np.around(predicted_probability, 1).tolist()[0]
-            predicted_probabilities = predicted_probability.tolist()[0]
-            return {
-                'prediction': [self.classes[str(int(cls))] for cls in predicted_classes], 
-                f'{self.classes["1"]}-probability': predicted_probabilities
-            }
-        else:
-            return predicted_probability.tolist()[0]
-
-
-
-
-

To let our nuclio builder know that our function code ends at this point we will use the comment annotation nuclio: end-code.

-

Any new cell from now on will be treated as if a nuclio: ignore comment was set, and will not be added to the funcion.

-
-
-
# nuclio: end-code
-
-
-
-
-
-
-
-

Test the function locally

-

Make sure your local TF / Keras version is the same as pulled in the nuclio image for accurate testing

-

Set the served models and their file paths using: SERVING_MODEL_<name> = <model file path>

-
-

Note: this notebook assumes the model and categories are under /User/mlrun/examples/

-
-
-
-
from PIL import Image
-from io import BytesIO
-import matplotlib.pyplot as plt
-import os, requests
-
-
-
-
-
-

Define test parameters

-
-
-
# Testing event
-cat_image_url = 'https://s3.amazonaws.com/iguazio-sample-data/images/catanddog/cat.102.jpg'
-response = requests.get(cat_image_url)
-cat_image = response.content
-img = Image.open(BytesIO(cat_image))
-
-print('Test image:')
-plt.imshow(img)
-
-
-
-
-
Test image:
-
-
-
<matplotlib.image.AxesImage at 0x7f1232ea52e8>
-
-
-_images/tf1_serving_example_20_2.png -
-
-
-
-

Define Function specifications

-
-
-
from mlrun import mlconf
-import os
-
-# Model Server variables
-model_class = 'TFModel'
-model_name = 'cat_vs_dog_tfv1' # Define for later use in tests
-models = {model_name: os.path.join(mlconf.artifact_path, 'tf1/cats_n_dogs.h5')}
-
-# Specific model variables
-function_envs = {
-    'IMAGE_HEIGHT': 128,
-    'IMAGE_WIDTH': 128,
-    'classes_map': os.path.join(mlconf.artifact_path, 'categories_map.json')
-}
-
-
-
-
-
-
-
-

Deploy the serving function to the cluster

-
-
-
from mlrun import new_model_server, mount_v3io
-
-
-
-
-
-
-
# Setup the model server function
-fn = new_model_server('tf1-serving', 
-                      model_class=model_class,
-                      models=models)
-fn.set_envs(function_envs)
-fn.spec.description = "tf1 image classification server"
-fn.metadata.categories = ['serving', 'dl']
-fn.metadata.labels = {'author': 'yaronh'}
-fn.export("function.yaml")
-
-
-
-
-
[mlrun] 2020-05-04 21:22:18,924 function spec saved to path: function.yaml
-
-
-
<mlrun.runtimes.function.RemoteRuntime at 0x7f5a1585f908>
-
-
-
-
-
-
-
if "V3IO_HOME" in list(os.environ):
-    from mlrun import mount_v3io
-    fn.apply(mount_v3io())
-else:
-    # is you set up mlrun using the instructions at
-    # https://github.com/mlrun/mlrun/blob/master/hack/local/README.md
-    from mlrun.platforms import mount_pvc
-    fn.apply(mount_pvc('nfsvol', 'nfsvol', '/home/joyan/data'))
-
-
-
-
-
-
-
# Deploy the model server
-addr = fn.deploy(project='cat-and-dog-servers')
-
-
-
-
-
[mlrun] 2020-04-30 20:52:15,886 deploy started
-[nuclio] 2020-04-30 20:53:46,385 (info) Build complete
-[nuclio] 2020-04-30 20:53:56,566 (info) Function deploy complete
-[nuclio] 2020-04-30 20:53:56,573 done updating tensorflow-v1-2layers, function address: 3.135.130.246:30961
-
-
-
-
-
-
-

Test the deployed function on the cluster

-
-

Test the deployed function (with URL)

-
-
-
# URL event
-event_body = json.dumps({"data_url": cat_image_url})
-print(f'Sending event: {event_body}')
-
-headers = {'Content-type': 'application/json'}
-response = requests.post(url=addr + f'/{model_name}/predict', data=event_body, headers=headers)
-response.content
-
-
-
-
-
Sending event: {"data_url": "https://s3.amazonaws.com/iguazio-sample-data/images/catanddog/cat.102.jpg"}
-
-
-
b'[0.0]'
-
-
-
-
-
-
-

Test the deployed function (with Jpeg Image)

-
-
-
# URL event
-event_body = cat_image
-print(f'Sending image from {cat_image_url}')
-plt.imshow(img)
-
-headers = {'Content-type': 'image/jpeg'}
-response = requests.post(url=addr + f'/{model_name}/predict/', data=event_body, headers=headers)
-response.content
-
-
-
-
-
Sending image from https://s3.amazonaws.com/iguazio-sample-data/images/catanddog/cat.102.jpg
-
-
-
b'[0.0]'
-
-
-_images/tf1_serving_example_32_2.png -
-
-
-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/tf1_serving/0.9.0/static/function.html b/functions/development/tf1_serving/0.9.0/static/function.html deleted file mode 100644 index 9f1ba0c4..00000000 --- a/functions/development/tf1_serving/0.9.0/static/function.html +++ /dev/null @@ -1,70 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: remote
-metadata:
-  name: tf1-serving
-  tag: ''
-  hash: 20cdeb2119a67fc51e55474ac84d386c7b658db3
-  project: ''
-  labels:
-    author: yaronh
-  categories:
-  - model-serving
-  - machine-learning
-spec:
-  command: ''
-  args: []
-  image: mlrun/mlrun
-  description: tf1 image classification server
-  min_replicas: 1
-  max_replicas: 4
-  env:
-  - name: MODEL_CLASS
-    value: TFModel
-  - name: ENABLE_EXPLAINER
-    value: false
-  base_spec:
-    apiVersion: nuclio.io/v1
-    kind: Function
-    metadata:
-      name: tf1-serving
-      labels: {}
-      annotations:
-        nuclio.io/generated_by: function generated from /home/kali/functions/tf1_serving/tf1_serving.py
-    spec:
-      runtime: python:3.6
-      handler: tf1_serving:handler
-      env: []
-      volumes: []
-      build:
-        commands: []
-        noBaseImagesPull: true
-        functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHdhcm5pbmdzCgp3YXJuaW5ncy5zaW1wbGVmaWx0ZXIoYWN0aW9uPSJpZ25vcmUiLCBjYXRlZ29yeT1GdXR1cmVXYXJuaW5nKQoKaW1wb3J0IGpzb24KaW1wb3J0IG51bXB5IGFzIG5wCmltcG9ydCByZXF1ZXN0cwpmcm9tIHRlbnNvcmZsb3cgaW1wb3J0IGtlcmFzCmZyb20ga2VyYXMubW9kZWxzIGltcG9ydCBsb2FkX21vZGVsCmZyb20ga2VyYXMucHJlcHJvY2Vzc2luZyBpbXBvcnQgaW1hZ2UKZnJvbSBrZXJhcy5wcmVwcm9jZXNzaW5nLmltYWdlIGltcG9ydCBsb2FkX2ltZwpmcm9tIG9zIGltcG9ydCBlbnZpcm9uLCBwYXRoCmZyb20gUElMIGltcG9ydCBJbWFnZQpmcm9tIGlvIGltcG9ydCBCeXRlc0lPCmZyb20gdXJsbGliLnJlcXVlc3QgaW1wb3J0IHVybG9wZW4KaW1wb3J0IG1scnVuCgoKY2xhc3MgVEZNb2RlbChtbHJ1bi5ydW50aW1lcy5NTE1vZGVsU2VydmVyKToKICAgIGRlZiBfX2luaXRfXyhzZWxmLCBuYW1lOiBzdHIsIG1vZGVsX2Rpcjogc3RyKToKICAgICAgICBzdXBlcigpLl9faW5pdF9fKG5hbWUsIG1vZGVsX2RpcikKCiAgICAgICAgc2VsZi5JTUFHRV9XSURUSCA9IGludChlbnZpcm9uLmdldCgiSU1BR0VfV0lEVEgiLCAiMTI4IikpCiAgICAgICAgc2VsZi5JTUFHRV9IRUlHSFQgPSBpbnQoZW52aXJvbi5nZXQoIklNQUdFX0hFSUdIVCIsICIxMjgiKSkKICAgICAgICBzZWxmLmNsYXNzZXMgPSBOb25lCiAgICAgICAgdHJ5OgogICAgICAgICAgICB3aXRoIG9wZW4oZW52aXJvblsiY2xhc3Nlc19tYXAiXSwgInIiKSBhcyBmOgogICAgICAgICAgICAgICAgc2VsZi5jbGFzc2VzID0ganNvbi5sb2FkKGYpCiAgICAgICAgZXhjZXB0OgogICAgICAgICAgICBwYXNzCgogICAgZGVmIGxvYWQoc2VsZik6CiAgICAgICAgbW9kZWxfZmlsZSwgZXh0cmFfZGF0YSA9IHNlbGYuZ2V0X21vZGVsKCIuaDUiKQogICAgICAgIHNlbGYubW9kZWwgPSBsb2FkX21vZGVsKG9wZW4obW9kZWxfZmlsZSwgInJiIikpCgogICAgZGVmIHByZXByb2Nlc3Moc2VsZiwgYm9keSk6CiAgICAgICAgdHJ5OgogICAgICAgICAgICBvdXRwdXQgPSB7Imluc3RhbmNlcyI6IFtdfQogICAgICAgICAgICBpbnN0YW5jZXMgPSBib2R5LmdldCgiaW5zdGFuY2VzIiwgW10pCiAgICAgICAgICAgIGZvciBieXRlX2ltYWdlIGluIGluc3RhbmNlczoKICAgICAgICAgICAgICAgIGltZyA9IEltYWdlLm9wZW4oYnl0ZV9pbWFnZSkKICAgICAgICAgICAgICAgIGltZyA9IGltZy5yZXNpemUoKHNlbGYuSU1BR0VfV0lEVEgsIHNlbGYuSU1BR0VfSEVJR0hUKSkKCiAgICAgICAgICAgICAgICB4ID0gaW1hZ2UuaW1nX3RvX2FycmF5KGltZykKICAgICAgICAgICAgICAgIHggPSBucC5leHBhbmRfZGltcyh4LCBheGlzPTApCiAgICAgICAgICAgICAgICBvdXRwdXRbImluc3RhbmNlcyJdLmFwcGVuZCh4KQoKICAgICAgICAgICAgb3V0cHV0WyJpbnN0YW5jZXMiXSA9IFtucC52c3RhY2sob3V0cHV0WyJpbnN0YW5jZXMiXSldCiAgICAgICAgICAgIHJldHVybiBvdXRwdXQKICAgICAgICBleGNlcHQ6CiAgICAgICAgICAgIHJhaXNlIEV4Y2VwdGlvbihmInJlY2VpdmVkOiB7Ym9keX0iKQoKICAgIGRlZiBwcmVkaWN0KHNlbGYsIGRhdGEpOgogICAgICAgIGltYWdlcyA9IGRhdGEuZ2V0KCJpbnN0YW5jZXMiLCBbXSkKCiAgICAgICAgcHJlZGljdGVkX3Byb2JhYmlsaXR5ID0gc2VsZi5tb2RlbC5wcmVkaWN0KGltYWdlcykKCiAgICAgICAgcmV0dXJuIHByZWRpY3RlZF9wcm9iYWJpbGl0eQoKICAgIGRlZiBwb3N0cHJvY2VzcyhzZWxmLCBwcmVkaWN0ZWRfcHJvYmFiaWxpdHkpOgogICAgICAgIGlmIHNlbGYuY2xhc3NlczoKICAgICAgICAgICAgcHJlZGljdGVkX2NsYXNzZXMgPSBucC5hcm91bmQocHJlZGljdGVkX3Byb2JhYmlsaXR5LCAxKS50b2xpc3QoKVswXQogICAgICAgICAgICBwcmVkaWN0ZWRfcHJvYmFiaWxpdGllcyA9IHByZWRpY3RlZF9wcm9iYWJpbGl0eS50b2xpc3QoKVswXQogICAgICAgICAgICByZXR1cm4gewogICAgICAgICAgICAgICAgInByZWRpY3Rpb24iOiBbCiAgICAgICAgICAgICAgICAgICAgc2VsZi5jbGFzc2VzW3N0cihpbnQoY2xzKSldIGZvciBjbHMgaW4gcHJlZGljdGVkX2NsYXNzZXMKICAgICAgICAgICAgICAgIF0sCiAgICAgICAgICAgICAgICBmJ3tzZWxmLmNsYXNzZXNbIjEiXX0tcHJvYmFiaWxpdHknOiBwcmVkaWN0ZWRfcHJvYmFiaWxpdGllcywKICAgICAgICAgICAgfQogICAgICAgIGVsc2U6CiAgICAgICAgICAgIHJldHVybiBwcmVkaWN0ZWRfcHJvYmFiaWxpdHkudG9saXN0KClbMF0KCmZyb20gbWxydW4ucnVudGltZXMgaW1wb3J0IG51Y2xpb19pbml0X2hvb2sKZGVmIGluaXRfY29udGV4dChjb250ZXh0KToKICAgIG51Y2xpb19pbml0X2hvb2soY29udGV4dCwgZ2xvYmFscygpLCAnc2VydmluZycpCgpkZWYgaGFuZGxlcihjb250ZXh0LCBldmVudCk6CiAgICByZXR1cm4gY29udGV4dC5tbHJ1bl9oYW5kbGVyKGNvbnRleHQsIGV2ZW50KQo=
-  source: ''
-  function_kind: serving
-  build:
-    commands: []
-    code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/tf1_serving/tf1_serving.py
-  default_handler: handler
-  affinity: null
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/tf1_serving/0.9.0/static/item.html b/functions/development/tf1_serving/0.9.0/static/item.html deleted file mode 100644 index 4e5565f5..00000000 --- a/functions/development/tf1_serving/0.9.0/static/item.html +++ /dev/null @@ -1,49 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- model-serving
-- machine-learning
-description: tf1 image classification server
-doc: ''
-example: tf1_serving.ipynb
-generationDate: 2021-11-18:12-28
-icon: ''
-labels:
-  author: yaronh
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 0.8.0
-name: tf1-serving
-platformVersion: 3.2.0
-spec:
-  env:
-    ENABLE_EXPLAINER: false
-    MODEL_CLASS: TFModel
-  filename: tf1_serving.py
-  handler: handler
-  image: mlrun/mlrun
-  kind: nuclio:serving
-  requirements: []
-url: ''
-version: 0.9.0
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/tf1_serving/0.9.0/static/source.html b/functions/development/tf1_serving/0.9.0/static/source.html deleted file mode 100644 index ae10b34f..00000000 --- a/functions/development/tf1_serving/0.9.0/static/source.html +++ /dev/null @@ -1,95 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-# Generated by nuclio.export.NuclioExporter
-
-import warnings
-
-warnings.simplefilter(action="ignore", category=FutureWarning)
-
-import json
-import numpy as np
-import requests
-from tensorflow import keras
-from keras.models import load_model
-from keras.preprocessing import image
-from keras.preprocessing.image import load_img
-from os import environ, path
-from PIL import Image
-from io import BytesIO
-from urllib.request import urlopen
-import mlrun
-
-
-class TFModel(mlrun.runtimes.MLModelServer):
-    def __init__(self, name: str, model_dir: str):
-        super().__init__(name, model_dir)
-
-        self.IMAGE_WIDTH = int(environ.get("IMAGE_WIDTH", "128"))
-        self.IMAGE_HEIGHT = int(environ.get("IMAGE_HEIGHT", "128"))
-        self.classes = None
-        try:
-            with open(environ["classes_map"], "r") as f:
-                self.classes = json.load(f)
-        except:
-            pass
-
-    def load(self):
-        model_file, extra_data = self.get_model(".h5")
-        self.model = load_model(open(model_file, "rb"))
-
-    def preprocess(self, body):
-        try:
-            output = {"instances": []}
-            instances = body.get("instances", [])
-            for byte_image in instances:
-                img = Image.open(byte_image)
-                img = img.resize((self.IMAGE_WIDTH, self.IMAGE_HEIGHT))
-
-                x = image.img_to_array(img)
-                x = np.expand_dims(x, axis=0)
-                output["instances"].append(x)
-
-            output["instances"] = [np.vstack(output["instances"])]
-            return output
-        except:
-            raise Exception(f"received: {body}")
-
-    def predict(self, data):
-        images = data.get("instances", [])
-
-        predicted_probability = self.model.predict(images)
-
-        return predicted_probability
-
-    def postprocess(self, predicted_probability):
-        if self.classes:
-            predicted_classes = np.around(predicted_probability, 1).tolist()[0]
-            predicted_probabilities = predicted_probability.tolist()[0]
-            return {
-                "prediction": [
-                    self.classes[str(int(cls))] for cls in predicted_classes
-                ],
-                f'{self.classes["1"]}-probability': predicted_probabilities,
-            }
-        else:
-            return predicted_probability.tolist()[0]
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/tf1_serving/0.9.1/src/function.yaml b/functions/development/tf1_serving/0.9.1/src/function.yaml deleted file mode 100644 index e6a57c4b..00000000 --- a/functions/development/tf1_serving/0.9.1/src/function.yaml +++ /dev/null @@ -1,48 +0,0 @@ -kind: remote -metadata: - name: tf1-serving - tag: '' - hash: 20cdeb2119a67fc51e55474ac84d386c7b658db3 - project: '' - labels: - author: yaronh - categories: - - model-serving - - machine-learning -spec: - command: '' - args: [] - image: mlrun/mlrun - description: tf1 image classification server - min_replicas: 1 - max_replicas: 4 - env: - - name: MODEL_CLASS - value: TFModel - - name: ENABLE_EXPLAINER - value: false - base_spec: - apiVersion: nuclio.io/v1 - kind: Function - metadata: - name: tf1-serving - labels: {} - annotations: - nuclio.io/generated_by: function generated from /home/kali/functions/tf1_serving/tf1_serving.py - spec: - runtime: python:3.6 - handler: tf1_serving:handler - env: [] - volumes: [] - build: - commands: [] - noBaseImagesPull: true - functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHdhcm5pbmdzCgp3YXJuaW5ncy5zaW1wbGVmaWx0ZXIoYWN0aW9uPSJpZ25vcmUiLCBjYXRlZ29yeT1GdXR1cmVXYXJuaW5nKQoKaW1wb3J0IGpzb24KaW1wb3J0IG51bXB5IGFzIG5wCmltcG9ydCByZXF1ZXN0cwpmcm9tIHRlbnNvcmZsb3cgaW1wb3J0IGtlcmFzCmZyb20ga2VyYXMubW9kZWxzIGltcG9ydCBsb2FkX21vZGVsCmZyb20ga2VyYXMucHJlcHJvY2Vzc2luZyBpbXBvcnQgaW1hZ2UKZnJvbSBrZXJhcy5wcmVwcm9jZXNzaW5nLmltYWdlIGltcG9ydCBsb2FkX2ltZwpmcm9tIG9zIGltcG9ydCBlbnZpcm9uLCBwYXRoCmZyb20gUElMIGltcG9ydCBJbWFnZQpmcm9tIGlvIGltcG9ydCBCeXRlc0lPCmZyb20gdXJsbGliLnJlcXVlc3QgaW1wb3J0IHVybG9wZW4KaW1wb3J0IG1scnVuCgoKY2xhc3MgVEZNb2RlbChtbHJ1bi5ydW50aW1lcy5NTE1vZGVsU2VydmVyKToKICAgIGRlZiBfX2luaXRfXyhzZWxmLCBuYW1lOiBzdHIsIG1vZGVsX2Rpcjogc3RyKToKICAgICAgICBzdXBlcigpLl9faW5pdF9fKG5hbWUsIG1vZGVsX2RpcikKCiAgICAgICAgc2VsZi5JTUFHRV9XSURUSCA9IGludChlbnZpcm9uLmdldCgiSU1BR0VfV0lEVEgiLCAiMTI4IikpCiAgICAgICAgc2VsZi5JTUFHRV9IRUlHSFQgPSBpbnQoZW52aXJvbi5nZXQoIklNQUdFX0hFSUdIVCIsICIxMjgiKSkKICAgICAgICBzZWxmLmNsYXNzZXMgPSBOb25lCiAgICAgICAgdHJ5OgogICAgICAgICAgICB3aXRoIG9wZW4oZW52aXJvblsiY2xhc3Nlc19tYXAiXSwgInIiKSBhcyBmOgogICAgICAgICAgICAgICAgc2VsZi5jbGFzc2VzID0ganNvbi5sb2FkKGYpCiAgICAgICAgZXhjZXB0OgogICAgICAgICAgICBwYXNzCgogICAgZGVmIGxvYWQoc2VsZik6CiAgICAgICAgbW9kZWxfZmlsZSwgZXh0cmFfZGF0YSA9IHNlbGYuZ2V0X21vZGVsKCIuaDUiKQogICAgICAgIHNlbGYubW9kZWwgPSBsb2FkX21vZGVsKG9wZW4obW9kZWxfZmlsZSwgInJiIikpCgogICAgZGVmIHByZXByb2Nlc3Moc2VsZiwgYm9keSk6CiAgICAgICAgdHJ5OgogICAgICAgICAgICBvdXRwdXQgPSB7Imluc3RhbmNlcyI6IFtdfQogICAgICAgICAgICBpbnN0YW5jZXMgPSBib2R5LmdldCgiaW5zdGFuY2VzIiwgW10pCiAgICAgICAgICAgIGZvciBieXRlX2ltYWdlIGluIGluc3RhbmNlczoKICAgICAgICAgICAgICAgIGltZyA9IEltYWdlLm9wZW4oYnl0ZV9pbWFnZSkKICAgICAgICAgICAgICAgIGltZyA9IGltZy5yZXNpemUoKHNlbGYuSU1BR0VfV0lEVEgsIHNlbGYuSU1BR0VfSEVJR0hUKSkKCiAgICAgICAgICAgICAgICB4ID0gaW1hZ2UuaW1nX3RvX2FycmF5KGltZykKICAgICAgICAgICAgICAgIHggPSBucC5leHBhbmRfZGltcyh4LCBheGlzPTApCiAgICAgICAgICAgICAgICBvdXRwdXRbImluc3RhbmNlcyJdLmFwcGVuZCh4KQoKICAgICAgICAgICAgb3V0cHV0WyJpbnN0YW5jZXMiXSA9IFtucC52c3RhY2sob3V0cHV0WyJpbnN0YW5jZXMiXSldCiAgICAgICAgICAgIHJldHVybiBvdXRwdXQKICAgICAgICBleGNlcHQ6CiAgICAgICAgICAgIHJhaXNlIEV4Y2VwdGlvbihmInJlY2VpdmVkOiB7Ym9keX0iKQoKICAgIGRlZiBwcmVkaWN0KHNlbGYsIGRhdGEpOgogICAgICAgIGltYWdlcyA9IGRhdGEuZ2V0KCJpbnN0YW5jZXMiLCBbXSkKCiAgICAgICAgcHJlZGljdGVkX3Byb2JhYmlsaXR5ID0gc2VsZi5tb2RlbC5wcmVkaWN0KGltYWdlcykKCiAgICAgICAgcmV0dXJuIHByZWRpY3RlZF9wcm9iYWJpbGl0eQoKICAgIGRlZiBwb3N0cHJvY2VzcyhzZWxmLCBwcmVkaWN0ZWRfcHJvYmFiaWxpdHkpOgogICAgICAgIGlmIHNlbGYuY2xhc3NlczoKICAgICAgICAgICAgcHJlZGljdGVkX2NsYXNzZXMgPSBucC5hcm91bmQocHJlZGljdGVkX3Byb2JhYmlsaXR5LCAxKS50b2xpc3QoKVswXQogICAgICAgICAgICBwcmVkaWN0ZWRfcHJvYmFiaWxpdGllcyA9IHByZWRpY3RlZF9wcm9iYWJpbGl0eS50b2xpc3QoKVswXQogICAgICAgICAgICByZXR1cm4gewogICAgICAgICAgICAgICAgInByZWRpY3Rpb24iOiBbCiAgICAgICAgICAgICAgICAgICAgc2VsZi5jbGFzc2VzW3N0cihpbnQoY2xzKSldIGZvciBjbHMgaW4gcHJlZGljdGVkX2NsYXNzZXMKICAgICAgICAgICAgICAgIF0sCiAgICAgICAgICAgICAgICBmJ3tzZWxmLmNsYXNzZXNbIjEiXX0tcHJvYmFiaWxpdHknOiBwcmVkaWN0ZWRfcHJvYmFiaWxpdGllcywKICAgICAgICAgICAgfQogICAgICAgIGVsc2U6CiAgICAgICAgICAgIHJldHVybiBwcmVkaWN0ZWRfcHJvYmFiaWxpdHkudG9saXN0KClbMF0KCmZyb20gbWxydW4ucnVudGltZXMgaW1wb3J0IG51Y2xpb19pbml0X2hvb2sKZGVmIGluaXRfY29udGV4dChjb250ZXh0KToKICAgIG51Y2xpb19pbml0X2hvb2soY29udGV4dCwgZ2xvYmFscygpLCAnc2VydmluZycpCgpkZWYgaGFuZGxlcihjb250ZXh0LCBldmVudCk6CiAgICByZXR1cm4gY29udGV4dC5tbHJ1bl9oYW5kbGVyKGNvbnRleHQsIGV2ZW50KQo= - source: '' - function_kind: serving - build: - commands: [] - code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/tf1_serving/tf1_serving.py - default_handler: handler - affinity: null -verbose: false diff --git a/functions/development/tf1_serving/0.9.1/src/item.yaml b/functions/development/tf1_serving/0.9.1/src/item.yaml deleted file mode 100644 index 654a895a..00000000 --- a/functions/development/tf1_serving/0.9.1/src/item.yaml +++ /dev/null @@ -1,27 +0,0 @@ -apiVersion: v1 -categories: -- model-serving -- machine-learning -description: tf1 image classification server -doc: '' -example: tf1_serving.ipynb -generationDate: 2021-11-18:12-28 -icon: '' -labels: - author: yaronh -maintainers: [] -marketplaceType: '' -mlrunVersion: 0.8.0 -name: tf1-serving -platformVersion: 3.2.0 -spec: - env: - ENABLE_EXPLAINER: false - MODEL_CLASS: TFModel - filename: tf1_serving.py - handler: handler - image: mlrun/mlrun - kind: nuclio:serving - requirements: [] -url: '' -version: 0.9.1 diff --git a/functions/development/tf1_serving/0.9.1/src/requirements.txt b/functions/development/tf1_serving/0.9.1/src/requirements.txt deleted file mode 100644 index 8d3d1955..00000000 --- a/functions/development/tf1_serving/0.9.1/src/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -pillow -tensorflow \ No newline at end of file diff --git a/functions/development/tf1_serving/0.9.1/src/tf1_serving.ipynb b/functions/development/tf1_serving/0.9.1/src/tf1_serving.ipynb deleted file mode 100644 index 1d42ee60..00000000 --- a/functions/development/tf1_serving/0.9.1/src/tf1_serving.ipynb +++ /dev/null @@ -1,567 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Image Classification Model - Serving Function\n", - "\n", - "This notebook demonstrates how to deploy a Tensorflow model using MLRun & Nuclio.\n", - "\n", - "**In this notebook you will:**\n", - "* Write a Tensorflow-Model class to load and predict on the incoming data\n", - "* Deploy the model as a serverless function\n", - "* Invoke the serving endpoint with data as:\n", - " * URLs to images hosted on S3\n", - " * Direct image send\n", - " \n", - "**Steps:** \n", - "* [Define Nuclio function](#Define-Nuclio-function) \n", - " * [Install dependencies and set config](#Install-dependencies-and-set-config) \n", - " * [Model serving class](#Model-Serving-Class) \n", - "* [Deploy the serving function to the cluster](#Deploy-the-serving-function-to-the-cluster) \n", - "* [Define test parameters](#Define-test-parameters)\n", - "* [Test the deployed function on the cluster](#Test-the-deployed-function-on-the-cluster)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Define Nuclio Function" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To use the magic commands for deploying this jupyter notebook as a nuclio function we must first import nuclio \n", - "Since we do not want to import nuclio in the actual function, the comment annotation `nuclio: ignore` is used. This marks the cell for nuclio, telling it to ignore the cell's values when building the function." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: ignore\n", - "import nuclio" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Install dependencies and set config\n", - "> Note: Since tensorflow 1.14 is being pulled from the baseimage it is not directly installed as a build command.\n", - "If it is not installed on your system please uninstall and install using the line: `pip install tensorflow==1.14 keras`" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "%nuclio: setting kind to 'nuclio:serving'\n", - "%nuclio: setting 'MODEL_CLASS' environment variable\n", - "%nuclio: setting spec.build.baseImage to 'mlrun/mlrun'\n" - ] - } - ], - "source": [ - "%nuclio config kind=\"nuclio:serving\"\n", - "%nuclio env MODEL_CLASS=TFModel\n", - "\n", - "# tensorflow version 1 requires a different version of python than \n", - "# the default (3.7), so we override the default tag here:\n", - "\n", - "%nuclio config spec.build.baseImage = \"mlrun/mlrun\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Since we are using packages which are not surely installed on our baseimage, or want to verify that a specific version of the package will be installed we use the `%nuclio cmd` annotation. \n", - ">`%nuclio cmd` works both locally and during deployment by default, but can be set with `-c` flag to only run the commands while deploying or `-l` to set the variable for the local environment only." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "%%nuclio cmd -c\n", - "pip install tensorflow==1.14 keras==2.3.1 'h5py<3.0.0'\n", - "pip install requests pillow" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Function Code" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "import warnings\n", - "warnings.simplefilter(action=\"ignore\", category=FutureWarning)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "import json\n", - "import numpy as np\n", - "import requests\n", - "from tensorflow import keras\n", - "from keras.models import load_model\n", - "from keras.preprocessing import image\n", - "from keras.preprocessing.image import load_img\n", - "from os import environ, path\n", - "from PIL import Image\n", - "from io import BytesIO\n", - "from urllib.request import urlopen\n", - "import mlrun" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Model Serving Class" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We define the `TFModel` class which we will use to define data handling and prediction of our model. \n", - "\n", - "The class should consist of:\n", - "* `__init__(name, model_dir)` - Setup the internal parameters\n", - "* `load(self)` - How to load the model and broadcast it's ready for prediction\n", - "* `preprocess(self, body)` - How to handle the incoming event, forming the request to an `{'instances': []}` dictionary as requested by the protocol\n", - "* `predict(self, data)` - Receives and `{'instances': []}` and returns the model's prediction as a list\n", - "* `postprocess(self, data)` - Does any additional processing needed on the predictions." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "class TFModel(mlrun.runtimes.MLModelServer):\n", - " def __init__(self, name: str, model_dir: str):\n", - " super().__init__(name, model_dir)\n", - "\n", - " self.IMAGE_WIDTH = int(environ.get('IMAGE_WIDTH', '128'))\n", - " self.IMAGE_HEIGHT = int(environ.get('IMAGE_HEIGHT', '128'))\n", - " self.classes = None\n", - " try:\n", - " with open(environ['classes_map'], 'r') as f:\n", - " self.classes = json.load(f)\n", - " except:\n", - " pass\n", - " \n", - " def load(self):\n", - " model_file, extra_data = self.get_model('.h5')\n", - " self.model = load_model(open(model_file, 'rb'))\n", - " \n", - " def preprocess(self, body):\n", - " try:\n", - " output = {'instances': []}\n", - " instances = body.get('instances', [])\n", - " for byte_image in instances:\n", - " img = Image.open(byte_image)\n", - " img = img.resize((self.IMAGE_WIDTH, self.IMAGE_HEIGHT))\n", - "\n", - " # Load image\n", - " x = image.img_to_array(img)\n", - " x = np.expand_dims(x, axis=0)\n", - " output['instances'].append(x)\n", - " \n", - " # Format instances list\n", - " output['instances'] = [np.vstack(output['instances'])]\n", - " return output\n", - " except:\n", - " raise Exception(f'received: {body}')\n", - " \n", - "\n", - " def predict(self, data):\n", - " images = data.get('instances', [])\n", - "\n", - " # Predict\n", - " predicted_probability = self.model.predict(images)\n", - "\n", - " # return prediction\n", - " return predicted_probability\n", - " \n", - " def postprocess(self, predicted_probability):\n", - " if self.classes:\n", - " predicted_classes = np.around(predicted_probability, 1).tolist()[0]\n", - " predicted_probabilities = predicted_probability.tolist()[0]\n", - " return {\n", - " 'prediction': [self.classes[str(int(cls))] for cls in predicted_classes], \n", - " f'{self.classes[\"1\"]}-probability': predicted_probabilities\n", - " }\n", - " else:\n", - " return predicted_probability.tolist()[0]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To let our nuclio builder know that our function code ends at this point we will use the comment annotation `nuclio: end-code`. \n", - "\n", - "Any new cell from now on will be treated as if a `nuclio: ignore` comment was set, and will not be added to the funcion." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: end-code" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Test the function locally" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Make sure your local TF / Keras version is the same as pulled in the nuclio image for accurate testing\n", - "\n", - "Set the served models and their file paths using: `SERVING_MODEL_ = `\n", - "\n", - "> Note: this notebook assumes the model and categories are under /User/mlrun/examples/" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "from PIL import Image\n", - "from io import BytesIO\n", - "import matplotlib.pyplot as plt\n", - "import os, requests" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Define test parameters" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Test image:\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "# Testing event\n", - "cat_image_url = 'https://s3.amazonaws.com/iguazio-sample-data/images/catanddog/cat.102.jpg'\n", - "response = requests.get(cat_image_url)\n", - "cat_image = response.content\n", - "img = Image.open(BytesIO(cat_image))\n", - "\n", - "print('Test image:')\n", - "plt.imshow(img)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Define Function specifications" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import mlconf\n", - "import os\n", - "\n", - "# Model Server variables\n", - "model_class = 'TFModel'\n", - "model_name = 'cat_vs_dog_tfv1' # Define for later use in tests\n", - "models = {model_name: os.path.join(mlconf.artifact_path, 'tf1/cats_n_dogs.h5')}\n", - "\n", - "# Specific model variables\n", - "function_envs = {\n", - " 'IMAGE_HEIGHT': 128,\n", - " 'IMAGE_WIDTH': 128,\n", - " 'classes_map': os.path.join(mlconf.artifact_path, 'categories_map.json')\n", - "}" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Deploy the serving function to the cluster" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import new_model_server, mount_v3io" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-05-04 21:22:18,924 function spec saved to path: function.yaml\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Setup the model server function\n", - "fn = new_model_server('tf1-serving', \n", - " model_class=model_class,\n", - " models=models)\n", - "fn.set_envs(function_envs)\n", - "fn.spec.description = \"tf1 image classification server\"\n", - "fn.metadata.categories = ['serving', 'dl']\n", - "fn.metadata.labels = {'author': 'yaronh'}\n", - "fn.export(\"function.yaml\")" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "if \"V3IO_HOME\" in list(os.environ):\n", - " from mlrun import mount_v3io\n", - " fn.apply(mount_v3io())\n", - "else:\n", - " # is you set up mlrun using the instructions at\n", - " # https://github.com/mlrun/mlrun/blob/master/hack/local/README.md\n", - " from mlrun.platforms import mount_pvc\n", - " fn.apply(mount_pvc('nfsvol', 'nfsvol', '/home/joyan/data'))" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-04-30 20:52:15,886 deploy started\n", - "[nuclio] 2020-04-30 20:53:46,385 (info) Build complete\n", - "[nuclio] 2020-04-30 20:53:56,566 (info) Function deploy complete\n", - "[nuclio] 2020-04-30 20:53:56,573 done updating tensorflow-v1-2layers, function address: 3.135.130.246:30961\n" - ] - } - ], - "source": [ - "# Deploy the model server\n", - "addr = fn.deploy(project='cat-and-dog-servers')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Test the deployed function on the cluster" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Test the deployed function (with URL)" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Sending event: {\"data_url\": \"https://s3.amazonaws.com/iguazio-sample-data/images/catanddog/cat.102.jpg\"}\n" - ] - }, - { - "data": { - "text/plain": [ - "b'[0.0]'" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# URL event\n", - "event_body = json.dumps({\"data_url\": cat_image_url})\n", - "print(f'Sending event: {event_body}')\n", - "\n", - "headers = {'Content-type': 'application/json'}\n", - "response = requests.post(url=addr + f'/{model_name}/predict', data=event_body, headers=headers)\n", - "response.content" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Test the deployed function (with Jpeg Image)" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Sending image from https://s3.amazonaws.com/iguazio-sample-data/images/catanddog/cat.102.jpg\n" - ] - }, - { - "data": { - "text/plain": [ - "b'[0.0]'" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "# URL event\n", - "event_body = cat_image\n", - "print(f'Sending image from {cat_image_url}')\n", - "plt.imshow(img)\n", - "\n", - "headers = {'Content-type': 'image/jpeg'}\n", - "response = requests.post(url=addr + f'/{model_name}/predict/', data=event_body, headers=headers)\n", - "response.content" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/functions/development/tf1_serving/0.9.1/src/tf1_serving.py b/functions/development/tf1_serving/0.9.1/src/tf1_serving.py deleted file mode 100644 index 5bced05c..00000000 --- a/functions/development/tf1_serving/0.9.1/src/tf1_serving.py +++ /dev/null @@ -1,73 +0,0 @@ -# Generated by nuclio.export.NuclioExporter - -import warnings - -warnings.simplefilter(action="ignore", category=FutureWarning) - -import json -import numpy as np -import requests -from tensorflow import keras -from keras.models import load_model -from keras.preprocessing import image -from keras.preprocessing.image import load_img -from os import environ, path -from PIL import Image -from io import BytesIO -from urllib.request import urlopen -import mlrun - - -class TFModel(mlrun.runtimes.MLModelServer): - def __init__(self, name: str, model_dir: str): - super().__init__(name, model_dir) - - self.IMAGE_WIDTH = int(environ.get("IMAGE_WIDTH", "128")) - self.IMAGE_HEIGHT = int(environ.get("IMAGE_HEIGHT", "128")) - self.classes = None - try: - with open(environ["classes_map"], "r") as f: - self.classes = json.load(f) - except: - pass - - def load(self): - model_file, extra_data = self.get_model(".h5") - self.model = load_model(open(model_file, "rb")) - - def preprocess(self, body): - try: - output = {"instances": []} - instances = body.get("instances", []) - for byte_image in instances: - img = Image.open(byte_image) - img = img.resize((self.IMAGE_WIDTH, self.IMAGE_HEIGHT)) - - x = image.img_to_array(img) - x = np.expand_dims(x, axis=0) - output["instances"].append(x) - - output["instances"] = [np.vstack(output["instances"])] - return output - except: - raise Exception(f"received: {body}") - - def predict(self, data): - images = data.get("instances", []) - - predicted_probability = self.model.predict(images) - - return predicted_probability - - def postprocess(self, predicted_probability): - if self.classes: - predicted_classes = np.around(predicted_probability, 1).tolist()[0] - predicted_probabilities = predicted_probability.tolist()[0] - return { - "prediction": [ - self.classes[str(int(cls))] for cls in predicted_classes - ], - f'{self.classes["1"]}-probability': predicted_probabilities, - } - else: - return predicted_probability.tolist()[0] diff --git a/functions/development/tf1_serving/0.9.1/static/documentation.html b/functions/development/tf1_serving/0.9.1/static/documentation.html deleted file mode 100644 index a5646c83..00000000 --- a/functions/development/tf1_serving/0.9.1/static/documentation.html +++ /dev/null @@ -1,128 +0,0 @@ - - - - - - - -tf1_serving package - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- - -
-
-
-
-
-
-

tf1_serving package

-
-

Submodules

-
-
-

tf1_serving.tf1_serving module

-
-
-

Module contents

-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/tf1_serving/0.9.1/static/example.html b/functions/development/tf1_serving/0.9.1/static/example.html deleted file mode 100644 index 62075201..00000000 --- a/functions/development/tf1_serving/0.9.1/static/example.html +++ /dev/null @@ -1,539 +0,0 @@ - - - - - - - -Image Classification Model - Serving Function - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
- -
-
-
-
-

Image Classification Model - Serving Function

-

This notebook demonstrates how to deploy a Tensorflow model using MLRun & Nuclio.

-

In this notebook you will:

-
    -
  • Write a Tensorflow-Model class to load and predict on the incoming data

  • -
  • Deploy the model as a serverless function

  • -
  • Invoke the serving endpoint with data as:

    -
      -
    • URLs to images hosted on S3

    • -
    • Direct image send

    • -
    -
  • -
-

Steps:

- -
-

Define Nuclio Function

-

To use the magic commands for deploying this jupyter notebook as a nuclio function we must first import nuclio
-Since we do not want to import nuclio in the actual function, the comment annotation nuclio: ignore is used. This marks the cell for nuclio, telling it to ignore the cell’s values when building the function.

-
-
-
# nuclio: ignore
-import nuclio
-
-
-
-
-
-

Install dependencies and set config

-
-

Note: Since tensorflow 1.14 is being pulled from the baseimage it is not directly installed as a build command. -If it is not installed on your system please uninstall and install using the line: pip install tensorflow==1.14 keras

-
-
-
-
%nuclio config kind="nuclio:serving"
-%nuclio env MODEL_CLASS=TFModel
-
-# tensorflow version 1 requires a different version of python than 
-# the default (3.7), so we override the default tag here:
-
-%nuclio config spec.build.baseImage = "mlrun/mlrun"
-
-
-
-
-
%nuclio: setting kind to 'nuclio:serving'
-%nuclio: setting 'MODEL_CLASS' environment variable
-%nuclio: setting spec.build.baseImage to 'mlrun/mlrun'
-
-
-
-
-

Since we are using packages which are not surely installed on our baseimage, or want to verify that a specific version of the package will be installed we use the %nuclio cmd annotation.

-
-

%nuclio cmd works both locally and during deployment by default, but can be set with -c flag to only run the commands while deploying or -l to set the variable for the local environment only.

-
-
-
-
%%nuclio cmd -c
-pip install tensorflow==1.14 keras==2.3.1 'h5py<3.0.0'
-pip install requests pillow
-
-
-
-
-
-
-
-

Function Code

-
-
-
import warnings
-warnings.simplefilter(action="ignore", category=FutureWarning)
-
-
-
-
-
-
-
import json
-import numpy as np
-import requests
-from tensorflow import keras
-from keras.models import load_model
-from keras.preprocessing import image
-from keras.preprocessing.image import load_img
-from os import environ, path
-from PIL import Image
-from io import BytesIO
-from urllib.request import urlopen
-import mlrun
-
-
-
-
-
-

Model Serving Class

-

We define the TFModel class which we will use to define data handling and prediction of our model.

-

The class should consist of:

-
    -
  • __init__(name, model_dir) - Setup the internal parameters

  • -
  • load(self) - How to load the model and broadcast it’s ready for prediction

  • -
  • preprocess(self, body) - How to handle the incoming event, forming the request to an {'instances': [<samples>]} dictionary as requested by the protocol

  • -
  • predict(self, data) - Receives and {'instances': [<samples>]} and returns the model’s prediction as a list

  • -
  • postprocess(self, data) - Does any additional processing needed on the predictions.

  • -
-
-
-
class TFModel(mlrun.runtimes.MLModelServer):
-    def __init__(self, name: str, model_dir: str):
-        super().__init__(name, model_dir)
-
-        self.IMAGE_WIDTH = int(environ.get('IMAGE_WIDTH', '128'))
-        self.IMAGE_HEIGHT = int(environ.get('IMAGE_HEIGHT', '128'))
-        self.classes = None
-        try:
-            with open(environ['classes_map'], 'r') as f:
-                self.classes = json.load(f)
-        except:
-            pass
-        
-    def load(self):
-        model_file, extra_data = self.get_model('.h5')
-        self.model = load_model(open(model_file, 'rb'))
-        
-    def preprocess(self, body):
-        try:
-            output = {'instances': []}
-            instances = body.get('instances', [])
-            for byte_image in instances:
-                img = Image.open(byte_image)
-                img = img.resize((self.IMAGE_WIDTH, self.IMAGE_HEIGHT))
-
-                # Load image
-                x = image.img_to_array(img)
-                x = np.expand_dims(x, axis=0)
-                output['instances'].append(x)
-            
-            # Format instances list
-            output['instances'] = [np.vstack(output['instances'])]
-            return output
-        except:
-            raise Exception(f'received: {body}')
-            
-
-    def predict(self, data):
-        images = data.get('instances', [])
-
-        # Predict
-        predicted_probability = self.model.predict(images)
-
-        # return prediction
-        return predicted_probability
-        
-    def postprocess(self, predicted_probability):
-        if self.classes:
-            predicted_classes = np.around(predicted_probability, 1).tolist()[0]
-            predicted_probabilities = predicted_probability.tolist()[0]
-            return {
-                'prediction': [self.classes[str(int(cls))] for cls in predicted_classes], 
-                f'{self.classes["1"]}-probability': predicted_probabilities
-            }
-        else:
-            return predicted_probability.tolist()[0]
-
-
-
-
-

To let our nuclio builder know that our function code ends at this point we will use the comment annotation nuclio: end-code.

-

Any new cell from now on will be treated as if a nuclio: ignore comment was set, and will not be added to the funcion.

-
-
-
# nuclio: end-code
-
-
-
-
-
-
-
-

Test the function locally

-

Make sure your local TF / Keras version is the same as pulled in the nuclio image for accurate testing

-

Set the served models and their file paths using: SERVING_MODEL_<name> = <model file path>

-
-

Note: this notebook assumes the model and categories are under /User/mlrun/examples/

-
-
-
-
from PIL import Image
-from io import BytesIO
-import matplotlib.pyplot as plt
-import os, requests
-
-
-
-
-
-

Define test parameters

-
-
-
# Testing event
-cat_image_url = 'https://s3.amazonaws.com/iguazio-sample-data/images/catanddog/cat.102.jpg'
-response = requests.get(cat_image_url)
-cat_image = response.content
-img = Image.open(BytesIO(cat_image))
-
-print('Test image:')
-plt.imshow(img)
-
-
-
-
-
Test image:
-
-
-
<matplotlib.image.AxesImage at 0x7f1232ea52e8>
-
-
-_images/tf1_serving_example_20_2.png -
-
-
-
-

Define Function specifications

-
-
-
from mlrun import mlconf
-import os
-
-# Model Server variables
-model_class = 'TFModel'
-model_name = 'cat_vs_dog_tfv1' # Define for later use in tests
-models = {model_name: os.path.join(mlconf.artifact_path, 'tf1/cats_n_dogs.h5')}
-
-# Specific model variables
-function_envs = {
-    'IMAGE_HEIGHT': 128,
-    'IMAGE_WIDTH': 128,
-    'classes_map': os.path.join(mlconf.artifact_path, 'categories_map.json')
-}
-
-
-
-
-
-
-
-

Deploy the serving function to the cluster

-
-
-
from mlrun import new_model_server, mount_v3io
-
-
-
-
-
-
-
# Setup the model server function
-fn = new_model_server('tf1-serving', 
-                      model_class=model_class,
-                      models=models)
-fn.set_envs(function_envs)
-fn.spec.description = "tf1 image classification server"
-fn.metadata.categories = ['serving', 'dl']
-fn.metadata.labels = {'author': 'yaronh'}
-fn.export("function.yaml")
-
-
-
-
-
[mlrun] 2020-05-04 21:22:18,924 function spec saved to path: function.yaml
-
-
-
<mlrun.runtimes.function.RemoteRuntime at 0x7f5a1585f908>
-
-
-
-
-
-
-
if "V3IO_HOME" in list(os.environ):
-    from mlrun import mount_v3io
-    fn.apply(mount_v3io())
-else:
-    # is you set up mlrun using the instructions at
-    # https://github.com/mlrun/mlrun/blob/master/hack/local/README.md
-    from mlrun.platforms import mount_pvc
-    fn.apply(mount_pvc('nfsvol', 'nfsvol', '/home/joyan/data'))
-
-
-
-
-
-
-
# Deploy the model server
-addr = fn.deploy(project='cat-and-dog-servers')
-
-
-
-
-
[mlrun] 2020-04-30 20:52:15,886 deploy started
-[nuclio] 2020-04-30 20:53:46,385 (info) Build complete
-[nuclio] 2020-04-30 20:53:56,566 (info) Function deploy complete
-[nuclio] 2020-04-30 20:53:56,573 done updating tensorflow-v1-2layers, function address: 3.135.130.246:30961
-
-
-
-
-
-
-

Test the deployed function on the cluster

-
-

Test the deployed function (with URL)

-
-
-
# URL event
-event_body = json.dumps({"data_url": cat_image_url})
-print(f'Sending event: {event_body}')
-
-headers = {'Content-type': 'application/json'}
-response = requests.post(url=addr + f'/{model_name}/predict', data=event_body, headers=headers)
-response.content
-
-
-
-
-
Sending event: {"data_url": "https://s3.amazonaws.com/iguazio-sample-data/images/catanddog/cat.102.jpg"}
-
-
-
b'[0.0]'
-
-
-
-
-
-
-

Test the deployed function (with Jpeg Image)

-
-
-
# URL event
-event_body = cat_image
-print(f'Sending image from {cat_image_url}')
-plt.imshow(img)
-
-headers = {'Content-type': 'image/jpeg'}
-response = requests.post(url=addr + f'/{model_name}/predict/', data=event_body, headers=headers)
-response.content
-
-
-
-
-
Sending image from https://s3.amazonaws.com/iguazio-sample-data/images/catanddog/cat.102.jpg
-
-
-
b'[0.0]'
-
-
-_images/tf1_serving_example_32_2.png -
-
-
-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/tf1_serving/0.9.1/static/function.html b/functions/development/tf1_serving/0.9.1/static/function.html deleted file mode 100644 index 9f1ba0c4..00000000 --- a/functions/development/tf1_serving/0.9.1/static/function.html +++ /dev/null @@ -1,70 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: remote
-metadata:
-  name: tf1-serving
-  tag: ''
-  hash: 20cdeb2119a67fc51e55474ac84d386c7b658db3
-  project: ''
-  labels:
-    author: yaronh
-  categories:
-  - model-serving
-  - machine-learning
-spec:
-  command: ''
-  args: []
-  image: mlrun/mlrun
-  description: tf1 image classification server
-  min_replicas: 1
-  max_replicas: 4
-  env:
-  - name: MODEL_CLASS
-    value: TFModel
-  - name: ENABLE_EXPLAINER
-    value: false
-  base_spec:
-    apiVersion: nuclio.io/v1
-    kind: Function
-    metadata:
-      name: tf1-serving
-      labels: {}
-      annotations:
-        nuclio.io/generated_by: function generated from /home/kali/functions/tf1_serving/tf1_serving.py
-    spec:
-      runtime: python:3.6
-      handler: tf1_serving:handler
-      env: []
-      volumes: []
-      build:
-        commands: []
-        noBaseImagesPull: true
-        functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHdhcm5pbmdzCgp3YXJuaW5ncy5zaW1wbGVmaWx0ZXIoYWN0aW9uPSJpZ25vcmUiLCBjYXRlZ29yeT1GdXR1cmVXYXJuaW5nKQoKaW1wb3J0IGpzb24KaW1wb3J0IG51bXB5IGFzIG5wCmltcG9ydCByZXF1ZXN0cwpmcm9tIHRlbnNvcmZsb3cgaW1wb3J0IGtlcmFzCmZyb20ga2VyYXMubW9kZWxzIGltcG9ydCBsb2FkX21vZGVsCmZyb20ga2VyYXMucHJlcHJvY2Vzc2luZyBpbXBvcnQgaW1hZ2UKZnJvbSBrZXJhcy5wcmVwcm9jZXNzaW5nLmltYWdlIGltcG9ydCBsb2FkX2ltZwpmcm9tIG9zIGltcG9ydCBlbnZpcm9uLCBwYXRoCmZyb20gUElMIGltcG9ydCBJbWFnZQpmcm9tIGlvIGltcG9ydCBCeXRlc0lPCmZyb20gdXJsbGliLnJlcXVlc3QgaW1wb3J0IHVybG9wZW4KaW1wb3J0IG1scnVuCgoKY2xhc3MgVEZNb2RlbChtbHJ1bi5ydW50aW1lcy5NTE1vZGVsU2VydmVyKToKICAgIGRlZiBfX2luaXRfXyhzZWxmLCBuYW1lOiBzdHIsIG1vZGVsX2Rpcjogc3RyKToKICAgICAgICBzdXBlcigpLl9faW5pdF9fKG5hbWUsIG1vZGVsX2RpcikKCiAgICAgICAgc2VsZi5JTUFHRV9XSURUSCA9IGludChlbnZpcm9uLmdldCgiSU1BR0VfV0lEVEgiLCAiMTI4IikpCiAgICAgICAgc2VsZi5JTUFHRV9IRUlHSFQgPSBpbnQoZW52aXJvbi5nZXQoIklNQUdFX0hFSUdIVCIsICIxMjgiKSkKICAgICAgICBzZWxmLmNsYXNzZXMgPSBOb25lCiAgICAgICAgdHJ5OgogICAgICAgICAgICB3aXRoIG9wZW4oZW52aXJvblsiY2xhc3Nlc19tYXAiXSwgInIiKSBhcyBmOgogICAgICAgICAgICAgICAgc2VsZi5jbGFzc2VzID0ganNvbi5sb2FkKGYpCiAgICAgICAgZXhjZXB0OgogICAgICAgICAgICBwYXNzCgogICAgZGVmIGxvYWQoc2VsZik6CiAgICAgICAgbW9kZWxfZmlsZSwgZXh0cmFfZGF0YSA9IHNlbGYuZ2V0X21vZGVsKCIuaDUiKQogICAgICAgIHNlbGYubW9kZWwgPSBsb2FkX21vZGVsKG9wZW4obW9kZWxfZmlsZSwgInJiIikpCgogICAgZGVmIHByZXByb2Nlc3Moc2VsZiwgYm9keSk6CiAgICAgICAgdHJ5OgogICAgICAgICAgICBvdXRwdXQgPSB7Imluc3RhbmNlcyI6IFtdfQogICAgICAgICAgICBpbnN0YW5jZXMgPSBib2R5LmdldCgiaW5zdGFuY2VzIiwgW10pCiAgICAgICAgICAgIGZvciBieXRlX2ltYWdlIGluIGluc3RhbmNlczoKICAgICAgICAgICAgICAgIGltZyA9IEltYWdlLm9wZW4oYnl0ZV9pbWFnZSkKICAgICAgICAgICAgICAgIGltZyA9IGltZy5yZXNpemUoKHNlbGYuSU1BR0VfV0lEVEgsIHNlbGYuSU1BR0VfSEVJR0hUKSkKCiAgICAgICAgICAgICAgICB4ID0gaW1hZ2UuaW1nX3RvX2FycmF5KGltZykKICAgICAgICAgICAgICAgIHggPSBucC5leHBhbmRfZGltcyh4LCBheGlzPTApCiAgICAgICAgICAgICAgICBvdXRwdXRbImluc3RhbmNlcyJdLmFwcGVuZCh4KQoKICAgICAgICAgICAgb3V0cHV0WyJpbnN0YW5jZXMiXSA9IFtucC52c3RhY2sob3V0cHV0WyJpbnN0YW5jZXMiXSldCiAgICAgICAgICAgIHJldHVybiBvdXRwdXQKICAgICAgICBleGNlcHQ6CiAgICAgICAgICAgIHJhaXNlIEV4Y2VwdGlvbihmInJlY2VpdmVkOiB7Ym9keX0iKQoKICAgIGRlZiBwcmVkaWN0KHNlbGYsIGRhdGEpOgogICAgICAgIGltYWdlcyA9IGRhdGEuZ2V0KCJpbnN0YW5jZXMiLCBbXSkKCiAgICAgICAgcHJlZGljdGVkX3Byb2JhYmlsaXR5ID0gc2VsZi5tb2RlbC5wcmVkaWN0KGltYWdlcykKCiAgICAgICAgcmV0dXJuIHByZWRpY3RlZF9wcm9iYWJpbGl0eQoKICAgIGRlZiBwb3N0cHJvY2VzcyhzZWxmLCBwcmVkaWN0ZWRfcHJvYmFiaWxpdHkpOgogICAgICAgIGlmIHNlbGYuY2xhc3NlczoKICAgICAgICAgICAgcHJlZGljdGVkX2NsYXNzZXMgPSBucC5hcm91bmQocHJlZGljdGVkX3Byb2JhYmlsaXR5LCAxKS50b2xpc3QoKVswXQogICAgICAgICAgICBwcmVkaWN0ZWRfcHJvYmFiaWxpdGllcyA9IHByZWRpY3RlZF9wcm9iYWJpbGl0eS50b2xpc3QoKVswXQogICAgICAgICAgICByZXR1cm4gewogICAgICAgICAgICAgICAgInByZWRpY3Rpb24iOiBbCiAgICAgICAgICAgICAgICAgICAgc2VsZi5jbGFzc2VzW3N0cihpbnQoY2xzKSldIGZvciBjbHMgaW4gcHJlZGljdGVkX2NsYXNzZXMKICAgICAgICAgICAgICAgIF0sCiAgICAgICAgICAgICAgICBmJ3tzZWxmLmNsYXNzZXNbIjEiXX0tcHJvYmFiaWxpdHknOiBwcmVkaWN0ZWRfcHJvYmFiaWxpdGllcywKICAgICAgICAgICAgfQogICAgICAgIGVsc2U6CiAgICAgICAgICAgIHJldHVybiBwcmVkaWN0ZWRfcHJvYmFiaWxpdHkudG9saXN0KClbMF0KCmZyb20gbWxydW4ucnVudGltZXMgaW1wb3J0IG51Y2xpb19pbml0X2hvb2sKZGVmIGluaXRfY29udGV4dChjb250ZXh0KToKICAgIG51Y2xpb19pbml0X2hvb2soY29udGV4dCwgZ2xvYmFscygpLCAnc2VydmluZycpCgpkZWYgaGFuZGxlcihjb250ZXh0LCBldmVudCk6CiAgICByZXR1cm4gY29udGV4dC5tbHJ1bl9oYW5kbGVyKGNvbnRleHQsIGV2ZW50KQo=
-  source: ''
-  function_kind: serving
-  build:
-    commands: []
-    code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/tf1_serving/tf1_serving.py
-  default_handler: handler
-  affinity: null
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/tf1_serving/0.9.1/static/item.html b/functions/development/tf1_serving/0.9.1/static/item.html deleted file mode 100644 index a25d3df8..00000000 --- a/functions/development/tf1_serving/0.9.1/static/item.html +++ /dev/null @@ -1,49 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- model-serving
-- machine-learning
-description: tf1 image classification server
-doc: ''
-example: tf1_serving.ipynb
-generationDate: 2021-11-18:12-28
-icon: ''
-labels:
-  author: yaronh
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 0.8.0
-name: tf1-serving
-platformVersion: 3.2.0
-spec:
-  env:
-    ENABLE_EXPLAINER: false
-    MODEL_CLASS: TFModel
-  filename: tf1_serving.py
-  handler: handler
-  image: mlrun/mlrun
-  kind: nuclio:serving
-  requirements: []
-url: ''
-version: 0.9.1
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/tf1_serving/0.9.1/static/source.html b/functions/development/tf1_serving/0.9.1/static/source.html deleted file mode 100644 index ae10b34f..00000000 --- a/functions/development/tf1_serving/0.9.1/static/source.html +++ /dev/null @@ -1,95 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-# Generated by nuclio.export.NuclioExporter
-
-import warnings
-
-warnings.simplefilter(action="ignore", category=FutureWarning)
-
-import json
-import numpy as np
-import requests
-from tensorflow import keras
-from keras.models import load_model
-from keras.preprocessing import image
-from keras.preprocessing.image import load_img
-from os import environ, path
-from PIL import Image
-from io import BytesIO
-from urllib.request import urlopen
-import mlrun
-
-
-class TFModel(mlrun.runtimes.MLModelServer):
-    def __init__(self, name: str, model_dir: str):
-        super().__init__(name, model_dir)
-
-        self.IMAGE_WIDTH = int(environ.get("IMAGE_WIDTH", "128"))
-        self.IMAGE_HEIGHT = int(environ.get("IMAGE_HEIGHT", "128"))
-        self.classes = None
-        try:
-            with open(environ["classes_map"], "r") as f:
-                self.classes = json.load(f)
-        except:
-            pass
-
-    def load(self):
-        model_file, extra_data = self.get_model(".h5")
-        self.model = load_model(open(model_file, "rb"))
-
-    def preprocess(self, body):
-        try:
-            output = {"instances": []}
-            instances = body.get("instances", [])
-            for byte_image in instances:
-                img = Image.open(byte_image)
-                img = img.resize((self.IMAGE_WIDTH, self.IMAGE_HEIGHT))
-
-                x = image.img_to_array(img)
-                x = np.expand_dims(x, axis=0)
-                output["instances"].append(x)
-
-            output["instances"] = [np.vstack(output["instances"])]
-            return output
-        except:
-            raise Exception(f"received: {body}")
-
-    def predict(self, data):
-        images = data.get("instances", [])
-
-        predicted_probability = self.model.predict(images)
-
-        return predicted_probability
-
-    def postprocess(self, predicted_probability):
-        if self.classes:
-            predicted_classes = np.around(predicted_probability, 1).tolist()[0]
-            predicted_probabilities = predicted_probability.tolist()[0]
-            return {
-                "prediction": [
-                    self.classes[str(int(cls))] for cls in predicted_classes
-                ],
-                f'{self.classes["1"]}-probability': predicted_probabilities,
-            }
-        else:
-            return predicted_probability.tolist()[0]
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/tf1_serving/1.1.0/src/function.yaml b/functions/development/tf1_serving/1.1.0/src/function.yaml deleted file mode 100644 index e6a57c4b..00000000 --- a/functions/development/tf1_serving/1.1.0/src/function.yaml +++ /dev/null @@ -1,48 +0,0 @@ -kind: remote -metadata: - name: tf1-serving - tag: '' - hash: 20cdeb2119a67fc51e55474ac84d386c7b658db3 - project: '' - labels: - author: yaronh - categories: - - model-serving - - machine-learning -spec: - command: '' - args: [] - image: mlrun/mlrun - description: tf1 image classification server - min_replicas: 1 - max_replicas: 4 - env: - - name: MODEL_CLASS - value: TFModel - - name: ENABLE_EXPLAINER - value: false - base_spec: - apiVersion: nuclio.io/v1 - kind: Function - metadata: - name: tf1-serving - labels: {} - annotations: - nuclio.io/generated_by: function generated from /home/kali/functions/tf1_serving/tf1_serving.py - spec: - runtime: python:3.6 - handler: tf1_serving:handler - env: [] - volumes: [] - build: - commands: [] - noBaseImagesPull: true - functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHdhcm5pbmdzCgp3YXJuaW5ncy5zaW1wbGVmaWx0ZXIoYWN0aW9uPSJpZ25vcmUiLCBjYXRlZ29yeT1GdXR1cmVXYXJuaW5nKQoKaW1wb3J0IGpzb24KaW1wb3J0IG51bXB5IGFzIG5wCmltcG9ydCByZXF1ZXN0cwpmcm9tIHRlbnNvcmZsb3cgaW1wb3J0IGtlcmFzCmZyb20ga2VyYXMubW9kZWxzIGltcG9ydCBsb2FkX21vZGVsCmZyb20ga2VyYXMucHJlcHJvY2Vzc2luZyBpbXBvcnQgaW1hZ2UKZnJvbSBrZXJhcy5wcmVwcm9jZXNzaW5nLmltYWdlIGltcG9ydCBsb2FkX2ltZwpmcm9tIG9zIGltcG9ydCBlbnZpcm9uLCBwYXRoCmZyb20gUElMIGltcG9ydCBJbWFnZQpmcm9tIGlvIGltcG9ydCBCeXRlc0lPCmZyb20gdXJsbGliLnJlcXVlc3QgaW1wb3J0IHVybG9wZW4KaW1wb3J0IG1scnVuCgoKY2xhc3MgVEZNb2RlbChtbHJ1bi5ydW50aW1lcy5NTE1vZGVsU2VydmVyKToKICAgIGRlZiBfX2luaXRfXyhzZWxmLCBuYW1lOiBzdHIsIG1vZGVsX2Rpcjogc3RyKToKICAgICAgICBzdXBlcigpLl9faW5pdF9fKG5hbWUsIG1vZGVsX2RpcikKCiAgICAgICAgc2VsZi5JTUFHRV9XSURUSCA9IGludChlbnZpcm9uLmdldCgiSU1BR0VfV0lEVEgiLCAiMTI4IikpCiAgICAgICAgc2VsZi5JTUFHRV9IRUlHSFQgPSBpbnQoZW52aXJvbi5nZXQoIklNQUdFX0hFSUdIVCIsICIxMjgiKSkKICAgICAgICBzZWxmLmNsYXNzZXMgPSBOb25lCiAgICAgICAgdHJ5OgogICAgICAgICAgICB3aXRoIG9wZW4oZW52aXJvblsiY2xhc3Nlc19tYXAiXSwgInIiKSBhcyBmOgogICAgICAgICAgICAgICAgc2VsZi5jbGFzc2VzID0ganNvbi5sb2FkKGYpCiAgICAgICAgZXhjZXB0OgogICAgICAgICAgICBwYXNzCgogICAgZGVmIGxvYWQoc2VsZik6CiAgICAgICAgbW9kZWxfZmlsZSwgZXh0cmFfZGF0YSA9IHNlbGYuZ2V0X21vZGVsKCIuaDUiKQogICAgICAgIHNlbGYubW9kZWwgPSBsb2FkX21vZGVsKG9wZW4obW9kZWxfZmlsZSwgInJiIikpCgogICAgZGVmIHByZXByb2Nlc3Moc2VsZiwgYm9keSk6CiAgICAgICAgdHJ5OgogICAgICAgICAgICBvdXRwdXQgPSB7Imluc3RhbmNlcyI6IFtdfQogICAgICAgICAgICBpbnN0YW5jZXMgPSBib2R5LmdldCgiaW5zdGFuY2VzIiwgW10pCiAgICAgICAgICAgIGZvciBieXRlX2ltYWdlIGluIGluc3RhbmNlczoKICAgICAgICAgICAgICAgIGltZyA9IEltYWdlLm9wZW4oYnl0ZV9pbWFnZSkKICAgICAgICAgICAgICAgIGltZyA9IGltZy5yZXNpemUoKHNlbGYuSU1BR0VfV0lEVEgsIHNlbGYuSU1BR0VfSEVJR0hUKSkKCiAgICAgICAgICAgICAgICB4ID0gaW1hZ2UuaW1nX3RvX2FycmF5KGltZykKICAgICAgICAgICAgICAgIHggPSBucC5leHBhbmRfZGltcyh4LCBheGlzPTApCiAgICAgICAgICAgICAgICBvdXRwdXRbImluc3RhbmNlcyJdLmFwcGVuZCh4KQoKICAgICAgICAgICAgb3V0cHV0WyJpbnN0YW5jZXMiXSA9IFtucC52c3RhY2sob3V0cHV0WyJpbnN0YW5jZXMiXSldCiAgICAgICAgICAgIHJldHVybiBvdXRwdXQKICAgICAgICBleGNlcHQ6CiAgICAgICAgICAgIHJhaXNlIEV4Y2VwdGlvbihmInJlY2VpdmVkOiB7Ym9keX0iKQoKICAgIGRlZiBwcmVkaWN0KHNlbGYsIGRhdGEpOgogICAgICAgIGltYWdlcyA9IGRhdGEuZ2V0KCJpbnN0YW5jZXMiLCBbXSkKCiAgICAgICAgcHJlZGljdGVkX3Byb2JhYmlsaXR5ID0gc2VsZi5tb2RlbC5wcmVkaWN0KGltYWdlcykKCiAgICAgICAgcmV0dXJuIHByZWRpY3RlZF9wcm9iYWJpbGl0eQoKICAgIGRlZiBwb3N0cHJvY2VzcyhzZWxmLCBwcmVkaWN0ZWRfcHJvYmFiaWxpdHkpOgogICAgICAgIGlmIHNlbGYuY2xhc3NlczoKICAgICAgICAgICAgcHJlZGljdGVkX2NsYXNzZXMgPSBucC5hcm91bmQocHJlZGljdGVkX3Byb2JhYmlsaXR5LCAxKS50b2xpc3QoKVswXQogICAgICAgICAgICBwcmVkaWN0ZWRfcHJvYmFiaWxpdGllcyA9IHByZWRpY3RlZF9wcm9iYWJpbGl0eS50b2xpc3QoKVswXQogICAgICAgICAgICByZXR1cm4gewogICAgICAgICAgICAgICAgInByZWRpY3Rpb24iOiBbCiAgICAgICAgICAgICAgICAgICAgc2VsZi5jbGFzc2VzW3N0cihpbnQoY2xzKSldIGZvciBjbHMgaW4gcHJlZGljdGVkX2NsYXNzZXMKICAgICAgICAgICAgICAgIF0sCiAgICAgICAgICAgICAgICBmJ3tzZWxmLmNsYXNzZXNbIjEiXX0tcHJvYmFiaWxpdHknOiBwcmVkaWN0ZWRfcHJvYmFiaWxpdGllcywKICAgICAgICAgICAgfQogICAgICAgIGVsc2U6CiAgICAgICAgICAgIHJldHVybiBwcmVkaWN0ZWRfcHJvYmFiaWxpdHkudG9saXN0KClbMF0KCmZyb20gbWxydW4ucnVudGltZXMgaW1wb3J0IG51Y2xpb19pbml0X2hvb2sKZGVmIGluaXRfY29udGV4dChjb250ZXh0KToKICAgIG51Y2xpb19pbml0X2hvb2soY29udGV4dCwgZ2xvYmFscygpLCAnc2VydmluZycpCgpkZWYgaGFuZGxlcihjb250ZXh0LCBldmVudCk6CiAgICByZXR1cm4gY29udGV4dC5tbHJ1bl9oYW5kbGVyKGNvbnRleHQsIGV2ZW50KQo= - source: '' - function_kind: serving - build: - commands: [] - code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/tf1_serving/tf1_serving.py - default_handler: handler - affinity: null -verbose: false diff --git a/functions/development/tf1_serving/1.1.0/src/item.yaml b/functions/development/tf1_serving/1.1.0/src/item.yaml deleted file mode 100644 index 6a5648ab..00000000 --- a/functions/development/tf1_serving/1.1.0/src/item.yaml +++ /dev/null @@ -1,28 +0,0 @@ -apiVersion: v1 -categories: -- model-serving -- machine-learning -description: tf1 image classification server -doc: '' -example: tf1_serving.ipynb -generationDate: 2022-08-28:17-25 -hidden: false -icon: '' -labels: - author: yaronh -maintainers: [] -marketplaceType: '' -mlrunVersion: 1.1.0 -name: tf1-serving -platformVersion: 3.5.0 -spec: - env: - ENABLE_EXPLAINER: false - MODEL_CLASS: TFModel - filename: tf1_serving.py - handler: handler - image: mlrun/mlrun - kind: nuclio:serving - requirements: [] -url: '' -version: 1.1.0 diff --git a/functions/development/tf1_serving/1.1.0/src/requirements.txt b/functions/development/tf1_serving/1.1.0/src/requirements.txt deleted file mode 100644 index 8d3d1955..00000000 --- a/functions/development/tf1_serving/1.1.0/src/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -pillow -tensorflow \ No newline at end of file diff --git a/functions/development/tf1_serving/1.1.0/src/tf1_serving.ipynb b/functions/development/tf1_serving/1.1.0/src/tf1_serving.ipynb deleted file mode 100644 index 1d42ee60..00000000 --- a/functions/development/tf1_serving/1.1.0/src/tf1_serving.ipynb +++ /dev/null @@ -1,567 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Image Classification Model - Serving Function\n", - "\n", - "This notebook demonstrates how to deploy a Tensorflow model using MLRun & Nuclio.\n", - "\n", - "**In this notebook you will:**\n", - "* Write a Tensorflow-Model class to load and predict on the incoming data\n", - "* Deploy the model as a serverless function\n", - "* Invoke the serving endpoint with data as:\n", - " * URLs to images hosted on S3\n", - " * Direct image send\n", - " \n", - "**Steps:** \n", - "* [Define Nuclio function](#Define-Nuclio-function) \n", - " * [Install dependencies and set config](#Install-dependencies-and-set-config) \n", - " * [Model serving class](#Model-Serving-Class) \n", - "* [Deploy the serving function to the cluster](#Deploy-the-serving-function-to-the-cluster) \n", - "* [Define test parameters](#Define-test-parameters)\n", - "* [Test the deployed function on the cluster](#Test-the-deployed-function-on-the-cluster)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Define Nuclio Function" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To use the magic commands for deploying this jupyter notebook as a nuclio function we must first import nuclio \n", - "Since we do not want to import nuclio in the actual function, the comment annotation `nuclio: ignore` is used. This marks the cell for nuclio, telling it to ignore the cell's values when building the function." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: ignore\n", - "import nuclio" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Install dependencies and set config\n", - "> Note: Since tensorflow 1.14 is being pulled from the baseimage it is not directly installed as a build command.\n", - "If it is not installed on your system please uninstall and install using the line: `pip install tensorflow==1.14 keras`" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "%nuclio: setting kind to 'nuclio:serving'\n", - "%nuclio: setting 'MODEL_CLASS' environment variable\n", - "%nuclio: setting spec.build.baseImage to 'mlrun/mlrun'\n" - ] - } - ], - "source": [ - "%nuclio config kind=\"nuclio:serving\"\n", - "%nuclio env MODEL_CLASS=TFModel\n", - "\n", - "# tensorflow version 1 requires a different version of python than \n", - "# the default (3.7), so we override the default tag here:\n", - "\n", - "%nuclio config spec.build.baseImage = \"mlrun/mlrun\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Since we are using packages which are not surely installed on our baseimage, or want to verify that a specific version of the package will be installed we use the `%nuclio cmd` annotation. \n", - ">`%nuclio cmd` works both locally and during deployment by default, but can be set with `-c` flag to only run the commands while deploying or `-l` to set the variable for the local environment only." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "%%nuclio cmd -c\n", - "pip install tensorflow==1.14 keras==2.3.1 'h5py<3.0.0'\n", - "pip install requests pillow" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Function Code" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "import warnings\n", - "warnings.simplefilter(action=\"ignore\", category=FutureWarning)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "import json\n", - "import numpy as np\n", - "import requests\n", - "from tensorflow import keras\n", - "from keras.models import load_model\n", - "from keras.preprocessing import image\n", - "from keras.preprocessing.image import load_img\n", - "from os import environ, path\n", - "from PIL import Image\n", - "from io import BytesIO\n", - "from urllib.request import urlopen\n", - "import mlrun" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Model Serving Class" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We define the `TFModel` class which we will use to define data handling and prediction of our model. \n", - "\n", - "The class should consist of:\n", - "* `__init__(name, model_dir)` - Setup the internal parameters\n", - "* `load(self)` - How to load the model and broadcast it's ready for prediction\n", - "* `preprocess(self, body)` - How to handle the incoming event, forming the request to an `{'instances': []}` dictionary as requested by the protocol\n", - "* `predict(self, data)` - Receives and `{'instances': []}` and returns the model's prediction as a list\n", - "* `postprocess(self, data)` - Does any additional processing needed on the predictions." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "class TFModel(mlrun.runtimes.MLModelServer):\n", - " def __init__(self, name: str, model_dir: str):\n", - " super().__init__(name, model_dir)\n", - "\n", - " self.IMAGE_WIDTH = int(environ.get('IMAGE_WIDTH', '128'))\n", - " self.IMAGE_HEIGHT = int(environ.get('IMAGE_HEIGHT', '128'))\n", - " self.classes = None\n", - " try:\n", - " with open(environ['classes_map'], 'r') as f:\n", - " self.classes = json.load(f)\n", - " except:\n", - " pass\n", - " \n", - " def load(self):\n", - " model_file, extra_data = self.get_model('.h5')\n", - " self.model = load_model(open(model_file, 'rb'))\n", - " \n", - " def preprocess(self, body):\n", - " try:\n", - " output = {'instances': []}\n", - " instances = body.get('instances', [])\n", - " for byte_image in instances:\n", - " img = Image.open(byte_image)\n", - " img = img.resize((self.IMAGE_WIDTH, self.IMAGE_HEIGHT))\n", - "\n", - " # Load image\n", - " x = image.img_to_array(img)\n", - " x = np.expand_dims(x, axis=0)\n", - " output['instances'].append(x)\n", - " \n", - " # Format instances list\n", - " output['instances'] = [np.vstack(output['instances'])]\n", - " return output\n", - " except:\n", - " raise Exception(f'received: {body}')\n", - " \n", - "\n", - " def predict(self, data):\n", - " images = data.get('instances', [])\n", - "\n", - " # Predict\n", - " predicted_probability = self.model.predict(images)\n", - "\n", - " # return prediction\n", - " return predicted_probability\n", - " \n", - " def postprocess(self, predicted_probability):\n", - " if self.classes:\n", - " predicted_classes = np.around(predicted_probability, 1).tolist()[0]\n", - " predicted_probabilities = predicted_probability.tolist()[0]\n", - " return {\n", - " 'prediction': [self.classes[str(int(cls))] for cls in predicted_classes], \n", - " f'{self.classes[\"1\"]}-probability': predicted_probabilities\n", - " }\n", - " else:\n", - " return predicted_probability.tolist()[0]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To let our nuclio builder know that our function code ends at this point we will use the comment annotation `nuclio: end-code`. \n", - "\n", - "Any new cell from now on will be treated as if a `nuclio: ignore` comment was set, and will not be added to the funcion." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: end-code" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Test the function locally" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Make sure your local TF / Keras version is the same as pulled in the nuclio image for accurate testing\n", - "\n", - "Set the served models and their file paths using: `SERVING_MODEL_ = `\n", - "\n", - "> Note: this notebook assumes the model and categories are under /User/mlrun/examples/" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "from PIL import Image\n", - "from io import BytesIO\n", - "import matplotlib.pyplot as plt\n", - "import os, requests" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Define test parameters" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Test image:\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "# Testing event\n", - "cat_image_url = 'https://s3.amazonaws.com/iguazio-sample-data/images/catanddog/cat.102.jpg'\n", - "response = requests.get(cat_image_url)\n", - "cat_image = response.content\n", - "img = Image.open(BytesIO(cat_image))\n", - "\n", - "print('Test image:')\n", - "plt.imshow(img)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Define Function specifications" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import mlconf\n", - "import os\n", - "\n", - "# Model Server variables\n", - "model_class = 'TFModel'\n", - "model_name = 'cat_vs_dog_tfv1' # Define for later use in tests\n", - "models = {model_name: os.path.join(mlconf.artifact_path, 'tf1/cats_n_dogs.h5')}\n", - "\n", - "# Specific model variables\n", - "function_envs = {\n", - " 'IMAGE_HEIGHT': 128,\n", - " 'IMAGE_WIDTH': 128,\n", - " 'classes_map': os.path.join(mlconf.artifact_path, 'categories_map.json')\n", - "}" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Deploy the serving function to the cluster" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import new_model_server, mount_v3io" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-05-04 21:22:18,924 function spec saved to path: function.yaml\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Setup the model server function\n", - "fn = new_model_server('tf1-serving', \n", - " model_class=model_class,\n", - " models=models)\n", - "fn.set_envs(function_envs)\n", - "fn.spec.description = \"tf1 image classification server\"\n", - "fn.metadata.categories = ['serving', 'dl']\n", - "fn.metadata.labels = {'author': 'yaronh'}\n", - "fn.export(\"function.yaml\")" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "if \"V3IO_HOME\" in list(os.environ):\n", - " from mlrun import mount_v3io\n", - " fn.apply(mount_v3io())\n", - "else:\n", - " # is you set up mlrun using the instructions at\n", - " # https://github.com/mlrun/mlrun/blob/master/hack/local/README.md\n", - " from mlrun.platforms import mount_pvc\n", - " fn.apply(mount_pvc('nfsvol', 'nfsvol', '/home/joyan/data'))" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-04-30 20:52:15,886 deploy started\n", - "[nuclio] 2020-04-30 20:53:46,385 (info) Build complete\n", - "[nuclio] 2020-04-30 20:53:56,566 (info) Function deploy complete\n", - "[nuclio] 2020-04-30 20:53:56,573 done updating tensorflow-v1-2layers, function address: 3.135.130.246:30961\n" - ] - } - ], - "source": [ - "# Deploy the model server\n", - "addr = fn.deploy(project='cat-and-dog-servers')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Test the deployed function on the cluster" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Test the deployed function (with URL)" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Sending event: {\"data_url\": \"https://s3.amazonaws.com/iguazio-sample-data/images/catanddog/cat.102.jpg\"}\n" - ] - }, - { - "data": { - "text/plain": [ - "b'[0.0]'" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# URL event\n", - "event_body = json.dumps({\"data_url\": cat_image_url})\n", - "print(f'Sending event: {event_body}')\n", - "\n", - "headers = {'Content-type': 'application/json'}\n", - "response = requests.post(url=addr + f'/{model_name}/predict', data=event_body, headers=headers)\n", - "response.content" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Test the deployed function (with Jpeg Image)" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Sending image from https://s3.amazonaws.com/iguazio-sample-data/images/catanddog/cat.102.jpg\n" - ] - }, - { - "data": { - "text/plain": [ - "b'[0.0]'" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "# URL event\n", - "event_body = cat_image\n", - "print(f'Sending image from {cat_image_url}')\n", - "plt.imshow(img)\n", - "\n", - "headers = {'Content-type': 'image/jpeg'}\n", - "response = requests.post(url=addr + f'/{model_name}/predict/', data=event_body, headers=headers)\n", - "response.content" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/functions/development/tf1_serving/1.1.0/src/tf1_serving.py b/functions/development/tf1_serving/1.1.0/src/tf1_serving.py deleted file mode 100644 index d9816c68..00000000 --- a/functions/development/tf1_serving/1.1.0/src/tf1_serving.py +++ /dev/null @@ -1,87 +0,0 @@ -# Copyright 2019 Iguazio -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# Generated by nuclio.export.NuclioExporter - -import warnings - -warnings.simplefilter(action="ignore", category=FutureWarning) - -import json -import numpy as np -import requests -from tensorflow import keras -from keras.models import load_model -from keras.preprocessing import image -from keras.preprocessing.image import load_img -from os import environ, path -from PIL import Image -from io import BytesIO -from urllib.request import urlopen -import mlrun - - -class TFModel(mlrun.runtimes.MLModelServer): - def __init__(self, name: str, model_dir: str): - super().__init__(name, model_dir) - - self.IMAGE_WIDTH = int(environ.get("IMAGE_WIDTH", "128")) - self.IMAGE_HEIGHT = int(environ.get("IMAGE_HEIGHT", "128")) - self.classes = None - try: - with open(environ["classes_map"], "r") as f: - self.classes = json.load(f) - except: - pass - - def load(self): - model_file, extra_data = self.get_model(".h5") - self.model = load_model(open(model_file, "rb")) - - def preprocess(self, body): - try: - output = {"instances": []} - instances = body.get("instances", []) - for byte_image in instances: - img = Image.open(byte_image) - img = img.resize((self.IMAGE_WIDTH, self.IMAGE_HEIGHT)) - - x = image.img_to_array(img) - x = np.expand_dims(x, axis=0) - output["instances"].append(x) - - output["instances"] = [np.vstack(output["instances"])] - return output - except: - raise Exception(f"received: {body}") - - def predict(self, data): - images = data.get("instances", []) - - predicted_probability = self.model.predict(images) - - return predicted_probability - - def postprocess(self, predicted_probability): - if self.classes: - predicted_classes = np.around(predicted_probability, 1).tolist()[0] - predicted_probabilities = predicted_probability.tolist()[0] - return { - "prediction": [ - self.classes[str(int(cls))] for cls in predicted_classes - ], - f'{self.classes["1"]}-probability': predicted_probabilities, - } - else: - return predicted_probability.tolist()[0] diff --git a/functions/development/tf1_serving/1.1.0/static/documentation.html b/functions/development/tf1_serving/1.1.0/static/documentation.html deleted file mode 100644 index e71da255..00000000 --- a/functions/development/tf1_serving/1.1.0/static/documentation.html +++ /dev/null @@ -1,243 +0,0 @@ - - - - - - - -tf1_serving package - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - - - -
-
- - -
-
-
- -
-

tf1_serving package

- -
- -
-
-
-
-
-

tf1_serving package#

-
-

Submodules#

-
-
-

tf1_serving.tf1_serving module#

-
-
-class tf1_serving.tf1_serving.TFModel(name: str, model_dir: str)[source]#
-

Bases: mlrun.serving.v1_serving.MLModelServer

-
-
-load()[source]#
-
-
-
-postprocess(predicted_probability)[source]#
-
-
-
-predict(data)[source]#
-
-
-
-preprocess(body)[source]#
-
-
-
-
-

Module contents#

-
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/tf1_serving/1.1.0/static/example.html b/functions/development/tf1_serving/1.1.0/static/example.html deleted file mode 100644 index adfbff17..00000000 --- a/functions/development/tf1_serving/1.1.0/static/example.html +++ /dev/null @@ -1,691 +0,0 @@ - - - - - - - -Image Classification Model - Serving Function - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - - - -
-
- - -
-
-
- - -
-
-
-

Image Classification Model - Serving Function#

-

This notebook demonstrates how to deploy a Tensorflow model using MLRun & Nuclio.

-

In this notebook you will:

-
    -
  • Write a Tensorflow-Model class to load and predict on the incoming data

  • -
  • Deploy the model as a serverless function

  • -
  • Invoke the serving endpoint with data as:

    -
      -
    • URLs to images hosted on S3

    • -
    • Direct image send

    • -
    -
  • -
-

Steps:

-
    -
  • Define Nuclio function

    -
      -
    • Install dependencies and set config

    • -
    • Model serving class

    • -
    -
  • -
  • Deploy the serving function to the cluster

  • -
  • Define test parameters

  • -
  • Test the deployed function on the cluster

  • -
-
-

Define Nuclio Function#

-

To use the magic commands for deploying this jupyter notebook as a nuclio function we must first import nuclio
-Since we do not want to import nuclio in the actual function, the comment annotation nuclio: ignore is used. This marks the cell for nuclio, telling it to ignore the cell’s values when building the function.

-
-
-
# nuclio: ignore
-import nuclio
-
-
-
-
-
-

Install dependencies and set config#

-
-

Note: Since tensorflow 1.14 is being pulled from the baseimage it is not directly installed as a build command. -If it is not installed on your system please uninstall and install using the line: pip install tensorflow==1.14 keras

-
-
-
-
%nuclio config kind="nuclio:serving"
-%nuclio env MODEL_CLASS=TFModel
-
-# tensorflow version 1 requires a different version of python than 
-# the default (3.7), so we override the default tag here:
-
-%nuclio config spec.build.baseImage = "mlrun/mlrun"
-
-
-
-
-
%nuclio: setting kind to 'nuclio:serving'
-%nuclio: setting 'MODEL_CLASS' environment variable
-%nuclio: setting spec.build.baseImage to 'mlrun/mlrun'
-
-
-
-
-

Since we are using packages which are not surely installed on our baseimage, or want to verify that a specific version of the package will be installed we use the %nuclio cmd annotation.

-
-

%nuclio cmd works both locally and during deployment by default, but can be set with -c flag to only run the commands while deploying or -l to set the variable for the local environment only.

-
-
-
-
%%nuclio cmd -c
-pip install tensorflow==1.14 keras==2.3.1 'h5py<3.0.0'
-pip install requests pillow
-
-
-
-
-
-
-
-

Function Code#

-
-
-
import warnings
-warnings.simplefilter(action="ignore", category=FutureWarning)
-
-
-
-
-
-
-
import json
-import numpy as np
-import requests
-from tensorflow import keras
-from keras.models import load_model
-from keras.preprocessing import image
-from keras.preprocessing.image import load_img
-from os import environ, path
-from PIL import Image
-from io import BytesIO
-from urllib.request import urlopen
-import mlrun
-
-
-
-
-
-

Model Serving Class#

-

We define the TFModel class which we will use to define data handling and prediction of our model.

-

The class should consist of:

-
    -
  • __init__(name, model_dir) - Setup the internal parameters

  • -
  • load(self) - How to load the model and broadcast it’s ready for prediction

  • -
  • preprocess(self, body) - How to handle the incoming event, forming the request to an {'instances': [<samples>]} dictionary as requested by the protocol

  • -
  • predict(self, data) - Receives and {'instances': [<samples>]} and returns the model’s prediction as a list

  • -
  • postprocess(self, data) - Does any additional processing needed on the predictions.

  • -
-
-
-
class TFModel(mlrun.runtimes.MLModelServer):
-    def __init__(self, name: str, model_dir: str):
-        super().__init__(name, model_dir)
-
-        self.IMAGE_WIDTH = int(environ.get('IMAGE_WIDTH', '128'))
-        self.IMAGE_HEIGHT = int(environ.get('IMAGE_HEIGHT', '128'))
-        self.classes = None
-        try:
-            with open(environ['classes_map'], 'r') as f:
-                self.classes = json.load(f)
-        except:
-            pass
-        
-    def load(self):
-        model_file, extra_data = self.get_model('.h5')
-        self.model = load_model(open(model_file, 'rb'))
-        
-    def preprocess(self, body):
-        try:
-            output = {'instances': []}
-            instances = body.get('instances', [])
-            for byte_image in instances:
-                img = Image.open(byte_image)
-                img = img.resize((self.IMAGE_WIDTH, self.IMAGE_HEIGHT))
-
-                # Load image
-                x = image.img_to_array(img)
-                x = np.expand_dims(x, axis=0)
-                output['instances'].append(x)
-            
-            # Format instances list
-            output['instances'] = [np.vstack(output['instances'])]
-            return output
-        except:
-            raise Exception(f'received: {body}')
-            
-
-    def predict(self, data):
-        images = data.get('instances', [])
-
-        # Predict
-        predicted_probability = self.model.predict(images)
-
-        # return prediction
-        return predicted_probability
-        
-    def postprocess(self, predicted_probability):
-        if self.classes:
-            predicted_classes = np.around(predicted_probability, 1).tolist()[0]
-            predicted_probabilities = predicted_probability.tolist()[0]
-            return {
-                'prediction': [self.classes[str(int(cls))] for cls in predicted_classes], 
-                f'{self.classes["1"]}-probability': predicted_probabilities
-            }
-        else:
-            return predicted_probability.tolist()[0]
-
-
-
-
-

To let our nuclio builder know that our function code ends at this point we will use the comment annotation nuclio: end-code.

-

Any new cell from now on will be treated as if a nuclio: ignore comment was set, and will not be added to the funcion.

-
-
-
# nuclio: end-code
-
-
-
-
-
-
-
-

Test the function locally#

-

Make sure your local TF / Keras version is the same as pulled in the nuclio image for accurate testing

-

Set the served models and their file paths using: SERVING_MODEL_<name> = <model file path>

-
-

Note: this notebook assumes the model and categories are under /User/mlrun/examples/

-
-
-
-
from PIL import Image
-from io import BytesIO
-import matplotlib.pyplot as plt
-import os, requests
-
-
-
-
-
-

Define test parameters#

-
-
-
# Testing event
-cat_image_url = 'https://s3.amazonaws.com/iguazio-sample-data/images/catanddog/cat.102.jpg'
-response = requests.get(cat_image_url)
-cat_image = response.content
-img = Image.open(BytesIO(cat_image))
-
-print('Test image:')
-plt.imshow(img)
-
-
-
-
-
Test image:
-
-
-
<matplotlib.image.AxesImage at 0x7f1232ea52e8>
-
-
-_images/42de06ae5aa6f46639fa3ef8175a9de784413555ecd0f5444a2c569286d93af1.png -
-
-
-
-

Define Function specifications#

-
-
-
from mlrun import mlconf
-import os
-
-# Model Server variables
-model_class = 'TFModel'
-model_name = 'cat_vs_dog_tfv1' # Define for later use in tests
-models = {model_name: os.path.join(mlconf.artifact_path, 'tf1/cats_n_dogs.h5')}
-
-# Specific model variables
-function_envs = {
-    'IMAGE_HEIGHT': 128,
-    'IMAGE_WIDTH': 128,
-    'classes_map': os.path.join(mlconf.artifact_path, 'categories_map.json')
-}
-
-
-
-
-
-
-
-

Deploy the serving function to the cluster#

-
-
-
from mlrun import new_model_server, mount_v3io
-
-
-
-
-
-
-
# Setup the model server function
-fn = new_model_server('tf1-serving', 
-                      model_class=model_class,
-                      models=models)
-fn.set_envs(function_envs)
-fn.spec.description = "tf1 image classification server"
-fn.metadata.categories = ['serving', 'dl']
-fn.metadata.labels = {'author': 'yaronh'}
-fn.export("function.yaml")
-
-
-
-
-
[mlrun] 2020-05-04 21:22:18,924 function spec saved to path: function.yaml
-
-
-
<mlrun.runtimes.function.RemoteRuntime at 0x7f5a1585f908>
-
-
-
-
-
-
-
if "V3IO_HOME" in list(os.environ):
-    from mlrun import mount_v3io
-    fn.apply(mount_v3io())
-else:
-    # is you set up mlrun using the instructions at
-    # https://github.com/mlrun/mlrun/blob/master/hack/local/README.md
-    from mlrun.platforms import mount_pvc
-    fn.apply(mount_pvc('nfsvol', 'nfsvol', '/home/joyan/data'))
-
-
-
-
-
-
-
# Deploy the model server
-addr = fn.deploy(project='cat-and-dog-servers')
-
-
-
-
-
[mlrun] 2020-04-30 20:52:15,886 deploy started
-[nuclio] 2020-04-30 20:53:46,385 (info) Build complete
-[nuclio] 2020-04-30 20:53:56,566 (info) Function deploy complete
-[nuclio] 2020-04-30 20:53:56,573 done updating tensorflow-v1-2layers, function address: 3.135.130.246:30961
-
-
-
-
-
-
-

Test the deployed function on the cluster#

-
-

Test the deployed function (with URL)#

-
-
-
# URL event
-event_body = json.dumps({"data_url": cat_image_url})
-print(f'Sending event: {event_body}')
-
-headers = {'Content-type': 'application/json'}
-response = requests.post(url=addr + f'/{model_name}/predict', data=event_body, headers=headers)
-response.content
-
-
-
-
-
Sending event: {"data_url": "https://s3.amazonaws.com/iguazio-sample-data/images/catanddog/cat.102.jpg"}
-
-
-
b'[0.0]'
-
-
-
-
-
-
-

Test the deployed function (with Jpeg Image)#

-
-
-
# URL event
-event_body = cat_image
-print(f'Sending image from {cat_image_url}')
-plt.imshow(img)
-
-headers = {'Content-type': 'image/jpeg'}
-response = requests.post(url=addr + f'/{model_name}/predict/', data=event_body, headers=headers)
-response.content
-
-
-
-
-
Sending image from https://s3.amazonaws.com/iguazio-sample-data/images/catanddog/cat.102.jpg
-
-
-
b'[0.0]'
-
-
-_images/42de06ae5aa6f46639fa3ef8175a9de784413555ecd0f5444a2c569286d93af1.png -
-
-
-
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/tf1_serving/1.1.0/static/function.html b/functions/development/tf1_serving/1.1.0/static/function.html deleted file mode 100644 index 9f1ba0c4..00000000 --- a/functions/development/tf1_serving/1.1.0/static/function.html +++ /dev/null @@ -1,70 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: remote
-metadata:
-  name: tf1-serving
-  tag: ''
-  hash: 20cdeb2119a67fc51e55474ac84d386c7b658db3
-  project: ''
-  labels:
-    author: yaronh
-  categories:
-  - model-serving
-  - machine-learning
-spec:
-  command: ''
-  args: []
-  image: mlrun/mlrun
-  description: tf1 image classification server
-  min_replicas: 1
-  max_replicas: 4
-  env:
-  - name: MODEL_CLASS
-    value: TFModel
-  - name: ENABLE_EXPLAINER
-    value: false
-  base_spec:
-    apiVersion: nuclio.io/v1
-    kind: Function
-    metadata:
-      name: tf1-serving
-      labels: {}
-      annotations:
-        nuclio.io/generated_by: function generated from /home/kali/functions/tf1_serving/tf1_serving.py
-    spec:
-      runtime: python:3.6
-      handler: tf1_serving:handler
-      env: []
-      volumes: []
-      build:
-        commands: []
-        noBaseImagesPull: true
-        functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHdhcm5pbmdzCgp3YXJuaW5ncy5zaW1wbGVmaWx0ZXIoYWN0aW9uPSJpZ25vcmUiLCBjYXRlZ29yeT1GdXR1cmVXYXJuaW5nKQoKaW1wb3J0IGpzb24KaW1wb3J0IG51bXB5IGFzIG5wCmltcG9ydCByZXF1ZXN0cwpmcm9tIHRlbnNvcmZsb3cgaW1wb3J0IGtlcmFzCmZyb20ga2VyYXMubW9kZWxzIGltcG9ydCBsb2FkX21vZGVsCmZyb20ga2VyYXMucHJlcHJvY2Vzc2luZyBpbXBvcnQgaW1hZ2UKZnJvbSBrZXJhcy5wcmVwcm9jZXNzaW5nLmltYWdlIGltcG9ydCBsb2FkX2ltZwpmcm9tIG9zIGltcG9ydCBlbnZpcm9uLCBwYXRoCmZyb20gUElMIGltcG9ydCBJbWFnZQpmcm9tIGlvIGltcG9ydCBCeXRlc0lPCmZyb20gdXJsbGliLnJlcXVlc3QgaW1wb3J0IHVybG9wZW4KaW1wb3J0IG1scnVuCgoKY2xhc3MgVEZNb2RlbChtbHJ1bi5ydW50aW1lcy5NTE1vZGVsU2VydmVyKToKICAgIGRlZiBfX2luaXRfXyhzZWxmLCBuYW1lOiBzdHIsIG1vZGVsX2Rpcjogc3RyKToKICAgICAgICBzdXBlcigpLl9faW5pdF9fKG5hbWUsIG1vZGVsX2RpcikKCiAgICAgICAgc2VsZi5JTUFHRV9XSURUSCA9IGludChlbnZpcm9uLmdldCgiSU1BR0VfV0lEVEgiLCAiMTI4IikpCiAgICAgICAgc2VsZi5JTUFHRV9IRUlHSFQgPSBpbnQoZW52aXJvbi5nZXQoIklNQUdFX0hFSUdIVCIsICIxMjgiKSkKICAgICAgICBzZWxmLmNsYXNzZXMgPSBOb25lCiAgICAgICAgdHJ5OgogICAgICAgICAgICB3aXRoIG9wZW4oZW52aXJvblsiY2xhc3Nlc19tYXAiXSwgInIiKSBhcyBmOgogICAgICAgICAgICAgICAgc2VsZi5jbGFzc2VzID0ganNvbi5sb2FkKGYpCiAgICAgICAgZXhjZXB0OgogICAgICAgICAgICBwYXNzCgogICAgZGVmIGxvYWQoc2VsZik6CiAgICAgICAgbW9kZWxfZmlsZSwgZXh0cmFfZGF0YSA9IHNlbGYuZ2V0X21vZGVsKCIuaDUiKQogICAgICAgIHNlbGYubW9kZWwgPSBsb2FkX21vZGVsKG9wZW4obW9kZWxfZmlsZSwgInJiIikpCgogICAgZGVmIHByZXByb2Nlc3Moc2VsZiwgYm9keSk6CiAgICAgICAgdHJ5OgogICAgICAgICAgICBvdXRwdXQgPSB7Imluc3RhbmNlcyI6IFtdfQogICAgICAgICAgICBpbnN0YW5jZXMgPSBib2R5LmdldCgiaW5zdGFuY2VzIiwgW10pCiAgICAgICAgICAgIGZvciBieXRlX2ltYWdlIGluIGluc3RhbmNlczoKICAgICAgICAgICAgICAgIGltZyA9IEltYWdlLm9wZW4oYnl0ZV9pbWFnZSkKICAgICAgICAgICAgICAgIGltZyA9IGltZy5yZXNpemUoKHNlbGYuSU1BR0VfV0lEVEgsIHNlbGYuSU1BR0VfSEVJR0hUKSkKCiAgICAgICAgICAgICAgICB4ID0gaW1hZ2UuaW1nX3RvX2FycmF5KGltZykKICAgICAgICAgICAgICAgIHggPSBucC5leHBhbmRfZGltcyh4LCBheGlzPTApCiAgICAgICAgICAgICAgICBvdXRwdXRbImluc3RhbmNlcyJdLmFwcGVuZCh4KQoKICAgICAgICAgICAgb3V0cHV0WyJpbnN0YW5jZXMiXSA9IFtucC52c3RhY2sob3V0cHV0WyJpbnN0YW5jZXMiXSldCiAgICAgICAgICAgIHJldHVybiBvdXRwdXQKICAgICAgICBleGNlcHQ6CiAgICAgICAgICAgIHJhaXNlIEV4Y2VwdGlvbihmInJlY2VpdmVkOiB7Ym9keX0iKQoKICAgIGRlZiBwcmVkaWN0KHNlbGYsIGRhdGEpOgogICAgICAgIGltYWdlcyA9IGRhdGEuZ2V0KCJpbnN0YW5jZXMiLCBbXSkKCiAgICAgICAgcHJlZGljdGVkX3Byb2JhYmlsaXR5ID0gc2VsZi5tb2RlbC5wcmVkaWN0KGltYWdlcykKCiAgICAgICAgcmV0dXJuIHByZWRpY3RlZF9wcm9iYWJpbGl0eQoKICAgIGRlZiBwb3N0cHJvY2VzcyhzZWxmLCBwcmVkaWN0ZWRfcHJvYmFiaWxpdHkpOgogICAgICAgIGlmIHNlbGYuY2xhc3NlczoKICAgICAgICAgICAgcHJlZGljdGVkX2NsYXNzZXMgPSBucC5hcm91bmQocHJlZGljdGVkX3Byb2JhYmlsaXR5LCAxKS50b2xpc3QoKVswXQogICAgICAgICAgICBwcmVkaWN0ZWRfcHJvYmFiaWxpdGllcyA9IHByZWRpY3RlZF9wcm9iYWJpbGl0eS50b2xpc3QoKVswXQogICAgICAgICAgICByZXR1cm4gewogICAgICAgICAgICAgICAgInByZWRpY3Rpb24iOiBbCiAgICAgICAgICAgICAgICAgICAgc2VsZi5jbGFzc2VzW3N0cihpbnQoY2xzKSldIGZvciBjbHMgaW4gcHJlZGljdGVkX2NsYXNzZXMKICAgICAgICAgICAgICAgIF0sCiAgICAgICAgICAgICAgICBmJ3tzZWxmLmNsYXNzZXNbIjEiXX0tcHJvYmFiaWxpdHknOiBwcmVkaWN0ZWRfcHJvYmFiaWxpdGllcywKICAgICAgICAgICAgfQogICAgICAgIGVsc2U6CiAgICAgICAgICAgIHJldHVybiBwcmVkaWN0ZWRfcHJvYmFiaWxpdHkudG9saXN0KClbMF0KCmZyb20gbWxydW4ucnVudGltZXMgaW1wb3J0IG51Y2xpb19pbml0X2hvb2sKZGVmIGluaXRfY29udGV4dChjb250ZXh0KToKICAgIG51Y2xpb19pbml0X2hvb2soY29udGV4dCwgZ2xvYmFscygpLCAnc2VydmluZycpCgpkZWYgaGFuZGxlcihjb250ZXh0LCBldmVudCk6CiAgICByZXR1cm4gY29udGV4dC5tbHJ1bl9oYW5kbGVyKGNvbnRleHQsIGV2ZW50KQo=
-  source: ''
-  function_kind: serving
-  build:
-    commands: []
-    code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/tf1_serving/tf1_serving.py
-  default_handler: handler
-  affinity: null
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/tf1_serving/1.1.0/static/item.html b/functions/development/tf1_serving/1.1.0/static/item.html deleted file mode 100644 index 9e2bddfe..00000000 --- a/functions/development/tf1_serving/1.1.0/static/item.html +++ /dev/null @@ -1,50 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- model-serving
-- machine-learning
-description: tf1 image classification server
-doc: ''
-example: tf1_serving.ipynb
-generationDate: 2022-08-28:17-25
-hidden: false
-icon: ''
-labels:
-  author: yaronh
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 1.1.0
-name: tf1-serving
-platformVersion: 3.5.0
-spec:
-  env:
-    ENABLE_EXPLAINER: false
-    MODEL_CLASS: TFModel
-  filename: tf1_serving.py
-  handler: handler
-  image: mlrun/mlrun
-  kind: nuclio:serving
-  requirements: []
-url: ''
-version: 1.1.0
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/tf1_serving/1.1.0/static/source.html b/functions/development/tf1_serving/1.1.0/static/source.html deleted file mode 100644 index 0ef12f0b..00000000 --- a/functions/development/tf1_serving/1.1.0/static/source.html +++ /dev/null @@ -1,109 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-# Copyright 2019 Iguazio
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# Generated by nuclio.export.NuclioExporter
-
-import warnings
-
-warnings.simplefilter(action="ignore", category=FutureWarning)
-
-import json
-import numpy as np
-import requests
-from tensorflow import keras
-from keras.models import load_model
-from keras.preprocessing import image
-from keras.preprocessing.image import load_img
-from os import environ, path
-from PIL import Image
-from io import BytesIO
-from urllib.request import urlopen
-import mlrun
-
-
-class TFModel(mlrun.runtimes.MLModelServer):
-    def __init__(self, name: str, model_dir: str):
-        super().__init__(name, model_dir)
-
-        self.IMAGE_WIDTH = int(environ.get("IMAGE_WIDTH", "128"))
-        self.IMAGE_HEIGHT = int(environ.get("IMAGE_HEIGHT", "128"))
-        self.classes = None
-        try:
-            with open(environ["classes_map"], "r") as f:
-                self.classes = json.load(f)
-        except:
-            pass
-
-    def load(self):
-        model_file, extra_data = self.get_model(".h5")
-        self.model = load_model(open(model_file, "rb"))
-
-    def preprocess(self, body):
-        try:
-            output = {"instances": []}
-            instances = body.get("instances", [])
-            for byte_image in instances:
-                img = Image.open(byte_image)
-                img = img.resize((self.IMAGE_WIDTH, self.IMAGE_HEIGHT))
-
-                x = image.img_to_array(img)
-                x = np.expand_dims(x, axis=0)
-                output["instances"].append(x)
-
-            output["instances"] = [np.vstack(output["instances"])]
-            return output
-        except:
-            raise Exception(f"received: {body}")
-
-    def predict(self, data):
-        images = data.get("instances", [])
-
-        predicted_probability = self.model.predict(images)
-
-        return predicted_probability
-
-    def postprocess(self, predicted_probability):
-        if self.classes:
-            predicted_classes = np.around(predicted_probability, 1).tolist()[0]
-            predicted_probabilities = predicted_probability.tolist()[0]
-            return {
-                "prediction": [
-                    self.classes[str(int(cls))] for cls in predicted_classes
-                ],
-                f'{self.classes["1"]}-probability': predicted_probabilities,
-            }
-        else:
-            return predicted_probability.tolist()[0]
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/tf1_serving/1.1.0/static/tf1_serving.html b/functions/development/tf1_serving/1.1.0/static/tf1_serving.html deleted file mode 100644 index f7654fa0..00000000 --- a/functions/development/tf1_serving/1.1.0/static/tf1_serving.html +++ /dev/null @@ -1,227 +0,0 @@ - - - - - - - -tf1_serving.tf1_serving - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - -
-
- -
-
-
-
-
- -
-

- -
-
-
-
-
-
-
-

Source code for tf1_serving.tf1_serving

-# Copyright 2019 Iguazio
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# Generated by nuclio.export.NuclioExporter
-
-import warnings
-
-warnings.simplefilter(action="ignore", category=FutureWarning)
-
-import json
-import numpy as np
-import requests
-from tensorflow import keras
-from keras.models import load_model
-from keras.preprocessing import image
-from keras.preprocessing.image import load_img
-from os import environ, path
-from PIL import Image
-from io import BytesIO
-from urllib.request import urlopen
-import mlrun
-
-
-
[docs]class TFModel(mlrun.runtimes.MLModelServer): - def __init__(self, name: str, model_dir: str): - super().__init__(name, model_dir) - - self.IMAGE_WIDTH = int(environ.get("IMAGE_WIDTH", "128")) - self.IMAGE_HEIGHT = int(environ.get("IMAGE_HEIGHT", "128")) - self.classes = None - try: - with open(environ["classes_map"], "r") as f: - self.classes = json.load(f) - except: - pass - -
[docs] def load(self): - model_file, extra_data = self.get_model(".h5") - self.model = load_model(open(model_file, "rb"))
- -
[docs] def preprocess(self, body): - try: - output = {"instances": []} - instances = body.get("instances", []) - for byte_image in instances: - img = Image.open(byte_image) - img = img.resize((self.IMAGE_WIDTH, self.IMAGE_HEIGHT)) - - x = image.img_to_array(img) - x = np.expand_dims(x, axis=0) - output["instances"].append(x) - - output["instances"] = [np.vstack(output["instances"])] - return output - except: - raise Exception(f"received: {body}")
- -
[docs] def predict(self, data): - images = data.get("instances", []) - - predicted_probability = self.model.predict(images) - - return predicted_probability
- -
[docs] def postprocess(self, predicted_probability): - if self.classes: - predicted_classes = np.around(predicted_probability, 1).tolist()[0] - predicted_probabilities = predicted_probability.tolist()[0] - return { - "prediction": [ - self.classes[str(int(cls))] for cls in predicted_classes - ], - f'{self.classes["1"]}-probability': predicted_probabilities, - } - else: - return predicted_probability.tolist()[0]
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/tf1_serving/latest/src/function.yaml b/functions/development/tf1_serving/latest/src/function.yaml deleted file mode 100644 index e6a57c4b..00000000 --- a/functions/development/tf1_serving/latest/src/function.yaml +++ /dev/null @@ -1,48 +0,0 @@ -kind: remote -metadata: - name: tf1-serving - tag: '' - hash: 20cdeb2119a67fc51e55474ac84d386c7b658db3 - project: '' - labels: - author: yaronh - categories: - - model-serving - - machine-learning -spec: - command: '' - args: [] - image: mlrun/mlrun - description: tf1 image classification server - min_replicas: 1 - max_replicas: 4 - env: - - name: MODEL_CLASS - value: TFModel - - name: ENABLE_EXPLAINER - value: false - base_spec: - apiVersion: nuclio.io/v1 - kind: Function - metadata: - name: tf1-serving - labels: {} - annotations: - nuclio.io/generated_by: function generated from /home/kali/functions/tf1_serving/tf1_serving.py - spec: - runtime: python:3.6 - handler: tf1_serving:handler - env: [] - volumes: [] - build: - commands: [] - noBaseImagesPull: true - functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHdhcm5pbmdzCgp3YXJuaW5ncy5zaW1wbGVmaWx0ZXIoYWN0aW9uPSJpZ25vcmUiLCBjYXRlZ29yeT1GdXR1cmVXYXJuaW5nKQoKaW1wb3J0IGpzb24KaW1wb3J0IG51bXB5IGFzIG5wCmltcG9ydCByZXF1ZXN0cwpmcm9tIHRlbnNvcmZsb3cgaW1wb3J0IGtlcmFzCmZyb20ga2VyYXMubW9kZWxzIGltcG9ydCBsb2FkX21vZGVsCmZyb20ga2VyYXMucHJlcHJvY2Vzc2luZyBpbXBvcnQgaW1hZ2UKZnJvbSBrZXJhcy5wcmVwcm9jZXNzaW5nLmltYWdlIGltcG9ydCBsb2FkX2ltZwpmcm9tIG9zIGltcG9ydCBlbnZpcm9uLCBwYXRoCmZyb20gUElMIGltcG9ydCBJbWFnZQpmcm9tIGlvIGltcG9ydCBCeXRlc0lPCmZyb20gdXJsbGliLnJlcXVlc3QgaW1wb3J0IHVybG9wZW4KaW1wb3J0IG1scnVuCgoKY2xhc3MgVEZNb2RlbChtbHJ1bi5ydW50aW1lcy5NTE1vZGVsU2VydmVyKToKICAgIGRlZiBfX2luaXRfXyhzZWxmLCBuYW1lOiBzdHIsIG1vZGVsX2Rpcjogc3RyKToKICAgICAgICBzdXBlcigpLl9faW5pdF9fKG5hbWUsIG1vZGVsX2RpcikKCiAgICAgICAgc2VsZi5JTUFHRV9XSURUSCA9IGludChlbnZpcm9uLmdldCgiSU1BR0VfV0lEVEgiLCAiMTI4IikpCiAgICAgICAgc2VsZi5JTUFHRV9IRUlHSFQgPSBpbnQoZW52aXJvbi5nZXQoIklNQUdFX0hFSUdIVCIsICIxMjgiKSkKICAgICAgICBzZWxmLmNsYXNzZXMgPSBOb25lCiAgICAgICAgdHJ5OgogICAgICAgICAgICB3aXRoIG9wZW4oZW52aXJvblsiY2xhc3Nlc19tYXAiXSwgInIiKSBhcyBmOgogICAgICAgICAgICAgICAgc2VsZi5jbGFzc2VzID0ganNvbi5sb2FkKGYpCiAgICAgICAgZXhjZXB0OgogICAgICAgICAgICBwYXNzCgogICAgZGVmIGxvYWQoc2VsZik6CiAgICAgICAgbW9kZWxfZmlsZSwgZXh0cmFfZGF0YSA9IHNlbGYuZ2V0X21vZGVsKCIuaDUiKQogICAgICAgIHNlbGYubW9kZWwgPSBsb2FkX21vZGVsKG9wZW4obW9kZWxfZmlsZSwgInJiIikpCgogICAgZGVmIHByZXByb2Nlc3Moc2VsZiwgYm9keSk6CiAgICAgICAgdHJ5OgogICAgICAgICAgICBvdXRwdXQgPSB7Imluc3RhbmNlcyI6IFtdfQogICAgICAgICAgICBpbnN0YW5jZXMgPSBib2R5LmdldCgiaW5zdGFuY2VzIiwgW10pCiAgICAgICAgICAgIGZvciBieXRlX2ltYWdlIGluIGluc3RhbmNlczoKICAgICAgICAgICAgICAgIGltZyA9IEltYWdlLm9wZW4oYnl0ZV9pbWFnZSkKICAgICAgICAgICAgICAgIGltZyA9IGltZy5yZXNpemUoKHNlbGYuSU1BR0VfV0lEVEgsIHNlbGYuSU1BR0VfSEVJR0hUKSkKCiAgICAgICAgICAgICAgICB4ID0gaW1hZ2UuaW1nX3RvX2FycmF5KGltZykKICAgICAgICAgICAgICAgIHggPSBucC5leHBhbmRfZGltcyh4LCBheGlzPTApCiAgICAgICAgICAgICAgICBvdXRwdXRbImluc3RhbmNlcyJdLmFwcGVuZCh4KQoKICAgICAgICAgICAgb3V0cHV0WyJpbnN0YW5jZXMiXSA9IFtucC52c3RhY2sob3V0cHV0WyJpbnN0YW5jZXMiXSldCiAgICAgICAgICAgIHJldHVybiBvdXRwdXQKICAgICAgICBleGNlcHQ6CiAgICAgICAgICAgIHJhaXNlIEV4Y2VwdGlvbihmInJlY2VpdmVkOiB7Ym9keX0iKQoKICAgIGRlZiBwcmVkaWN0KHNlbGYsIGRhdGEpOgogICAgICAgIGltYWdlcyA9IGRhdGEuZ2V0KCJpbnN0YW5jZXMiLCBbXSkKCiAgICAgICAgcHJlZGljdGVkX3Byb2JhYmlsaXR5ID0gc2VsZi5tb2RlbC5wcmVkaWN0KGltYWdlcykKCiAgICAgICAgcmV0dXJuIHByZWRpY3RlZF9wcm9iYWJpbGl0eQoKICAgIGRlZiBwb3N0cHJvY2VzcyhzZWxmLCBwcmVkaWN0ZWRfcHJvYmFiaWxpdHkpOgogICAgICAgIGlmIHNlbGYuY2xhc3NlczoKICAgICAgICAgICAgcHJlZGljdGVkX2NsYXNzZXMgPSBucC5hcm91bmQocHJlZGljdGVkX3Byb2JhYmlsaXR5LCAxKS50b2xpc3QoKVswXQogICAgICAgICAgICBwcmVkaWN0ZWRfcHJvYmFiaWxpdGllcyA9IHByZWRpY3RlZF9wcm9iYWJpbGl0eS50b2xpc3QoKVswXQogICAgICAgICAgICByZXR1cm4gewogICAgICAgICAgICAgICAgInByZWRpY3Rpb24iOiBbCiAgICAgICAgICAgICAgICAgICAgc2VsZi5jbGFzc2VzW3N0cihpbnQoY2xzKSldIGZvciBjbHMgaW4gcHJlZGljdGVkX2NsYXNzZXMKICAgICAgICAgICAgICAgIF0sCiAgICAgICAgICAgICAgICBmJ3tzZWxmLmNsYXNzZXNbIjEiXX0tcHJvYmFiaWxpdHknOiBwcmVkaWN0ZWRfcHJvYmFiaWxpdGllcywKICAgICAgICAgICAgfQogICAgICAgIGVsc2U6CiAgICAgICAgICAgIHJldHVybiBwcmVkaWN0ZWRfcHJvYmFiaWxpdHkudG9saXN0KClbMF0KCmZyb20gbWxydW4ucnVudGltZXMgaW1wb3J0IG51Y2xpb19pbml0X2hvb2sKZGVmIGluaXRfY29udGV4dChjb250ZXh0KToKICAgIG51Y2xpb19pbml0X2hvb2soY29udGV4dCwgZ2xvYmFscygpLCAnc2VydmluZycpCgpkZWYgaGFuZGxlcihjb250ZXh0LCBldmVudCk6CiAgICByZXR1cm4gY29udGV4dC5tbHJ1bl9oYW5kbGVyKGNvbnRleHQsIGV2ZW50KQo= - source: '' - function_kind: serving - build: - commands: [] - code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/tf1_serving/tf1_serving.py - default_handler: handler - affinity: null -verbose: false diff --git a/functions/development/tf1_serving/latest/src/item.yaml b/functions/development/tf1_serving/latest/src/item.yaml deleted file mode 100644 index 6a5648ab..00000000 --- a/functions/development/tf1_serving/latest/src/item.yaml +++ /dev/null @@ -1,28 +0,0 @@ -apiVersion: v1 -categories: -- model-serving -- machine-learning -description: tf1 image classification server -doc: '' -example: tf1_serving.ipynb -generationDate: 2022-08-28:17-25 -hidden: false -icon: '' -labels: - author: yaronh -maintainers: [] -marketplaceType: '' -mlrunVersion: 1.1.0 -name: tf1-serving -platformVersion: 3.5.0 -spec: - env: - ENABLE_EXPLAINER: false - MODEL_CLASS: TFModel - filename: tf1_serving.py - handler: handler - image: mlrun/mlrun - kind: nuclio:serving - requirements: [] -url: '' -version: 1.1.0 diff --git a/functions/development/tf1_serving/latest/src/requirements.txt b/functions/development/tf1_serving/latest/src/requirements.txt deleted file mode 100644 index 8d3d1955..00000000 --- a/functions/development/tf1_serving/latest/src/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -pillow -tensorflow \ No newline at end of file diff --git a/functions/development/tf1_serving/latest/src/tf1_serving.ipynb b/functions/development/tf1_serving/latest/src/tf1_serving.ipynb deleted file mode 100644 index 1d42ee60..00000000 --- a/functions/development/tf1_serving/latest/src/tf1_serving.ipynb +++ /dev/null @@ -1,567 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Image Classification Model - Serving Function\n", - "\n", - "This notebook demonstrates how to deploy a Tensorflow model using MLRun & Nuclio.\n", - "\n", - "**In this notebook you will:**\n", - "* Write a Tensorflow-Model class to load and predict on the incoming data\n", - "* Deploy the model as a serverless function\n", - "* Invoke the serving endpoint with data as:\n", - " * URLs to images hosted on S3\n", - " * Direct image send\n", - " \n", - "**Steps:** \n", - "* [Define Nuclio function](#Define-Nuclio-function) \n", - " * [Install dependencies and set config](#Install-dependencies-and-set-config) \n", - " * [Model serving class](#Model-Serving-Class) \n", - "* [Deploy the serving function to the cluster](#Deploy-the-serving-function-to-the-cluster) \n", - "* [Define test parameters](#Define-test-parameters)\n", - "* [Test the deployed function on the cluster](#Test-the-deployed-function-on-the-cluster)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Define Nuclio Function" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To use the magic commands for deploying this jupyter notebook as a nuclio function we must first import nuclio \n", - "Since we do not want to import nuclio in the actual function, the comment annotation `nuclio: ignore` is used. This marks the cell for nuclio, telling it to ignore the cell's values when building the function." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: ignore\n", - "import nuclio" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Install dependencies and set config\n", - "> Note: Since tensorflow 1.14 is being pulled from the baseimage it is not directly installed as a build command.\n", - "If it is not installed on your system please uninstall and install using the line: `pip install tensorflow==1.14 keras`" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "%nuclio: setting kind to 'nuclio:serving'\n", - "%nuclio: setting 'MODEL_CLASS' environment variable\n", - "%nuclio: setting spec.build.baseImage to 'mlrun/mlrun'\n" - ] - } - ], - "source": [ - "%nuclio config kind=\"nuclio:serving\"\n", - "%nuclio env MODEL_CLASS=TFModel\n", - "\n", - "# tensorflow version 1 requires a different version of python than \n", - "# the default (3.7), so we override the default tag here:\n", - "\n", - "%nuclio config spec.build.baseImage = \"mlrun/mlrun\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Since we are using packages which are not surely installed on our baseimage, or want to verify that a specific version of the package will be installed we use the `%nuclio cmd` annotation. \n", - ">`%nuclio cmd` works both locally and during deployment by default, but can be set with `-c` flag to only run the commands while deploying or `-l` to set the variable for the local environment only." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "%%nuclio cmd -c\n", - "pip install tensorflow==1.14 keras==2.3.1 'h5py<3.0.0'\n", - "pip install requests pillow" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Function Code" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "import warnings\n", - "warnings.simplefilter(action=\"ignore\", category=FutureWarning)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "import json\n", - "import numpy as np\n", - "import requests\n", - "from tensorflow import keras\n", - "from keras.models import load_model\n", - "from keras.preprocessing import image\n", - "from keras.preprocessing.image import load_img\n", - "from os import environ, path\n", - "from PIL import Image\n", - "from io import BytesIO\n", - "from urllib.request import urlopen\n", - "import mlrun" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Model Serving Class" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We define the `TFModel` class which we will use to define data handling and prediction of our model. \n", - "\n", - "The class should consist of:\n", - "* `__init__(name, model_dir)` - Setup the internal parameters\n", - "* `load(self)` - How to load the model and broadcast it's ready for prediction\n", - "* `preprocess(self, body)` - How to handle the incoming event, forming the request to an `{'instances': []}` dictionary as requested by the protocol\n", - "* `predict(self, data)` - Receives and `{'instances': []}` and returns the model's prediction as a list\n", - "* `postprocess(self, data)` - Does any additional processing needed on the predictions." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "class TFModel(mlrun.runtimes.MLModelServer):\n", - " def __init__(self, name: str, model_dir: str):\n", - " super().__init__(name, model_dir)\n", - "\n", - " self.IMAGE_WIDTH = int(environ.get('IMAGE_WIDTH', '128'))\n", - " self.IMAGE_HEIGHT = int(environ.get('IMAGE_HEIGHT', '128'))\n", - " self.classes = None\n", - " try:\n", - " with open(environ['classes_map'], 'r') as f:\n", - " self.classes = json.load(f)\n", - " except:\n", - " pass\n", - " \n", - " def load(self):\n", - " model_file, extra_data = self.get_model('.h5')\n", - " self.model = load_model(open(model_file, 'rb'))\n", - " \n", - " def preprocess(self, body):\n", - " try:\n", - " output = {'instances': []}\n", - " instances = body.get('instances', [])\n", - " for byte_image in instances:\n", - " img = Image.open(byte_image)\n", - " img = img.resize((self.IMAGE_WIDTH, self.IMAGE_HEIGHT))\n", - "\n", - " # Load image\n", - " x = image.img_to_array(img)\n", - " x = np.expand_dims(x, axis=0)\n", - " output['instances'].append(x)\n", - " \n", - " # Format instances list\n", - " output['instances'] = [np.vstack(output['instances'])]\n", - " return output\n", - " except:\n", - " raise Exception(f'received: {body}')\n", - " \n", - "\n", - " def predict(self, data):\n", - " images = data.get('instances', [])\n", - "\n", - " # Predict\n", - " predicted_probability = self.model.predict(images)\n", - "\n", - " # return prediction\n", - " return predicted_probability\n", - " \n", - " def postprocess(self, predicted_probability):\n", - " if self.classes:\n", - " predicted_classes = np.around(predicted_probability, 1).tolist()[0]\n", - " predicted_probabilities = predicted_probability.tolist()[0]\n", - " return {\n", - " 'prediction': [self.classes[str(int(cls))] for cls in predicted_classes], \n", - " f'{self.classes[\"1\"]}-probability': predicted_probabilities\n", - " }\n", - " else:\n", - " return predicted_probability.tolist()[0]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To let our nuclio builder know that our function code ends at this point we will use the comment annotation `nuclio: end-code`. \n", - "\n", - "Any new cell from now on will be treated as if a `nuclio: ignore` comment was set, and will not be added to the funcion." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: end-code" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Test the function locally" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Make sure your local TF / Keras version is the same as pulled in the nuclio image for accurate testing\n", - "\n", - "Set the served models and their file paths using: `SERVING_MODEL_ = `\n", - "\n", - "> Note: this notebook assumes the model and categories are under /User/mlrun/examples/" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "from PIL import Image\n", - "from io import BytesIO\n", - "import matplotlib.pyplot as plt\n", - "import os, requests" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Define test parameters" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Test image:\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "# Testing event\n", - "cat_image_url = 'https://s3.amazonaws.com/iguazio-sample-data/images/catanddog/cat.102.jpg'\n", - "response = requests.get(cat_image_url)\n", - "cat_image = response.content\n", - "img = Image.open(BytesIO(cat_image))\n", - "\n", - "print('Test image:')\n", - "plt.imshow(img)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Define Function specifications" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import mlconf\n", - "import os\n", - "\n", - "# Model Server variables\n", - "model_class = 'TFModel'\n", - "model_name = 'cat_vs_dog_tfv1' # Define for later use in tests\n", - "models = {model_name: os.path.join(mlconf.artifact_path, 'tf1/cats_n_dogs.h5')}\n", - "\n", - "# Specific model variables\n", - "function_envs = {\n", - " 'IMAGE_HEIGHT': 128,\n", - " 'IMAGE_WIDTH': 128,\n", - " 'classes_map': os.path.join(mlconf.artifact_path, 'categories_map.json')\n", - "}" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Deploy the serving function to the cluster" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import new_model_server, mount_v3io" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-05-04 21:22:18,924 function spec saved to path: function.yaml\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Setup the model server function\n", - "fn = new_model_server('tf1-serving', \n", - " model_class=model_class,\n", - " models=models)\n", - "fn.set_envs(function_envs)\n", - "fn.spec.description = \"tf1 image classification server\"\n", - "fn.metadata.categories = ['serving', 'dl']\n", - "fn.metadata.labels = {'author': 'yaronh'}\n", - "fn.export(\"function.yaml\")" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "if \"V3IO_HOME\" in list(os.environ):\n", - " from mlrun import mount_v3io\n", - " fn.apply(mount_v3io())\n", - "else:\n", - " # is you set up mlrun using the instructions at\n", - " # https://github.com/mlrun/mlrun/blob/master/hack/local/README.md\n", - " from mlrun.platforms import mount_pvc\n", - " fn.apply(mount_pvc('nfsvol', 'nfsvol', '/home/joyan/data'))" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-04-30 20:52:15,886 deploy started\n", - "[nuclio] 2020-04-30 20:53:46,385 (info) Build complete\n", - "[nuclio] 2020-04-30 20:53:56,566 (info) Function deploy complete\n", - "[nuclio] 2020-04-30 20:53:56,573 done updating tensorflow-v1-2layers, function address: 3.135.130.246:30961\n" - ] - } - ], - "source": [ - "# Deploy the model server\n", - "addr = fn.deploy(project='cat-and-dog-servers')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Test the deployed function on the cluster" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Test the deployed function (with URL)" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Sending event: {\"data_url\": \"https://s3.amazonaws.com/iguazio-sample-data/images/catanddog/cat.102.jpg\"}\n" - ] - }, - { - "data": { - "text/plain": [ - "b'[0.0]'" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# URL event\n", - "event_body = json.dumps({\"data_url\": cat_image_url})\n", - "print(f'Sending event: {event_body}')\n", - "\n", - "headers = {'Content-type': 'application/json'}\n", - "response = requests.post(url=addr + f'/{model_name}/predict', data=event_body, headers=headers)\n", - "response.content" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Test the deployed function (with Jpeg Image)" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Sending image from https://s3.amazonaws.com/iguazio-sample-data/images/catanddog/cat.102.jpg\n" - ] - }, - { - "data": { - "text/plain": [ - "b'[0.0]'" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "# URL event\n", - "event_body = cat_image\n", - "print(f'Sending image from {cat_image_url}')\n", - "plt.imshow(img)\n", - "\n", - "headers = {'Content-type': 'image/jpeg'}\n", - "response = requests.post(url=addr + f'/{model_name}/predict/', data=event_body, headers=headers)\n", - "response.content" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/functions/development/tf1_serving/latest/src/tf1_serving.py b/functions/development/tf1_serving/latest/src/tf1_serving.py deleted file mode 100644 index d9816c68..00000000 --- a/functions/development/tf1_serving/latest/src/tf1_serving.py +++ /dev/null @@ -1,87 +0,0 @@ -# Copyright 2019 Iguazio -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# Generated by nuclio.export.NuclioExporter - -import warnings - -warnings.simplefilter(action="ignore", category=FutureWarning) - -import json -import numpy as np -import requests -from tensorflow import keras -from keras.models import load_model -from keras.preprocessing import image -from keras.preprocessing.image import load_img -from os import environ, path -from PIL import Image -from io import BytesIO -from urllib.request import urlopen -import mlrun - - -class TFModel(mlrun.runtimes.MLModelServer): - def __init__(self, name: str, model_dir: str): - super().__init__(name, model_dir) - - self.IMAGE_WIDTH = int(environ.get("IMAGE_WIDTH", "128")) - self.IMAGE_HEIGHT = int(environ.get("IMAGE_HEIGHT", "128")) - self.classes = None - try: - with open(environ["classes_map"], "r") as f: - self.classes = json.load(f) - except: - pass - - def load(self): - model_file, extra_data = self.get_model(".h5") - self.model = load_model(open(model_file, "rb")) - - def preprocess(self, body): - try: - output = {"instances": []} - instances = body.get("instances", []) - for byte_image in instances: - img = Image.open(byte_image) - img = img.resize((self.IMAGE_WIDTH, self.IMAGE_HEIGHT)) - - x = image.img_to_array(img) - x = np.expand_dims(x, axis=0) - output["instances"].append(x) - - output["instances"] = [np.vstack(output["instances"])] - return output - except: - raise Exception(f"received: {body}") - - def predict(self, data): - images = data.get("instances", []) - - predicted_probability = self.model.predict(images) - - return predicted_probability - - def postprocess(self, predicted_probability): - if self.classes: - predicted_classes = np.around(predicted_probability, 1).tolist()[0] - predicted_probabilities = predicted_probability.tolist()[0] - return { - "prediction": [ - self.classes[str(int(cls))] for cls in predicted_classes - ], - f'{self.classes["1"]}-probability': predicted_probabilities, - } - else: - return predicted_probability.tolist()[0] diff --git a/functions/development/tf1_serving/latest/static/documentation.html b/functions/development/tf1_serving/latest/static/documentation.html deleted file mode 100644 index e71da255..00000000 --- a/functions/development/tf1_serving/latest/static/documentation.html +++ /dev/null @@ -1,243 +0,0 @@ - - - - - - - -tf1_serving package - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - - - -
-
- - -
-
-
- -
-

tf1_serving package

- -
- -
-
-
-
-
-

tf1_serving package#

-
-

Submodules#

-
-
-

tf1_serving.tf1_serving module#

-
-
-class tf1_serving.tf1_serving.TFModel(name: str, model_dir: str)[source]#
-

Bases: mlrun.serving.v1_serving.MLModelServer

-
-
-load()[source]#
-
-
-
-postprocess(predicted_probability)[source]#
-
-
-
-predict(data)[source]#
-
-
-
-preprocess(body)[source]#
-
-
-
-
-

Module contents#

-
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/tf1_serving/latest/static/example.html b/functions/development/tf1_serving/latest/static/example.html deleted file mode 100644 index adfbff17..00000000 --- a/functions/development/tf1_serving/latest/static/example.html +++ /dev/null @@ -1,691 +0,0 @@ - - - - - - - -Image Classification Model - Serving Function - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - - - -
-
- - -
-
-
- - -
-
-
-

Image Classification Model - Serving Function#

-

This notebook demonstrates how to deploy a Tensorflow model using MLRun & Nuclio.

-

In this notebook you will:

-
    -
  • Write a Tensorflow-Model class to load and predict on the incoming data

  • -
  • Deploy the model as a serverless function

  • -
  • Invoke the serving endpoint with data as:

    -
      -
    • URLs to images hosted on S3

    • -
    • Direct image send

    • -
    -
  • -
-

Steps:

-
    -
  • Define Nuclio function

    -
      -
    • Install dependencies and set config

    • -
    • Model serving class

    • -
    -
  • -
  • Deploy the serving function to the cluster

  • -
  • Define test parameters

  • -
  • Test the deployed function on the cluster

  • -
-
-

Define Nuclio Function#

-

To use the magic commands for deploying this jupyter notebook as a nuclio function we must first import nuclio
-Since we do not want to import nuclio in the actual function, the comment annotation nuclio: ignore is used. This marks the cell for nuclio, telling it to ignore the cell’s values when building the function.

-
-
-
# nuclio: ignore
-import nuclio
-
-
-
-
-
-

Install dependencies and set config#

-
-

Note: Since tensorflow 1.14 is being pulled from the baseimage it is not directly installed as a build command. -If it is not installed on your system please uninstall and install using the line: pip install tensorflow==1.14 keras

-
-
-
-
%nuclio config kind="nuclio:serving"
-%nuclio env MODEL_CLASS=TFModel
-
-# tensorflow version 1 requires a different version of python than 
-# the default (3.7), so we override the default tag here:
-
-%nuclio config spec.build.baseImage = "mlrun/mlrun"
-
-
-
-
-
%nuclio: setting kind to 'nuclio:serving'
-%nuclio: setting 'MODEL_CLASS' environment variable
-%nuclio: setting spec.build.baseImage to 'mlrun/mlrun'
-
-
-
-
-

Since we are using packages which are not surely installed on our baseimage, or want to verify that a specific version of the package will be installed we use the %nuclio cmd annotation.

-
-

%nuclio cmd works both locally and during deployment by default, but can be set with -c flag to only run the commands while deploying or -l to set the variable for the local environment only.

-
-
-
-
%%nuclio cmd -c
-pip install tensorflow==1.14 keras==2.3.1 'h5py<3.0.0'
-pip install requests pillow
-
-
-
-
-
-
-
-

Function Code#

-
-
-
import warnings
-warnings.simplefilter(action="ignore", category=FutureWarning)
-
-
-
-
-
-
-
import json
-import numpy as np
-import requests
-from tensorflow import keras
-from keras.models import load_model
-from keras.preprocessing import image
-from keras.preprocessing.image import load_img
-from os import environ, path
-from PIL import Image
-from io import BytesIO
-from urllib.request import urlopen
-import mlrun
-
-
-
-
-
-

Model Serving Class#

-

We define the TFModel class which we will use to define data handling and prediction of our model.

-

The class should consist of:

-
    -
  • __init__(name, model_dir) - Setup the internal parameters

  • -
  • load(self) - How to load the model and broadcast it’s ready for prediction

  • -
  • preprocess(self, body) - How to handle the incoming event, forming the request to an {'instances': [<samples>]} dictionary as requested by the protocol

  • -
  • predict(self, data) - Receives and {'instances': [<samples>]} and returns the model’s prediction as a list

  • -
  • postprocess(self, data) - Does any additional processing needed on the predictions.

  • -
-
-
-
class TFModel(mlrun.runtimes.MLModelServer):
-    def __init__(self, name: str, model_dir: str):
-        super().__init__(name, model_dir)
-
-        self.IMAGE_WIDTH = int(environ.get('IMAGE_WIDTH', '128'))
-        self.IMAGE_HEIGHT = int(environ.get('IMAGE_HEIGHT', '128'))
-        self.classes = None
-        try:
-            with open(environ['classes_map'], 'r') as f:
-                self.classes = json.load(f)
-        except:
-            pass
-        
-    def load(self):
-        model_file, extra_data = self.get_model('.h5')
-        self.model = load_model(open(model_file, 'rb'))
-        
-    def preprocess(self, body):
-        try:
-            output = {'instances': []}
-            instances = body.get('instances', [])
-            for byte_image in instances:
-                img = Image.open(byte_image)
-                img = img.resize((self.IMAGE_WIDTH, self.IMAGE_HEIGHT))
-
-                # Load image
-                x = image.img_to_array(img)
-                x = np.expand_dims(x, axis=0)
-                output['instances'].append(x)
-            
-            # Format instances list
-            output['instances'] = [np.vstack(output['instances'])]
-            return output
-        except:
-            raise Exception(f'received: {body}')
-            
-
-    def predict(self, data):
-        images = data.get('instances', [])
-
-        # Predict
-        predicted_probability = self.model.predict(images)
-
-        # return prediction
-        return predicted_probability
-        
-    def postprocess(self, predicted_probability):
-        if self.classes:
-            predicted_classes = np.around(predicted_probability, 1).tolist()[0]
-            predicted_probabilities = predicted_probability.tolist()[0]
-            return {
-                'prediction': [self.classes[str(int(cls))] for cls in predicted_classes], 
-                f'{self.classes["1"]}-probability': predicted_probabilities
-            }
-        else:
-            return predicted_probability.tolist()[0]
-
-
-
-
-

To let our nuclio builder know that our function code ends at this point we will use the comment annotation nuclio: end-code.

-

Any new cell from now on will be treated as if a nuclio: ignore comment was set, and will not be added to the funcion.

-
-
-
# nuclio: end-code
-
-
-
-
-
-
-
-

Test the function locally#

-

Make sure your local TF / Keras version is the same as pulled in the nuclio image for accurate testing

-

Set the served models and their file paths using: SERVING_MODEL_<name> = <model file path>

-
-

Note: this notebook assumes the model and categories are under /User/mlrun/examples/

-
-
-
-
from PIL import Image
-from io import BytesIO
-import matplotlib.pyplot as plt
-import os, requests
-
-
-
-
-
-

Define test parameters#

-
-
-
# Testing event
-cat_image_url = 'https://s3.amazonaws.com/iguazio-sample-data/images/catanddog/cat.102.jpg'
-response = requests.get(cat_image_url)
-cat_image = response.content
-img = Image.open(BytesIO(cat_image))
-
-print('Test image:')
-plt.imshow(img)
-
-
-
-
-
Test image:
-
-
-
<matplotlib.image.AxesImage at 0x7f1232ea52e8>
-
-
-_images/42de06ae5aa6f46639fa3ef8175a9de784413555ecd0f5444a2c569286d93af1.png -
-
-
-
-

Define Function specifications#

-
-
-
from mlrun import mlconf
-import os
-
-# Model Server variables
-model_class = 'TFModel'
-model_name = 'cat_vs_dog_tfv1' # Define for later use in tests
-models = {model_name: os.path.join(mlconf.artifact_path, 'tf1/cats_n_dogs.h5')}
-
-# Specific model variables
-function_envs = {
-    'IMAGE_HEIGHT': 128,
-    'IMAGE_WIDTH': 128,
-    'classes_map': os.path.join(mlconf.artifact_path, 'categories_map.json')
-}
-
-
-
-
-
-
-
-

Deploy the serving function to the cluster#

-
-
-
from mlrun import new_model_server, mount_v3io
-
-
-
-
-
-
-
# Setup the model server function
-fn = new_model_server('tf1-serving', 
-                      model_class=model_class,
-                      models=models)
-fn.set_envs(function_envs)
-fn.spec.description = "tf1 image classification server"
-fn.metadata.categories = ['serving', 'dl']
-fn.metadata.labels = {'author': 'yaronh'}
-fn.export("function.yaml")
-
-
-
-
-
[mlrun] 2020-05-04 21:22:18,924 function spec saved to path: function.yaml
-
-
-
<mlrun.runtimes.function.RemoteRuntime at 0x7f5a1585f908>
-
-
-
-
-
-
-
if "V3IO_HOME" in list(os.environ):
-    from mlrun import mount_v3io
-    fn.apply(mount_v3io())
-else:
-    # is you set up mlrun using the instructions at
-    # https://github.com/mlrun/mlrun/blob/master/hack/local/README.md
-    from mlrun.platforms import mount_pvc
-    fn.apply(mount_pvc('nfsvol', 'nfsvol', '/home/joyan/data'))
-
-
-
-
-
-
-
# Deploy the model server
-addr = fn.deploy(project='cat-and-dog-servers')
-
-
-
-
-
[mlrun] 2020-04-30 20:52:15,886 deploy started
-[nuclio] 2020-04-30 20:53:46,385 (info) Build complete
-[nuclio] 2020-04-30 20:53:56,566 (info) Function deploy complete
-[nuclio] 2020-04-30 20:53:56,573 done updating tensorflow-v1-2layers, function address: 3.135.130.246:30961
-
-
-
-
-
-
-

Test the deployed function on the cluster#

-
-

Test the deployed function (with URL)#

-
-
-
# URL event
-event_body = json.dumps({"data_url": cat_image_url})
-print(f'Sending event: {event_body}')
-
-headers = {'Content-type': 'application/json'}
-response = requests.post(url=addr + f'/{model_name}/predict', data=event_body, headers=headers)
-response.content
-
-
-
-
-
Sending event: {"data_url": "https://s3.amazonaws.com/iguazio-sample-data/images/catanddog/cat.102.jpg"}
-
-
-
b'[0.0]'
-
-
-
-
-
-
-

Test the deployed function (with Jpeg Image)#

-
-
-
# URL event
-event_body = cat_image
-print(f'Sending image from {cat_image_url}')
-plt.imshow(img)
-
-headers = {'Content-type': 'image/jpeg'}
-response = requests.post(url=addr + f'/{model_name}/predict/', data=event_body, headers=headers)
-response.content
-
-
-
-
-
Sending image from https://s3.amazonaws.com/iguazio-sample-data/images/catanddog/cat.102.jpg
-
-
-
b'[0.0]'
-
-
-_images/42de06ae5aa6f46639fa3ef8175a9de784413555ecd0f5444a2c569286d93af1.png -
-
-
-
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/tf1_serving/latest/static/function.html b/functions/development/tf1_serving/latest/static/function.html deleted file mode 100644 index 9f1ba0c4..00000000 --- a/functions/development/tf1_serving/latest/static/function.html +++ /dev/null @@ -1,70 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: remote
-metadata:
-  name: tf1-serving
-  tag: ''
-  hash: 20cdeb2119a67fc51e55474ac84d386c7b658db3
-  project: ''
-  labels:
-    author: yaronh
-  categories:
-  - model-serving
-  - machine-learning
-spec:
-  command: ''
-  args: []
-  image: mlrun/mlrun
-  description: tf1 image classification server
-  min_replicas: 1
-  max_replicas: 4
-  env:
-  - name: MODEL_CLASS
-    value: TFModel
-  - name: ENABLE_EXPLAINER
-    value: false
-  base_spec:
-    apiVersion: nuclio.io/v1
-    kind: Function
-    metadata:
-      name: tf1-serving
-      labels: {}
-      annotations:
-        nuclio.io/generated_by: function generated from /home/kali/functions/tf1_serving/tf1_serving.py
-    spec:
-      runtime: python:3.6
-      handler: tf1_serving:handler
-      env: []
-      volumes: []
-      build:
-        commands: []
-        noBaseImagesPull: true
-        functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHdhcm5pbmdzCgp3YXJuaW5ncy5zaW1wbGVmaWx0ZXIoYWN0aW9uPSJpZ25vcmUiLCBjYXRlZ29yeT1GdXR1cmVXYXJuaW5nKQoKaW1wb3J0IGpzb24KaW1wb3J0IG51bXB5IGFzIG5wCmltcG9ydCByZXF1ZXN0cwpmcm9tIHRlbnNvcmZsb3cgaW1wb3J0IGtlcmFzCmZyb20ga2VyYXMubW9kZWxzIGltcG9ydCBsb2FkX21vZGVsCmZyb20ga2VyYXMucHJlcHJvY2Vzc2luZyBpbXBvcnQgaW1hZ2UKZnJvbSBrZXJhcy5wcmVwcm9jZXNzaW5nLmltYWdlIGltcG9ydCBsb2FkX2ltZwpmcm9tIG9zIGltcG9ydCBlbnZpcm9uLCBwYXRoCmZyb20gUElMIGltcG9ydCBJbWFnZQpmcm9tIGlvIGltcG9ydCBCeXRlc0lPCmZyb20gdXJsbGliLnJlcXVlc3QgaW1wb3J0IHVybG9wZW4KaW1wb3J0IG1scnVuCgoKY2xhc3MgVEZNb2RlbChtbHJ1bi5ydW50aW1lcy5NTE1vZGVsU2VydmVyKToKICAgIGRlZiBfX2luaXRfXyhzZWxmLCBuYW1lOiBzdHIsIG1vZGVsX2Rpcjogc3RyKToKICAgICAgICBzdXBlcigpLl9faW5pdF9fKG5hbWUsIG1vZGVsX2RpcikKCiAgICAgICAgc2VsZi5JTUFHRV9XSURUSCA9IGludChlbnZpcm9uLmdldCgiSU1BR0VfV0lEVEgiLCAiMTI4IikpCiAgICAgICAgc2VsZi5JTUFHRV9IRUlHSFQgPSBpbnQoZW52aXJvbi5nZXQoIklNQUdFX0hFSUdIVCIsICIxMjgiKSkKICAgICAgICBzZWxmLmNsYXNzZXMgPSBOb25lCiAgICAgICAgdHJ5OgogICAgICAgICAgICB3aXRoIG9wZW4oZW52aXJvblsiY2xhc3Nlc19tYXAiXSwgInIiKSBhcyBmOgogICAgICAgICAgICAgICAgc2VsZi5jbGFzc2VzID0ganNvbi5sb2FkKGYpCiAgICAgICAgZXhjZXB0OgogICAgICAgICAgICBwYXNzCgogICAgZGVmIGxvYWQoc2VsZik6CiAgICAgICAgbW9kZWxfZmlsZSwgZXh0cmFfZGF0YSA9IHNlbGYuZ2V0X21vZGVsKCIuaDUiKQogICAgICAgIHNlbGYubW9kZWwgPSBsb2FkX21vZGVsKG9wZW4obW9kZWxfZmlsZSwgInJiIikpCgogICAgZGVmIHByZXByb2Nlc3Moc2VsZiwgYm9keSk6CiAgICAgICAgdHJ5OgogICAgICAgICAgICBvdXRwdXQgPSB7Imluc3RhbmNlcyI6IFtdfQogICAgICAgICAgICBpbnN0YW5jZXMgPSBib2R5LmdldCgiaW5zdGFuY2VzIiwgW10pCiAgICAgICAgICAgIGZvciBieXRlX2ltYWdlIGluIGluc3RhbmNlczoKICAgICAgICAgICAgICAgIGltZyA9IEltYWdlLm9wZW4oYnl0ZV9pbWFnZSkKICAgICAgICAgICAgICAgIGltZyA9IGltZy5yZXNpemUoKHNlbGYuSU1BR0VfV0lEVEgsIHNlbGYuSU1BR0VfSEVJR0hUKSkKCiAgICAgICAgICAgICAgICB4ID0gaW1hZ2UuaW1nX3RvX2FycmF5KGltZykKICAgICAgICAgICAgICAgIHggPSBucC5leHBhbmRfZGltcyh4LCBheGlzPTApCiAgICAgICAgICAgICAgICBvdXRwdXRbImluc3RhbmNlcyJdLmFwcGVuZCh4KQoKICAgICAgICAgICAgb3V0cHV0WyJpbnN0YW5jZXMiXSA9IFtucC52c3RhY2sob3V0cHV0WyJpbnN0YW5jZXMiXSldCiAgICAgICAgICAgIHJldHVybiBvdXRwdXQKICAgICAgICBleGNlcHQ6CiAgICAgICAgICAgIHJhaXNlIEV4Y2VwdGlvbihmInJlY2VpdmVkOiB7Ym9keX0iKQoKICAgIGRlZiBwcmVkaWN0KHNlbGYsIGRhdGEpOgogICAgICAgIGltYWdlcyA9IGRhdGEuZ2V0KCJpbnN0YW5jZXMiLCBbXSkKCiAgICAgICAgcHJlZGljdGVkX3Byb2JhYmlsaXR5ID0gc2VsZi5tb2RlbC5wcmVkaWN0KGltYWdlcykKCiAgICAgICAgcmV0dXJuIHByZWRpY3RlZF9wcm9iYWJpbGl0eQoKICAgIGRlZiBwb3N0cHJvY2VzcyhzZWxmLCBwcmVkaWN0ZWRfcHJvYmFiaWxpdHkpOgogICAgICAgIGlmIHNlbGYuY2xhc3NlczoKICAgICAgICAgICAgcHJlZGljdGVkX2NsYXNzZXMgPSBucC5hcm91bmQocHJlZGljdGVkX3Byb2JhYmlsaXR5LCAxKS50b2xpc3QoKVswXQogICAgICAgICAgICBwcmVkaWN0ZWRfcHJvYmFiaWxpdGllcyA9IHByZWRpY3RlZF9wcm9iYWJpbGl0eS50b2xpc3QoKVswXQogICAgICAgICAgICByZXR1cm4gewogICAgICAgICAgICAgICAgInByZWRpY3Rpb24iOiBbCiAgICAgICAgICAgICAgICAgICAgc2VsZi5jbGFzc2VzW3N0cihpbnQoY2xzKSldIGZvciBjbHMgaW4gcHJlZGljdGVkX2NsYXNzZXMKICAgICAgICAgICAgICAgIF0sCiAgICAgICAgICAgICAgICBmJ3tzZWxmLmNsYXNzZXNbIjEiXX0tcHJvYmFiaWxpdHknOiBwcmVkaWN0ZWRfcHJvYmFiaWxpdGllcywKICAgICAgICAgICAgfQogICAgICAgIGVsc2U6CiAgICAgICAgICAgIHJldHVybiBwcmVkaWN0ZWRfcHJvYmFiaWxpdHkudG9saXN0KClbMF0KCmZyb20gbWxydW4ucnVudGltZXMgaW1wb3J0IG51Y2xpb19pbml0X2hvb2sKZGVmIGluaXRfY29udGV4dChjb250ZXh0KToKICAgIG51Y2xpb19pbml0X2hvb2soY29udGV4dCwgZ2xvYmFscygpLCAnc2VydmluZycpCgpkZWYgaGFuZGxlcihjb250ZXh0LCBldmVudCk6CiAgICByZXR1cm4gY29udGV4dC5tbHJ1bl9oYW5kbGVyKGNvbnRleHQsIGV2ZW50KQo=
-  source: ''
-  function_kind: serving
-  build:
-    commands: []
-    code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/tf1_serving/tf1_serving.py
-  default_handler: handler
-  affinity: null
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/tf1_serving/latest/static/item.html b/functions/development/tf1_serving/latest/static/item.html deleted file mode 100644 index 9e2bddfe..00000000 --- a/functions/development/tf1_serving/latest/static/item.html +++ /dev/null @@ -1,50 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- model-serving
-- machine-learning
-description: tf1 image classification server
-doc: ''
-example: tf1_serving.ipynb
-generationDate: 2022-08-28:17-25
-hidden: false
-icon: ''
-labels:
-  author: yaronh
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 1.1.0
-name: tf1-serving
-platformVersion: 3.5.0
-spec:
-  env:
-    ENABLE_EXPLAINER: false
-    MODEL_CLASS: TFModel
-  filename: tf1_serving.py
-  handler: handler
-  image: mlrun/mlrun
-  kind: nuclio:serving
-  requirements: []
-url: ''
-version: 1.1.0
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/tf1_serving/latest/static/source.html b/functions/development/tf1_serving/latest/static/source.html deleted file mode 100644 index 0ef12f0b..00000000 --- a/functions/development/tf1_serving/latest/static/source.html +++ /dev/null @@ -1,109 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-# Copyright 2019 Iguazio
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# Generated by nuclio.export.NuclioExporter
-
-import warnings
-
-warnings.simplefilter(action="ignore", category=FutureWarning)
-
-import json
-import numpy as np
-import requests
-from tensorflow import keras
-from keras.models import load_model
-from keras.preprocessing import image
-from keras.preprocessing.image import load_img
-from os import environ, path
-from PIL import Image
-from io import BytesIO
-from urllib.request import urlopen
-import mlrun
-
-
-class TFModel(mlrun.runtimes.MLModelServer):
-    def __init__(self, name: str, model_dir: str):
-        super().__init__(name, model_dir)
-
-        self.IMAGE_WIDTH = int(environ.get("IMAGE_WIDTH", "128"))
-        self.IMAGE_HEIGHT = int(environ.get("IMAGE_HEIGHT", "128"))
-        self.classes = None
-        try:
-            with open(environ["classes_map"], "r") as f:
-                self.classes = json.load(f)
-        except:
-            pass
-
-    def load(self):
-        model_file, extra_data = self.get_model(".h5")
-        self.model = load_model(open(model_file, "rb"))
-
-    def preprocess(self, body):
-        try:
-            output = {"instances": []}
-            instances = body.get("instances", [])
-            for byte_image in instances:
-                img = Image.open(byte_image)
-                img = img.resize((self.IMAGE_WIDTH, self.IMAGE_HEIGHT))
-
-                x = image.img_to_array(img)
-                x = np.expand_dims(x, axis=0)
-                output["instances"].append(x)
-
-            output["instances"] = [np.vstack(output["instances"])]
-            return output
-        except:
-            raise Exception(f"received: {body}")
-
-    def predict(self, data):
-        images = data.get("instances", [])
-
-        predicted_probability = self.model.predict(images)
-
-        return predicted_probability
-
-    def postprocess(self, predicted_probability):
-        if self.classes:
-            predicted_classes = np.around(predicted_probability, 1).tolist()[0]
-            predicted_probabilities = predicted_probability.tolist()[0]
-            return {
-                "prediction": [
-                    self.classes[str(int(cls))] for cls in predicted_classes
-                ],
-                f'{self.classes["1"]}-probability': predicted_probabilities,
-            }
-        else:
-            return predicted_probability.tolist()[0]
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/tf1_serving/latest/static/tf1_serving.html b/functions/development/tf1_serving/latest/static/tf1_serving.html deleted file mode 100644 index f7654fa0..00000000 --- a/functions/development/tf1_serving/latest/static/tf1_serving.html +++ /dev/null @@ -1,227 +0,0 @@ - - - - - - - -tf1_serving.tf1_serving - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - -
-
- -
-
-
-
-
- -
-

- -
-
-
-
-
-
-
-

Source code for tf1_serving.tf1_serving

-# Copyright 2019 Iguazio
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# Generated by nuclio.export.NuclioExporter
-
-import warnings
-
-warnings.simplefilter(action="ignore", category=FutureWarning)
-
-import json
-import numpy as np
-import requests
-from tensorflow import keras
-from keras.models import load_model
-from keras.preprocessing import image
-from keras.preprocessing.image import load_img
-from os import environ, path
-from PIL import Image
-from io import BytesIO
-from urllib.request import urlopen
-import mlrun
-
-
-
[docs]class TFModel(mlrun.runtimes.MLModelServer): - def __init__(self, name: str, model_dir: str): - super().__init__(name, model_dir) - - self.IMAGE_WIDTH = int(environ.get("IMAGE_WIDTH", "128")) - self.IMAGE_HEIGHT = int(environ.get("IMAGE_HEIGHT", "128")) - self.classes = None - try: - with open(environ["classes_map"], "r") as f: - self.classes = json.load(f) - except: - pass - -
[docs] def load(self): - model_file, extra_data = self.get_model(".h5") - self.model = load_model(open(model_file, "rb"))
- -
[docs] def preprocess(self, body): - try: - output = {"instances": []} - instances = body.get("instances", []) - for byte_image in instances: - img = Image.open(byte_image) - img = img.resize((self.IMAGE_WIDTH, self.IMAGE_HEIGHT)) - - x = image.img_to_array(img) - x = np.expand_dims(x, axis=0) - output["instances"].append(x) - - output["instances"] = [np.vstack(output["instances"])] - return output - except: - raise Exception(f"received: {body}")
- -
[docs] def predict(self, data): - images = data.get("instances", []) - - predicted_probability = self.model.predict(images) - - return predicted_probability
- -
[docs] def postprocess(self, predicted_probability): - if self.classes: - predicted_classes = np.around(predicted_probability, 1).tolist()[0] - predicted_probabilities = predicted_probability.tolist()[0] - return { - "prediction": [ - self.classes[str(int(cls))] for cls in predicted_classes - ], - f'{self.classes["1"]}-probability': predicted_probabilities, - } - else: - return predicted_probability.tolist()[0]
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/tf2_serving_v2/0.0.1/src/function.yaml b/functions/development/tf2_serving_v2/0.0.1/src/function.yaml deleted file mode 100644 index 311f6154..00000000 --- a/functions/development/tf2_serving_v2/0.0.1/src/function.yaml +++ /dev/null @@ -1,45 +0,0 @@ -kind: serving -metadata: - name: tf2-serving-v2 - tag: '' - hash: 8748deb1d9804f9b436c913322c84d5b46c82bd9 - project: default - labels: - author: yaronh - categories: - - model-serving - - machine-learning -spec: - command: '' - args: [] - image: mlrun/mlrun - description: tf2 image classification server v2 - min_replicas: 1 - max_replicas: 4 - env: [] - base_spec: - apiVersion: nuclio.io/v1 - kind: Function - metadata: - name: tf2-serving-v2 - labels: {} - annotations: - nuclio.io/generated_by: function generated from /home/kali/functions/tf2_serving_v2/tf2_serving_v2.py - spec: - runtime: python:3.6 - handler: tf2_serving_v2:handler - env: [] - volumes: [] - build: - commands: [] - noBaseImagesPull: true - functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHdhcm5pbmdzCgp3YXJuaW5ncy5zaW1wbGVmaWx0ZXIoYWN0aW9uPSJpZ25vcmUiLCBjYXRlZ29yeT1GdXR1cmVXYXJuaW5nKQoKaW1wb3J0IGpzb24KaW1wb3J0IG51bXB5IGFzIG5wCmltcG9ydCByZXF1ZXN0cwpmcm9tIHRlbnNvcmZsb3cgaW1wb3J0IGtlcmFzCmZyb20gdGVuc29yZmxvdy5rZXJhcy5tb2RlbHMgaW1wb3J0IGxvYWRfbW9kZWwKZnJvbSB0ZW5zb3JmbG93LmtlcmFzLnByZXByb2Nlc3NpbmcgaW1wb3J0IGltYWdlCmZyb20gdGVuc29yZmxvdy5rZXJhcy5wcmVwcm9jZXNzaW5nLmltYWdlIGltcG9ydCBsb2FkX2ltZwpmcm9tIG9zIGltcG9ydCBlbnZpcm9uLCBwYXRoCmZyb20gUElMIGltcG9ydCBJbWFnZQpmcm9tIGlvIGltcG9ydCBCeXRlc0lPCmZyb20gdXJsbGliLnJlcXVlc3QgaW1wb3J0IHVybG9wZW4KaW1wb3J0IG1scnVuCgoKY2xhc3MgVEZNb2RlbChtbHJ1bi5zZXJ2aW5nLlYyTW9kZWxTZXJ2ZXIpOgogICAgZGVmIGxvYWQoc2VsZik6CiAgICAgICAgc2VsZi5JTUFHRV9XSURUSCA9IGludChlbnZpcm9uLmdldCgiSU1BR0VfV0lEVEgiLCAiMTI4IikpCiAgICAgICAgc2VsZi5JTUFHRV9IRUlHSFQgPSBpbnQoZW52aXJvbi5nZXQoIklNQUdFX0hFSUdIVCIsICIxMjgiKSkKCiAgICAgICAgdHJ5OgogICAgICAgICAgICB3aXRoIG9wZW4oZW52aXJvblsiY2xhc3Nlc19tYXAiXSwgInIiKSBhcyBmOgogICAgICAgICAgICAgICAgc2VsZi5jbGFzc2VzID0ganNvbi5sb2FkKGYpCiAgICAgICAgZXhjZXB0OgogICAgICAgICAgICBzZWxmLmNsYXNzZXMgPSBOb25lCgogICAgICAgIG1vZGVsX2ZpbGUsIGV4dHJhX2RhdGEgPSBzZWxmLmdldF9tb2RlbCgiLmg1IikKICAgICAgICBzZWxmLm1vZGVsID0gbG9hZF9tb2RlbChtb2RlbF9maWxlKQoKICAgIGRlZiBwcmVwcm9jZXNzKHNlbGYsIGJvZHksIG9wZXJhdGlvbik6CiAgICAgICAgdHJ5OgogICAgICAgICAgICBvdXRwdXQgPSB7ImlucHV0cyI6IFtdfQogICAgICAgICAgICBpbnB1dHMgPSBib2R5LmdldCgiaW5wdXRzIiwgW10pCiAgICAgICAgICAgIGZvciBieXRlX2ltYWdlIGluIGlucHV0czoKICAgICAgICAgICAgICAgIGltZyA9IEltYWdlLm9wZW4oYnl0ZV9pbWFnZSkKICAgICAgICAgICAgICAgIGltZyA9IGltZy5yZXNpemUoKHNlbGYuSU1BR0VfV0lEVEgsIHNlbGYuSU1BR0VfSEVJR0hUKSkKCiAgICAgICAgICAgICAgICB4ID0gaW1hZ2UuaW1nX3RvX2FycmF5KGltZykKICAgICAgICAgICAgICAgIHggPSBucC5leHBhbmRfZGltcyh4LCBheGlzPTApCiAgICAgICAgICAgICAgICBvdXRwdXRbImlucHV0cyJdLmFwcGVuZCh4KQoKICAgICAgICAgICAgb3V0cHV0WyJpbnB1dHMiXSA9IFtucC52c3RhY2sob3V0cHV0WyJpbnB1dHMiXSldCiAgICAgICAgICAgIHJldHVybiBvdXRwdXQKICAgICAgICBleGNlcHQ6CiAgICAgICAgICAgIHJhaXNlIEV4Y2VwdGlvbihmInJlY2VpdmVkOiB7Ym9keX0iKQoKICAgIGRlZiBwcmVkaWN0KHNlbGYsIGRhdGEpOgogICAgICAgIGltYWdlcyA9IGRhdGEuZ2V0KCJpbnB1dHMiLCBbXSkKCiAgICAgICAgcHJlZGljdGVkX3Byb2JhYmlsaXR5ID0gc2VsZi5tb2RlbC5wcmVkaWN0KGltYWdlcykKCiAgICAgICAgcmV0dXJuIHByZWRpY3RlZF9wcm9iYWJpbGl0eS50b2xpc3QoKVswXQoKCmZyb20gbWxydW4ucnVudGltZXMgaW1wb3J0IG51Y2xpb19pbml0X2hvb2sKCgpkZWYgaW5pdF9jb250ZXh0KGNvbnRleHQpOgogICAgbnVjbGlvX2luaXRfaG9vayhjb250ZXh0LCBnbG9iYWxzKCksICJzZXJ2aW5nX3YyIikKCgpkZWYgaGFuZGxlcihjb250ZXh0LCBldmVudCk6CiAgICByZXR1cm4gY29udGV4dC5tbHJ1bl9oYW5kbGVyKGNvbnRleHQsIGV2ZW50KQoKZnJvbSBtbHJ1bi5ydW50aW1lcyBpbXBvcnQgbnVjbGlvX2luaXRfaG9vawpkZWYgaW5pdF9jb250ZXh0KGNvbnRleHQpOgogICAgbnVjbGlvX2luaXRfaG9vayhjb250ZXh0LCBnbG9iYWxzKCksICdzZXJ2aW5nX3YyJykKCmRlZiBoYW5kbGVyKGNvbnRleHQsIGV2ZW50KToKICAgIHJldHVybiBjb250ZXh0Lm1scnVuX2hhbmRsZXIoY29udGV4dCwgZXZlbnQpCg== - source: '' - function_kind: serving_v2 - build: - commands: - - python -m pip install requests pillow tensorflow>=2.1 - code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/tf2_serving_v2/tf2_serving_v2.py - secret_sources: [] - affinity: null -verbose: false diff --git a/functions/development/tf2_serving_v2/0.0.1/src/item.yaml b/functions/development/tf2_serving_v2/0.0.1/src/item.yaml deleted file mode 100644 index 3588c517..00000000 --- a/functions/development/tf2_serving_v2/0.0.1/src/item.yaml +++ /dev/null @@ -1,27 +0,0 @@ -apiVersion: v1 -categories: -- model-serving -- machine-learning -description: tf2 image classification server v2 -doc: '' -example: tf2_serving_v2.ipynb -generationDate: 2021-05-19:23-13 -icon: '' -labels: - author: yaronh -maintainers: [] -marketplaceType: '' -mlrunVersion: '' -name: tf2-serving-v2 -platformVersion: '' -spec: - filename: tf2_serving_v2.py - handler: handler - image: mlrun/mlrun - kind: serving - requirements: - - requests - - pillow - - tensorflow>=2.1 -url: '' -version: 0.0.1 diff --git a/functions/development/tf2_serving_v2/0.0.1/src/tf2_serving_v2.ipynb b/functions/development/tf2_serving_v2/0.0.1/src/tf2_serving_v2.ipynb deleted file mode 100644 index 6a15b11a..00000000 --- a/functions/development/tf2_serving_v2/0.0.1/src/tf2_serving_v2.ipynb +++ /dev/null @@ -1,545 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Image Classification Model - Serving Function\n", - "\n", - "This notebook demonstrates how to deploy a Tensorflow model using MLRun & Nuclio.\n", - "\n", - "**In this notebook you will:**\n", - "* Write a Tensorflow-Model class to load and predict on the incoming data\n", - "* Deploy the model as a serverless function\n", - "* Invoke the serving endpoint with data as:\n", - " * URLs to images hosted on S3\n", - " * Direct image send\n", - " \n", - "**Steps:** \n", - "* [Define Nuclio function](#Define-Nuclio-function) \n", - " * [Install dependencies and set config](#Install-dependencies-and-set-config) \n", - " * [Model serving class](#Model-Serving-Class) \n", - "* [Deploy the serving function to the cluster](#Deploy-the-serving-function-to-the-cluster) \n", - "* [Define test parameters](#Define-test-parameters)\n", - "* [Test the deployed function on the cluster](#Test-the-deployed-function-on-the-cluster)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Define Nuclio Function" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To use the magic commands for deploying this jupyter notebook as a nuclio function we must first import nuclio \n", - "Since we do not want to import nuclio in the actual function, the comment annotation `nuclio: ignore` is used. This marks the cell for nuclio, telling it to ignore the cell's values when building the function." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The history saving thread hit an unexpected error (DatabaseError('database disk image is malformed')).History will not be written to the database.\n" - ] - } - ], - "source": [ - "# nuclio: ignore\n", - "import nuclio" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Install dependencies and set config\n", - "> Note: Since tensorflow is being pulled from the baseimage it is not directly installed as a build command.\n", - "If it is not installed on your system please uninstall and install using the line: `pip install tensorflow`" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "%nuclio: setting kind to 'serving'\n", - "%nuclio: setting spec.build.baseImage to 'mlrun/mlrun'\n" - ] - } - ], - "source": [ - "%nuclio config kind=\"serving\"\n", - "\n", - "# tensorflow 2 use the default serving image (or the mlrun/ml-models for a faster build)\n", - "\n", - "%nuclio config spec.build.baseImage = \"mlrun/mlrun\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Since we are using packages which are not surely installed on our baseimage, or want to verify that a specific version of the package will be installed we use the `%nuclio cmd` annotation. \n", - ">`%nuclio cmd` works both locally and during deployment by default, but can be set with `-c` flag to only run the commands while deploying or `-l` to set the variable for the local environment only." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "%%nuclio cmd -c\n", - "pip install tensorflow>=2.1\n", - "pip install requests pillow" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Function Code" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "import warnings\n", - "warnings.simplefilter(action=\"ignore\", category=FutureWarning)" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2021-01-29 23:47:50,165 [warning] Failed resolving version info. Ignoring and using defaults\n", - "> 2021-01-29 23:47:51,342 [warning] Unable to parse server or client version. Assuming compatible: {'server_version': '0.6.0-rc9', 'client_version': 'unstable'}\n" - ] - } - ], - "source": [ - "import json\n", - "import numpy as np\n", - "import requests\n", - "from tensorflow import keras\n", - "from tensorflow.keras.models import load_model\n", - "from tensorflow.keras.preprocessing import image\n", - "from tensorflow.keras.preprocessing.image import load_img\n", - "from os import environ, path\n", - "from PIL import Image\n", - "from io import BytesIO\n", - "from urllib.request import urlopen\n", - "import mlrun" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Model Serving Class" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We define the `TFModel` class which we will use to define data handling and prediction of our model. \n", - "\n", - "The class should consist of:\n", - "* `__init__(name, model_dir)` - Setup the internal parameters\n", - "* `load(self)` - How to load the model and broadcast it's ready for prediction\n", - "* `preprocess(self, body)` - How to handle the incoming event, forming the request to an `{'instances': []}` dictionary as requested by the protocol\n", - "* `predict(self, data)` - Receives and `{'instances': []}` and returns the model's prediction as a list\n", - "* `postprocess(self, data)` - Does any additional processing needed on the predictions." - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "class TFModel(mlrun.serving.V2ModelServer):\n", - "\n", - " def load(self):\n", - " self.IMAGE_WIDTH = int(environ.get('IMAGE_WIDTH', '128'))\n", - " self.IMAGE_HEIGHT = int(environ.get('IMAGE_HEIGHT', '128'))\n", - " \n", - " try:\n", - " with open(environ['classes_map'], 'r') as f:\n", - " self.classes = json.load(f)\n", - " except:\n", - " self.classes = None\n", - " \n", - " model_file, extra_data = self.get_model('.h5')\n", - " self.model = load_model(model_file)\n", - " \n", - " def preprocess(self, body, operation):\n", - " try:\n", - " output = {'inputs': []}\n", - " inputs = body.get('inputs', [])\n", - " for byte_image in inputs:\n", - " img = Image.open(byte_image)\n", - " img = img.resize((self.IMAGE_WIDTH, self.IMAGE_HEIGHT))\n", - "\n", - " # Load image\n", - " x = image.img_to_array(img)\n", - " x = np.expand_dims(x, axis=0)\n", - " output['inputs'].append(x)\n", - " \n", - " # Format inputs list\n", - " output['inputs'] = [np.vstack(output['inputs'])]\n", - " return output\n", - " except:\n", - " raise Exception(f'received: {body}')\n", - " \n", - "\n", - " def predict(self, data):\n", - " images = data.get('inputs', [])\n", - "\n", - " # Predict\n", - " predicted_probability = self.model.predict(images)\n", - "\n", - " # return prediction\n", - " return predicted_probability.tolist()[0]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To let our nuclio builder know that our function code ends at this point we will use the comment annotation `nuclio: end-code`. \n", - "\n", - "Any new cell from now on will be treated as if a `nuclio: ignore` comment was set, and will not be added to the funcion." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: end-code" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Test the function locally" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Make sure your local TF / Keras version is the same as pulled in the nuclio image for accurate testing\n", - "\n", - "Set the served models and their file paths using: `SERVING_MODEL_ = `\n", - "\n", - "> Note: this notebook assumes the model and categories are under /User/mlrun/examples/" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "from PIL import Image\n", - "from io import BytesIO\n", - "import matplotlib.pyplot as plt\n", - "import os" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Define test parameters" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Test image:\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "# Testing event\n", - "cat_image_url = 'https://s3.amazonaws.com/iguazio-sample-data/images/catanddog/cat.102.jpg'\n", - "response = requests.get(cat_image_url)\n", - "cat_image = response.content\n", - "img = Image.open(BytesIO(cat_image))\n", - "\n", - "print('Test image:')\n", - "plt.imshow(img)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Define Function specifications" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "from mlrun import mlconf\n", - "\n", - "# Specific model variables\n", - "function_envs = {\n", - " 'IMAGE_HEIGHT': 224,\n", - " 'IMAGE_WIDTH': 224,\n", - " 'classes_map': '/Userv3io/projects/cat-and-dog-servers/artifacts/categories_map.json',\n", - "}" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Deploy the serving function to the cluster" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import code_to_function, mount_v3io" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2021-01-29 23:47:54,881 [info] function spec saved to path: function.yaml\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Setup the model server function\n", - "\n", - "fn = code_to_function('tf2-serving-v2', kind=\"serving\")\n", - "fn.spec.description = \"tf2 image classification server v2\"\n", - "fn.metadata.categories = ['serving', 'dl']\n", - "fn.metadata.labels = {'author': 'yaronh'}\n", - "fn.export(\"function.yaml\")\n", - "fn.set_envs(function_envs)\n", - "fn.add_model(key=\"model\",\n", - " model_path=\"/User/mlrun_repos/demos/image-classification-with-distributed-training/pipe/52f2145e-7a54-4137-8c7b-b6c20cc8b1fd/tfmodels/model.h5\",\n", - " class_name=\"TFModel\")" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "if \"V3IO_HOME\" in list(os.environ):\n", - " from mlrun import mount_v3io\n", - " fn.apply(mount_v3io())\n", - "else:\n", - " # is you set up mlrun using the instructions at\n", - " # https://github.com/mlrun/mlrun/blob/master/hack/local/README.md\n", - " from mlrun.platforms import mount_pvc\n", - " fn.apply(mount_pvc('nfsvol', 'nfsvol', '/home/joyan/data'))" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2021-01-29 23:47:54,893 [info] Starting remote function deploy\n", - "2021-01-29 23:47:55 (info) Deploying function\n", - "2021-01-29 23:47:55 (info) Building\n", - "2021-01-29 23:47:55 (info) Staging files and preparing base images\n", - "2021-01-29 23:47:56 (info) Building processor image\n", - "2021-01-29 23:47:57 (info) Build complete\n", - "2021-01-29 23:48:07 (info) Function deploy complete\n", - "> 2021-01-29 23:48:08,029 [info] function deployed, address=default-tenant.app.us-sales30-demo.iguazio-cd2.com:31946\n" - ] - } - ], - "source": [ - "# Deploy the model server\n", - "addr = fn.deploy(project='cat-and-dog-servers')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Test the deployed function on the cluster" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Test the deployed function (with URL)" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [], - "source": [ - "payload = json.dumps({\"data_url\" : cat_image_url})" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'id': '38224902-a688-4985-9424-578ff9ccb4a5',\n", - " 'model_name': 'model',\n", - " 'outputs': [0.0]}" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fn.invoke(path='/v2/models/model/predict', body=payload)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Test the deployed function (with Jpeg Image)" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'id': '246c00fc-225c-44ec-b221-4e6c99f7bc5d',\n", - " 'model_name': 'model',\n", - " 'outputs': [0.0]}" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fn.invoke(path='/v2/models/model/predict',\n", - " body=cat_image,\n", - " headers={'Content-type': 'image/jpeg'})" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/functions/development/tf2_serving_v2/0.0.1/src/tf2_serving_v2.py b/functions/development/tf2_serving_v2/0.0.1/src/tf2_serving_v2.py deleted file mode 100644 index 41c50488..00000000 --- a/functions/development/tf2_serving_v2/0.0.1/src/tf2_serving_v2.py +++ /dev/null @@ -1,68 +0,0 @@ -# Generated by nuclio.export.NuclioExporter - -import warnings - -warnings.simplefilter(action="ignore", category=FutureWarning) - -import json -import numpy as np -import requests -from tensorflow import keras -from tensorflow.keras.models import load_model -from tensorflow.keras.preprocessing import image -from tensorflow.keras.preprocessing.image import load_img -from os import environ, path -from PIL import Image -from io import BytesIO -from urllib.request import urlopen -import mlrun - - -class TFModel(mlrun.serving.V2ModelServer): - def load(self): - self.IMAGE_WIDTH = int(environ.get("IMAGE_WIDTH", "128")) - self.IMAGE_HEIGHT = int(environ.get("IMAGE_HEIGHT", "128")) - - try: - with open(environ["classes_map"], "r") as f: - self.classes = json.load(f) - except: - self.classes = None - - model_file, extra_data = self.get_model(".h5") - self.model = load_model(model_file) - - def preprocess(self, body, operation): - try: - output = {"inputs": []} - inputs = body.get("inputs", []) - for byte_image in inputs: - img = Image.open(byte_image) - img = img.resize((self.IMAGE_WIDTH, self.IMAGE_HEIGHT)) - - x = image.img_to_array(img) - x = np.expand_dims(x, axis=0) - output["inputs"].append(x) - - output["inputs"] = [np.vstack(output["inputs"])] - return output - except: - raise Exception(f"received: {body}") - - def predict(self, data): - images = data.get("inputs", []) - - predicted_probability = self.model.predict(images) - - return predicted_probability.tolist()[0] - - -from mlrun.runtimes import nuclio_init_hook - - -def init_context(context): - nuclio_init_hook(context, globals(), "serving_v2") - - -def handler(context, event): - return context.mlrun_handler(context, event) diff --git a/functions/development/tf2_serving_v2/0.0.1/static/documentation.html b/functions/development/tf2_serving_v2/0.0.1/static/documentation.html deleted file mode 100644 index 010e6fae..00000000 --- a/functions/development/tf2_serving_v2/0.0.1/static/documentation.html +++ /dev/null @@ -1,125 +0,0 @@ - - - - - - - -tf2_serving_v2 package - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- - -
-
-
-
-
-
-

tf2_serving_v2 package

-
-

Submodules

-
-
-

tf2_serving_v2.tf2_serving_v2 module

-
-
-

Module contents

-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/tf2_serving_v2/0.0.1/static/example.html b/functions/development/tf2_serving_v2/0.0.1/static/example.html deleted file mode 100644 index e52f2066..00000000 --- a/functions/development/tf2_serving_v2/0.0.1/static/example.html +++ /dev/null @@ -1,525 +0,0 @@ - - - - - - - -Image Classification Model - Serving Function - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
- -
-
-
-
-

Image Classification Model - Serving Function

-

This notebook demonstrates how to deploy a Tensorflow model using MLRun & Nuclio.

-

In this notebook you will:

-
    -
  • Write a Tensorflow-Model class to load and predict on the incoming data

  • -
  • Deploy the model as a serverless function

  • -
  • Invoke the serving endpoint with data as:

    -
      -
    • URLs to images hosted on S3

    • -
    • Direct image send

    • -
    -
  • -
-

Steps:

- -
-

Define Nuclio Function

-

To use the magic commands for deploying this jupyter notebook as a nuclio function we must first import nuclio
-Since we do not want to import nuclio in the actual function, the comment annotation nuclio: ignore is used. This marks the cell for nuclio, telling it to ignore the cell’s values when building the function.

-
-
-
# nuclio: ignore
-import nuclio
-
-
-
-
-
The history saving thread hit an unexpected error (DatabaseError('database disk image is malformed')).History will not be written to the database.
-
-
-
-
-
-

Install dependencies and set config

-
-

Note: Since tensorflow is being pulled from the baseimage it is not directly installed as a build command. -If it is not installed on your system please uninstall and install using the line: pip install tensorflow

-
-
-
-
%nuclio config kind="serving"
-
-# tensorflow 2 use the default serving image (or the mlrun/ml-models for a faster build)
-
-%nuclio config spec.build.baseImage = "mlrun/mlrun"
-
-
-
-
-
%nuclio: setting kind to 'serving'
-%nuclio: setting spec.build.baseImage to 'mlrun/mlrun'
-
-
-
-
-

Since we are using packages which are not surely installed on our baseimage, or want to verify that a specific version of the package will be installed we use the %nuclio cmd annotation.

-
-

%nuclio cmd works both locally and during deployment by default, but can be set with -c flag to only run the commands while deploying or -l to set the variable for the local environment only.

-
-
-
-
%%nuclio cmd -c
-pip install tensorflow>=2.1
-pip install requests pillow
-
-
-
-
-
-
-
-

Function Code

-
-
-
import warnings
-warnings.simplefilter(action="ignore", category=FutureWarning)
-
-
-
-
-
-
-
import json
-import numpy as np
-import requests
-from tensorflow import keras
-from tensorflow.keras.models import load_model
-from tensorflow.keras.preprocessing import image
-from tensorflow.keras.preprocessing.image import load_img
-from os import environ, path
-from PIL import Image
-from io import BytesIO
-from urllib.request import urlopen
-import mlrun
-
-
-
-
-
> 2021-01-29 23:47:50,165 [warning] Failed resolving version info. Ignoring and using defaults
-> 2021-01-29 23:47:51,342 [warning] Unable to parse server or client version. Assuming compatible: {'server_version': '0.6.0-rc9', 'client_version': 'unstable'}
-
-
-
-
-
-

Model Serving Class

-

We define the TFModel class which we will use to define data handling and prediction of our model.

-

The class should consist of:

-
    -
  • __init__(name, model_dir) - Setup the internal parameters

  • -
  • load(self) - How to load the model and broadcast it’s ready for prediction

  • -
  • preprocess(self, body) - How to handle the incoming event, forming the request to an {'instances': [<samples>]} dictionary as requested by the protocol

  • -
  • predict(self, data) - Receives and {'instances': [<samples>]} and returns the model’s prediction as a list

  • -
  • postprocess(self, data) - Does any additional processing needed on the predictions.

  • -
-
-
-
class TFModel(mlrun.serving.V2ModelServer):
-
-    def load(self):
-        self.IMAGE_WIDTH = int(environ.get('IMAGE_WIDTH', '128'))
-        self.IMAGE_HEIGHT = int(environ.get('IMAGE_HEIGHT', '128'))
-        
-        try:
-            with open(environ['classes_map'], 'r') as f:
-                self.classes = json.load(f)
-        except:
-            self.classes = None
-        
-        model_file, extra_data = self.get_model('.h5')
-        self.model = load_model(model_file)
-        
-    def preprocess(self, body, operation):
-        try:
-            output = {'inputs': []}
-            inputs = body.get('inputs', [])
-            for byte_image in inputs:
-                img = Image.open(byte_image)
-                img = img.resize((self.IMAGE_WIDTH, self.IMAGE_HEIGHT))
-
-                # Load image
-                x = image.img_to_array(img)
-                x = np.expand_dims(x, axis=0)
-                output['inputs'].append(x)
-            
-            # Format inputs list
-            output['inputs'] = [np.vstack(output['inputs'])]
-            return output
-        except:
-            raise Exception(f'received: {body}')
-            
-
-    def predict(self, data):
-        images = data.get('inputs', [])
-
-        # Predict
-        predicted_probability = self.model.predict(images)
-
-        # return prediction
-        return predicted_probability.tolist()[0]
-
-
-
-
-

To let our nuclio builder know that our function code ends at this point we will use the comment annotation nuclio: end-code.

-

Any new cell from now on will be treated as if a nuclio: ignore comment was set, and will not be added to the funcion.

-
-
-
# nuclio: end-code
-
-
-
-
-
-
-
-

Test the function locally

-

Make sure your local TF / Keras version is the same as pulled in the nuclio image for accurate testing

-

Set the served models and their file paths using: SERVING_MODEL_<name> = <model file path>

-
-

Note: this notebook assumes the model and categories are under /User/mlrun/examples/

-
-
-
-
from PIL import Image
-from io import BytesIO
-import matplotlib.pyplot as plt
-import os
-
-
-
-
-
-

Define test parameters

-
-
-
# Testing event
-cat_image_url = 'https://s3.amazonaws.com/iguazio-sample-data/images/catanddog/cat.102.jpg'
-response = requests.get(cat_image_url)
-cat_image = response.content
-img = Image.open(BytesIO(cat_image))
-
-print('Test image:')
-plt.imshow(img)
-
-
-
-
-
Test image:
-
-
-
<matplotlib.image.AxesImage at 0x7f01b9643350>
-
-
-_images/tf2_serving_v2_example_20_2.png -
-
-
-
-

Define Function specifications

-
-
-
import os
-from mlrun import mlconf
-
-# Specific model variables
-function_envs = {
-    'IMAGE_HEIGHT': 224,
-    'IMAGE_WIDTH': 224,
-    'classes_map': '/Userv3io/projects/cat-and-dog-servers/artifacts/categories_map.json',
-}
-
-
-
-
-
-
-
-

Deploy the serving function to the cluster

-
-
-
from mlrun import code_to_function, mount_v3io
-
-
-
-
-
-
-
# Setup the model server function
-
-fn = code_to_function('tf2-serving-v2', kind="serving")
-fn.spec.description = "tf2 image classification server v2"
-fn.metadata.categories = ['serving', 'dl']
-fn.metadata.labels = {'author': 'yaronh'}
-fn.export("function.yaml")
-fn.set_envs(function_envs)
-fn.add_model(key="model",
-             model_path="/User/mlrun_repos/demos/image-classification-with-distributed-training/pipe/52f2145e-7a54-4137-8c7b-b6c20cc8b1fd/tfmodels/model.h5",
-             class_name="TFModel")
-
-
-
-
-
> 2021-01-29 23:47:54,881 [info] function spec saved to path: function.yaml
-
-
-
<mlrun.serving.states.TaskState at 0x7f01b86cbd90>
-
-
-
-
-
-
-
if "V3IO_HOME" in list(os.environ):
-    from mlrun import mount_v3io
-    fn.apply(mount_v3io())
-else:
-    # is you set up mlrun using the instructions at
-    # https://github.com/mlrun/mlrun/blob/master/hack/local/README.md
-    from mlrun.platforms import mount_pvc
-    fn.apply(mount_pvc('nfsvol', 'nfsvol', '/home/joyan/data'))
-
-
-
-
-
-
-
# Deploy the model server
-addr = fn.deploy(project='cat-and-dog-servers')
-
-
-
-
-
> 2021-01-29 23:47:54,893 [info] Starting remote function deploy
-2021-01-29 23:47:55  (info) Deploying function
-2021-01-29 23:47:55  (info) Building
-2021-01-29 23:47:55  (info) Staging files and preparing base images
-2021-01-29 23:47:56  (info) Building processor image
-2021-01-29 23:47:57  (info) Build complete
-2021-01-29 23:48:07  (info) Function deploy complete
-> 2021-01-29 23:48:08,029 [info] function deployed, address=default-tenant.app.us-sales30-demo.iguazio-cd2.com:31946
-
-
-
-
-
-
-

Test the deployed function on the cluster

-
-

Test the deployed function (with URL)

-
-
-
payload = json.dumps({"data_url" : cat_image_url})
-
-
-
-
-
-
-
fn.invoke(path='/v2/models/model/predict', body=payload)
-
-
-
-
-
{'id': '38224902-a688-4985-9424-578ff9ccb4a5',
- 'model_name': 'model',
- 'outputs': [0.0]}
-
-
-
-
-
-
-

Test the deployed function (with Jpeg Image)

-
-
-
fn.invoke(path='/v2/models/model/predict',
-          body=cat_image,
-          headers={'Content-type': 'image/jpeg'})
-
-
-
-
-
{'id': '246c00fc-225c-44ec-b221-4e6c99f7bc5d',
- 'model_name': 'model',
- 'outputs': [0.0]}
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/tf2_serving_v2/0.0.1/static/function.html b/functions/development/tf2_serving_v2/0.0.1/static/function.html deleted file mode 100644 index 2abe7de0..00000000 --- a/functions/development/tf2_serving_v2/0.0.1/static/function.html +++ /dev/null @@ -1,67 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: serving
-metadata:
-  name: tf2-serving-v2
-  tag: ''
-  hash: 8748deb1d9804f9b436c913322c84d5b46c82bd9
-  project: default
-  labels:
-    author: yaronh
-  categories:
-  - model-serving
-  - machine-learning
-spec:
-  command: ''
-  args: []
-  image: mlrun/mlrun
-  description: tf2 image classification server v2
-  min_replicas: 1
-  max_replicas: 4
-  env: []
-  base_spec:
-    apiVersion: nuclio.io/v1
-    kind: Function
-    metadata:
-      name: tf2-serving-v2
-      labels: {}
-      annotations:
-        nuclio.io/generated_by: function generated from /home/kali/functions/tf2_serving_v2/tf2_serving_v2.py
-    spec:
-      runtime: python:3.6
-      handler: tf2_serving_v2:handler
-      env: []
-      volumes: []
-      build:
-        commands: []
-        noBaseImagesPull: true
-        functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHdhcm5pbmdzCgp3YXJuaW5ncy5zaW1wbGVmaWx0ZXIoYWN0aW9uPSJpZ25vcmUiLCBjYXRlZ29yeT1GdXR1cmVXYXJuaW5nKQoKaW1wb3J0IGpzb24KaW1wb3J0IG51bXB5IGFzIG5wCmltcG9ydCByZXF1ZXN0cwpmcm9tIHRlbnNvcmZsb3cgaW1wb3J0IGtlcmFzCmZyb20gdGVuc29yZmxvdy5rZXJhcy5tb2RlbHMgaW1wb3J0IGxvYWRfbW9kZWwKZnJvbSB0ZW5zb3JmbG93LmtlcmFzLnByZXByb2Nlc3NpbmcgaW1wb3J0IGltYWdlCmZyb20gdGVuc29yZmxvdy5rZXJhcy5wcmVwcm9jZXNzaW5nLmltYWdlIGltcG9ydCBsb2FkX2ltZwpmcm9tIG9zIGltcG9ydCBlbnZpcm9uLCBwYXRoCmZyb20gUElMIGltcG9ydCBJbWFnZQpmcm9tIGlvIGltcG9ydCBCeXRlc0lPCmZyb20gdXJsbGliLnJlcXVlc3QgaW1wb3J0IHVybG9wZW4KaW1wb3J0IG1scnVuCgoKY2xhc3MgVEZNb2RlbChtbHJ1bi5zZXJ2aW5nLlYyTW9kZWxTZXJ2ZXIpOgogICAgZGVmIGxvYWQoc2VsZik6CiAgICAgICAgc2VsZi5JTUFHRV9XSURUSCA9IGludChlbnZpcm9uLmdldCgiSU1BR0VfV0lEVEgiLCAiMTI4IikpCiAgICAgICAgc2VsZi5JTUFHRV9IRUlHSFQgPSBpbnQoZW52aXJvbi5nZXQoIklNQUdFX0hFSUdIVCIsICIxMjgiKSkKCiAgICAgICAgdHJ5OgogICAgICAgICAgICB3aXRoIG9wZW4oZW52aXJvblsiY2xhc3Nlc19tYXAiXSwgInIiKSBhcyBmOgogICAgICAgICAgICAgICAgc2VsZi5jbGFzc2VzID0ganNvbi5sb2FkKGYpCiAgICAgICAgZXhjZXB0OgogICAgICAgICAgICBzZWxmLmNsYXNzZXMgPSBOb25lCgogICAgICAgIG1vZGVsX2ZpbGUsIGV4dHJhX2RhdGEgPSBzZWxmLmdldF9tb2RlbCgiLmg1IikKICAgICAgICBzZWxmLm1vZGVsID0gbG9hZF9tb2RlbChtb2RlbF9maWxlKQoKICAgIGRlZiBwcmVwcm9jZXNzKHNlbGYsIGJvZHksIG9wZXJhdGlvbik6CiAgICAgICAgdHJ5OgogICAgICAgICAgICBvdXRwdXQgPSB7ImlucHV0cyI6IFtdfQogICAgICAgICAgICBpbnB1dHMgPSBib2R5LmdldCgiaW5wdXRzIiwgW10pCiAgICAgICAgICAgIGZvciBieXRlX2ltYWdlIGluIGlucHV0czoKICAgICAgICAgICAgICAgIGltZyA9IEltYWdlLm9wZW4oYnl0ZV9pbWFnZSkKICAgICAgICAgICAgICAgIGltZyA9IGltZy5yZXNpemUoKHNlbGYuSU1BR0VfV0lEVEgsIHNlbGYuSU1BR0VfSEVJR0hUKSkKCiAgICAgICAgICAgICAgICB4ID0gaW1hZ2UuaW1nX3RvX2FycmF5KGltZykKICAgICAgICAgICAgICAgIHggPSBucC5leHBhbmRfZGltcyh4LCBheGlzPTApCiAgICAgICAgICAgICAgICBvdXRwdXRbImlucHV0cyJdLmFwcGVuZCh4KQoKICAgICAgICAgICAgb3V0cHV0WyJpbnB1dHMiXSA9IFtucC52c3RhY2sob3V0cHV0WyJpbnB1dHMiXSldCiAgICAgICAgICAgIHJldHVybiBvdXRwdXQKICAgICAgICBleGNlcHQ6CiAgICAgICAgICAgIHJhaXNlIEV4Y2VwdGlvbihmInJlY2VpdmVkOiB7Ym9keX0iKQoKICAgIGRlZiBwcmVkaWN0KHNlbGYsIGRhdGEpOgogICAgICAgIGltYWdlcyA9IGRhdGEuZ2V0KCJpbnB1dHMiLCBbXSkKCiAgICAgICAgcHJlZGljdGVkX3Byb2JhYmlsaXR5ID0gc2VsZi5tb2RlbC5wcmVkaWN0KGltYWdlcykKCiAgICAgICAgcmV0dXJuIHByZWRpY3RlZF9wcm9iYWJpbGl0eS50b2xpc3QoKVswXQoKCmZyb20gbWxydW4ucnVudGltZXMgaW1wb3J0IG51Y2xpb19pbml0X2hvb2sKCgpkZWYgaW5pdF9jb250ZXh0KGNvbnRleHQpOgogICAgbnVjbGlvX2luaXRfaG9vayhjb250ZXh0LCBnbG9iYWxzKCksICJzZXJ2aW5nX3YyIikKCgpkZWYgaGFuZGxlcihjb250ZXh0LCBldmVudCk6CiAgICByZXR1cm4gY29udGV4dC5tbHJ1bl9oYW5kbGVyKGNvbnRleHQsIGV2ZW50KQoKZnJvbSBtbHJ1bi5ydW50aW1lcyBpbXBvcnQgbnVjbGlvX2luaXRfaG9vawpkZWYgaW5pdF9jb250ZXh0KGNvbnRleHQpOgogICAgbnVjbGlvX2luaXRfaG9vayhjb250ZXh0LCBnbG9iYWxzKCksICdzZXJ2aW5nX3YyJykKCmRlZiBoYW5kbGVyKGNvbnRleHQsIGV2ZW50KToKICAgIHJldHVybiBjb250ZXh0Lm1scnVuX2hhbmRsZXIoY29udGV4dCwgZXZlbnQpCg==
-  source: ''
-  function_kind: serving_v2
-  build:
-    commands:
-    - python -m pip install requests pillow tensorflow>=2.1
-    code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/tf2_serving_v2/tf2_serving_v2.py
-  secret_sources: []
-  affinity: null
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/tf2_serving_v2/0.0.1/static/item.html b/functions/development/tf2_serving_v2/0.0.1/static/item.html deleted file mode 100644 index 4252c8f5..00000000 --- a/functions/development/tf2_serving_v2/0.0.1/static/item.html +++ /dev/null @@ -1,49 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- model-serving
-- machine-learning
-description: tf2 image classification server v2
-doc: ''
-example: tf2_serving_v2.ipynb
-generationDate: 2021-05-19:23-13
-icon: ''
-labels:
-  author: yaronh
-maintainers: []
-marketplaceType: ''
-mlrunVersion: ''
-name: tf2-serving-v2
-platformVersion: ''
-spec:
-  filename: tf2_serving_v2.py
-  handler: handler
-  image: mlrun/mlrun
-  kind: serving
-  requirements:
-  - requests
-  - pillow
-  - tensorflow>=2.1
-url: ''
-version: 0.0.1
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/tf2_serving_v2/0.0.1/static/source.html b/functions/development/tf2_serving_v2/0.0.1/static/source.html deleted file mode 100644 index a68bd9fb..00000000 --- a/functions/development/tf2_serving_v2/0.0.1/static/source.html +++ /dev/null @@ -1,90 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-# Generated by nuclio.export.NuclioExporter
-
-import warnings
-
-warnings.simplefilter(action="ignore", category=FutureWarning)
-
-import json
-import numpy as np
-import requests
-from tensorflow import keras
-from tensorflow.keras.models import load_model
-from tensorflow.keras.preprocessing import image
-from tensorflow.keras.preprocessing.image import load_img
-from os import environ, path
-from PIL import Image
-from io import BytesIO
-from urllib.request import urlopen
-import mlrun
-
-
-class TFModel(mlrun.serving.V2ModelServer):
-    def load(self):
-        self.IMAGE_WIDTH = int(environ.get("IMAGE_WIDTH", "128"))
-        self.IMAGE_HEIGHT = int(environ.get("IMAGE_HEIGHT", "128"))
-
-        try:
-            with open(environ["classes_map"], "r") as f:
-                self.classes = json.load(f)
-        except:
-            self.classes = None
-
-        model_file, extra_data = self.get_model(".h5")
-        self.model = load_model(model_file)
-
-    def preprocess(self, body, operation):
-        try:
-            output = {"inputs": []}
-            inputs = body.get("inputs", [])
-            for byte_image in inputs:
-                img = Image.open(byte_image)
-                img = img.resize((self.IMAGE_WIDTH, self.IMAGE_HEIGHT))
-
-                x = image.img_to_array(img)
-                x = np.expand_dims(x, axis=0)
-                output["inputs"].append(x)
-
-            output["inputs"] = [np.vstack(output["inputs"])]
-            return output
-        except:
-            raise Exception(f"received: {body}")
-
-    def predict(self, data):
-        images = data.get("inputs", [])
-
-        predicted_probability = self.model.predict(images)
-
-        return predicted_probability.tolist()[0]
-
-
-from mlrun.runtimes import nuclio_init_hook
-
-
-def init_context(context):
-    nuclio_init_hook(context, globals(), "serving_v2")
-
-
-def handler(context, event):
-    return context.mlrun_handler(context, event)
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/tf2_serving_v2/0.8.0/src/function.yaml b/functions/development/tf2_serving_v2/0.8.0/src/function.yaml deleted file mode 100644 index 311f6154..00000000 --- a/functions/development/tf2_serving_v2/0.8.0/src/function.yaml +++ /dev/null @@ -1,45 +0,0 @@ -kind: serving -metadata: - name: tf2-serving-v2 - tag: '' - hash: 8748deb1d9804f9b436c913322c84d5b46c82bd9 - project: default - labels: - author: yaronh - categories: - - model-serving - - machine-learning -spec: - command: '' - args: [] - image: mlrun/mlrun - description: tf2 image classification server v2 - min_replicas: 1 - max_replicas: 4 - env: [] - base_spec: - apiVersion: nuclio.io/v1 - kind: Function - metadata: - name: tf2-serving-v2 - labels: {} - annotations: - nuclio.io/generated_by: function generated from /home/kali/functions/tf2_serving_v2/tf2_serving_v2.py - spec: - runtime: python:3.6 - handler: tf2_serving_v2:handler - env: [] - volumes: [] - build: - commands: [] - noBaseImagesPull: true - functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHdhcm5pbmdzCgp3YXJuaW5ncy5zaW1wbGVmaWx0ZXIoYWN0aW9uPSJpZ25vcmUiLCBjYXRlZ29yeT1GdXR1cmVXYXJuaW5nKQoKaW1wb3J0IGpzb24KaW1wb3J0IG51bXB5IGFzIG5wCmltcG9ydCByZXF1ZXN0cwpmcm9tIHRlbnNvcmZsb3cgaW1wb3J0IGtlcmFzCmZyb20gdGVuc29yZmxvdy5rZXJhcy5tb2RlbHMgaW1wb3J0IGxvYWRfbW9kZWwKZnJvbSB0ZW5zb3JmbG93LmtlcmFzLnByZXByb2Nlc3NpbmcgaW1wb3J0IGltYWdlCmZyb20gdGVuc29yZmxvdy5rZXJhcy5wcmVwcm9jZXNzaW5nLmltYWdlIGltcG9ydCBsb2FkX2ltZwpmcm9tIG9zIGltcG9ydCBlbnZpcm9uLCBwYXRoCmZyb20gUElMIGltcG9ydCBJbWFnZQpmcm9tIGlvIGltcG9ydCBCeXRlc0lPCmZyb20gdXJsbGliLnJlcXVlc3QgaW1wb3J0IHVybG9wZW4KaW1wb3J0IG1scnVuCgoKY2xhc3MgVEZNb2RlbChtbHJ1bi5zZXJ2aW5nLlYyTW9kZWxTZXJ2ZXIpOgogICAgZGVmIGxvYWQoc2VsZik6CiAgICAgICAgc2VsZi5JTUFHRV9XSURUSCA9IGludChlbnZpcm9uLmdldCgiSU1BR0VfV0lEVEgiLCAiMTI4IikpCiAgICAgICAgc2VsZi5JTUFHRV9IRUlHSFQgPSBpbnQoZW52aXJvbi5nZXQoIklNQUdFX0hFSUdIVCIsICIxMjgiKSkKCiAgICAgICAgdHJ5OgogICAgICAgICAgICB3aXRoIG9wZW4oZW52aXJvblsiY2xhc3Nlc19tYXAiXSwgInIiKSBhcyBmOgogICAgICAgICAgICAgICAgc2VsZi5jbGFzc2VzID0ganNvbi5sb2FkKGYpCiAgICAgICAgZXhjZXB0OgogICAgICAgICAgICBzZWxmLmNsYXNzZXMgPSBOb25lCgogICAgICAgIG1vZGVsX2ZpbGUsIGV4dHJhX2RhdGEgPSBzZWxmLmdldF9tb2RlbCgiLmg1IikKICAgICAgICBzZWxmLm1vZGVsID0gbG9hZF9tb2RlbChtb2RlbF9maWxlKQoKICAgIGRlZiBwcmVwcm9jZXNzKHNlbGYsIGJvZHksIG9wZXJhdGlvbik6CiAgICAgICAgdHJ5OgogICAgICAgICAgICBvdXRwdXQgPSB7ImlucHV0cyI6IFtdfQogICAgICAgICAgICBpbnB1dHMgPSBib2R5LmdldCgiaW5wdXRzIiwgW10pCiAgICAgICAgICAgIGZvciBieXRlX2ltYWdlIGluIGlucHV0czoKICAgICAgICAgICAgICAgIGltZyA9IEltYWdlLm9wZW4oYnl0ZV9pbWFnZSkKICAgICAgICAgICAgICAgIGltZyA9IGltZy5yZXNpemUoKHNlbGYuSU1BR0VfV0lEVEgsIHNlbGYuSU1BR0VfSEVJR0hUKSkKCiAgICAgICAgICAgICAgICB4ID0gaW1hZ2UuaW1nX3RvX2FycmF5KGltZykKICAgICAgICAgICAgICAgIHggPSBucC5leHBhbmRfZGltcyh4LCBheGlzPTApCiAgICAgICAgICAgICAgICBvdXRwdXRbImlucHV0cyJdLmFwcGVuZCh4KQoKICAgICAgICAgICAgb3V0cHV0WyJpbnB1dHMiXSA9IFtucC52c3RhY2sob3V0cHV0WyJpbnB1dHMiXSldCiAgICAgICAgICAgIHJldHVybiBvdXRwdXQKICAgICAgICBleGNlcHQ6CiAgICAgICAgICAgIHJhaXNlIEV4Y2VwdGlvbihmInJlY2VpdmVkOiB7Ym9keX0iKQoKICAgIGRlZiBwcmVkaWN0KHNlbGYsIGRhdGEpOgogICAgICAgIGltYWdlcyA9IGRhdGEuZ2V0KCJpbnB1dHMiLCBbXSkKCiAgICAgICAgcHJlZGljdGVkX3Byb2JhYmlsaXR5ID0gc2VsZi5tb2RlbC5wcmVkaWN0KGltYWdlcykKCiAgICAgICAgcmV0dXJuIHByZWRpY3RlZF9wcm9iYWJpbGl0eS50b2xpc3QoKVswXQoKCmZyb20gbWxydW4ucnVudGltZXMgaW1wb3J0IG51Y2xpb19pbml0X2hvb2sKCgpkZWYgaW5pdF9jb250ZXh0KGNvbnRleHQpOgogICAgbnVjbGlvX2luaXRfaG9vayhjb250ZXh0LCBnbG9iYWxzKCksICJzZXJ2aW5nX3YyIikKCgpkZWYgaGFuZGxlcihjb250ZXh0LCBldmVudCk6CiAgICByZXR1cm4gY29udGV4dC5tbHJ1bl9oYW5kbGVyKGNvbnRleHQsIGV2ZW50KQoKZnJvbSBtbHJ1bi5ydW50aW1lcyBpbXBvcnQgbnVjbGlvX2luaXRfaG9vawpkZWYgaW5pdF9jb250ZXh0KGNvbnRleHQpOgogICAgbnVjbGlvX2luaXRfaG9vayhjb250ZXh0LCBnbG9iYWxzKCksICdzZXJ2aW5nX3YyJykKCmRlZiBoYW5kbGVyKGNvbnRleHQsIGV2ZW50KToKICAgIHJldHVybiBjb250ZXh0Lm1scnVuX2hhbmRsZXIoY29udGV4dCwgZXZlbnQpCg== - source: '' - function_kind: serving_v2 - build: - commands: - - python -m pip install requests pillow tensorflow>=2.1 - code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/tf2_serving_v2/tf2_serving_v2.py - secret_sources: [] - affinity: null -verbose: false diff --git a/functions/development/tf2_serving_v2/0.8.0/src/item.yaml b/functions/development/tf2_serving_v2/0.8.0/src/item.yaml deleted file mode 100644 index 6e1d4f6a..00000000 --- a/functions/development/tf2_serving_v2/0.8.0/src/item.yaml +++ /dev/null @@ -1,27 +0,0 @@ -apiVersion: v1 -categories: -- model-serving -- machine-learning -description: tf2 image classification server v2 -doc: '' -example: tf2_serving_v2.ipynb -generationDate: 2021-05-19:23-13 -icon: '' -labels: - author: yaronh -maintainers: [] -marketplaceType: '' -mlrunVersion: 0.8.0 -name: tf2-serving-v2 -platformVersion: 3.2.0 -spec: - filename: tf2_serving_v2.py - handler: handler - image: mlrun/mlrun - kind: serving - requirements: - - requests - - pillow - - tensorflow>=2.1 -url: '' -version: 0.8.0 diff --git a/functions/development/tf2_serving_v2/0.8.0/src/tf2_serving_v2.ipynb b/functions/development/tf2_serving_v2/0.8.0/src/tf2_serving_v2.ipynb deleted file mode 100644 index 6a15b11a..00000000 --- a/functions/development/tf2_serving_v2/0.8.0/src/tf2_serving_v2.ipynb +++ /dev/null @@ -1,545 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Image Classification Model - Serving Function\n", - "\n", - "This notebook demonstrates how to deploy a Tensorflow model using MLRun & Nuclio.\n", - "\n", - "**In this notebook you will:**\n", - "* Write a Tensorflow-Model class to load and predict on the incoming data\n", - "* Deploy the model as a serverless function\n", - "* Invoke the serving endpoint with data as:\n", - " * URLs to images hosted on S3\n", - " * Direct image send\n", - " \n", - "**Steps:** \n", - "* [Define Nuclio function](#Define-Nuclio-function) \n", - " * [Install dependencies and set config](#Install-dependencies-and-set-config) \n", - " * [Model serving class](#Model-Serving-Class) \n", - "* [Deploy the serving function to the cluster](#Deploy-the-serving-function-to-the-cluster) \n", - "* [Define test parameters](#Define-test-parameters)\n", - "* [Test the deployed function on the cluster](#Test-the-deployed-function-on-the-cluster)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Define Nuclio Function" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To use the magic commands for deploying this jupyter notebook as a nuclio function we must first import nuclio \n", - "Since we do not want to import nuclio in the actual function, the comment annotation `nuclio: ignore` is used. This marks the cell for nuclio, telling it to ignore the cell's values when building the function." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The history saving thread hit an unexpected error (DatabaseError('database disk image is malformed')).History will not be written to the database.\n" - ] - } - ], - "source": [ - "# nuclio: ignore\n", - "import nuclio" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Install dependencies and set config\n", - "> Note: Since tensorflow is being pulled from the baseimage it is not directly installed as a build command.\n", - "If it is not installed on your system please uninstall and install using the line: `pip install tensorflow`" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "%nuclio: setting kind to 'serving'\n", - "%nuclio: setting spec.build.baseImage to 'mlrun/mlrun'\n" - ] - } - ], - "source": [ - "%nuclio config kind=\"serving\"\n", - "\n", - "# tensorflow 2 use the default serving image (or the mlrun/ml-models for a faster build)\n", - "\n", - "%nuclio config spec.build.baseImage = \"mlrun/mlrun\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Since we are using packages which are not surely installed on our baseimage, or want to verify that a specific version of the package will be installed we use the `%nuclio cmd` annotation. \n", - ">`%nuclio cmd` works both locally and during deployment by default, but can be set with `-c` flag to only run the commands while deploying or `-l` to set the variable for the local environment only." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "%%nuclio cmd -c\n", - "pip install tensorflow>=2.1\n", - "pip install requests pillow" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Function Code" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "import warnings\n", - "warnings.simplefilter(action=\"ignore\", category=FutureWarning)" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2021-01-29 23:47:50,165 [warning] Failed resolving version info. Ignoring and using defaults\n", - "> 2021-01-29 23:47:51,342 [warning] Unable to parse server or client version. Assuming compatible: {'server_version': '0.6.0-rc9', 'client_version': 'unstable'}\n" - ] - } - ], - "source": [ - "import json\n", - "import numpy as np\n", - "import requests\n", - "from tensorflow import keras\n", - "from tensorflow.keras.models import load_model\n", - "from tensorflow.keras.preprocessing import image\n", - "from tensorflow.keras.preprocessing.image import load_img\n", - "from os import environ, path\n", - "from PIL import Image\n", - "from io import BytesIO\n", - "from urllib.request import urlopen\n", - "import mlrun" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Model Serving Class" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We define the `TFModel` class which we will use to define data handling and prediction of our model. \n", - "\n", - "The class should consist of:\n", - "* `__init__(name, model_dir)` - Setup the internal parameters\n", - "* `load(self)` - How to load the model and broadcast it's ready for prediction\n", - "* `preprocess(self, body)` - How to handle the incoming event, forming the request to an `{'instances': []}` dictionary as requested by the protocol\n", - "* `predict(self, data)` - Receives and `{'instances': []}` and returns the model's prediction as a list\n", - "* `postprocess(self, data)` - Does any additional processing needed on the predictions." - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "class TFModel(mlrun.serving.V2ModelServer):\n", - "\n", - " def load(self):\n", - " self.IMAGE_WIDTH = int(environ.get('IMAGE_WIDTH', '128'))\n", - " self.IMAGE_HEIGHT = int(environ.get('IMAGE_HEIGHT', '128'))\n", - " \n", - " try:\n", - " with open(environ['classes_map'], 'r') as f:\n", - " self.classes = json.load(f)\n", - " except:\n", - " self.classes = None\n", - " \n", - " model_file, extra_data = self.get_model('.h5')\n", - " self.model = load_model(model_file)\n", - " \n", - " def preprocess(self, body, operation):\n", - " try:\n", - " output = {'inputs': []}\n", - " inputs = body.get('inputs', [])\n", - " for byte_image in inputs:\n", - " img = Image.open(byte_image)\n", - " img = img.resize((self.IMAGE_WIDTH, self.IMAGE_HEIGHT))\n", - "\n", - " # Load image\n", - " x = image.img_to_array(img)\n", - " x = np.expand_dims(x, axis=0)\n", - " output['inputs'].append(x)\n", - " \n", - " # Format inputs list\n", - " output['inputs'] = [np.vstack(output['inputs'])]\n", - " return output\n", - " except:\n", - " raise Exception(f'received: {body}')\n", - " \n", - "\n", - " def predict(self, data):\n", - " images = data.get('inputs', [])\n", - "\n", - " # Predict\n", - " predicted_probability = self.model.predict(images)\n", - "\n", - " # return prediction\n", - " return predicted_probability.tolist()[0]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To let our nuclio builder know that our function code ends at this point we will use the comment annotation `nuclio: end-code`. \n", - "\n", - "Any new cell from now on will be treated as if a `nuclio: ignore` comment was set, and will not be added to the funcion." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: end-code" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Test the function locally" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Make sure your local TF / Keras version is the same as pulled in the nuclio image for accurate testing\n", - "\n", - "Set the served models and their file paths using: `SERVING_MODEL_ = `\n", - "\n", - "> Note: this notebook assumes the model and categories are under /User/mlrun/examples/" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "from PIL import Image\n", - "from io import BytesIO\n", - "import matplotlib.pyplot as plt\n", - "import os" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Define test parameters" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Test image:\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "# Testing event\n", - "cat_image_url = 'https://s3.amazonaws.com/iguazio-sample-data/images/catanddog/cat.102.jpg'\n", - "response = requests.get(cat_image_url)\n", - "cat_image = response.content\n", - "img = Image.open(BytesIO(cat_image))\n", - "\n", - "print('Test image:')\n", - "plt.imshow(img)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Define Function specifications" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "from mlrun import mlconf\n", - "\n", - "# Specific model variables\n", - "function_envs = {\n", - " 'IMAGE_HEIGHT': 224,\n", - " 'IMAGE_WIDTH': 224,\n", - " 'classes_map': '/Userv3io/projects/cat-and-dog-servers/artifacts/categories_map.json',\n", - "}" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Deploy the serving function to the cluster" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import code_to_function, mount_v3io" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2021-01-29 23:47:54,881 [info] function spec saved to path: function.yaml\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Setup the model server function\n", - "\n", - "fn = code_to_function('tf2-serving-v2', kind=\"serving\")\n", - "fn.spec.description = \"tf2 image classification server v2\"\n", - "fn.metadata.categories = ['serving', 'dl']\n", - "fn.metadata.labels = {'author': 'yaronh'}\n", - "fn.export(\"function.yaml\")\n", - "fn.set_envs(function_envs)\n", - "fn.add_model(key=\"model\",\n", - " model_path=\"/User/mlrun_repos/demos/image-classification-with-distributed-training/pipe/52f2145e-7a54-4137-8c7b-b6c20cc8b1fd/tfmodels/model.h5\",\n", - " class_name=\"TFModel\")" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "if \"V3IO_HOME\" in list(os.environ):\n", - " from mlrun import mount_v3io\n", - " fn.apply(mount_v3io())\n", - "else:\n", - " # is you set up mlrun using the instructions at\n", - " # https://github.com/mlrun/mlrun/blob/master/hack/local/README.md\n", - " from mlrun.platforms import mount_pvc\n", - " fn.apply(mount_pvc('nfsvol', 'nfsvol', '/home/joyan/data'))" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2021-01-29 23:47:54,893 [info] Starting remote function deploy\n", - "2021-01-29 23:47:55 (info) Deploying function\n", - "2021-01-29 23:47:55 (info) Building\n", - "2021-01-29 23:47:55 (info) Staging files and preparing base images\n", - "2021-01-29 23:47:56 (info) Building processor image\n", - "2021-01-29 23:47:57 (info) Build complete\n", - "2021-01-29 23:48:07 (info) Function deploy complete\n", - "> 2021-01-29 23:48:08,029 [info] function deployed, address=default-tenant.app.us-sales30-demo.iguazio-cd2.com:31946\n" - ] - } - ], - "source": [ - "# Deploy the model server\n", - "addr = fn.deploy(project='cat-and-dog-servers')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Test the deployed function on the cluster" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Test the deployed function (with URL)" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [], - "source": [ - "payload = json.dumps({\"data_url\" : cat_image_url})" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'id': '38224902-a688-4985-9424-578ff9ccb4a5',\n", - " 'model_name': 'model',\n", - " 'outputs': [0.0]}" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fn.invoke(path='/v2/models/model/predict', body=payload)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Test the deployed function (with Jpeg Image)" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'id': '246c00fc-225c-44ec-b221-4e6c99f7bc5d',\n", - " 'model_name': 'model',\n", - " 'outputs': [0.0]}" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fn.invoke(path='/v2/models/model/predict',\n", - " body=cat_image,\n", - " headers={'Content-type': 'image/jpeg'})" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/functions/development/tf2_serving_v2/0.8.0/src/tf2_serving_v2.py b/functions/development/tf2_serving_v2/0.8.0/src/tf2_serving_v2.py deleted file mode 100644 index 41c50488..00000000 --- a/functions/development/tf2_serving_v2/0.8.0/src/tf2_serving_v2.py +++ /dev/null @@ -1,68 +0,0 @@ -# Generated by nuclio.export.NuclioExporter - -import warnings - -warnings.simplefilter(action="ignore", category=FutureWarning) - -import json -import numpy as np -import requests -from tensorflow import keras -from tensorflow.keras.models import load_model -from tensorflow.keras.preprocessing import image -from tensorflow.keras.preprocessing.image import load_img -from os import environ, path -from PIL import Image -from io import BytesIO -from urllib.request import urlopen -import mlrun - - -class TFModel(mlrun.serving.V2ModelServer): - def load(self): - self.IMAGE_WIDTH = int(environ.get("IMAGE_WIDTH", "128")) - self.IMAGE_HEIGHT = int(environ.get("IMAGE_HEIGHT", "128")) - - try: - with open(environ["classes_map"], "r") as f: - self.classes = json.load(f) - except: - self.classes = None - - model_file, extra_data = self.get_model(".h5") - self.model = load_model(model_file) - - def preprocess(self, body, operation): - try: - output = {"inputs": []} - inputs = body.get("inputs", []) - for byte_image in inputs: - img = Image.open(byte_image) - img = img.resize((self.IMAGE_WIDTH, self.IMAGE_HEIGHT)) - - x = image.img_to_array(img) - x = np.expand_dims(x, axis=0) - output["inputs"].append(x) - - output["inputs"] = [np.vstack(output["inputs"])] - return output - except: - raise Exception(f"received: {body}") - - def predict(self, data): - images = data.get("inputs", []) - - predicted_probability = self.model.predict(images) - - return predicted_probability.tolist()[0] - - -from mlrun.runtimes import nuclio_init_hook - - -def init_context(context): - nuclio_init_hook(context, globals(), "serving_v2") - - -def handler(context, event): - return context.mlrun_handler(context, event) diff --git a/functions/development/tf2_serving_v2/0.8.0/static/documentation.html b/functions/development/tf2_serving_v2/0.8.0/static/documentation.html deleted file mode 100644 index 010e6fae..00000000 --- a/functions/development/tf2_serving_v2/0.8.0/static/documentation.html +++ /dev/null @@ -1,125 +0,0 @@ - - - - - - - -tf2_serving_v2 package - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- - -
-
-
-
-
-
-

tf2_serving_v2 package

-
-

Submodules

-
-
-

tf2_serving_v2.tf2_serving_v2 module

-
-
-

Module contents

-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/tf2_serving_v2/0.8.0/static/example.html b/functions/development/tf2_serving_v2/0.8.0/static/example.html deleted file mode 100644 index 7d181843..00000000 --- a/functions/development/tf2_serving_v2/0.8.0/static/example.html +++ /dev/null @@ -1,525 +0,0 @@ - - - - - - - -Image Classification Model - Serving Function - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
- -
-
-
-
-

Image Classification Model - Serving Function

-

This notebook demonstrates how to deploy a Tensorflow model using MLRun & Nuclio.

-

In this notebook you will:

-
    -
  • Write a Tensorflow-Model class to load and predict on the incoming data

  • -
  • Deploy the model as a serverless function

  • -
  • Invoke the serving endpoint with data as:

    -
      -
    • URLs to images hosted on S3

    • -
    • Direct image send

    • -
    -
  • -
-

Steps:

- -
-

Define Nuclio Function

-

To use the magic commands for deploying this jupyter notebook as a nuclio function we must first import nuclio
-Since we do not want to import nuclio in the actual function, the comment annotation nuclio: ignore is used. This marks the cell for nuclio, telling it to ignore the cell’s values when building the function.

-
-
-
# nuclio: ignore
-import nuclio
-
-
-
-
-
The history saving thread hit an unexpected error (DatabaseError('database disk image is malformed')).History will not be written to the database.
-
-
-
-
-
-

Install dependencies and set config

-
-

Note: Since tensorflow is being pulled from the baseimage it is not directly installed as a build command. -If it is not installed on your system please uninstall and install using the line: pip install tensorflow

-
-
-
-
%nuclio config kind="serving"
-
-# tensorflow 2 use the default serving image (or the mlrun/ml-models for a faster build)
-
-%nuclio config spec.build.baseImage = "mlrun/mlrun"
-
-
-
-
-
%nuclio: setting kind to 'serving'
-%nuclio: setting spec.build.baseImage to 'mlrun/mlrun'
-
-
-
-
-

Since we are using packages which are not surely installed on our baseimage, or want to verify that a specific version of the package will be installed we use the %nuclio cmd annotation.

-
-

%nuclio cmd works both locally and during deployment by default, but can be set with -c flag to only run the commands while deploying or -l to set the variable for the local environment only.

-
-
-
-
%%nuclio cmd -c
-pip install tensorflow>=2.1
-pip install requests pillow
-
-
-
-
-
-
-
-

Function Code

-
-
-
import warnings
-warnings.simplefilter(action="ignore", category=FutureWarning)
-
-
-
-
-
-
-
import json
-import numpy as np
-import requests
-from tensorflow import keras
-from tensorflow.keras.models import load_model
-from tensorflow.keras.preprocessing import image
-from tensorflow.keras.preprocessing.image import load_img
-from os import environ, path
-from PIL import Image
-from io import BytesIO
-from urllib.request import urlopen
-import mlrun
-
-
-
-
-
> 2021-01-29 23:47:50,165 [warning] Failed resolving version info. Ignoring and using defaults
-> 2021-01-29 23:47:51,342 [warning] Unable to parse server or client version. Assuming compatible: {'server_version': '0.6.0-rc9', 'client_version': 'unstable'}
-
-
-
-
-
-

Model Serving Class

-

We define the TFModel class which we will use to define data handling and prediction of our model.

-

The class should consist of:

-
    -
  • __init__(name, model_dir) - Setup the internal parameters

  • -
  • load(self) - How to load the model and broadcast it’s ready for prediction

  • -
  • preprocess(self, body) - How to handle the incoming event, forming the request to an {'instances': [<samples>]} dictionary as requested by the protocol

  • -
  • predict(self, data) - Receives and {'instances': [<samples>]} and returns the model’s prediction as a list

  • -
  • postprocess(self, data) - Does any additional processing needed on the predictions.

  • -
-
-
-
class TFModel(mlrun.serving.V2ModelServer):
-
-    def load(self):
-        self.IMAGE_WIDTH = int(environ.get('IMAGE_WIDTH', '128'))
-        self.IMAGE_HEIGHT = int(environ.get('IMAGE_HEIGHT', '128'))
-        
-        try:
-            with open(environ['classes_map'], 'r') as f:
-                self.classes = json.load(f)
-        except:
-            self.classes = None
-        
-        model_file, extra_data = self.get_model('.h5')
-        self.model = load_model(model_file)
-        
-    def preprocess(self, body, operation):
-        try:
-            output = {'inputs': []}
-            inputs = body.get('inputs', [])
-            for byte_image in inputs:
-                img = Image.open(byte_image)
-                img = img.resize((self.IMAGE_WIDTH, self.IMAGE_HEIGHT))
-
-                # Load image
-                x = image.img_to_array(img)
-                x = np.expand_dims(x, axis=0)
-                output['inputs'].append(x)
-            
-            # Format inputs list
-            output['inputs'] = [np.vstack(output['inputs'])]
-            return output
-        except:
-            raise Exception(f'received: {body}')
-            
-
-    def predict(self, data):
-        images = data.get('inputs', [])
-
-        # Predict
-        predicted_probability = self.model.predict(images)
-
-        # return prediction
-        return predicted_probability.tolist()[0]
-
-
-
-
-

To let our nuclio builder know that our function code ends at this point we will use the comment annotation nuclio: end-code.

-

Any new cell from now on will be treated as if a nuclio: ignore comment was set, and will not be added to the funcion.

-
-
-
# nuclio: end-code
-
-
-
-
-
-
-
-

Test the function locally

-

Make sure your local TF / Keras version is the same as pulled in the nuclio image for accurate testing

-

Set the served models and their file paths using: SERVING_MODEL_<name> = <model file path>

-
-

Note: this notebook assumes the model and categories are under /User/mlrun/examples/

-
-
-
-
from PIL import Image
-from io import BytesIO
-import matplotlib.pyplot as plt
-import os
-
-
-
-
-
-

Define test parameters

-
-
-
# Testing event
-cat_image_url = 'https://s3.amazonaws.com/iguazio-sample-data/images/catanddog/cat.102.jpg'
-response = requests.get(cat_image_url)
-cat_image = response.content
-img = Image.open(BytesIO(cat_image))
-
-print('Test image:')
-plt.imshow(img)
-
-
-
-
-
Test image:
-
-
-
<matplotlib.image.AxesImage at 0x7f01b9643350>
-
-
-_images/tf2_serving_v2_example_20_2.png -
-
-
-
-

Define Function specifications

-
-
-
import os
-from mlrun import mlconf
-
-# Specific model variables
-function_envs = {
-    'IMAGE_HEIGHT': 224,
-    'IMAGE_WIDTH': 224,
-    'classes_map': '/Userv3io/projects/cat-and-dog-servers/artifacts/categories_map.json',
-}
-
-
-
-
-
-
-
-

Deploy the serving function to the cluster

-
-
-
from mlrun import code_to_function, mount_v3io
-
-
-
-
-
-
-
# Setup the model server function
-
-fn = code_to_function('tf2-serving-v2', kind="serving")
-fn.spec.description = "tf2 image classification server v2"
-fn.metadata.categories = ['serving', 'dl']
-fn.metadata.labels = {'author': 'yaronh'}
-fn.export("function.yaml")
-fn.set_envs(function_envs)
-fn.add_model(key="model",
-             model_path="/User/mlrun_repos/demos/image-classification-with-distributed-training/pipe/52f2145e-7a54-4137-8c7b-b6c20cc8b1fd/tfmodels/model.h5",
-             class_name="TFModel")
-
-
-
-
-
> 2021-01-29 23:47:54,881 [info] function spec saved to path: function.yaml
-
-
-
<mlrun.serving.states.TaskState at 0x7f01b86cbd90>
-
-
-
-
-
-
-
if "V3IO_HOME" in list(os.environ):
-    from mlrun import mount_v3io
-    fn.apply(mount_v3io())
-else:
-    # is you set up mlrun using the instructions at
-    # https://github.com/mlrun/mlrun/blob/master/hack/local/README.md
-    from mlrun.platforms import mount_pvc
-    fn.apply(mount_pvc('nfsvol', 'nfsvol', '/home/joyan/data'))
-
-
-
-
-
-
-
# Deploy the model server
-addr = fn.deploy(project='cat-and-dog-servers')
-
-
-
-
-
> 2021-01-29 23:47:54,893 [info] Starting remote function deploy
-2021-01-29 23:47:55  (info) Deploying function
-2021-01-29 23:47:55  (info) Building
-2021-01-29 23:47:55  (info) Staging files and preparing base images
-2021-01-29 23:47:56  (info) Building processor image
-2021-01-29 23:47:57  (info) Build complete
-2021-01-29 23:48:07  (info) Function deploy complete
-> 2021-01-29 23:48:08,029 [info] function deployed, address=default-tenant.app.us-sales30-demo.iguazio-cd2.com:31946
-
-
-
-
-
-
-

Test the deployed function on the cluster

-
-

Test the deployed function (with URL)

-
-
-
payload = json.dumps({"data_url" : cat_image_url})
-
-
-
-
-
-
-
fn.invoke(path='/v2/models/model/predict', body=payload)
-
-
-
-
-
{'id': '38224902-a688-4985-9424-578ff9ccb4a5',
- 'model_name': 'model',
- 'outputs': [0.0]}
-
-
-
-
-
-
-

Test the deployed function (with Jpeg Image)

-
-
-
fn.invoke(path='/v2/models/model/predict',
-          body=cat_image,
-          headers={'Content-type': 'image/jpeg'})
-
-
-
-
-
{'id': '246c00fc-225c-44ec-b221-4e6c99f7bc5d',
- 'model_name': 'model',
- 'outputs': [0.0]}
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/tf2_serving_v2/0.8.0/static/function.html b/functions/development/tf2_serving_v2/0.8.0/static/function.html deleted file mode 100644 index 2abe7de0..00000000 --- a/functions/development/tf2_serving_v2/0.8.0/static/function.html +++ /dev/null @@ -1,67 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: serving
-metadata:
-  name: tf2-serving-v2
-  tag: ''
-  hash: 8748deb1d9804f9b436c913322c84d5b46c82bd9
-  project: default
-  labels:
-    author: yaronh
-  categories:
-  - model-serving
-  - machine-learning
-spec:
-  command: ''
-  args: []
-  image: mlrun/mlrun
-  description: tf2 image classification server v2
-  min_replicas: 1
-  max_replicas: 4
-  env: []
-  base_spec:
-    apiVersion: nuclio.io/v1
-    kind: Function
-    metadata:
-      name: tf2-serving-v2
-      labels: {}
-      annotations:
-        nuclio.io/generated_by: function generated from /home/kali/functions/tf2_serving_v2/tf2_serving_v2.py
-    spec:
-      runtime: python:3.6
-      handler: tf2_serving_v2:handler
-      env: []
-      volumes: []
-      build:
-        commands: []
-        noBaseImagesPull: true
-        functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHdhcm5pbmdzCgp3YXJuaW5ncy5zaW1wbGVmaWx0ZXIoYWN0aW9uPSJpZ25vcmUiLCBjYXRlZ29yeT1GdXR1cmVXYXJuaW5nKQoKaW1wb3J0IGpzb24KaW1wb3J0IG51bXB5IGFzIG5wCmltcG9ydCByZXF1ZXN0cwpmcm9tIHRlbnNvcmZsb3cgaW1wb3J0IGtlcmFzCmZyb20gdGVuc29yZmxvdy5rZXJhcy5tb2RlbHMgaW1wb3J0IGxvYWRfbW9kZWwKZnJvbSB0ZW5zb3JmbG93LmtlcmFzLnByZXByb2Nlc3NpbmcgaW1wb3J0IGltYWdlCmZyb20gdGVuc29yZmxvdy5rZXJhcy5wcmVwcm9jZXNzaW5nLmltYWdlIGltcG9ydCBsb2FkX2ltZwpmcm9tIG9zIGltcG9ydCBlbnZpcm9uLCBwYXRoCmZyb20gUElMIGltcG9ydCBJbWFnZQpmcm9tIGlvIGltcG9ydCBCeXRlc0lPCmZyb20gdXJsbGliLnJlcXVlc3QgaW1wb3J0IHVybG9wZW4KaW1wb3J0IG1scnVuCgoKY2xhc3MgVEZNb2RlbChtbHJ1bi5zZXJ2aW5nLlYyTW9kZWxTZXJ2ZXIpOgogICAgZGVmIGxvYWQoc2VsZik6CiAgICAgICAgc2VsZi5JTUFHRV9XSURUSCA9IGludChlbnZpcm9uLmdldCgiSU1BR0VfV0lEVEgiLCAiMTI4IikpCiAgICAgICAgc2VsZi5JTUFHRV9IRUlHSFQgPSBpbnQoZW52aXJvbi5nZXQoIklNQUdFX0hFSUdIVCIsICIxMjgiKSkKCiAgICAgICAgdHJ5OgogICAgICAgICAgICB3aXRoIG9wZW4oZW52aXJvblsiY2xhc3Nlc19tYXAiXSwgInIiKSBhcyBmOgogICAgICAgICAgICAgICAgc2VsZi5jbGFzc2VzID0ganNvbi5sb2FkKGYpCiAgICAgICAgZXhjZXB0OgogICAgICAgICAgICBzZWxmLmNsYXNzZXMgPSBOb25lCgogICAgICAgIG1vZGVsX2ZpbGUsIGV4dHJhX2RhdGEgPSBzZWxmLmdldF9tb2RlbCgiLmg1IikKICAgICAgICBzZWxmLm1vZGVsID0gbG9hZF9tb2RlbChtb2RlbF9maWxlKQoKICAgIGRlZiBwcmVwcm9jZXNzKHNlbGYsIGJvZHksIG9wZXJhdGlvbik6CiAgICAgICAgdHJ5OgogICAgICAgICAgICBvdXRwdXQgPSB7ImlucHV0cyI6IFtdfQogICAgICAgICAgICBpbnB1dHMgPSBib2R5LmdldCgiaW5wdXRzIiwgW10pCiAgICAgICAgICAgIGZvciBieXRlX2ltYWdlIGluIGlucHV0czoKICAgICAgICAgICAgICAgIGltZyA9IEltYWdlLm9wZW4oYnl0ZV9pbWFnZSkKICAgICAgICAgICAgICAgIGltZyA9IGltZy5yZXNpemUoKHNlbGYuSU1BR0VfV0lEVEgsIHNlbGYuSU1BR0VfSEVJR0hUKSkKCiAgICAgICAgICAgICAgICB4ID0gaW1hZ2UuaW1nX3RvX2FycmF5KGltZykKICAgICAgICAgICAgICAgIHggPSBucC5leHBhbmRfZGltcyh4LCBheGlzPTApCiAgICAgICAgICAgICAgICBvdXRwdXRbImlucHV0cyJdLmFwcGVuZCh4KQoKICAgICAgICAgICAgb3V0cHV0WyJpbnB1dHMiXSA9IFtucC52c3RhY2sob3V0cHV0WyJpbnB1dHMiXSldCiAgICAgICAgICAgIHJldHVybiBvdXRwdXQKICAgICAgICBleGNlcHQ6CiAgICAgICAgICAgIHJhaXNlIEV4Y2VwdGlvbihmInJlY2VpdmVkOiB7Ym9keX0iKQoKICAgIGRlZiBwcmVkaWN0KHNlbGYsIGRhdGEpOgogICAgICAgIGltYWdlcyA9IGRhdGEuZ2V0KCJpbnB1dHMiLCBbXSkKCiAgICAgICAgcHJlZGljdGVkX3Byb2JhYmlsaXR5ID0gc2VsZi5tb2RlbC5wcmVkaWN0KGltYWdlcykKCiAgICAgICAgcmV0dXJuIHByZWRpY3RlZF9wcm9iYWJpbGl0eS50b2xpc3QoKVswXQoKCmZyb20gbWxydW4ucnVudGltZXMgaW1wb3J0IG51Y2xpb19pbml0X2hvb2sKCgpkZWYgaW5pdF9jb250ZXh0KGNvbnRleHQpOgogICAgbnVjbGlvX2luaXRfaG9vayhjb250ZXh0LCBnbG9iYWxzKCksICJzZXJ2aW5nX3YyIikKCgpkZWYgaGFuZGxlcihjb250ZXh0LCBldmVudCk6CiAgICByZXR1cm4gY29udGV4dC5tbHJ1bl9oYW5kbGVyKGNvbnRleHQsIGV2ZW50KQoKZnJvbSBtbHJ1bi5ydW50aW1lcyBpbXBvcnQgbnVjbGlvX2luaXRfaG9vawpkZWYgaW5pdF9jb250ZXh0KGNvbnRleHQpOgogICAgbnVjbGlvX2luaXRfaG9vayhjb250ZXh0LCBnbG9iYWxzKCksICdzZXJ2aW5nX3YyJykKCmRlZiBoYW5kbGVyKGNvbnRleHQsIGV2ZW50KToKICAgIHJldHVybiBjb250ZXh0Lm1scnVuX2hhbmRsZXIoY29udGV4dCwgZXZlbnQpCg==
-  source: ''
-  function_kind: serving_v2
-  build:
-    commands:
-    - python -m pip install requests pillow tensorflow>=2.1
-    code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/tf2_serving_v2/tf2_serving_v2.py
-  secret_sources: []
-  affinity: null
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/tf2_serving_v2/0.8.0/static/item.html b/functions/development/tf2_serving_v2/0.8.0/static/item.html deleted file mode 100644 index c7ab22df..00000000 --- a/functions/development/tf2_serving_v2/0.8.0/static/item.html +++ /dev/null @@ -1,49 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- model-serving
-- machine-learning
-description: tf2 image classification server v2
-doc: ''
-example: tf2_serving_v2.ipynb
-generationDate: 2021-05-19:23-13
-icon: ''
-labels:
-  author: yaronh
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 0.8.0
-name: tf2-serving-v2
-platformVersion: 3.2.0
-spec:
-  filename: tf2_serving_v2.py
-  handler: handler
-  image: mlrun/mlrun
-  kind: serving
-  requirements:
-  - requests
-  - pillow
-  - tensorflow>=2.1
-url: ''
-version: 0.8.0
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/tf2_serving_v2/0.8.0/static/source.html b/functions/development/tf2_serving_v2/0.8.0/static/source.html deleted file mode 100644 index a68bd9fb..00000000 --- a/functions/development/tf2_serving_v2/0.8.0/static/source.html +++ /dev/null @@ -1,90 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-# Generated by nuclio.export.NuclioExporter
-
-import warnings
-
-warnings.simplefilter(action="ignore", category=FutureWarning)
-
-import json
-import numpy as np
-import requests
-from tensorflow import keras
-from tensorflow.keras.models import load_model
-from tensorflow.keras.preprocessing import image
-from tensorflow.keras.preprocessing.image import load_img
-from os import environ, path
-from PIL import Image
-from io import BytesIO
-from urllib.request import urlopen
-import mlrun
-
-
-class TFModel(mlrun.serving.V2ModelServer):
-    def load(self):
-        self.IMAGE_WIDTH = int(environ.get("IMAGE_WIDTH", "128"))
-        self.IMAGE_HEIGHT = int(environ.get("IMAGE_HEIGHT", "128"))
-
-        try:
-            with open(environ["classes_map"], "r") as f:
-                self.classes = json.load(f)
-        except:
-            self.classes = None
-
-        model_file, extra_data = self.get_model(".h5")
-        self.model = load_model(model_file)
-
-    def preprocess(self, body, operation):
-        try:
-            output = {"inputs": []}
-            inputs = body.get("inputs", [])
-            for byte_image in inputs:
-                img = Image.open(byte_image)
-                img = img.resize((self.IMAGE_WIDTH, self.IMAGE_HEIGHT))
-
-                x = image.img_to_array(img)
-                x = np.expand_dims(x, axis=0)
-                output["inputs"].append(x)
-
-            output["inputs"] = [np.vstack(output["inputs"])]
-            return output
-        except:
-            raise Exception(f"received: {body}")
-
-    def predict(self, data):
-        images = data.get("inputs", [])
-
-        predicted_probability = self.model.predict(images)
-
-        return predicted_probability.tolist()[0]
-
-
-from mlrun.runtimes import nuclio_init_hook
-
-
-def init_context(context):
-    nuclio_init_hook(context, globals(), "serving_v2")
-
-
-def handler(context, event):
-    return context.mlrun_handler(context, event)
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/tf2_serving_v2/0.9.0/src/function.yaml b/functions/development/tf2_serving_v2/0.9.0/src/function.yaml deleted file mode 100644 index 4dbe9f3f..00000000 --- a/functions/development/tf2_serving_v2/0.9.0/src/function.yaml +++ /dev/null @@ -1,45 +0,0 @@ -kind: serving -metadata: - name: tf2-serving-v2 - tag: '' - hash: 8748deb1d9804f9b436c913322c84d5b46c82bd9 - project: '' - labels: - author: yaronh - categories: - - model-serving - - machine-learning -spec: - command: '' - args: [] - image: mlrun/mlrun - description: tf2 image classification server v2 - min_replicas: 1 - max_replicas: 4 - env: [] - base_spec: - apiVersion: nuclio.io/v1 - kind: Function - metadata: - name: tf2-serving-v2 - labels: {} - annotations: - nuclio.io/generated_by: function generated from /home/kali/functions/tf2_serving_v2/tf2_serving_v2.py - spec: - runtime: python:3.6 - handler: tf2_serving_v2:handler - env: [] - volumes: [] - build: - commands: [] - noBaseImagesPull: true - functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHdhcm5pbmdzCgp3YXJuaW5ncy5zaW1wbGVmaWx0ZXIoYWN0aW9uPSJpZ25vcmUiLCBjYXRlZ29yeT1GdXR1cmVXYXJuaW5nKQoKaW1wb3J0IGpzb24KaW1wb3J0IG51bXB5IGFzIG5wCmltcG9ydCByZXF1ZXN0cwpmcm9tIHRlbnNvcmZsb3cgaW1wb3J0IGtlcmFzCmZyb20gdGVuc29yZmxvdy5rZXJhcy5tb2RlbHMgaW1wb3J0IGxvYWRfbW9kZWwKZnJvbSB0ZW5zb3JmbG93LmtlcmFzLnByZXByb2Nlc3NpbmcgaW1wb3J0IGltYWdlCmZyb20gdGVuc29yZmxvdy5rZXJhcy5wcmVwcm9jZXNzaW5nLmltYWdlIGltcG9ydCBsb2FkX2ltZwpmcm9tIG9zIGltcG9ydCBlbnZpcm9uLCBwYXRoCmZyb20gUElMIGltcG9ydCBJbWFnZQpmcm9tIGlvIGltcG9ydCBCeXRlc0lPCmZyb20gdXJsbGliLnJlcXVlc3QgaW1wb3J0IHVybG9wZW4KaW1wb3J0IG1scnVuCgoKY2xhc3MgVEZNb2RlbChtbHJ1bi5zZXJ2aW5nLlYyTW9kZWxTZXJ2ZXIpOgogICAgZGVmIGxvYWQoc2VsZik6CiAgICAgICAgc2VsZi5JTUFHRV9XSURUSCA9IGludChlbnZpcm9uLmdldCgiSU1BR0VfV0lEVEgiLCAiMTI4IikpCiAgICAgICAgc2VsZi5JTUFHRV9IRUlHSFQgPSBpbnQoZW52aXJvbi5nZXQoIklNQUdFX0hFSUdIVCIsICIxMjgiKSkKCiAgICAgICAgdHJ5OgogICAgICAgICAgICB3aXRoIG9wZW4oZW52aXJvblsiY2xhc3Nlc19tYXAiXSwgInIiKSBhcyBmOgogICAgICAgICAgICAgICAgc2VsZi5jbGFzc2VzID0ganNvbi5sb2FkKGYpCiAgICAgICAgZXhjZXB0OgogICAgICAgICAgICBzZWxmLmNsYXNzZXMgPSBOb25lCgogICAgICAgIG1vZGVsX2ZpbGUsIGV4dHJhX2RhdGEgPSBzZWxmLmdldF9tb2RlbCgiLmg1IikKICAgICAgICBzZWxmLm1vZGVsID0gbG9hZF9tb2RlbChtb2RlbF9maWxlKQoKICAgIGRlZiBwcmVwcm9jZXNzKHNlbGYsIGJvZHksIG9wZXJhdGlvbik6CiAgICAgICAgdHJ5OgogICAgICAgICAgICBvdXRwdXQgPSB7ImlucHV0cyI6IFtdfQogICAgICAgICAgICBpbnB1dHMgPSBib2R5LmdldCgiaW5wdXRzIiwgW10pCiAgICAgICAgICAgIGZvciBieXRlX2ltYWdlIGluIGlucHV0czoKICAgICAgICAgICAgICAgIGltZyA9IEltYWdlLm9wZW4oYnl0ZV9pbWFnZSkKICAgICAgICAgICAgICAgIGltZyA9IGltZy5yZXNpemUoKHNlbGYuSU1BR0VfV0lEVEgsIHNlbGYuSU1BR0VfSEVJR0hUKSkKCiAgICAgICAgICAgICAgICB4ID0gaW1hZ2UuaW1nX3RvX2FycmF5KGltZykKICAgICAgICAgICAgICAgIHggPSBucC5leHBhbmRfZGltcyh4LCBheGlzPTApCiAgICAgICAgICAgICAgICBvdXRwdXRbImlucHV0cyJdLmFwcGVuZCh4KQoKICAgICAgICAgICAgb3V0cHV0WyJpbnB1dHMiXSA9IFtucC52c3RhY2sob3V0cHV0WyJpbnB1dHMiXSldCiAgICAgICAgICAgIHJldHVybiBvdXRwdXQKICAgICAgICBleGNlcHQ6CiAgICAgICAgICAgIHJhaXNlIEV4Y2VwdGlvbihmInJlY2VpdmVkOiB7Ym9keX0iKQoKICAgIGRlZiBwcmVkaWN0KHNlbGYsIGRhdGEpOgogICAgICAgIGltYWdlcyA9IGRhdGEuZ2V0KCJpbnB1dHMiLCBbXSkKCiAgICAgICAgcHJlZGljdGVkX3Byb2JhYmlsaXR5ID0gc2VsZi5tb2RlbC5wcmVkaWN0KGltYWdlcykKCiAgICAgICAgcmV0dXJuIHByZWRpY3RlZF9wcm9iYWJpbGl0eS50b2xpc3QoKVswXQoKCmZyb20gbWxydW4ucnVudGltZXMgaW1wb3J0IG51Y2xpb19pbml0X2hvb2sKCgpkZWYgaW5pdF9jb250ZXh0KGNvbnRleHQpOgogICAgbnVjbGlvX2luaXRfaG9vayhjb250ZXh0LCBnbG9iYWxzKCksICJzZXJ2aW5nX3YyIikKCgpkZWYgaGFuZGxlcihjb250ZXh0LCBldmVudCk6CiAgICByZXR1cm4gY29udGV4dC5tbHJ1bl9oYW5kbGVyKGNvbnRleHQsIGV2ZW50KQoKZnJvbSBtbHJ1bi5ydW50aW1lcyBpbXBvcnQgbnVjbGlvX2luaXRfaG9vawpkZWYgaW5pdF9jb250ZXh0KGNvbnRleHQpOgogICAgbnVjbGlvX2luaXRfaG9vayhjb250ZXh0LCBnbG9iYWxzKCksICdzZXJ2aW5nX3YyJykKCmRlZiBoYW5kbGVyKGNvbnRleHQsIGV2ZW50KToKICAgIHJldHVybiBjb250ZXh0Lm1scnVuX2hhbmRsZXIoY29udGV4dCwgZXZlbnQpCg== - source: '' - function_kind: serving_v2 - build: - commands: - - python -m pip install requests pillow tensorflow>=2.1 - code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/tf2_serving_v2/tf2_serving_v2.py - secret_sources: [] - affinity: null -verbose: false diff --git a/functions/development/tf2_serving_v2/0.9.0/src/item.yaml b/functions/development/tf2_serving_v2/0.9.0/src/item.yaml deleted file mode 100644 index 7431257f..00000000 --- a/functions/development/tf2_serving_v2/0.9.0/src/item.yaml +++ /dev/null @@ -1,27 +0,0 @@ -apiVersion: v1 -categories: -- model-serving -- machine-learning -description: tf2 image classification server v2 -doc: '' -example: tf2_serving_v2.ipynb -generationDate: 2021-11-18:12-28 -icon: '' -labels: - author: yaronh -maintainers: [] -marketplaceType: '' -mlrunVersion: 0.8.0 -name: tf2-serving-v2 -platformVersion: 3.2.0 -spec: - filename: tf2_serving_v2.py - handler: handler - image: mlrun/mlrun - kind: serving - requirements: - - requests - - pillow - - tensorflow>=2.1 -url: '' -version: 0.9.0 diff --git a/functions/development/tf2_serving_v2/0.9.0/src/tf2_serving_v2.ipynb b/functions/development/tf2_serving_v2/0.9.0/src/tf2_serving_v2.ipynb deleted file mode 100644 index 6a15b11a..00000000 --- a/functions/development/tf2_serving_v2/0.9.0/src/tf2_serving_v2.ipynb +++ /dev/null @@ -1,545 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Image Classification Model - Serving Function\n", - "\n", - "This notebook demonstrates how to deploy a Tensorflow model using MLRun & Nuclio.\n", - "\n", - "**In this notebook you will:**\n", - "* Write a Tensorflow-Model class to load and predict on the incoming data\n", - "* Deploy the model as a serverless function\n", - "* Invoke the serving endpoint with data as:\n", - " * URLs to images hosted on S3\n", - " * Direct image send\n", - " \n", - "**Steps:** \n", - "* [Define Nuclio function](#Define-Nuclio-function) \n", - " * [Install dependencies and set config](#Install-dependencies-and-set-config) \n", - " * [Model serving class](#Model-Serving-Class) \n", - "* [Deploy the serving function to the cluster](#Deploy-the-serving-function-to-the-cluster) \n", - "* [Define test parameters](#Define-test-parameters)\n", - "* [Test the deployed function on the cluster](#Test-the-deployed-function-on-the-cluster)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Define Nuclio Function" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To use the magic commands for deploying this jupyter notebook as a nuclio function we must first import nuclio \n", - "Since we do not want to import nuclio in the actual function, the comment annotation `nuclio: ignore` is used. This marks the cell for nuclio, telling it to ignore the cell's values when building the function." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The history saving thread hit an unexpected error (DatabaseError('database disk image is malformed')).History will not be written to the database.\n" - ] - } - ], - "source": [ - "# nuclio: ignore\n", - "import nuclio" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Install dependencies and set config\n", - "> Note: Since tensorflow is being pulled from the baseimage it is not directly installed as a build command.\n", - "If it is not installed on your system please uninstall and install using the line: `pip install tensorflow`" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "%nuclio: setting kind to 'serving'\n", - "%nuclio: setting spec.build.baseImage to 'mlrun/mlrun'\n" - ] - } - ], - "source": [ - "%nuclio config kind=\"serving\"\n", - "\n", - "# tensorflow 2 use the default serving image (or the mlrun/ml-models for a faster build)\n", - "\n", - "%nuclio config spec.build.baseImage = \"mlrun/mlrun\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Since we are using packages which are not surely installed on our baseimage, or want to verify that a specific version of the package will be installed we use the `%nuclio cmd` annotation. \n", - ">`%nuclio cmd` works both locally and during deployment by default, but can be set with `-c` flag to only run the commands while deploying or `-l` to set the variable for the local environment only." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "%%nuclio cmd -c\n", - "pip install tensorflow>=2.1\n", - "pip install requests pillow" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Function Code" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "import warnings\n", - "warnings.simplefilter(action=\"ignore\", category=FutureWarning)" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2021-01-29 23:47:50,165 [warning] Failed resolving version info. Ignoring and using defaults\n", - "> 2021-01-29 23:47:51,342 [warning] Unable to parse server or client version. Assuming compatible: {'server_version': '0.6.0-rc9', 'client_version': 'unstable'}\n" - ] - } - ], - "source": [ - "import json\n", - "import numpy as np\n", - "import requests\n", - "from tensorflow import keras\n", - "from tensorflow.keras.models import load_model\n", - "from tensorflow.keras.preprocessing import image\n", - "from tensorflow.keras.preprocessing.image import load_img\n", - "from os import environ, path\n", - "from PIL import Image\n", - "from io import BytesIO\n", - "from urllib.request import urlopen\n", - "import mlrun" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Model Serving Class" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We define the `TFModel` class which we will use to define data handling and prediction of our model. \n", - "\n", - "The class should consist of:\n", - "* `__init__(name, model_dir)` - Setup the internal parameters\n", - "* `load(self)` - How to load the model and broadcast it's ready for prediction\n", - "* `preprocess(self, body)` - How to handle the incoming event, forming the request to an `{'instances': []}` dictionary as requested by the protocol\n", - "* `predict(self, data)` - Receives and `{'instances': []}` and returns the model's prediction as a list\n", - "* `postprocess(self, data)` - Does any additional processing needed on the predictions." - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "class TFModel(mlrun.serving.V2ModelServer):\n", - "\n", - " def load(self):\n", - " self.IMAGE_WIDTH = int(environ.get('IMAGE_WIDTH', '128'))\n", - " self.IMAGE_HEIGHT = int(environ.get('IMAGE_HEIGHT', '128'))\n", - " \n", - " try:\n", - " with open(environ['classes_map'], 'r') as f:\n", - " self.classes = json.load(f)\n", - " except:\n", - " self.classes = None\n", - " \n", - " model_file, extra_data = self.get_model('.h5')\n", - " self.model = load_model(model_file)\n", - " \n", - " def preprocess(self, body, operation):\n", - " try:\n", - " output = {'inputs': []}\n", - " inputs = body.get('inputs', [])\n", - " for byte_image in inputs:\n", - " img = Image.open(byte_image)\n", - " img = img.resize((self.IMAGE_WIDTH, self.IMAGE_HEIGHT))\n", - "\n", - " # Load image\n", - " x = image.img_to_array(img)\n", - " x = np.expand_dims(x, axis=0)\n", - " output['inputs'].append(x)\n", - " \n", - " # Format inputs list\n", - " output['inputs'] = [np.vstack(output['inputs'])]\n", - " return output\n", - " except:\n", - " raise Exception(f'received: {body}')\n", - " \n", - "\n", - " def predict(self, data):\n", - " images = data.get('inputs', [])\n", - "\n", - " # Predict\n", - " predicted_probability = self.model.predict(images)\n", - "\n", - " # return prediction\n", - " return predicted_probability.tolist()[0]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To let our nuclio builder know that our function code ends at this point we will use the comment annotation `nuclio: end-code`. \n", - "\n", - "Any new cell from now on will be treated as if a `nuclio: ignore` comment was set, and will not be added to the funcion." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: end-code" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Test the function locally" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Make sure your local TF / Keras version is the same as pulled in the nuclio image for accurate testing\n", - "\n", - "Set the served models and their file paths using: `SERVING_MODEL_ = `\n", - "\n", - "> Note: this notebook assumes the model and categories are under /User/mlrun/examples/" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "from PIL import Image\n", - "from io import BytesIO\n", - "import matplotlib.pyplot as plt\n", - "import os" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Define test parameters" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Test image:\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "# Testing event\n", - "cat_image_url = 'https://s3.amazonaws.com/iguazio-sample-data/images/catanddog/cat.102.jpg'\n", - "response = requests.get(cat_image_url)\n", - "cat_image = response.content\n", - "img = Image.open(BytesIO(cat_image))\n", - "\n", - "print('Test image:')\n", - "plt.imshow(img)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Define Function specifications" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "from mlrun import mlconf\n", - "\n", - "# Specific model variables\n", - "function_envs = {\n", - " 'IMAGE_HEIGHT': 224,\n", - " 'IMAGE_WIDTH': 224,\n", - " 'classes_map': '/Userv3io/projects/cat-and-dog-servers/artifacts/categories_map.json',\n", - "}" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Deploy the serving function to the cluster" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import code_to_function, mount_v3io" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2021-01-29 23:47:54,881 [info] function spec saved to path: function.yaml\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Setup the model server function\n", - "\n", - "fn = code_to_function('tf2-serving-v2', kind=\"serving\")\n", - "fn.spec.description = \"tf2 image classification server v2\"\n", - "fn.metadata.categories = ['serving', 'dl']\n", - "fn.metadata.labels = {'author': 'yaronh'}\n", - "fn.export(\"function.yaml\")\n", - "fn.set_envs(function_envs)\n", - "fn.add_model(key=\"model\",\n", - " model_path=\"/User/mlrun_repos/demos/image-classification-with-distributed-training/pipe/52f2145e-7a54-4137-8c7b-b6c20cc8b1fd/tfmodels/model.h5\",\n", - " class_name=\"TFModel\")" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "if \"V3IO_HOME\" in list(os.environ):\n", - " from mlrun import mount_v3io\n", - " fn.apply(mount_v3io())\n", - "else:\n", - " # is you set up mlrun using the instructions at\n", - " # https://github.com/mlrun/mlrun/blob/master/hack/local/README.md\n", - " from mlrun.platforms import mount_pvc\n", - " fn.apply(mount_pvc('nfsvol', 'nfsvol', '/home/joyan/data'))" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2021-01-29 23:47:54,893 [info] Starting remote function deploy\n", - "2021-01-29 23:47:55 (info) Deploying function\n", - "2021-01-29 23:47:55 (info) Building\n", - "2021-01-29 23:47:55 (info) Staging files and preparing base images\n", - "2021-01-29 23:47:56 (info) Building processor image\n", - "2021-01-29 23:47:57 (info) Build complete\n", - "2021-01-29 23:48:07 (info) Function deploy complete\n", - "> 2021-01-29 23:48:08,029 [info] function deployed, address=default-tenant.app.us-sales30-demo.iguazio-cd2.com:31946\n" - ] - } - ], - "source": [ - "# Deploy the model server\n", - "addr = fn.deploy(project='cat-and-dog-servers')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Test the deployed function on the cluster" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Test the deployed function (with URL)" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [], - "source": [ - "payload = json.dumps({\"data_url\" : cat_image_url})" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'id': '38224902-a688-4985-9424-578ff9ccb4a5',\n", - " 'model_name': 'model',\n", - " 'outputs': [0.0]}" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fn.invoke(path='/v2/models/model/predict', body=payload)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Test the deployed function (with Jpeg Image)" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'id': '246c00fc-225c-44ec-b221-4e6c99f7bc5d',\n", - " 'model_name': 'model',\n", - " 'outputs': [0.0]}" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fn.invoke(path='/v2/models/model/predict',\n", - " body=cat_image,\n", - " headers={'Content-type': 'image/jpeg'})" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/functions/development/tf2_serving_v2/0.9.0/src/tf2_serving_v2.py b/functions/development/tf2_serving_v2/0.9.0/src/tf2_serving_v2.py deleted file mode 100644 index 41c50488..00000000 --- a/functions/development/tf2_serving_v2/0.9.0/src/tf2_serving_v2.py +++ /dev/null @@ -1,68 +0,0 @@ -# Generated by nuclio.export.NuclioExporter - -import warnings - -warnings.simplefilter(action="ignore", category=FutureWarning) - -import json -import numpy as np -import requests -from tensorflow import keras -from tensorflow.keras.models import load_model -from tensorflow.keras.preprocessing import image -from tensorflow.keras.preprocessing.image import load_img -from os import environ, path -from PIL import Image -from io import BytesIO -from urllib.request import urlopen -import mlrun - - -class TFModel(mlrun.serving.V2ModelServer): - def load(self): - self.IMAGE_WIDTH = int(environ.get("IMAGE_WIDTH", "128")) - self.IMAGE_HEIGHT = int(environ.get("IMAGE_HEIGHT", "128")) - - try: - with open(environ["classes_map"], "r") as f: - self.classes = json.load(f) - except: - self.classes = None - - model_file, extra_data = self.get_model(".h5") - self.model = load_model(model_file) - - def preprocess(self, body, operation): - try: - output = {"inputs": []} - inputs = body.get("inputs", []) - for byte_image in inputs: - img = Image.open(byte_image) - img = img.resize((self.IMAGE_WIDTH, self.IMAGE_HEIGHT)) - - x = image.img_to_array(img) - x = np.expand_dims(x, axis=0) - output["inputs"].append(x) - - output["inputs"] = [np.vstack(output["inputs"])] - return output - except: - raise Exception(f"received: {body}") - - def predict(self, data): - images = data.get("inputs", []) - - predicted_probability = self.model.predict(images) - - return predicted_probability.tolist()[0] - - -from mlrun.runtimes import nuclio_init_hook - - -def init_context(context): - nuclio_init_hook(context, globals(), "serving_v2") - - -def handler(context, event): - return context.mlrun_handler(context, event) diff --git a/functions/development/tf2_serving_v2/0.9.0/static/documentation.html b/functions/development/tf2_serving_v2/0.9.0/static/documentation.html deleted file mode 100644 index 010e6fae..00000000 --- a/functions/development/tf2_serving_v2/0.9.0/static/documentation.html +++ /dev/null @@ -1,125 +0,0 @@ - - - - - - - -tf2_serving_v2 package - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- - -
-
-
-
-
-
-

tf2_serving_v2 package

-
-

Submodules

-
-
-

tf2_serving_v2.tf2_serving_v2 module

-
-
-

Module contents

-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/tf2_serving_v2/0.9.0/static/example.html b/functions/development/tf2_serving_v2/0.9.0/static/example.html deleted file mode 100644 index 7d181843..00000000 --- a/functions/development/tf2_serving_v2/0.9.0/static/example.html +++ /dev/null @@ -1,525 +0,0 @@ - - - - - - - -Image Classification Model - Serving Function - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
- -
-
-
-
-

Image Classification Model - Serving Function

-

This notebook demonstrates how to deploy a Tensorflow model using MLRun & Nuclio.

-

In this notebook you will:

-
    -
  • Write a Tensorflow-Model class to load and predict on the incoming data

  • -
  • Deploy the model as a serverless function

  • -
  • Invoke the serving endpoint with data as:

    -
      -
    • URLs to images hosted on S3

    • -
    • Direct image send

    • -
    -
  • -
-

Steps:

- -
-

Define Nuclio Function

-

To use the magic commands for deploying this jupyter notebook as a nuclio function we must first import nuclio
-Since we do not want to import nuclio in the actual function, the comment annotation nuclio: ignore is used. This marks the cell for nuclio, telling it to ignore the cell’s values when building the function.

-
-
-
# nuclio: ignore
-import nuclio
-
-
-
-
-
The history saving thread hit an unexpected error (DatabaseError('database disk image is malformed')).History will not be written to the database.
-
-
-
-
-
-

Install dependencies and set config

-
-

Note: Since tensorflow is being pulled from the baseimage it is not directly installed as a build command. -If it is not installed on your system please uninstall and install using the line: pip install tensorflow

-
-
-
-
%nuclio config kind="serving"
-
-# tensorflow 2 use the default serving image (or the mlrun/ml-models for a faster build)
-
-%nuclio config spec.build.baseImage = "mlrun/mlrun"
-
-
-
-
-
%nuclio: setting kind to 'serving'
-%nuclio: setting spec.build.baseImage to 'mlrun/mlrun'
-
-
-
-
-

Since we are using packages which are not surely installed on our baseimage, or want to verify that a specific version of the package will be installed we use the %nuclio cmd annotation.

-
-

%nuclio cmd works both locally and during deployment by default, but can be set with -c flag to only run the commands while deploying or -l to set the variable for the local environment only.

-
-
-
-
%%nuclio cmd -c
-pip install tensorflow>=2.1
-pip install requests pillow
-
-
-
-
-
-
-
-

Function Code

-
-
-
import warnings
-warnings.simplefilter(action="ignore", category=FutureWarning)
-
-
-
-
-
-
-
import json
-import numpy as np
-import requests
-from tensorflow import keras
-from tensorflow.keras.models import load_model
-from tensorflow.keras.preprocessing import image
-from tensorflow.keras.preprocessing.image import load_img
-from os import environ, path
-from PIL import Image
-from io import BytesIO
-from urllib.request import urlopen
-import mlrun
-
-
-
-
-
> 2021-01-29 23:47:50,165 [warning] Failed resolving version info. Ignoring and using defaults
-> 2021-01-29 23:47:51,342 [warning] Unable to parse server or client version. Assuming compatible: {'server_version': '0.6.0-rc9', 'client_version': 'unstable'}
-
-
-
-
-
-

Model Serving Class

-

We define the TFModel class which we will use to define data handling and prediction of our model.

-

The class should consist of:

-
    -
  • __init__(name, model_dir) - Setup the internal parameters

  • -
  • load(self) - How to load the model and broadcast it’s ready for prediction

  • -
  • preprocess(self, body) - How to handle the incoming event, forming the request to an {'instances': [<samples>]} dictionary as requested by the protocol

  • -
  • predict(self, data) - Receives and {'instances': [<samples>]} and returns the model’s prediction as a list

  • -
  • postprocess(self, data) - Does any additional processing needed on the predictions.

  • -
-
-
-
class TFModel(mlrun.serving.V2ModelServer):
-
-    def load(self):
-        self.IMAGE_WIDTH = int(environ.get('IMAGE_WIDTH', '128'))
-        self.IMAGE_HEIGHT = int(environ.get('IMAGE_HEIGHT', '128'))
-        
-        try:
-            with open(environ['classes_map'], 'r') as f:
-                self.classes = json.load(f)
-        except:
-            self.classes = None
-        
-        model_file, extra_data = self.get_model('.h5')
-        self.model = load_model(model_file)
-        
-    def preprocess(self, body, operation):
-        try:
-            output = {'inputs': []}
-            inputs = body.get('inputs', [])
-            for byte_image in inputs:
-                img = Image.open(byte_image)
-                img = img.resize((self.IMAGE_WIDTH, self.IMAGE_HEIGHT))
-
-                # Load image
-                x = image.img_to_array(img)
-                x = np.expand_dims(x, axis=0)
-                output['inputs'].append(x)
-            
-            # Format inputs list
-            output['inputs'] = [np.vstack(output['inputs'])]
-            return output
-        except:
-            raise Exception(f'received: {body}')
-            
-
-    def predict(self, data):
-        images = data.get('inputs', [])
-
-        # Predict
-        predicted_probability = self.model.predict(images)
-
-        # return prediction
-        return predicted_probability.tolist()[0]
-
-
-
-
-

To let our nuclio builder know that our function code ends at this point we will use the comment annotation nuclio: end-code.

-

Any new cell from now on will be treated as if a nuclio: ignore comment was set, and will not be added to the funcion.

-
-
-
# nuclio: end-code
-
-
-
-
-
-
-
-

Test the function locally

-

Make sure your local TF / Keras version is the same as pulled in the nuclio image for accurate testing

-

Set the served models and their file paths using: SERVING_MODEL_<name> = <model file path>

-
-

Note: this notebook assumes the model and categories are under /User/mlrun/examples/

-
-
-
-
from PIL import Image
-from io import BytesIO
-import matplotlib.pyplot as plt
-import os
-
-
-
-
-
-

Define test parameters

-
-
-
# Testing event
-cat_image_url = 'https://s3.amazonaws.com/iguazio-sample-data/images/catanddog/cat.102.jpg'
-response = requests.get(cat_image_url)
-cat_image = response.content
-img = Image.open(BytesIO(cat_image))
-
-print('Test image:')
-plt.imshow(img)
-
-
-
-
-
Test image:
-
-
-
<matplotlib.image.AxesImage at 0x7f01b9643350>
-
-
-_images/tf2_serving_v2_example_20_2.png -
-
-
-
-

Define Function specifications

-
-
-
import os
-from mlrun import mlconf
-
-# Specific model variables
-function_envs = {
-    'IMAGE_HEIGHT': 224,
-    'IMAGE_WIDTH': 224,
-    'classes_map': '/Userv3io/projects/cat-and-dog-servers/artifacts/categories_map.json',
-}
-
-
-
-
-
-
-
-

Deploy the serving function to the cluster

-
-
-
from mlrun import code_to_function, mount_v3io
-
-
-
-
-
-
-
# Setup the model server function
-
-fn = code_to_function('tf2-serving-v2', kind="serving")
-fn.spec.description = "tf2 image classification server v2"
-fn.metadata.categories = ['serving', 'dl']
-fn.metadata.labels = {'author': 'yaronh'}
-fn.export("function.yaml")
-fn.set_envs(function_envs)
-fn.add_model(key="model",
-             model_path="/User/mlrun_repos/demos/image-classification-with-distributed-training/pipe/52f2145e-7a54-4137-8c7b-b6c20cc8b1fd/tfmodels/model.h5",
-             class_name="TFModel")
-
-
-
-
-
> 2021-01-29 23:47:54,881 [info] function spec saved to path: function.yaml
-
-
-
<mlrun.serving.states.TaskState at 0x7f01b86cbd90>
-
-
-
-
-
-
-
if "V3IO_HOME" in list(os.environ):
-    from mlrun import mount_v3io
-    fn.apply(mount_v3io())
-else:
-    # is you set up mlrun using the instructions at
-    # https://github.com/mlrun/mlrun/blob/master/hack/local/README.md
-    from mlrun.platforms import mount_pvc
-    fn.apply(mount_pvc('nfsvol', 'nfsvol', '/home/joyan/data'))
-
-
-
-
-
-
-
# Deploy the model server
-addr = fn.deploy(project='cat-and-dog-servers')
-
-
-
-
-
> 2021-01-29 23:47:54,893 [info] Starting remote function deploy
-2021-01-29 23:47:55  (info) Deploying function
-2021-01-29 23:47:55  (info) Building
-2021-01-29 23:47:55  (info) Staging files and preparing base images
-2021-01-29 23:47:56  (info) Building processor image
-2021-01-29 23:47:57  (info) Build complete
-2021-01-29 23:48:07  (info) Function deploy complete
-> 2021-01-29 23:48:08,029 [info] function deployed, address=default-tenant.app.us-sales30-demo.iguazio-cd2.com:31946
-
-
-
-
-
-
-

Test the deployed function on the cluster

-
-

Test the deployed function (with URL)

-
-
-
payload = json.dumps({"data_url" : cat_image_url})
-
-
-
-
-
-
-
fn.invoke(path='/v2/models/model/predict', body=payload)
-
-
-
-
-
{'id': '38224902-a688-4985-9424-578ff9ccb4a5',
- 'model_name': 'model',
- 'outputs': [0.0]}
-
-
-
-
-
-
-

Test the deployed function (with Jpeg Image)

-
-
-
fn.invoke(path='/v2/models/model/predict',
-          body=cat_image,
-          headers={'Content-type': 'image/jpeg'})
-
-
-
-
-
{'id': '246c00fc-225c-44ec-b221-4e6c99f7bc5d',
- 'model_name': 'model',
- 'outputs': [0.0]}
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/tf2_serving_v2/0.9.0/static/function.html b/functions/development/tf2_serving_v2/0.9.0/static/function.html deleted file mode 100644 index 1d00ec42..00000000 --- a/functions/development/tf2_serving_v2/0.9.0/static/function.html +++ /dev/null @@ -1,67 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: serving
-metadata:
-  name: tf2-serving-v2
-  tag: ''
-  hash: 8748deb1d9804f9b436c913322c84d5b46c82bd9
-  project: ''
-  labels:
-    author: yaronh
-  categories:
-  - model-serving
-  - machine-learning
-spec:
-  command: ''
-  args: []
-  image: mlrun/mlrun
-  description: tf2 image classification server v2
-  min_replicas: 1
-  max_replicas: 4
-  env: []
-  base_spec:
-    apiVersion: nuclio.io/v1
-    kind: Function
-    metadata:
-      name: tf2-serving-v2
-      labels: {}
-      annotations:
-        nuclio.io/generated_by: function generated from /home/kali/functions/tf2_serving_v2/tf2_serving_v2.py
-    spec:
-      runtime: python:3.6
-      handler: tf2_serving_v2:handler
-      env: []
-      volumes: []
-      build:
-        commands: []
-        noBaseImagesPull: true
-        functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHdhcm5pbmdzCgp3YXJuaW5ncy5zaW1wbGVmaWx0ZXIoYWN0aW9uPSJpZ25vcmUiLCBjYXRlZ29yeT1GdXR1cmVXYXJuaW5nKQoKaW1wb3J0IGpzb24KaW1wb3J0IG51bXB5IGFzIG5wCmltcG9ydCByZXF1ZXN0cwpmcm9tIHRlbnNvcmZsb3cgaW1wb3J0IGtlcmFzCmZyb20gdGVuc29yZmxvdy5rZXJhcy5tb2RlbHMgaW1wb3J0IGxvYWRfbW9kZWwKZnJvbSB0ZW5zb3JmbG93LmtlcmFzLnByZXByb2Nlc3NpbmcgaW1wb3J0IGltYWdlCmZyb20gdGVuc29yZmxvdy5rZXJhcy5wcmVwcm9jZXNzaW5nLmltYWdlIGltcG9ydCBsb2FkX2ltZwpmcm9tIG9zIGltcG9ydCBlbnZpcm9uLCBwYXRoCmZyb20gUElMIGltcG9ydCBJbWFnZQpmcm9tIGlvIGltcG9ydCBCeXRlc0lPCmZyb20gdXJsbGliLnJlcXVlc3QgaW1wb3J0IHVybG9wZW4KaW1wb3J0IG1scnVuCgoKY2xhc3MgVEZNb2RlbChtbHJ1bi5zZXJ2aW5nLlYyTW9kZWxTZXJ2ZXIpOgogICAgZGVmIGxvYWQoc2VsZik6CiAgICAgICAgc2VsZi5JTUFHRV9XSURUSCA9IGludChlbnZpcm9uLmdldCgiSU1BR0VfV0lEVEgiLCAiMTI4IikpCiAgICAgICAgc2VsZi5JTUFHRV9IRUlHSFQgPSBpbnQoZW52aXJvbi5nZXQoIklNQUdFX0hFSUdIVCIsICIxMjgiKSkKCiAgICAgICAgdHJ5OgogICAgICAgICAgICB3aXRoIG9wZW4oZW52aXJvblsiY2xhc3Nlc19tYXAiXSwgInIiKSBhcyBmOgogICAgICAgICAgICAgICAgc2VsZi5jbGFzc2VzID0ganNvbi5sb2FkKGYpCiAgICAgICAgZXhjZXB0OgogICAgICAgICAgICBzZWxmLmNsYXNzZXMgPSBOb25lCgogICAgICAgIG1vZGVsX2ZpbGUsIGV4dHJhX2RhdGEgPSBzZWxmLmdldF9tb2RlbCgiLmg1IikKICAgICAgICBzZWxmLm1vZGVsID0gbG9hZF9tb2RlbChtb2RlbF9maWxlKQoKICAgIGRlZiBwcmVwcm9jZXNzKHNlbGYsIGJvZHksIG9wZXJhdGlvbik6CiAgICAgICAgdHJ5OgogICAgICAgICAgICBvdXRwdXQgPSB7ImlucHV0cyI6IFtdfQogICAgICAgICAgICBpbnB1dHMgPSBib2R5LmdldCgiaW5wdXRzIiwgW10pCiAgICAgICAgICAgIGZvciBieXRlX2ltYWdlIGluIGlucHV0czoKICAgICAgICAgICAgICAgIGltZyA9IEltYWdlLm9wZW4oYnl0ZV9pbWFnZSkKICAgICAgICAgICAgICAgIGltZyA9IGltZy5yZXNpemUoKHNlbGYuSU1BR0VfV0lEVEgsIHNlbGYuSU1BR0VfSEVJR0hUKSkKCiAgICAgICAgICAgICAgICB4ID0gaW1hZ2UuaW1nX3RvX2FycmF5KGltZykKICAgICAgICAgICAgICAgIHggPSBucC5leHBhbmRfZGltcyh4LCBheGlzPTApCiAgICAgICAgICAgICAgICBvdXRwdXRbImlucHV0cyJdLmFwcGVuZCh4KQoKICAgICAgICAgICAgb3V0cHV0WyJpbnB1dHMiXSA9IFtucC52c3RhY2sob3V0cHV0WyJpbnB1dHMiXSldCiAgICAgICAgICAgIHJldHVybiBvdXRwdXQKICAgICAgICBleGNlcHQ6CiAgICAgICAgICAgIHJhaXNlIEV4Y2VwdGlvbihmInJlY2VpdmVkOiB7Ym9keX0iKQoKICAgIGRlZiBwcmVkaWN0KHNlbGYsIGRhdGEpOgogICAgICAgIGltYWdlcyA9IGRhdGEuZ2V0KCJpbnB1dHMiLCBbXSkKCiAgICAgICAgcHJlZGljdGVkX3Byb2JhYmlsaXR5ID0gc2VsZi5tb2RlbC5wcmVkaWN0KGltYWdlcykKCiAgICAgICAgcmV0dXJuIHByZWRpY3RlZF9wcm9iYWJpbGl0eS50b2xpc3QoKVswXQoKCmZyb20gbWxydW4ucnVudGltZXMgaW1wb3J0IG51Y2xpb19pbml0X2hvb2sKCgpkZWYgaW5pdF9jb250ZXh0KGNvbnRleHQpOgogICAgbnVjbGlvX2luaXRfaG9vayhjb250ZXh0LCBnbG9iYWxzKCksICJzZXJ2aW5nX3YyIikKCgpkZWYgaGFuZGxlcihjb250ZXh0LCBldmVudCk6CiAgICByZXR1cm4gY29udGV4dC5tbHJ1bl9oYW5kbGVyKGNvbnRleHQsIGV2ZW50KQoKZnJvbSBtbHJ1bi5ydW50aW1lcyBpbXBvcnQgbnVjbGlvX2luaXRfaG9vawpkZWYgaW5pdF9jb250ZXh0KGNvbnRleHQpOgogICAgbnVjbGlvX2luaXRfaG9vayhjb250ZXh0LCBnbG9iYWxzKCksICdzZXJ2aW5nX3YyJykKCmRlZiBoYW5kbGVyKGNvbnRleHQsIGV2ZW50KToKICAgIHJldHVybiBjb250ZXh0Lm1scnVuX2hhbmRsZXIoY29udGV4dCwgZXZlbnQpCg==
-  source: ''
-  function_kind: serving_v2
-  build:
-    commands:
-    - python -m pip install requests pillow tensorflow>=2.1
-    code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/tf2_serving_v2/tf2_serving_v2.py
-  secret_sources: []
-  affinity: null
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/tf2_serving_v2/0.9.0/static/item.html b/functions/development/tf2_serving_v2/0.9.0/static/item.html deleted file mode 100644 index ac4f5d5f..00000000 --- a/functions/development/tf2_serving_v2/0.9.0/static/item.html +++ /dev/null @@ -1,49 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- model-serving
-- machine-learning
-description: tf2 image classification server v2
-doc: ''
-example: tf2_serving_v2.ipynb
-generationDate: 2021-11-18:12-28
-icon: ''
-labels:
-  author: yaronh
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 0.8.0
-name: tf2-serving-v2
-platformVersion: 3.2.0
-spec:
-  filename: tf2_serving_v2.py
-  handler: handler
-  image: mlrun/mlrun
-  kind: serving
-  requirements:
-  - requests
-  - pillow
-  - tensorflow>=2.1
-url: ''
-version: 0.9.0
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/tf2_serving_v2/0.9.0/static/source.html b/functions/development/tf2_serving_v2/0.9.0/static/source.html deleted file mode 100644 index a68bd9fb..00000000 --- a/functions/development/tf2_serving_v2/0.9.0/static/source.html +++ /dev/null @@ -1,90 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-# Generated by nuclio.export.NuclioExporter
-
-import warnings
-
-warnings.simplefilter(action="ignore", category=FutureWarning)
-
-import json
-import numpy as np
-import requests
-from tensorflow import keras
-from tensorflow.keras.models import load_model
-from tensorflow.keras.preprocessing import image
-from tensorflow.keras.preprocessing.image import load_img
-from os import environ, path
-from PIL import Image
-from io import BytesIO
-from urllib.request import urlopen
-import mlrun
-
-
-class TFModel(mlrun.serving.V2ModelServer):
-    def load(self):
-        self.IMAGE_WIDTH = int(environ.get("IMAGE_WIDTH", "128"))
-        self.IMAGE_HEIGHT = int(environ.get("IMAGE_HEIGHT", "128"))
-
-        try:
-            with open(environ["classes_map"], "r") as f:
-                self.classes = json.load(f)
-        except:
-            self.classes = None
-
-        model_file, extra_data = self.get_model(".h5")
-        self.model = load_model(model_file)
-
-    def preprocess(self, body, operation):
-        try:
-            output = {"inputs": []}
-            inputs = body.get("inputs", [])
-            for byte_image in inputs:
-                img = Image.open(byte_image)
-                img = img.resize((self.IMAGE_WIDTH, self.IMAGE_HEIGHT))
-
-                x = image.img_to_array(img)
-                x = np.expand_dims(x, axis=0)
-                output["inputs"].append(x)
-
-            output["inputs"] = [np.vstack(output["inputs"])]
-            return output
-        except:
-            raise Exception(f"received: {body}")
-
-    def predict(self, data):
-        images = data.get("inputs", [])
-
-        predicted_probability = self.model.predict(images)
-
-        return predicted_probability.tolist()[0]
-
-
-from mlrun.runtimes import nuclio_init_hook
-
-
-def init_context(context):
-    nuclio_init_hook(context, globals(), "serving_v2")
-
-
-def handler(context, event):
-    return context.mlrun_handler(context, event)
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/tf2_serving_v2/0.9.1/src/function.yaml b/functions/development/tf2_serving_v2/0.9.1/src/function.yaml deleted file mode 100644 index 4dbe9f3f..00000000 --- a/functions/development/tf2_serving_v2/0.9.1/src/function.yaml +++ /dev/null @@ -1,45 +0,0 @@ -kind: serving -metadata: - name: tf2-serving-v2 - tag: '' - hash: 8748deb1d9804f9b436c913322c84d5b46c82bd9 - project: '' - labels: - author: yaronh - categories: - - model-serving - - machine-learning -spec: - command: '' - args: [] - image: mlrun/mlrun - description: tf2 image classification server v2 - min_replicas: 1 - max_replicas: 4 - env: [] - base_spec: - apiVersion: nuclio.io/v1 - kind: Function - metadata: - name: tf2-serving-v2 - labels: {} - annotations: - nuclio.io/generated_by: function generated from /home/kali/functions/tf2_serving_v2/tf2_serving_v2.py - spec: - runtime: python:3.6 - handler: tf2_serving_v2:handler - env: [] - volumes: [] - build: - commands: [] - noBaseImagesPull: true - functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHdhcm5pbmdzCgp3YXJuaW5ncy5zaW1wbGVmaWx0ZXIoYWN0aW9uPSJpZ25vcmUiLCBjYXRlZ29yeT1GdXR1cmVXYXJuaW5nKQoKaW1wb3J0IGpzb24KaW1wb3J0IG51bXB5IGFzIG5wCmltcG9ydCByZXF1ZXN0cwpmcm9tIHRlbnNvcmZsb3cgaW1wb3J0IGtlcmFzCmZyb20gdGVuc29yZmxvdy5rZXJhcy5tb2RlbHMgaW1wb3J0IGxvYWRfbW9kZWwKZnJvbSB0ZW5zb3JmbG93LmtlcmFzLnByZXByb2Nlc3NpbmcgaW1wb3J0IGltYWdlCmZyb20gdGVuc29yZmxvdy5rZXJhcy5wcmVwcm9jZXNzaW5nLmltYWdlIGltcG9ydCBsb2FkX2ltZwpmcm9tIG9zIGltcG9ydCBlbnZpcm9uLCBwYXRoCmZyb20gUElMIGltcG9ydCBJbWFnZQpmcm9tIGlvIGltcG9ydCBCeXRlc0lPCmZyb20gdXJsbGliLnJlcXVlc3QgaW1wb3J0IHVybG9wZW4KaW1wb3J0IG1scnVuCgoKY2xhc3MgVEZNb2RlbChtbHJ1bi5zZXJ2aW5nLlYyTW9kZWxTZXJ2ZXIpOgogICAgZGVmIGxvYWQoc2VsZik6CiAgICAgICAgc2VsZi5JTUFHRV9XSURUSCA9IGludChlbnZpcm9uLmdldCgiSU1BR0VfV0lEVEgiLCAiMTI4IikpCiAgICAgICAgc2VsZi5JTUFHRV9IRUlHSFQgPSBpbnQoZW52aXJvbi5nZXQoIklNQUdFX0hFSUdIVCIsICIxMjgiKSkKCiAgICAgICAgdHJ5OgogICAgICAgICAgICB3aXRoIG9wZW4oZW52aXJvblsiY2xhc3Nlc19tYXAiXSwgInIiKSBhcyBmOgogICAgICAgICAgICAgICAgc2VsZi5jbGFzc2VzID0ganNvbi5sb2FkKGYpCiAgICAgICAgZXhjZXB0OgogICAgICAgICAgICBzZWxmLmNsYXNzZXMgPSBOb25lCgogICAgICAgIG1vZGVsX2ZpbGUsIGV4dHJhX2RhdGEgPSBzZWxmLmdldF9tb2RlbCgiLmg1IikKICAgICAgICBzZWxmLm1vZGVsID0gbG9hZF9tb2RlbChtb2RlbF9maWxlKQoKICAgIGRlZiBwcmVwcm9jZXNzKHNlbGYsIGJvZHksIG9wZXJhdGlvbik6CiAgICAgICAgdHJ5OgogICAgICAgICAgICBvdXRwdXQgPSB7ImlucHV0cyI6IFtdfQogICAgICAgICAgICBpbnB1dHMgPSBib2R5LmdldCgiaW5wdXRzIiwgW10pCiAgICAgICAgICAgIGZvciBieXRlX2ltYWdlIGluIGlucHV0czoKICAgICAgICAgICAgICAgIGltZyA9IEltYWdlLm9wZW4oYnl0ZV9pbWFnZSkKICAgICAgICAgICAgICAgIGltZyA9IGltZy5yZXNpemUoKHNlbGYuSU1BR0VfV0lEVEgsIHNlbGYuSU1BR0VfSEVJR0hUKSkKCiAgICAgICAgICAgICAgICB4ID0gaW1hZ2UuaW1nX3RvX2FycmF5KGltZykKICAgICAgICAgICAgICAgIHggPSBucC5leHBhbmRfZGltcyh4LCBheGlzPTApCiAgICAgICAgICAgICAgICBvdXRwdXRbImlucHV0cyJdLmFwcGVuZCh4KQoKICAgICAgICAgICAgb3V0cHV0WyJpbnB1dHMiXSA9IFtucC52c3RhY2sob3V0cHV0WyJpbnB1dHMiXSldCiAgICAgICAgICAgIHJldHVybiBvdXRwdXQKICAgICAgICBleGNlcHQ6CiAgICAgICAgICAgIHJhaXNlIEV4Y2VwdGlvbihmInJlY2VpdmVkOiB7Ym9keX0iKQoKICAgIGRlZiBwcmVkaWN0KHNlbGYsIGRhdGEpOgogICAgICAgIGltYWdlcyA9IGRhdGEuZ2V0KCJpbnB1dHMiLCBbXSkKCiAgICAgICAgcHJlZGljdGVkX3Byb2JhYmlsaXR5ID0gc2VsZi5tb2RlbC5wcmVkaWN0KGltYWdlcykKCiAgICAgICAgcmV0dXJuIHByZWRpY3RlZF9wcm9iYWJpbGl0eS50b2xpc3QoKVswXQoKCmZyb20gbWxydW4ucnVudGltZXMgaW1wb3J0IG51Y2xpb19pbml0X2hvb2sKCgpkZWYgaW5pdF9jb250ZXh0KGNvbnRleHQpOgogICAgbnVjbGlvX2luaXRfaG9vayhjb250ZXh0LCBnbG9iYWxzKCksICJzZXJ2aW5nX3YyIikKCgpkZWYgaGFuZGxlcihjb250ZXh0LCBldmVudCk6CiAgICByZXR1cm4gY29udGV4dC5tbHJ1bl9oYW5kbGVyKGNvbnRleHQsIGV2ZW50KQoKZnJvbSBtbHJ1bi5ydW50aW1lcyBpbXBvcnQgbnVjbGlvX2luaXRfaG9vawpkZWYgaW5pdF9jb250ZXh0KGNvbnRleHQpOgogICAgbnVjbGlvX2luaXRfaG9vayhjb250ZXh0LCBnbG9iYWxzKCksICdzZXJ2aW5nX3YyJykKCmRlZiBoYW5kbGVyKGNvbnRleHQsIGV2ZW50KToKICAgIHJldHVybiBjb250ZXh0Lm1scnVuX2hhbmRsZXIoY29udGV4dCwgZXZlbnQpCg== - source: '' - function_kind: serving_v2 - build: - commands: - - python -m pip install requests pillow tensorflow>=2.1 - code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/tf2_serving_v2/tf2_serving_v2.py - secret_sources: [] - affinity: null -verbose: false diff --git a/functions/development/tf2_serving_v2/0.9.1/src/item.yaml b/functions/development/tf2_serving_v2/0.9.1/src/item.yaml deleted file mode 100644 index cd9a66e6..00000000 --- a/functions/development/tf2_serving_v2/0.9.1/src/item.yaml +++ /dev/null @@ -1,27 +0,0 @@ -apiVersion: v1 -categories: -- model-serving -- machine-learning -description: tf2 image classification server v2 -doc: '' -example: tf2_serving_v2.ipynb -generationDate: 2021-11-18:12-28 -icon: '' -labels: - author: yaronh -maintainers: [] -marketplaceType: '' -mlrunVersion: 0.8.0 -name: tf2-serving-v2 -platformVersion: 3.2.0 -spec: - filename: tf2_serving_v2.py - handler: handler - image: mlrun/mlrun - kind: serving - requirements: - - requests - - pillow - - tensorflow>=2.1 -url: '' -version: 0.9.1 diff --git a/functions/development/tf2_serving_v2/0.9.1/src/requirements.txt b/functions/development/tf2_serving_v2/0.9.1/src/requirements.txt deleted file mode 100644 index 8d3d1955..00000000 --- a/functions/development/tf2_serving_v2/0.9.1/src/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -pillow -tensorflow \ No newline at end of file diff --git a/functions/development/tf2_serving_v2/0.9.1/src/tf2_serving_v2.ipynb b/functions/development/tf2_serving_v2/0.9.1/src/tf2_serving_v2.ipynb deleted file mode 100644 index 6a15b11a..00000000 --- a/functions/development/tf2_serving_v2/0.9.1/src/tf2_serving_v2.ipynb +++ /dev/null @@ -1,545 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Image Classification Model - Serving Function\n", - "\n", - "This notebook demonstrates how to deploy a Tensorflow model using MLRun & Nuclio.\n", - "\n", - "**In this notebook you will:**\n", - "* Write a Tensorflow-Model class to load and predict on the incoming data\n", - "* Deploy the model as a serverless function\n", - "* Invoke the serving endpoint with data as:\n", - " * URLs to images hosted on S3\n", - " * Direct image send\n", - " \n", - "**Steps:** \n", - "* [Define Nuclio function](#Define-Nuclio-function) \n", - " * [Install dependencies and set config](#Install-dependencies-and-set-config) \n", - " * [Model serving class](#Model-Serving-Class) \n", - "* [Deploy the serving function to the cluster](#Deploy-the-serving-function-to-the-cluster) \n", - "* [Define test parameters](#Define-test-parameters)\n", - "* [Test the deployed function on the cluster](#Test-the-deployed-function-on-the-cluster)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Define Nuclio Function" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To use the magic commands for deploying this jupyter notebook as a nuclio function we must first import nuclio \n", - "Since we do not want to import nuclio in the actual function, the comment annotation `nuclio: ignore` is used. This marks the cell for nuclio, telling it to ignore the cell's values when building the function." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The history saving thread hit an unexpected error (DatabaseError('database disk image is malformed')).History will not be written to the database.\n" - ] - } - ], - "source": [ - "# nuclio: ignore\n", - "import nuclio" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Install dependencies and set config\n", - "> Note: Since tensorflow is being pulled from the baseimage it is not directly installed as a build command.\n", - "If it is not installed on your system please uninstall and install using the line: `pip install tensorflow`" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "%nuclio: setting kind to 'serving'\n", - "%nuclio: setting spec.build.baseImage to 'mlrun/mlrun'\n" - ] - } - ], - "source": [ - "%nuclio config kind=\"serving\"\n", - "\n", - "# tensorflow 2 use the default serving image (or the mlrun/ml-models for a faster build)\n", - "\n", - "%nuclio config spec.build.baseImage = \"mlrun/mlrun\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Since we are using packages which are not surely installed on our baseimage, or want to verify that a specific version of the package will be installed we use the `%nuclio cmd` annotation. \n", - ">`%nuclio cmd` works both locally and during deployment by default, but can be set with `-c` flag to only run the commands while deploying or `-l` to set the variable for the local environment only." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "%%nuclio cmd -c\n", - "pip install tensorflow>=2.1\n", - "pip install requests pillow" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Function Code" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "import warnings\n", - "warnings.simplefilter(action=\"ignore\", category=FutureWarning)" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2021-01-29 23:47:50,165 [warning] Failed resolving version info. Ignoring and using defaults\n", - "> 2021-01-29 23:47:51,342 [warning] Unable to parse server or client version. Assuming compatible: {'server_version': '0.6.0-rc9', 'client_version': 'unstable'}\n" - ] - } - ], - "source": [ - "import json\n", - "import numpy as np\n", - "import requests\n", - "from tensorflow import keras\n", - "from tensorflow.keras.models import load_model\n", - "from tensorflow.keras.preprocessing import image\n", - "from tensorflow.keras.preprocessing.image import load_img\n", - "from os import environ, path\n", - "from PIL import Image\n", - "from io import BytesIO\n", - "from urllib.request import urlopen\n", - "import mlrun" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Model Serving Class" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We define the `TFModel` class which we will use to define data handling and prediction of our model. \n", - "\n", - "The class should consist of:\n", - "* `__init__(name, model_dir)` - Setup the internal parameters\n", - "* `load(self)` - How to load the model and broadcast it's ready for prediction\n", - "* `preprocess(self, body)` - How to handle the incoming event, forming the request to an `{'instances': []}` dictionary as requested by the protocol\n", - "* `predict(self, data)` - Receives and `{'instances': []}` and returns the model's prediction as a list\n", - "* `postprocess(self, data)` - Does any additional processing needed on the predictions." - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "class TFModel(mlrun.serving.V2ModelServer):\n", - "\n", - " def load(self):\n", - " self.IMAGE_WIDTH = int(environ.get('IMAGE_WIDTH', '128'))\n", - " self.IMAGE_HEIGHT = int(environ.get('IMAGE_HEIGHT', '128'))\n", - " \n", - " try:\n", - " with open(environ['classes_map'], 'r') as f:\n", - " self.classes = json.load(f)\n", - " except:\n", - " self.classes = None\n", - " \n", - " model_file, extra_data = self.get_model('.h5')\n", - " self.model = load_model(model_file)\n", - " \n", - " def preprocess(self, body, operation):\n", - " try:\n", - " output = {'inputs': []}\n", - " inputs = body.get('inputs', [])\n", - " for byte_image in inputs:\n", - " img = Image.open(byte_image)\n", - " img = img.resize((self.IMAGE_WIDTH, self.IMAGE_HEIGHT))\n", - "\n", - " # Load image\n", - " x = image.img_to_array(img)\n", - " x = np.expand_dims(x, axis=0)\n", - " output['inputs'].append(x)\n", - " \n", - " # Format inputs list\n", - " output['inputs'] = [np.vstack(output['inputs'])]\n", - " return output\n", - " except:\n", - " raise Exception(f'received: {body}')\n", - " \n", - "\n", - " def predict(self, data):\n", - " images = data.get('inputs', [])\n", - "\n", - " # Predict\n", - " predicted_probability = self.model.predict(images)\n", - "\n", - " # return prediction\n", - " return predicted_probability.tolist()[0]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To let our nuclio builder know that our function code ends at this point we will use the comment annotation `nuclio: end-code`. \n", - "\n", - "Any new cell from now on will be treated as if a `nuclio: ignore` comment was set, and will not be added to the funcion." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: end-code" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Test the function locally" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Make sure your local TF / Keras version is the same as pulled in the nuclio image for accurate testing\n", - "\n", - "Set the served models and their file paths using: `SERVING_MODEL_ = `\n", - "\n", - "> Note: this notebook assumes the model and categories are under /User/mlrun/examples/" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "from PIL import Image\n", - "from io import BytesIO\n", - "import matplotlib.pyplot as plt\n", - "import os" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Define test parameters" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Test image:\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "# Testing event\n", - "cat_image_url = 'https://s3.amazonaws.com/iguazio-sample-data/images/catanddog/cat.102.jpg'\n", - "response = requests.get(cat_image_url)\n", - "cat_image = response.content\n", - "img = Image.open(BytesIO(cat_image))\n", - "\n", - "print('Test image:')\n", - "plt.imshow(img)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Define Function specifications" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "from mlrun import mlconf\n", - "\n", - "# Specific model variables\n", - "function_envs = {\n", - " 'IMAGE_HEIGHT': 224,\n", - " 'IMAGE_WIDTH': 224,\n", - " 'classes_map': '/Userv3io/projects/cat-and-dog-servers/artifacts/categories_map.json',\n", - "}" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Deploy the serving function to the cluster" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import code_to_function, mount_v3io" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2021-01-29 23:47:54,881 [info] function spec saved to path: function.yaml\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Setup the model server function\n", - "\n", - "fn = code_to_function('tf2-serving-v2', kind=\"serving\")\n", - "fn.spec.description = \"tf2 image classification server v2\"\n", - "fn.metadata.categories = ['serving', 'dl']\n", - "fn.metadata.labels = {'author': 'yaronh'}\n", - "fn.export(\"function.yaml\")\n", - "fn.set_envs(function_envs)\n", - "fn.add_model(key=\"model\",\n", - " model_path=\"/User/mlrun_repos/demos/image-classification-with-distributed-training/pipe/52f2145e-7a54-4137-8c7b-b6c20cc8b1fd/tfmodels/model.h5\",\n", - " class_name=\"TFModel\")" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "if \"V3IO_HOME\" in list(os.environ):\n", - " from mlrun import mount_v3io\n", - " fn.apply(mount_v3io())\n", - "else:\n", - " # is you set up mlrun using the instructions at\n", - " # https://github.com/mlrun/mlrun/blob/master/hack/local/README.md\n", - " from mlrun.platforms import mount_pvc\n", - " fn.apply(mount_pvc('nfsvol', 'nfsvol', '/home/joyan/data'))" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2021-01-29 23:47:54,893 [info] Starting remote function deploy\n", - "2021-01-29 23:47:55 (info) Deploying function\n", - "2021-01-29 23:47:55 (info) Building\n", - "2021-01-29 23:47:55 (info) Staging files and preparing base images\n", - "2021-01-29 23:47:56 (info) Building processor image\n", - "2021-01-29 23:47:57 (info) Build complete\n", - "2021-01-29 23:48:07 (info) Function deploy complete\n", - "> 2021-01-29 23:48:08,029 [info] function deployed, address=default-tenant.app.us-sales30-demo.iguazio-cd2.com:31946\n" - ] - } - ], - "source": [ - "# Deploy the model server\n", - "addr = fn.deploy(project='cat-and-dog-servers')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Test the deployed function on the cluster" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Test the deployed function (with URL)" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [], - "source": [ - "payload = json.dumps({\"data_url\" : cat_image_url})" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'id': '38224902-a688-4985-9424-578ff9ccb4a5',\n", - " 'model_name': 'model',\n", - " 'outputs': [0.0]}" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fn.invoke(path='/v2/models/model/predict', body=payload)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Test the deployed function (with Jpeg Image)" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'id': '246c00fc-225c-44ec-b221-4e6c99f7bc5d',\n", - " 'model_name': 'model',\n", - " 'outputs': [0.0]}" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fn.invoke(path='/v2/models/model/predict',\n", - " body=cat_image,\n", - " headers={'Content-type': 'image/jpeg'})" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/functions/development/tf2_serving_v2/0.9.1/src/tf2_serving_v2.py b/functions/development/tf2_serving_v2/0.9.1/src/tf2_serving_v2.py deleted file mode 100644 index 41c50488..00000000 --- a/functions/development/tf2_serving_v2/0.9.1/src/tf2_serving_v2.py +++ /dev/null @@ -1,68 +0,0 @@ -# Generated by nuclio.export.NuclioExporter - -import warnings - -warnings.simplefilter(action="ignore", category=FutureWarning) - -import json -import numpy as np -import requests -from tensorflow import keras -from tensorflow.keras.models import load_model -from tensorflow.keras.preprocessing import image -from tensorflow.keras.preprocessing.image import load_img -from os import environ, path -from PIL import Image -from io import BytesIO -from urllib.request import urlopen -import mlrun - - -class TFModel(mlrun.serving.V2ModelServer): - def load(self): - self.IMAGE_WIDTH = int(environ.get("IMAGE_WIDTH", "128")) - self.IMAGE_HEIGHT = int(environ.get("IMAGE_HEIGHT", "128")) - - try: - with open(environ["classes_map"], "r") as f: - self.classes = json.load(f) - except: - self.classes = None - - model_file, extra_data = self.get_model(".h5") - self.model = load_model(model_file) - - def preprocess(self, body, operation): - try: - output = {"inputs": []} - inputs = body.get("inputs", []) - for byte_image in inputs: - img = Image.open(byte_image) - img = img.resize((self.IMAGE_WIDTH, self.IMAGE_HEIGHT)) - - x = image.img_to_array(img) - x = np.expand_dims(x, axis=0) - output["inputs"].append(x) - - output["inputs"] = [np.vstack(output["inputs"])] - return output - except: - raise Exception(f"received: {body}") - - def predict(self, data): - images = data.get("inputs", []) - - predicted_probability = self.model.predict(images) - - return predicted_probability.tolist()[0] - - -from mlrun.runtimes import nuclio_init_hook - - -def init_context(context): - nuclio_init_hook(context, globals(), "serving_v2") - - -def handler(context, event): - return context.mlrun_handler(context, event) diff --git a/functions/development/tf2_serving_v2/0.9.1/static/documentation.html b/functions/development/tf2_serving_v2/0.9.1/static/documentation.html deleted file mode 100644 index 10cbdda3..00000000 --- a/functions/development/tf2_serving_v2/0.9.1/static/documentation.html +++ /dev/null @@ -1,128 +0,0 @@ - - - - - - - -tf2_serving_v2 package - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- - -
-
-
-
-
-
-

tf2_serving_v2 package

-
-

Submodules

-
-
-

tf2_serving_v2.tf2_serving_v2 module

-
-
-

Module contents

-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/tf2_serving_v2/0.9.1/static/example.html b/functions/development/tf2_serving_v2/0.9.1/static/example.html deleted file mode 100644 index e468e768..00000000 --- a/functions/development/tf2_serving_v2/0.9.1/static/example.html +++ /dev/null @@ -1,528 +0,0 @@ - - - - - - - -Image Classification Model - Serving Function - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
- -
-
-
-
-

Image Classification Model - Serving Function

-

This notebook demonstrates how to deploy a Tensorflow model using MLRun & Nuclio.

-

In this notebook you will:

-
    -
  • Write a Tensorflow-Model class to load and predict on the incoming data

  • -
  • Deploy the model as a serverless function

  • -
  • Invoke the serving endpoint with data as:

    -
      -
    • URLs to images hosted on S3

    • -
    • Direct image send

    • -
    -
  • -
-

Steps:

- -
-

Define Nuclio Function

-

To use the magic commands for deploying this jupyter notebook as a nuclio function we must first import nuclio
-Since we do not want to import nuclio in the actual function, the comment annotation nuclio: ignore is used. This marks the cell for nuclio, telling it to ignore the cell’s values when building the function.

-
-
-
# nuclio: ignore
-import nuclio
-
-
-
-
-
The history saving thread hit an unexpected error (DatabaseError('database disk image is malformed')).History will not be written to the database.
-
-
-
-
-
-

Install dependencies and set config

-
-

Note: Since tensorflow is being pulled from the baseimage it is not directly installed as a build command. -If it is not installed on your system please uninstall and install using the line: pip install tensorflow

-
-
-
-
%nuclio config kind="serving"
-
-# tensorflow 2 use the default serving image (or the mlrun/ml-models for a faster build)
-
-%nuclio config spec.build.baseImage = "mlrun/mlrun"
-
-
-
-
-
%nuclio: setting kind to 'serving'
-%nuclio: setting spec.build.baseImage to 'mlrun/mlrun'
-
-
-
-
-

Since we are using packages which are not surely installed on our baseimage, or want to verify that a specific version of the package will be installed we use the %nuclio cmd annotation.

-
-

%nuclio cmd works both locally and during deployment by default, but can be set with -c flag to only run the commands while deploying or -l to set the variable for the local environment only.

-
-
-
-
%%nuclio cmd -c
-pip install tensorflow>=2.1
-pip install requests pillow
-
-
-
-
-
-
-
-

Function Code

-
-
-
import warnings
-warnings.simplefilter(action="ignore", category=FutureWarning)
-
-
-
-
-
-
-
import json
-import numpy as np
-import requests
-from tensorflow import keras
-from tensorflow.keras.models import load_model
-from tensorflow.keras.preprocessing import image
-from tensorflow.keras.preprocessing.image import load_img
-from os import environ, path
-from PIL import Image
-from io import BytesIO
-from urllib.request import urlopen
-import mlrun
-
-
-
-
-
> 2021-01-29 23:47:50,165 [warning] Failed resolving version info. Ignoring and using defaults
-> 2021-01-29 23:47:51,342 [warning] Unable to parse server or client version. Assuming compatible: {'server_version': '0.6.0-rc9', 'client_version': 'unstable'}
-
-
-
-
-
-

Model Serving Class

-

We define the TFModel class which we will use to define data handling and prediction of our model.

-

The class should consist of:

-
    -
  • __init__(name, model_dir) - Setup the internal parameters

  • -
  • load(self) - How to load the model and broadcast it’s ready for prediction

  • -
  • preprocess(self, body) - How to handle the incoming event, forming the request to an {'instances': [<samples>]} dictionary as requested by the protocol

  • -
  • predict(self, data) - Receives and {'instances': [<samples>]} and returns the model’s prediction as a list

  • -
  • postprocess(self, data) - Does any additional processing needed on the predictions.

  • -
-
-
-
class TFModel(mlrun.serving.V2ModelServer):
-
-    def load(self):
-        self.IMAGE_WIDTH = int(environ.get('IMAGE_WIDTH', '128'))
-        self.IMAGE_HEIGHT = int(environ.get('IMAGE_HEIGHT', '128'))
-        
-        try:
-            with open(environ['classes_map'], 'r') as f:
-                self.classes = json.load(f)
-        except:
-            self.classes = None
-        
-        model_file, extra_data = self.get_model('.h5')
-        self.model = load_model(model_file)
-        
-    def preprocess(self, body, operation):
-        try:
-            output = {'inputs': []}
-            inputs = body.get('inputs', [])
-            for byte_image in inputs:
-                img = Image.open(byte_image)
-                img = img.resize((self.IMAGE_WIDTH, self.IMAGE_HEIGHT))
-
-                # Load image
-                x = image.img_to_array(img)
-                x = np.expand_dims(x, axis=0)
-                output['inputs'].append(x)
-            
-            # Format inputs list
-            output['inputs'] = [np.vstack(output['inputs'])]
-            return output
-        except:
-            raise Exception(f'received: {body}')
-            
-
-    def predict(self, data):
-        images = data.get('inputs', [])
-
-        # Predict
-        predicted_probability = self.model.predict(images)
-
-        # return prediction
-        return predicted_probability.tolist()[0]
-
-
-
-
-

To let our nuclio builder know that our function code ends at this point we will use the comment annotation nuclio: end-code.

-

Any new cell from now on will be treated as if a nuclio: ignore comment was set, and will not be added to the funcion.

-
-
-
# nuclio: end-code
-
-
-
-
-
-
-
-

Test the function locally

-

Make sure your local TF / Keras version is the same as pulled in the nuclio image for accurate testing

-

Set the served models and their file paths using: SERVING_MODEL_<name> = <model file path>

-
-

Note: this notebook assumes the model and categories are under /User/mlrun/examples/

-
-
-
-
from PIL import Image
-from io import BytesIO
-import matplotlib.pyplot as plt
-import os
-
-
-
-
-
-

Define test parameters

-
-
-
# Testing event
-cat_image_url = 'https://s3.amazonaws.com/iguazio-sample-data/images/catanddog/cat.102.jpg'
-response = requests.get(cat_image_url)
-cat_image = response.content
-img = Image.open(BytesIO(cat_image))
-
-print('Test image:')
-plt.imshow(img)
-
-
-
-
-
Test image:
-
-
-
<matplotlib.image.AxesImage at 0x7f01b9643350>
-
-
-_images/tf2_serving_v2_example_20_2.png -
-
-
-
-

Define Function specifications

-
-
-
import os
-from mlrun import mlconf
-
-# Specific model variables
-function_envs = {
-    'IMAGE_HEIGHT': 224,
-    'IMAGE_WIDTH': 224,
-    'classes_map': '/Userv3io/projects/cat-and-dog-servers/artifacts/categories_map.json',
-}
-
-
-
-
-
-
-
-

Deploy the serving function to the cluster

-
-
-
from mlrun import code_to_function, mount_v3io
-
-
-
-
-
-
-
# Setup the model server function
-
-fn = code_to_function('tf2-serving-v2', kind="serving")
-fn.spec.description = "tf2 image classification server v2"
-fn.metadata.categories = ['serving', 'dl']
-fn.metadata.labels = {'author': 'yaronh'}
-fn.export("function.yaml")
-fn.set_envs(function_envs)
-fn.add_model(key="model",
-             model_path="/User/mlrun_repos/demos/image-classification-with-distributed-training/pipe/52f2145e-7a54-4137-8c7b-b6c20cc8b1fd/tfmodels/model.h5",
-             class_name="TFModel")
-
-
-
-
-
> 2021-01-29 23:47:54,881 [info] function spec saved to path: function.yaml
-
-
-
<mlrun.serving.states.TaskState at 0x7f01b86cbd90>
-
-
-
-
-
-
-
if "V3IO_HOME" in list(os.environ):
-    from mlrun import mount_v3io
-    fn.apply(mount_v3io())
-else:
-    # is you set up mlrun using the instructions at
-    # https://github.com/mlrun/mlrun/blob/master/hack/local/README.md
-    from mlrun.platforms import mount_pvc
-    fn.apply(mount_pvc('nfsvol', 'nfsvol', '/home/joyan/data'))
-
-
-
-
-
-
-
# Deploy the model server
-addr = fn.deploy(project='cat-and-dog-servers')
-
-
-
-
-
> 2021-01-29 23:47:54,893 [info] Starting remote function deploy
-2021-01-29 23:47:55  (info) Deploying function
-2021-01-29 23:47:55  (info) Building
-2021-01-29 23:47:55  (info) Staging files and preparing base images
-2021-01-29 23:47:56  (info) Building processor image
-2021-01-29 23:47:57  (info) Build complete
-2021-01-29 23:48:07  (info) Function deploy complete
-> 2021-01-29 23:48:08,029 [info] function deployed, address=default-tenant.app.us-sales30-demo.iguazio-cd2.com:31946
-
-
-
-
-
-
-

Test the deployed function on the cluster

-
-

Test the deployed function (with URL)

-
-
-
payload = json.dumps({"data_url" : cat_image_url})
-
-
-
-
-
-
-
fn.invoke(path='/v2/models/model/predict', body=payload)
-
-
-
-
-
{'id': '38224902-a688-4985-9424-578ff9ccb4a5',
- 'model_name': 'model',
- 'outputs': [0.0]}
-
-
-
-
-
-
-

Test the deployed function (with Jpeg Image)

-
-
-
fn.invoke(path='/v2/models/model/predict',
-          body=cat_image,
-          headers={'Content-type': 'image/jpeg'})
-
-
-
-
-
{'id': '246c00fc-225c-44ec-b221-4e6c99f7bc5d',
- 'model_name': 'model',
- 'outputs': [0.0]}
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/tf2_serving_v2/0.9.1/static/function.html b/functions/development/tf2_serving_v2/0.9.1/static/function.html deleted file mode 100644 index 1d00ec42..00000000 --- a/functions/development/tf2_serving_v2/0.9.1/static/function.html +++ /dev/null @@ -1,67 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: serving
-metadata:
-  name: tf2-serving-v2
-  tag: ''
-  hash: 8748deb1d9804f9b436c913322c84d5b46c82bd9
-  project: ''
-  labels:
-    author: yaronh
-  categories:
-  - model-serving
-  - machine-learning
-spec:
-  command: ''
-  args: []
-  image: mlrun/mlrun
-  description: tf2 image classification server v2
-  min_replicas: 1
-  max_replicas: 4
-  env: []
-  base_spec:
-    apiVersion: nuclio.io/v1
-    kind: Function
-    metadata:
-      name: tf2-serving-v2
-      labels: {}
-      annotations:
-        nuclio.io/generated_by: function generated from /home/kali/functions/tf2_serving_v2/tf2_serving_v2.py
-    spec:
-      runtime: python:3.6
-      handler: tf2_serving_v2:handler
-      env: []
-      volumes: []
-      build:
-        commands: []
-        noBaseImagesPull: true
-        functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHdhcm5pbmdzCgp3YXJuaW5ncy5zaW1wbGVmaWx0ZXIoYWN0aW9uPSJpZ25vcmUiLCBjYXRlZ29yeT1GdXR1cmVXYXJuaW5nKQoKaW1wb3J0IGpzb24KaW1wb3J0IG51bXB5IGFzIG5wCmltcG9ydCByZXF1ZXN0cwpmcm9tIHRlbnNvcmZsb3cgaW1wb3J0IGtlcmFzCmZyb20gdGVuc29yZmxvdy5rZXJhcy5tb2RlbHMgaW1wb3J0IGxvYWRfbW9kZWwKZnJvbSB0ZW5zb3JmbG93LmtlcmFzLnByZXByb2Nlc3NpbmcgaW1wb3J0IGltYWdlCmZyb20gdGVuc29yZmxvdy5rZXJhcy5wcmVwcm9jZXNzaW5nLmltYWdlIGltcG9ydCBsb2FkX2ltZwpmcm9tIG9zIGltcG9ydCBlbnZpcm9uLCBwYXRoCmZyb20gUElMIGltcG9ydCBJbWFnZQpmcm9tIGlvIGltcG9ydCBCeXRlc0lPCmZyb20gdXJsbGliLnJlcXVlc3QgaW1wb3J0IHVybG9wZW4KaW1wb3J0IG1scnVuCgoKY2xhc3MgVEZNb2RlbChtbHJ1bi5zZXJ2aW5nLlYyTW9kZWxTZXJ2ZXIpOgogICAgZGVmIGxvYWQoc2VsZik6CiAgICAgICAgc2VsZi5JTUFHRV9XSURUSCA9IGludChlbnZpcm9uLmdldCgiSU1BR0VfV0lEVEgiLCAiMTI4IikpCiAgICAgICAgc2VsZi5JTUFHRV9IRUlHSFQgPSBpbnQoZW52aXJvbi5nZXQoIklNQUdFX0hFSUdIVCIsICIxMjgiKSkKCiAgICAgICAgdHJ5OgogICAgICAgICAgICB3aXRoIG9wZW4oZW52aXJvblsiY2xhc3Nlc19tYXAiXSwgInIiKSBhcyBmOgogICAgICAgICAgICAgICAgc2VsZi5jbGFzc2VzID0ganNvbi5sb2FkKGYpCiAgICAgICAgZXhjZXB0OgogICAgICAgICAgICBzZWxmLmNsYXNzZXMgPSBOb25lCgogICAgICAgIG1vZGVsX2ZpbGUsIGV4dHJhX2RhdGEgPSBzZWxmLmdldF9tb2RlbCgiLmg1IikKICAgICAgICBzZWxmLm1vZGVsID0gbG9hZF9tb2RlbChtb2RlbF9maWxlKQoKICAgIGRlZiBwcmVwcm9jZXNzKHNlbGYsIGJvZHksIG9wZXJhdGlvbik6CiAgICAgICAgdHJ5OgogICAgICAgICAgICBvdXRwdXQgPSB7ImlucHV0cyI6IFtdfQogICAgICAgICAgICBpbnB1dHMgPSBib2R5LmdldCgiaW5wdXRzIiwgW10pCiAgICAgICAgICAgIGZvciBieXRlX2ltYWdlIGluIGlucHV0czoKICAgICAgICAgICAgICAgIGltZyA9IEltYWdlLm9wZW4oYnl0ZV9pbWFnZSkKICAgICAgICAgICAgICAgIGltZyA9IGltZy5yZXNpemUoKHNlbGYuSU1BR0VfV0lEVEgsIHNlbGYuSU1BR0VfSEVJR0hUKSkKCiAgICAgICAgICAgICAgICB4ID0gaW1hZ2UuaW1nX3RvX2FycmF5KGltZykKICAgICAgICAgICAgICAgIHggPSBucC5leHBhbmRfZGltcyh4LCBheGlzPTApCiAgICAgICAgICAgICAgICBvdXRwdXRbImlucHV0cyJdLmFwcGVuZCh4KQoKICAgICAgICAgICAgb3V0cHV0WyJpbnB1dHMiXSA9IFtucC52c3RhY2sob3V0cHV0WyJpbnB1dHMiXSldCiAgICAgICAgICAgIHJldHVybiBvdXRwdXQKICAgICAgICBleGNlcHQ6CiAgICAgICAgICAgIHJhaXNlIEV4Y2VwdGlvbihmInJlY2VpdmVkOiB7Ym9keX0iKQoKICAgIGRlZiBwcmVkaWN0KHNlbGYsIGRhdGEpOgogICAgICAgIGltYWdlcyA9IGRhdGEuZ2V0KCJpbnB1dHMiLCBbXSkKCiAgICAgICAgcHJlZGljdGVkX3Byb2JhYmlsaXR5ID0gc2VsZi5tb2RlbC5wcmVkaWN0KGltYWdlcykKCiAgICAgICAgcmV0dXJuIHByZWRpY3RlZF9wcm9iYWJpbGl0eS50b2xpc3QoKVswXQoKCmZyb20gbWxydW4ucnVudGltZXMgaW1wb3J0IG51Y2xpb19pbml0X2hvb2sKCgpkZWYgaW5pdF9jb250ZXh0KGNvbnRleHQpOgogICAgbnVjbGlvX2luaXRfaG9vayhjb250ZXh0LCBnbG9iYWxzKCksICJzZXJ2aW5nX3YyIikKCgpkZWYgaGFuZGxlcihjb250ZXh0LCBldmVudCk6CiAgICByZXR1cm4gY29udGV4dC5tbHJ1bl9oYW5kbGVyKGNvbnRleHQsIGV2ZW50KQoKZnJvbSBtbHJ1bi5ydW50aW1lcyBpbXBvcnQgbnVjbGlvX2luaXRfaG9vawpkZWYgaW5pdF9jb250ZXh0KGNvbnRleHQpOgogICAgbnVjbGlvX2luaXRfaG9vayhjb250ZXh0LCBnbG9iYWxzKCksICdzZXJ2aW5nX3YyJykKCmRlZiBoYW5kbGVyKGNvbnRleHQsIGV2ZW50KToKICAgIHJldHVybiBjb250ZXh0Lm1scnVuX2hhbmRsZXIoY29udGV4dCwgZXZlbnQpCg==
-  source: ''
-  function_kind: serving_v2
-  build:
-    commands:
-    - python -m pip install requests pillow tensorflow>=2.1
-    code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/tf2_serving_v2/tf2_serving_v2.py
-  secret_sources: []
-  affinity: null
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/tf2_serving_v2/0.9.1/static/item.html b/functions/development/tf2_serving_v2/0.9.1/static/item.html deleted file mode 100644 index a85a2f09..00000000 --- a/functions/development/tf2_serving_v2/0.9.1/static/item.html +++ /dev/null @@ -1,49 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- model-serving
-- machine-learning
-description: tf2 image classification server v2
-doc: ''
-example: tf2_serving_v2.ipynb
-generationDate: 2021-11-18:12-28
-icon: ''
-labels:
-  author: yaronh
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 0.8.0
-name: tf2-serving-v2
-platformVersion: 3.2.0
-spec:
-  filename: tf2_serving_v2.py
-  handler: handler
-  image: mlrun/mlrun
-  kind: serving
-  requirements:
-  - requests
-  - pillow
-  - tensorflow>=2.1
-url: ''
-version: 0.9.1
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/tf2_serving_v2/0.9.1/static/source.html b/functions/development/tf2_serving_v2/0.9.1/static/source.html deleted file mode 100644 index a68bd9fb..00000000 --- a/functions/development/tf2_serving_v2/0.9.1/static/source.html +++ /dev/null @@ -1,90 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-# Generated by nuclio.export.NuclioExporter
-
-import warnings
-
-warnings.simplefilter(action="ignore", category=FutureWarning)
-
-import json
-import numpy as np
-import requests
-from tensorflow import keras
-from tensorflow.keras.models import load_model
-from tensorflow.keras.preprocessing import image
-from tensorflow.keras.preprocessing.image import load_img
-from os import environ, path
-from PIL import Image
-from io import BytesIO
-from urllib.request import urlopen
-import mlrun
-
-
-class TFModel(mlrun.serving.V2ModelServer):
-    def load(self):
-        self.IMAGE_WIDTH = int(environ.get("IMAGE_WIDTH", "128"))
-        self.IMAGE_HEIGHT = int(environ.get("IMAGE_HEIGHT", "128"))
-
-        try:
-            with open(environ["classes_map"], "r") as f:
-                self.classes = json.load(f)
-        except:
-            self.classes = None
-
-        model_file, extra_data = self.get_model(".h5")
-        self.model = load_model(model_file)
-
-    def preprocess(self, body, operation):
-        try:
-            output = {"inputs": []}
-            inputs = body.get("inputs", [])
-            for byte_image in inputs:
-                img = Image.open(byte_image)
-                img = img.resize((self.IMAGE_WIDTH, self.IMAGE_HEIGHT))
-
-                x = image.img_to_array(img)
-                x = np.expand_dims(x, axis=0)
-                output["inputs"].append(x)
-
-            output["inputs"] = [np.vstack(output["inputs"])]
-            return output
-        except:
-            raise Exception(f"received: {body}")
-
-    def predict(self, data):
-        images = data.get("inputs", [])
-
-        predicted_probability = self.model.predict(images)
-
-        return predicted_probability.tolist()[0]
-
-
-from mlrun.runtimes import nuclio_init_hook
-
-
-def init_context(context):
-    nuclio_init_hook(context, globals(), "serving_v2")
-
-
-def handler(context, event):
-    return context.mlrun_handler(context, event)
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/tf2_serving_v2/1.1.0/src/function.yaml b/functions/development/tf2_serving_v2/1.1.0/src/function.yaml deleted file mode 100644 index 4dbe9f3f..00000000 --- a/functions/development/tf2_serving_v2/1.1.0/src/function.yaml +++ /dev/null @@ -1,45 +0,0 @@ -kind: serving -metadata: - name: tf2-serving-v2 - tag: '' - hash: 8748deb1d9804f9b436c913322c84d5b46c82bd9 - project: '' - labels: - author: yaronh - categories: - - model-serving - - machine-learning -spec: - command: '' - args: [] - image: mlrun/mlrun - description: tf2 image classification server v2 - min_replicas: 1 - max_replicas: 4 - env: [] - base_spec: - apiVersion: nuclio.io/v1 - kind: Function - metadata: - name: tf2-serving-v2 - labels: {} - annotations: - nuclio.io/generated_by: function generated from /home/kali/functions/tf2_serving_v2/tf2_serving_v2.py - spec: - runtime: python:3.6 - handler: tf2_serving_v2:handler - env: [] - volumes: [] - build: - commands: [] - noBaseImagesPull: true - functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHdhcm5pbmdzCgp3YXJuaW5ncy5zaW1wbGVmaWx0ZXIoYWN0aW9uPSJpZ25vcmUiLCBjYXRlZ29yeT1GdXR1cmVXYXJuaW5nKQoKaW1wb3J0IGpzb24KaW1wb3J0IG51bXB5IGFzIG5wCmltcG9ydCByZXF1ZXN0cwpmcm9tIHRlbnNvcmZsb3cgaW1wb3J0IGtlcmFzCmZyb20gdGVuc29yZmxvdy5rZXJhcy5tb2RlbHMgaW1wb3J0IGxvYWRfbW9kZWwKZnJvbSB0ZW5zb3JmbG93LmtlcmFzLnByZXByb2Nlc3NpbmcgaW1wb3J0IGltYWdlCmZyb20gdGVuc29yZmxvdy5rZXJhcy5wcmVwcm9jZXNzaW5nLmltYWdlIGltcG9ydCBsb2FkX2ltZwpmcm9tIG9zIGltcG9ydCBlbnZpcm9uLCBwYXRoCmZyb20gUElMIGltcG9ydCBJbWFnZQpmcm9tIGlvIGltcG9ydCBCeXRlc0lPCmZyb20gdXJsbGliLnJlcXVlc3QgaW1wb3J0IHVybG9wZW4KaW1wb3J0IG1scnVuCgoKY2xhc3MgVEZNb2RlbChtbHJ1bi5zZXJ2aW5nLlYyTW9kZWxTZXJ2ZXIpOgogICAgZGVmIGxvYWQoc2VsZik6CiAgICAgICAgc2VsZi5JTUFHRV9XSURUSCA9IGludChlbnZpcm9uLmdldCgiSU1BR0VfV0lEVEgiLCAiMTI4IikpCiAgICAgICAgc2VsZi5JTUFHRV9IRUlHSFQgPSBpbnQoZW52aXJvbi5nZXQoIklNQUdFX0hFSUdIVCIsICIxMjgiKSkKCiAgICAgICAgdHJ5OgogICAgICAgICAgICB3aXRoIG9wZW4oZW52aXJvblsiY2xhc3Nlc19tYXAiXSwgInIiKSBhcyBmOgogICAgICAgICAgICAgICAgc2VsZi5jbGFzc2VzID0ganNvbi5sb2FkKGYpCiAgICAgICAgZXhjZXB0OgogICAgICAgICAgICBzZWxmLmNsYXNzZXMgPSBOb25lCgogICAgICAgIG1vZGVsX2ZpbGUsIGV4dHJhX2RhdGEgPSBzZWxmLmdldF9tb2RlbCgiLmg1IikKICAgICAgICBzZWxmLm1vZGVsID0gbG9hZF9tb2RlbChtb2RlbF9maWxlKQoKICAgIGRlZiBwcmVwcm9jZXNzKHNlbGYsIGJvZHksIG9wZXJhdGlvbik6CiAgICAgICAgdHJ5OgogICAgICAgICAgICBvdXRwdXQgPSB7ImlucHV0cyI6IFtdfQogICAgICAgICAgICBpbnB1dHMgPSBib2R5LmdldCgiaW5wdXRzIiwgW10pCiAgICAgICAgICAgIGZvciBieXRlX2ltYWdlIGluIGlucHV0czoKICAgICAgICAgICAgICAgIGltZyA9IEltYWdlLm9wZW4oYnl0ZV9pbWFnZSkKICAgICAgICAgICAgICAgIGltZyA9IGltZy5yZXNpemUoKHNlbGYuSU1BR0VfV0lEVEgsIHNlbGYuSU1BR0VfSEVJR0hUKSkKCiAgICAgICAgICAgICAgICB4ID0gaW1hZ2UuaW1nX3RvX2FycmF5KGltZykKICAgICAgICAgICAgICAgIHggPSBucC5leHBhbmRfZGltcyh4LCBheGlzPTApCiAgICAgICAgICAgICAgICBvdXRwdXRbImlucHV0cyJdLmFwcGVuZCh4KQoKICAgICAgICAgICAgb3V0cHV0WyJpbnB1dHMiXSA9IFtucC52c3RhY2sob3V0cHV0WyJpbnB1dHMiXSldCiAgICAgICAgICAgIHJldHVybiBvdXRwdXQKICAgICAgICBleGNlcHQ6CiAgICAgICAgICAgIHJhaXNlIEV4Y2VwdGlvbihmInJlY2VpdmVkOiB7Ym9keX0iKQoKICAgIGRlZiBwcmVkaWN0KHNlbGYsIGRhdGEpOgogICAgICAgIGltYWdlcyA9IGRhdGEuZ2V0KCJpbnB1dHMiLCBbXSkKCiAgICAgICAgcHJlZGljdGVkX3Byb2JhYmlsaXR5ID0gc2VsZi5tb2RlbC5wcmVkaWN0KGltYWdlcykKCiAgICAgICAgcmV0dXJuIHByZWRpY3RlZF9wcm9iYWJpbGl0eS50b2xpc3QoKVswXQoKCmZyb20gbWxydW4ucnVudGltZXMgaW1wb3J0IG51Y2xpb19pbml0X2hvb2sKCgpkZWYgaW5pdF9jb250ZXh0KGNvbnRleHQpOgogICAgbnVjbGlvX2luaXRfaG9vayhjb250ZXh0LCBnbG9iYWxzKCksICJzZXJ2aW5nX3YyIikKCgpkZWYgaGFuZGxlcihjb250ZXh0LCBldmVudCk6CiAgICByZXR1cm4gY29udGV4dC5tbHJ1bl9oYW5kbGVyKGNvbnRleHQsIGV2ZW50KQoKZnJvbSBtbHJ1bi5ydW50aW1lcyBpbXBvcnQgbnVjbGlvX2luaXRfaG9vawpkZWYgaW5pdF9jb250ZXh0KGNvbnRleHQpOgogICAgbnVjbGlvX2luaXRfaG9vayhjb250ZXh0LCBnbG9iYWxzKCksICdzZXJ2aW5nX3YyJykKCmRlZiBoYW5kbGVyKGNvbnRleHQsIGV2ZW50KToKICAgIHJldHVybiBjb250ZXh0Lm1scnVuX2hhbmRsZXIoY29udGV4dCwgZXZlbnQpCg== - source: '' - function_kind: serving_v2 - build: - commands: - - python -m pip install requests pillow tensorflow>=2.1 - code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/tf2_serving_v2/tf2_serving_v2.py - secret_sources: [] - affinity: null -verbose: false diff --git a/functions/development/tf2_serving_v2/1.1.0/src/item.yaml b/functions/development/tf2_serving_v2/1.1.0/src/item.yaml deleted file mode 100644 index dc7640b0..00000000 --- a/functions/development/tf2_serving_v2/1.1.0/src/item.yaml +++ /dev/null @@ -1,28 +0,0 @@ -apiVersion: v1 -categories: -- model-serving -- machine-learning -description: tf2 image classification server v2 -doc: '' -example: tf2_serving_v2.ipynb -generationDate: 2022-08-28:17-25 -hidden: false -icon: '' -labels: - author: yaronh -maintainers: [] -marketplaceType: '' -mlrunVersion: 1.1.0 -name: tf2-serving-v2 -platformVersion: 3.5.0 -spec: - filename: tf2_serving_v2.py - handler: handler - image: mlrun/mlrun - kind: serving - requirements: - - requests - - pillow - - tensorflow>=2.1 -url: '' -version: 1.1.0 diff --git a/functions/development/tf2_serving_v2/1.1.0/src/requirements.txt b/functions/development/tf2_serving_v2/1.1.0/src/requirements.txt deleted file mode 100644 index 8d3d1955..00000000 --- a/functions/development/tf2_serving_v2/1.1.0/src/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -pillow -tensorflow \ No newline at end of file diff --git a/functions/development/tf2_serving_v2/1.1.0/src/tf2_serving_v2.ipynb b/functions/development/tf2_serving_v2/1.1.0/src/tf2_serving_v2.ipynb deleted file mode 100644 index 6a15b11a..00000000 --- a/functions/development/tf2_serving_v2/1.1.0/src/tf2_serving_v2.ipynb +++ /dev/null @@ -1,545 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Image Classification Model - Serving Function\n", - "\n", - "This notebook demonstrates how to deploy a Tensorflow model using MLRun & Nuclio.\n", - "\n", - "**In this notebook you will:**\n", - "* Write a Tensorflow-Model class to load and predict on the incoming data\n", - "* Deploy the model as a serverless function\n", - "* Invoke the serving endpoint with data as:\n", - " * URLs to images hosted on S3\n", - " * Direct image send\n", - " \n", - "**Steps:** \n", - "* [Define Nuclio function](#Define-Nuclio-function) \n", - " * [Install dependencies and set config](#Install-dependencies-and-set-config) \n", - " * [Model serving class](#Model-Serving-Class) \n", - "* [Deploy the serving function to the cluster](#Deploy-the-serving-function-to-the-cluster) \n", - "* [Define test parameters](#Define-test-parameters)\n", - "* [Test the deployed function on the cluster](#Test-the-deployed-function-on-the-cluster)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Define Nuclio Function" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To use the magic commands for deploying this jupyter notebook as a nuclio function we must first import nuclio \n", - "Since we do not want to import nuclio in the actual function, the comment annotation `nuclio: ignore` is used. This marks the cell for nuclio, telling it to ignore the cell's values when building the function." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The history saving thread hit an unexpected error (DatabaseError('database disk image is malformed')).History will not be written to the database.\n" - ] - } - ], - "source": [ - "# nuclio: ignore\n", - "import nuclio" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Install dependencies and set config\n", - "> Note: Since tensorflow is being pulled from the baseimage it is not directly installed as a build command.\n", - "If it is not installed on your system please uninstall and install using the line: `pip install tensorflow`" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "%nuclio: setting kind to 'serving'\n", - "%nuclio: setting spec.build.baseImage to 'mlrun/mlrun'\n" - ] - } - ], - "source": [ - "%nuclio config kind=\"serving\"\n", - "\n", - "# tensorflow 2 use the default serving image (or the mlrun/ml-models for a faster build)\n", - "\n", - "%nuclio config spec.build.baseImage = \"mlrun/mlrun\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Since we are using packages which are not surely installed on our baseimage, or want to verify that a specific version of the package will be installed we use the `%nuclio cmd` annotation. \n", - ">`%nuclio cmd` works both locally and during deployment by default, but can be set with `-c` flag to only run the commands while deploying or `-l` to set the variable for the local environment only." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "%%nuclio cmd -c\n", - "pip install tensorflow>=2.1\n", - "pip install requests pillow" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Function Code" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "import warnings\n", - "warnings.simplefilter(action=\"ignore\", category=FutureWarning)" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2021-01-29 23:47:50,165 [warning] Failed resolving version info. Ignoring and using defaults\n", - "> 2021-01-29 23:47:51,342 [warning] Unable to parse server or client version. Assuming compatible: {'server_version': '0.6.0-rc9', 'client_version': 'unstable'}\n" - ] - } - ], - "source": [ - "import json\n", - "import numpy as np\n", - "import requests\n", - "from tensorflow import keras\n", - "from tensorflow.keras.models import load_model\n", - "from tensorflow.keras.preprocessing import image\n", - "from tensorflow.keras.preprocessing.image import load_img\n", - "from os import environ, path\n", - "from PIL import Image\n", - "from io import BytesIO\n", - "from urllib.request import urlopen\n", - "import mlrun" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Model Serving Class" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We define the `TFModel` class which we will use to define data handling and prediction of our model. \n", - "\n", - "The class should consist of:\n", - "* `__init__(name, model_dir)` - Setup the internal parameters\n", - "* `load(self)` - How to load the model and broadcast it's ready for prediction\n", - "* `preprocess(self, body)` - How to handle the incoming event, forming the request to an `{'instances': []}` dictionary as requested by the protocol\n", - "* `predict(self, data)` - Receives and `{'instances': []}` and returns the model's prediction as a list\n", - "* `postprocess(self, data)` - Does any additional processing needed on the predictions." - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "class TFModel(mlrun.serving.V2ModelServer):\n", - "\n", - " def load(self):\n", - " self.IMAGE_WIDTH = int(environ.get('IMAGE_WIDTH', '128'))\n", - " self.IMAGE_HEIGHT = int(environ.get('IMAGE_HEIGHT', '128'))\n", - " \n", - " try:\n", - " with open(environ['classes_map'], 'r') as f:\n", - " self.classes = json.load(f)\n", - " except:\n", - " self.classes = None\n", - " \n", - " model_file, extra_data = self.get_model('.h5')\n", - " self.model = load_model(model_file)\n", - " \n", - " def preprocess(self, body, operation):\n", - " try:\n", - " output = {'inputs': []}\n", - " inputs = body.get('inputs', [])\n", - " for byte_image in inputs:\n", - " img = Image.open(byte_image)\n", - " img = img.resize((self.IMAGE_WIDTH, self.IMAGE_HEIGHT))\n", - "\n", - " # Load image\n", - " x = image.img_to_array(img)\n", - " x = np.expand_dims(x, axis=0)\n", - " output['inputs'].append(x)\n", - " \n", - " # Format inputs list\n", - " output['inputs'] = [np.vstack(output['inputs'])]\n", - " return output\n", - " except:\n", - " raise Exception(f'received: {body}')\n", - " \n", - "\n", - " def predict(self, data):\n", - " images = data.get('inputs', [])\n", - "\n", - " # Predict\n", - " predicted_probability = self.model.predict(images)\n", - "\n", - " # return prediction\n", - " return predicted_probability.tolist()[0]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To let our nuclio builder know that our function code ends at this point we will use the comment annotation `nuclio: end-code`. \n", - "\n", - "Any new cell from now on will be treated as if a `nuclio: ignore` comment was set, and will not be added to the funcion." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: end-code" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Test the function locally" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Make sure your local TF / Keras version is the same as pulled in the nuclio image for accurate testing\n", - "\n", - "Set the served models and their file paths using: `SERVING_MODEL_ = `\n", - "\n", - "> Note: this notebook assumes the model and categories are under /User/mlrun/examples/" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "from PIL import Image\n", - "from io import BytesIO\n", - "import matplotlib.pyplot as plt\n", - "import os" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Define test parameters" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Test image:\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "# Testing event\n", - "cat_image_url = 'https://s3.amazonaws.com/iguazio-sample-data/images/catanddog/cat.102.jpg'\n", - "response = requests.get(cat_image_url)\n", - "cat_image = response.content\n", - "img = Image.open(BytesIO(cat_image))\n", - "\n", - "print('Test image:')\n", - "plt.imshow(img)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Define Function specifications" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "from mlrun import mlconf\n", - "\n", - "# Specific model variables\n", - "function_envs = {\n", - " 'IMAGE_HEIGHT': 224,\n", - " 'IMAGE_WIDTH': 224,\n", - " 'classes_map': '/Userv3io/projects/cat-and-dog-servers/artifacts/categories_map.json',\n", - "}" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Deploy the serving function to the cluster" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import code_to_function, mount_v3io" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2021-01-29 23:47:54,881 [info] function spec saved to path: function.yaml\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Setup the model server function\n", - "\n", - "fn = code_to_function('tf2-serving-v2', kind=\"serving\")\n", - "fn.spec.description = \"tf2 image classification server v2\"\n", - "fn.metadata.categories = ['serving', 'dl']\n", - "fn.metadata.labels = {'author': 'yaronh'}\n", - "fn.export(\"function.yaml\")\n", - "fn.set_envs(function_envs)\n", - "fn.add_model(key=\"model\",\n", - " model_path=\"/User/mlrun_repos/demos/image-classification-with-distributed-training/pipe/52f2145e-7a54-4137-8c7b-b6c20cc8b1fd/tfmodels/model.h5\",\n", - " class_name=\"TFModel\")" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "if \"V3IO_HOME\" in list(os.environ):\n", - " from mlrun import mount_v3io\n", - " fn.apply(mount_v3io())\n", - "else:\n", - " # is you set up mlrun using the instructions at\n", - " # https://github.com/mlrun/mlrun/blob/master/hack/local/README.md\n", - " from mlrun.platforms import mount_pvc\n", - " fn.apply(mount_pvc('nfsvol', 'nfsvol', '/home/joyan/data'))" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2021-01-29 23:47:54,893 [info] Starting remote function deploy\n", - "2021-01-29 23:47:55 (info) Deploying function\n", - "2021-01-29 23:47:55 (info) Building\n", - "2021-01-29 23:47:55 (info) Staging files and preparing base images\n", - "2021-01-29 23:47:56 (info) Building processor image\n", - "2021-01-29 23:47:57 (info) Build complete\n", - "2021-01-29 23:48:07 (info) Function deploy complete\n", - "> 2021-01-29 23:48:08,029 [info] function deployed, address=default-tenant.app.us-sales30-demo.iguazio-cd2.com:31946\n" - ] - } - ], - "source": [ - "# Deploy the model server\n", - "addr = fn.deploy(project='cat-and-dog-servers')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Test the deployed function on the cluster" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Test the deployed function (with URL)" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [], - "source": [ - "payload = json.dumps({\"data_url\" : cat_image_url})" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'id': '38224902-a688-4985-9424-578ff9ccb4a5',\n", - " 'model_name': 'model',\n", - " 'outputs': [0.0]}" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fn.invoke(path='/v2/models/model/predict', body=payload)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Test the deployed function (with Jpeg Image)" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'id': '246c00fc-225c-44ec-b221-4e6c99f7bc5d',\n", - " 'model_name': 'model',\n", - " 'outputs': [0.0]}" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fn.invoke(path='/v2/models/model/predict',\n", - " body=cat_image,\n", - " headers={'Content-type': 'image/jpeg'})" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/functions/development/tf2_serving_v2/1.1.0/src/tf2_serving_v2.py b/functions/development/tf2_serving_v2/1.1.0/src/tf2_serving_v2.py deleted file mode 100644 index d3642c20..00000000 --- a/functions/development/tf2_serving_v2/1.1.0/src/tf2_serving_v2.py +++ /dev/null @@ -1,82 +0,0 @@ -# Copyright 2019 Iguazio -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# Generated by nuclio.export.NuclioExporter - -import warnings - -warnings.simplefilter(action="ignore", category=FutureWarning) - -import json -import numpy as np -import requests -from tensorflow import keras -from tensorflow.keras.models import load_model -from tensorflow.keras.preprocessing import image -from tensorflow.keras.preprocessing.image import load_img -from os import environ, path -from PIL import Image -from io import BytesIO -from urllib.request import urlopen -import mlrun - - -class TFModel(mlrun.serving.V2ModelServer): - def load(self): - self.IMAGE_WIDTH = int(environ.get("IMAGE_WIDTH", "128")) - self.IMAGE_HEIGHT = int(environ.get("IMAGE_HEIGHT", "128")) - - try: - with open(environ["classes_map"], "r") as f: - self.classes = json.load(f) - except: - self.classes = None - - model_file, extra_data = self.get_model(".h5") - self.model = load_model(model_file) - - def preprocess(self, body, operation): - try: - output = {"inputs": []} - inputs = body.get("inputs", []) - for byte_image in inputs: - img = Image.open(byte_image) - img = img.resize((self.IMAGE_WIDTH, self.IMAGE_HEIGHT)) - - x = image.img_to_array(img) - x = np.expand_dims(x, axis=0) - output["inputs"].append(x) - - output["inputs"] = [np.vstack(output["inputs"])] - return output - except: - raise Exception(f"received: {body}") - - def predict(self, data): - images = data.get("inputs", []) - - predicted_probability = self.model.predict(images) - - return predicted_probability.tolist()[0] - - -from mlrun.runtimes import nuclio_init_hook - - -def init_context(context): - nuclio_init_hook(context, globals(), "serving_v2") - - -def handler(context, event): - return context.mlrun_handler(context, event) diff --git a/functions/development/tf2_serving_v2/1.1.0/static/documentation.html b/functions/development/tf2_serving_v2/1.1.0/static/documentation.html deleted file mode 100644 index 99a5cbd7..00000000 --- a/functions/development/tf2_serving_v2/1.1.0/static/documentation.html +++ /dev/null @@ -1,250 +0,0 @@ - - - - - - - -tf2_serving_v2 package - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - - - -
-
- - -
-
-
- -
-

tf2_serving_v2 package

- -
- -
-
-
-
-
-

tf2_serving_v2 package#

-
-

Submodules#

-
-
-

tf2_serving_v2.tf2_serving_v2 module#

-
-
-class tf2_serving_v2.tf2_serving_v2.TFModel(context=None, name: Optional[str] = None, model_path: Optional[str] = None, model=None, protocol=None, input_path: Optional[str] = None, result_path: Optional[str] = None, **kwargs)[source]#
-

Bases: mlrun.serving.v2_serving.V2ModelServer

-
-
-load()[source]#
-

model loading function, see also .get_model() method

-
-
-
-predict(data)[source]#
-

model prediction operation

-
-
-
-preprocess(body, operation)[source]#
-

preprocess the event body before validate and action

-
-
-
-
-tf2_serving_v2.tf2_serving_v2.handler(context, event)[source]#
-
-
-
-tf2_serving_v2.tf2_serving_v2.init_context(context)[source]#
-
-
-
-

Module contents#

-
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/tf2_serving_v2/1.1.0/static/example.html b/functions/development/tf2_serving_v2/1.1.0/static/example.html deleted file mode 100644 index edba841b..00000000 --- a/functions/development/tf2_serving_v2/1.1.0/static/example.html +++ /dev/null @@ -1,680 +0,0 @@ - - - - - - - -Image Classification Model - Serving Function - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - - - -
-
- - -
-
-
- - -
-
-
-

Image Classification Model - Serving Function#

-

This notebook demonstrates how to deploy a Tensorflow model using MLRun & Nuclio.

-

In this notebook you will:

-
    -
  • Write a Tensorflow-Model class to load and predict on the incoming data

  • -
  • Deploy the model as a serverless function

  • -
  • Invoke the serving endpoint with data as:

    -
      -
    • URLs to images hosted on S3

    • -
    • Direct image send

    • -
    -
  • -
-

Steps:

-
    -
  • Define Nuclio function

    -
      -
    • Install dependencies and set config

    • -
    • Model serving class

    • -
    -
  • -
  • Deploy the serving function to the cluster

  • -
  • Define test parameters

  • -
  • Test the deployed function on the cluster

  • -
-
-

Define Nuclio Function#

-

To use the magic commands for deploying this jupyter notebook as a nuclio function we must first import nuclio
-Since we do not want to import nuclio in the actual function, the comment annotation nuclio: ignore is used. This marks the cell for nuclio, telling it to ignore the cell’s values when building the function.

-
-
-
# nuclio: ignore
-import nuclio
-
-
-
-
-
The history saving thread hit an unexpected error (DatabaseError('database disk image is malformed')).History will not be written to the database.
-
-
-
-
-
-

Install dependencies and set config#

-
-

Note: Since tensorflow is being pulled from the baseimage it is not directly installed as a build command. -If it is not installed on your system please uninstall and install using the line: pip install tensorflow

-
-
-
-
%nuclio config kind="serving"
-
-# tensorflow 2 use the default serving image (or the mlrun/ml-models for a faster build)
-
-%nuclio config spec.build.baseImage = "mlrun/mlrun"
-
-
-
-
-
%nuclio: setting kind to 'serving'
-%nuclio: setting spec.build.baseImage to 'mlrun/mlrun'
-
-
-
-
-

Since we are using packages which are not surely installed on our baseimage, or want to verify that a specific version of the package will be installed we use the %nuclio cmd annotation.

-
-

%nuclio cmd works both locally and during deployment by default, but can be set with -c flag to only run the commands while deploying or -l to set the variable for the local environment only.

-
-
-
-
%%nuclio cmd -c
-pip install tensorflow>=2.1
-pip install requests pillow
-
-
-
-
-
-
-
-

Function Code#

-
-
-
import warnings
-warnings.simplefilter(action="ignore", category=FutureWarning)
-
-
-
-
-
-
-
import json
-import numpy as np
-import requests
-from tensorflow import keras
-from tensorflow.keras.models import load_model
-from tensorflow.keras.preprocessing import image
-from tensorflow.keras.preprocessing.image import load_img
-from os import environ, path
-from PIL import Image
-from io import BytesIO
-from urllib.request import urlopen
-import mlrun
-
-
-
-
-
> 2021-01-29 23:47:50,165 [warning] Failed resolving version info. Ignoring and using defaults
-> 2021-01-29 23:47:51,342 [warning] Unable to parse server or client version. Assuming compatible: {'server_version': '0.6.0-rc9', 'client_version': 'unstable'}
-
-
-
-
-
-

Model Serving Class#

-

We define the TFModel class which we will use to define data handling and prediction of our model.

-

The class should consist of:

-
    -
  • __init__(name, model_dir) - Setup the internal parameters

  • -
  • load(self) - How to load the model and broadcast it’s ready for prediction

  • -
  • preprocess(self, body) - How to handle the incoming event, forming the request to an {'instances': [<samples>]} dictionary as requested by the protocol

  • -
  • predict(self, data) - Receives and {'instances': [<samples>]} and returns the model’s prediction as a list

  • -
  • postprocess(self, data) - Does any additional processing needed on the predictions.

  • -
-
-
-
class TFModel(mlrun.serving.V2ModelServer):
-
-    def load(self):
-        self.IMAGE_WIDTH = int(environ.get('IMAGE_WIDTH', '128'))
-        self.IMAGE_HEIGHT = int(environ.get('IMAGE_HEIGHT', '128'))
-        
-        try:
-            with open(environ['classes_map'], 'r') as f:
-                self.classes = json.load(f)
-        except:
-            self.classes = None
-        
-        model_file, extra_data = self.get_model('.h5')
-        self.model = load_model(model_file)
-        
-    def preprocess(self, body, operation):
-        try:
-            output = {'inputs': []}
-            inputs = body.get('inputs', [])
-            for byte_image in inputs:
-                img = Image.open(byte_image)
-                img = img.resize((self.IMAGE_WIDTH, self.IMAGE_HEIGHT))
-
-                # Load image
-                x = image.img_to_array(img)
-                x = np.expand_dims(x, axis=0)
-                output['inputs'].append(x)
-            
-            # Format inputs list
-            output['inputs'] = [np.vstack(output['inputs'])]
-            return output
-        except:
-            raise Exception(f'received: {body}')
-            
-
-    def predict(self, data):
-        images = data.get('inputs', [])
-
-        # Predict
-        predicted_probability = self.model.predict(images)
-
-        # return prediction
-        return predicted_probability.tolist()[0]
-
-
-
-
-

To let our nuclio builder know that our function code ends at this point we will use the comment annotation nuclio: end-code.

-

Any new cell from now on will be treated as if a nuclio: ignore comment was set, and will not be added to the funcion.

-
-
-
# nuclio: end-code
-
-
-
-
-
-
-
-

Test the function locally#

-

Make sure your local TF / Keras version is the same as pulled in the nuclio image for accurate testing

-

Set the served models and their file paths using: SERVING_MODEL_<name> = <model file path>

-
-

Note: this notebook assumes the model and categories are under /User/mlrun/examples/

-
-
-
-
from PIL import Image
-from io import BytesIO
-import matplotlib.pyplot as plt
-import os
-
-
-
-
-
-

Define test parameters#

-
-
-
# Testing event
-cat_image_url = 'https://s3.amazonaws.com/iguazio-sample-data/images/catanddog/cat.102.jpg'
-response = requests.get(cat_image_url)
-cat_image = response.content
-img = Image.open(BytesIO(cat_image))
-
-print('Test image:')
-plt.imshow(img)
-
-
-
-
-
Test image:
-
-
-
<matplotlib.image.AxesImage at 0x7f01b9643350>
-
-
-_images/cd2de2ed0d1841d97e37ee5a2c7a1f70e7e7ba5a4dc5416e11e1af47c1b99e03.png -
-
-
-
-

Define Function specifications#

-
-
-
import os
-from mlrun import mlconf
-
-# Specific model variables
-function_envs = {
-    'IMAGE_HEIGHT': 224,
-    'IMAGE_WIDTH': 224,
-    'classes_map': '/Userv3io/projects/cat-and-dog-servers/artifacts/categories_map.json',
-}
-
-
-
-
-
-
-
-

Deploy the serving function to the cluster#

-
-
-
from mlrun import code_to_function, mount_v3io
-
-
-
-
-
-
-
# Setup the model server function
-
-fn = code_to_function('tf2-serving-v2', kind="serving")
-fn.spec.description = "tf2 image classification server v2"
-fn.metadata.categories = ['serving', 'dl']
-fn.metadata.labels = {'author': 'yaronh'}
-fn.export("function.yaml")
-fn.set_envs(function_envs)
-fn.add_model(key="model",
-             model_path="/User/mlrun_repos/demos/image-classification-with-distributed-training/pipe/52f2145e-7a54-4137-8c7b-b6c20cc8b1fd/tfmodels/model.h5",
-             class_name="TFModel")
-
-
-
-
-
> 2021-01-29 23:47:54,881 [info] function spec saved to path: function.yaml
-
-
-
<mlrun.serving.states.TaskState at 0x7f01b86cbd90>
-
-
-
-
-
-
-
if "V3IO_HOME" in list(os.environ):
-    from mlrun import mount_v3io
-    fn.apply(mount_v3io())
-else:
-    # is you set up mlrun using the instructions at
-    # https://github.com/mlrun/mlrun/blob/master/hack/local/README.md
-    from mlrun.platforms import mount_pvc
-    fn.apply(mount_pvc('nfsvol', 'nfsvol', '/home/joyan/data'))
-
-
-
-
-
-
-
# Deploy the model server
-addr = fn.deploy(project='cat-and-dog-servers')
-
-
-
-
-
> 2021-01-29 23:47:54,893 [info] Starting remote function deploy
-2021-01-29 23:47:55  (info) Deploying function
-2021-01-29 23:47:55  (info) Building
-2021-01-29 23:47:55  (info) Staging files and preparing base images
-2021-01-29 23:47:56  (info) Building processor image
-2021-01-29 23:47:57  (info) Build complete
-2021-01-29 23:48:07  (info) Function deploy complete
-> 2021-01-29 23:48:08,029 [info] function deployed, address=default-tenant.app.us-sales30-demo.iguazio-cd2.com:31946
-
-
-
-
-
-
-

Test the deployed function on the cluster#

-
-

Test the deployed function (with URL)#

-
-
-
payload = json.dumps({"data_url" : cat_image_url})
-
-
-
-
-
-
-
fn.invoke(path='/v2/models/model/predict', body=payload)
-
-
-
-
-
{'id': '38224902-a688-4985-9424-578ff9ccb4a5',
- 'model_name': 'model',
- 'outputs': [0.0]}
-
-
-
-
-
-
-

Test the deployed function (with Jpeg Image)#

-
-
-
fn.invoke(path='/v2/models/model/predict',
-          body=cat_image,
-          headers={'Content-type': 'image/jpeg'})
-
-
-
-
-
{'id': '246c00fc-225c-44ec-b221-4e6c99f7bc5d',
- 'model_name': 'model',
- 'outputs': [0.0]}
-
-
-
-
-
-
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/tf2_serving_v2/1.1.0/static/function.html b/functions/development/tf2_serving_v2/1.1.0/static/function.html deleted file mode 100644 index 1d00ec42..00000000 --- a/functions/development/tf2_serving_v2/1.1.0/static/function.html +++ /dev/null @@ -1,67 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: serving
-metadata:
-  name: tf2-serving-v2
-  tag: ''
-  hash: 8748deb1d9804f9b436c913322c84d5b46c82bd9
-  project: ''
-  labels:
-    author: yaronh
-  categories:
-  - model-serving
-  - machine-learning
-spec:
-  command: ''
-  args: []
-  image: mlrun/mlrun
-  description: tf2 image classification server v2
-  min_replicas: 1
-  max_replicas: 4
-  env: []
-  base_spec:
-    apiVersion: nuclio.io/v1
-    kind: Function
-    metadata:
-      name: tf2-serving-v2
-      labels: {}
-      annotations:
-        nuclio.io/generated_by: function generated from /home/kali/functions/tf2_serving_v2/tf2_serving_v2.py
-    spec:
-      runtime: python:3.6
-      handler: tf2_serving_v2:handler
-      env: []
-      volumes: []
-      build:
-        commands: []
-        noBaseImagesPull: true
-        functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHdhcm5pbmdzCgp3YXJuaW5ncy5zaW1wbGVmaWx0ZXIoYWN0aW9uPSJpZ25vcmUiLCBjYXRlZ29yeT1GdXR1cmVXYXJuaW5nKQoKaW1wb3J0IGpzb24KaW1wb3J0IG51bXB5IGFzIG5wCmltcG9ydCByZXF1ZXN0cwpmcm9tIHRlbnNvcmZsb3cgaW1wb3J0IGtlcmFzCmZyb20gdGVuc29yZmxvdy5rZXJhcy5tb2RlbHMgaW1wb3J0IGxvYWRfbW9kZWwKZnJvbSB0ZW5zb3JmbG93LmtlcmFzLnByZXByb2Nlc3NpbmcgaW1wb3J0IGltYWdlCmZyb20gdGVuc29yZmxvdy5rZXJhcy5wcmVwcm9jZXNzaW5nLmltYWdlIGltcG9ydCBsb2FkX2ltZwpmcm9tIG9zIGltcG9ydCBlbnZpcm9uLCBwYXRoCmZyb20gUElMIGltcG9ydCBJbWFnZQpmcm9tIGlvIGltcG9ydCBCeXRlc0lPCmZyb20gdXJsbGliLnJlcXVlc3QgaW1wb3J0IHVybG9wZW4KaW1wb3J0IG1scnVuCgoKY2xhc3MgVEZNb2RlbChtbHJ1bi5zZXJ2aW5nLlYyTW9kZWxTZXJ2ZXIpOgogICAgZGVmIGxvYWQoc2VsZik6CiAgICAgICAgc2VsZi5JTUFHRV9XSURUSCA9IGludChlbnZpcm9uLmdldCgiSU1BR0VfV0lEVEgiLCAiMTI4IikpCiAgICAgICAgc2VsZi5JTUFHRV9IRUlHSFQgPSBpbnQoZW52aXJvbi5nZXQoIklNQUdFX0hFSUdIVCIsICIxMjgiKSkKCiAgICAgICAgdHJ5OgogICAgICAgICAgICB3aXRoIG9wZW4oZW52aXJvblsiY2xhc3Nlc19tYXAiXSwgInIiKSBhcyBmOgogICAgICAgICAgICAgICAgc2VsZi5jbGFzc2VzID0ganNvbi5sb2FkKGYpCiAgICAgICAgZXhjZXB0OgogICAgICAgICAgICBzZWxmLmNsYXNzZXMgPSBOb25lCgogICAgICAgIG1vZGVsX2ZpbGUsIGV4dHJhX2RhdGEgPSBzZWxmLmdldF9tb2RlbCgiLmg1IikKICAgICAgICBzZWxmLm1vZGVsID0gbG9hZF9tb2RlbChtb2RlbF9maWxlKQoKICAgIGRlZiBwcmVwcm9jZXNzKHNlbGYsIGJvZHksIG9wZXJhdGlvbik6CiAgICAgICAgdHJ5OgogICAgICAgICAgICBvdXRwdXQgPSB7ImlucHV0cyI6IFtdfQogICAgICAgICAgICBpbnB1dHMgPSBib2R5LmdldCgiaW5wdXRzIiwgW10pCiAgICAgICAgICAgIGZvciBieXRlX2ltYWdlIGluIGlucHV0czoKICAgICAgICAgICAgICAgIGltZyA9IEltYWdlLm9wZW4oYnl0ZV9pbWFnZSkKICAgICAgICAgICAgICAgIGltZyA9IGltZy5yZXNpemUoKHNlbGYuSU1BR0VfV0lEVEgsIHNlbGYuSU1BR0VfSEVJR0hUKSkKCiAgICAgICAgICAgICAgICB4ID0gaW1hZ2UuaW1nX3RvX2FycmF5KGltZykKICAgICAgICAgICAgICAgIHggPSBucC5leHBhbmRfZGltcyh4LCBheGlzPTApCiAgICAgICAgICAgICAgICBvdXRwdXRbImlucHV0cyJdLmFwcGVuZCh4KQoKICAgICAgICAgICAgb3V0cHV0WyJpbnB1dHMiXSA9IFtucC52c3RhY2sob3V0cHV0WyJpbnB1dHMiXSldCiAgICAgICAgICAgIHJldHVybiBvdXRwdXQKICAgICAgICBleGNlcHQ6CiAgICAgICAgICAgIHJhaXNlIEV4Y2VwdGlvbihmInJlY2VpdmVkOiB7Ym9keX0iKQoKICAgIGRlZiBwcmVkaWN0KHNlbGYsIGRhdGEpOgogICAgICAgIGltYWdlcyA9IGRhdGEuZ2V0KCJpbnB1dHMiLCBbXSkKCiAgICAgICAgcHJlZGljdGVkX3Byb2JhYmlsaXR5ID0gc2VsZi5tb2RlbC5wcmVkaWN0KGltYWdlcykKCiAgICAgICAgcmV0dXJuIHByZWRpY3RlZF9wcm9iYWJpbGl0eS50b2xpc3QoKVswXQoKCmZyb20gbWxydW4ucnVudGltZXMgaW1wb3J0IG51Y2xpb19pbml0X2hvb2sKCgpkZWYgaW5pdF9jb250ZXh0KGNvbnRleHQpOgogICAgbnVjbGlvX2luaXRfaG9vayhjb250ZXh0LCBnbG9iYWxzKCksICJzZXJ2aW5nX3YyIikKCgpkZWYgaGFuZGxlcihjb250ZXh0LCBldmVudCk6CiAgICByZXR1cm4gY29udGV4dC5tbHJ1bl9oYW5kbGVyKGNvbnRleHQsIGV2ZW50KQoKZnJvbSBtbHJ1bi5ydW50aW1lcyBpbXBvcnQgbnVjbGlvX2luaXRfaG9vawpkZWYgaW5pdF9jb250ZXh0KGNvbnRleHQpOgogICAgbnVjbGlvX2luaXRfaG9vayhjb250ZXh0LCBnbG9iYWxzKCksICdzZXJ2aW5nX3YyJykKCmRlZiBoYW5kbGVyKGNvbnRleHQsIGV2ZW50KToKICAgIHJldHVybiBjb250ZXh0Lm1scnVuX2hhbmRsZXIoY29udGV4dCwgZXZlbnQpCg==
-  source: ''
-  function_kind: serving_v2
-  build:
-    commands:
-    - python -m pip install requests pillow tensorflow>=2.1
-    code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/tf2_serving_v2/tf2_serving_v2.py
-  secret_sources: []
-  affinity: null
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/tf2_serving_v2/1.1.0/static/item.html b/functions/development/tf2_serving_v2/1.1.0/static/item.html deleted file mode 100644 index fdc080f8..00000000 --- a/functions/development/tf2_serving_v2/1.1.0/static/item.html +++ /dev/null @@ -1,50 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- model-serving
-- machine-learning
-description: tf2 image classification server v2
-doc: ''
-example: tf2_serving_v2.ipynb
-generationDate: 2022-08-28:17-25
-hidden: false
-icon: ''
-labels:
-  author: yaronh
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 1.1.0
-name: tf2-serving-v2
-platformVersion: 3.5.0
-spec:
-  filename: tf2_serving_v2.py
-  handler: handler
-  image: mlrun/mlrun
-  kind: serving
-  requirements:
-  - requests
-  - pillow
-  - tensorflow>=2.1
-url: ''
-version: 1.1.0
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/tf2_serving_v2/1.1.0/static/source.html b/functions/development/tf2_serving_v2/1.1.0/static/source.html deleted file mode 100644 index 65434955..00000000 --- a/functions/development/tf2_serving_v2/1.1.0/static/source.html +++ /dev/null @@ -1,104 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-# Copyright 2019 Iguazio
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# Generated by nuclio.export.NuclioExporter
-
-import warnings
-
-warnings.simplefilter(action="ignore", category=FutureWarning)
-
-import json
-import numpy as np
-import requests
-from tensorflow import keras
-from tensorflow.keras.models import load_model
-from tensorflow.keras.preprocessing import image
-from tensorflow.keras.preprocessing.image import load_img
-from os import environ, path
-from PIL import Image
-from io import BytesIO
-from urllib.request import urlopen
-import mlrun
-
-
-class TFModel(mlrun.serving.V2ModelServer):
-    def load(self):
-        self.IMAGE_WIDTH = int(environ.get("IMAGE_WIDTH", "128"))
-        self.IMAGE_HEIGHT = int(environ.get("IMAGE_HEIGHT", "128"))
-
-        try:
-            with open(environ["classes_map"], "r") as f:
-                self.classes = json.load(f)
-        except:
-            self.classes = None
-
-        model_file, extra_data = self.get_model(".h5")
-        self.model = load_model(model_file)
-
-    def preprocess(self, body, operation):
-        try:
-            output = {"inputs": []}
-            inputs = body.get("inputs", [])
-            for byte_image in inputs:
-                img = Image.open(byte_image)
-                img = img.resize((self.IMAGE_WIDTH, self.IMAGE_HEIGHT))
-
-                x = image.img_to_array(img)
-                x = np.expand_dims(x, axis=0)
-                output["inputs"].append(x)
-
-            output["inputs"] = [np.vstack(output["inputs"])]
-            return output
-        except:
-            raise Exception(f"received: {body}")
-
-    def predict(self, data):
-        images = data.get("inputs", [])
-
-        predicted_probability = self.model.predict(images)
-
-        return predicted_probability.tolist()[0]
-
-
-from mlrun.runtimes import nuclio_init_hook
-
-
-def init_context(context):
-    nuclio_init_hook(context, globals(), "serving_v2")
-
-
-def handler(context, event):
-    return context.mlrun_handler(context, event)
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/tf2_serving_v2/1.1.0/static/tf2_serving_v2.html b/functions/development/tf2_serving_v2/1.1.0/static/tf2_serving_v2.html deleted file mode 100644 index 27b5b357..00000000 --- a/functions/development/tf2_serving_v2/1.1.0/static/tf2_serving_v2.html +++ /dev/null @@ -1,222 +0,0 @@ - - - - - - - -tf2_serving_v2.tf2_serving_v2 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - -
-
- -
-
-
-
-
- -
-

- -
-
-
-
-
-
-
-

Source code for tf2_serving_v2.tf2_serving_v2

-# Copyright 2019 Iguazio
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# Generated by nuclio.export.NuclioExporter
-
-import warnings
-
-warnings.simplefilter(action="ignore", category=FutureWarning)
-
-import json
-import numpy as np
-import requests
-from tensorflow import keras
-from tensorflow.keras.models import load_model
-from tensorflow.keras.preprocessing import image
-from tensorflow.keras.preprocessing.image import load_img
-from os import environ, path
-from PIL import Image
-from io import BytesIO
-from urllib.request import urlopen
-import mlrun
-
-
-
[docs]class TFModel(mlrun.serving.V2ModelServer): -
[docs] def load(self): - self.IMAGE_WIDTH = int(environ.get("IMAGE_WIDTH", "128")) - self.IMAGE_HEIGHT = int(environ.get("IMAGE_HEIGHT", "128")) - - try: - with open(environ["classes_map"], "r") as f: - self.classes = json.load(f) - except: - self.classes = None - - model_file, extra_data = self.get_model(".h5") - self.model = load_model(model_file)
- -
[docs] def preprocess(self, body, operation): - try: - output = {"inputs": []} - inputs = body.get("inputs", []) - for byte_image in inputs: - img = Image.open(byte_image) - img = img.resize((self.IMAGE_WIDTH, self.IMAGE_HEIGHT)) - - x = image.img_to_array(img) - x = np.expand_dims(x, axis=0) - output["inputs"].append(x) - - output["inputs"] = [np.vstack(output["inputs"])] - return output - except: - raise Exception(f"received: {body}")
- -
[docs] def predict(self, data): - images = data.get("inputs", []) - - predicted_probability = self.model.predict(images) - - return predicted_probability.tolist()[0]
- - -from mlrun.runtimes import nuclio_init_hook - - -
[docs]def init_context(context): - nuclio_init_hook(context, globals(), "serving_v2")
- - -
[docs]def handler(context, event): - return context.mlrun_handler(context, event)
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/tf2_serving_v2/latest/src/function.yaml b/functions/development/tf2_serving_v2/latest/src/function.yaml deleted file mode 100644 index 4dbe9f3f..00000000 --- a/functions/development/tf2_serving_v2/latest/src/function.yaml +++ /dev/null @@ -1,45 +0,0 @@ -kind: serving -metadata: - name: tf2-serving-v2 - tag: '' - hash: 8748deb1d9804f9b436c913322c84d5b46c82bd9 - project: '' - labels: - author: yaronh - categories: - - model-serving - - machine-learning -spec: - command: '' - args: [] - image: mlrun/mlrun - description: tf2 image classification server v2 - min_replicas: 1 - max_replicas: 4 - env: [] - base_spec: - apiVersion: nuclio.io/v1 - kind: Function - metadata: - name: tf2-serving-v2 - labels: {} - annotations: - nuclio.io/generated_by: function generated from /home/kali/functions/tf2_serving_v2/tf2_serving_v2.py - spec: - runtime: python:3.6 - handler: tf2_serving_v2:handler - env: [] - volumes: [] - build: - commands: [] - noBaseImagesPull: true - functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHdhcm5pbmdzCgp3YXJuaW5ncy5zaW1wbGVmaWx0ZXIoYWN0aW9uPSJpZ25vcmUiLCBjYXRlZ29yeT1GdXR1cmVXYXJuaW5nKQoKaW1wb3J0IGpzb24KaW1wb3J0IG51bXB5IGFzIG5wCmltcG9ydCByZXF1ZXN0cwpmcm9tIHRlbnNvcmZsb3cgaW1wb3J0IGtlcmFzCmZyb20gdGVuc29yZmxvdy5rZXJhcy5tb2RlbHMgaW1wb3J0IGxvYWRfbW9kZWwKZnJvbSB0ZW5zb3JmbG93LmtlcmFzLnByZXByb2Nlc3NpbmcgaW1wb3J0IGltYWdlCmZyb20gdGVuc29yZmxvdy5rZXJhcy5wcmVwcm9jZXNzaW5nLmltYWdlIGltcG9ydCBsb2FkX2ltZwpmcm9tIG9zIGltcG9ydCBlbnZpcm9uLCBwYXRoCmZyb20gUElMIGltcG9ydCBJbWFnZQpmcm9tIGlvIGltcG9ydCBCeXRlc0lPCmZyb20gdXJsbGliLnJlcXVlc3QgaW1wb3J0IHVybG9wZW4KaW1wb3J0IG1scnVuCgoKY2xhc3MgVEZNb2RlbChtbHJ1bi5zZXJ2aW5nLlYyTW9kZWxTZXJ2ZXIpOgogICAgZGVmIGxvYWQoc2VsZik6CiAgICAgICAgc2VsZi5JTUFHRV9XSURUSCA9IGludChlbnZpcm9uLmdldCgiSU1BR0VfV0lEVEgiLCAiMTI4IikpCiAgICAgICAgc2VsZi5JTUFHRV9IRUlHSFQgPSBpbnQoZW52aXJvbi5nZXQoIklNQUdFX0hFSUdIVCIsICIxMjgiKSkKCiAgICAgICAgdHJ5OgogICAgICAgICAgICB3aXRoIG9wZW4oZW52aXJvblsiY2xhc3Nlc19tYXAiXSwgInIiKSBhcyBmOgogICAgICAgICAgICAgICAgc2VsZi5jbGFzc2VzID0ganNvbi5sb2FkKGYpCiAgICAgICAgZXhjZXB0OgogICAgICAgICAgICBzZWxmLmNsYXNzZXMgPSBOb25lCgogICAgICAgIG1vZGVsX2ZpbGUsIGV4dHJhX2RhdGEgPSBzZWxmLmdldF9tb2RlbCgiLmg1IikKICAgICAgICBzZWxmLm1vZGVsID0gbG9hZF9tb2RlbChtb2RlbF9maWxlKQoKICAgIGRlZiBwcmVwcm9jZXNzKHNlbGYsIGJvZHksIG9wZXJhdGlvbik6CiAgICAgICAgdHJ5OgogICAgICAgICAgICBvdXRwdXQgPSB7ImlucHV0cyI6IFtdfQogICAgICAgICAgICBpbnB1dHMgPSBib2R5LmdldCgiaW5wdXRzIiwgW10pCiAgICAgICAgICAgIGZvciBieXRlX2ltYWdlIGluIGlucHV0czoKICAgICAgICAgICAgICAgIGltZyA9IEltYWdlLm9wZW4oYnl0ZV9pbWFnZSkKICAgICAgICAgICAgICAgIGltZyA9IGltZy5yZXNpemUoKHNlbGYuSU1BR0VfV0lEVEgsIHNlbGYuSU1BR0VfSEVJR0hUKSkKCiAgICAgICAgICAgICAgICB4ID0gaW1hZ2UuaW1nX3RvX2FycmF5KGltZykKICAgICAgICAgICAgICAgIHggPSBucC5leHBhbmRfZGltcyh4LCBheGlzPTApCiAgICAgICAgICAgICAgICBvdXRwdXRbImlucHV0cyJdLmFwcGVuZCh4KQoKICAgICAgICAgICAgb3V0cHV0WyJpbnB1dHMiXSA9IFtucC52c3RhY2sob3V0cHV0WyJpbnB1dHMiXSldCiAgICAgICAgICAgIHJldHVybiBvdXRwdXQKICAgICAgICBleGNlcHQ6CiAgICAgICAgICAgIHJhaXNlIEV4Y2VwdGlvbihmInJlY2VpdmVkOiB7Ym9keX0iKQoKICAgIGRlZiBwcmVkaWN0KHNlbGYsIGRhdGEpOgogICAgICAgIGltYWdlcyA9IGRhdGEuZ2V0KCJpbnB1dHMiLCBbXSkKCiAgICAgICAgcHJlZGljdGVkX3Byb2JhYmlsaXR5ID0gc2VsZi5tb2RlbC5wcmVkaWN0KGltYWdlcykKCiAgICAgICAgcmV0dXJuIHByZWRpY3RlZF9wcm9iYWJpbGl0eS50b2xpc3QoKVswXQoKCmZyb20gbWxydW4ucnVudGltZXMgaW1wb3J0IG51Y2xpb19pbml0X2hvb2sKCgpkZWYgaW5pdF9jb250ZXh0KGNvbnRleHQpOgogICAgbnVjbGlvX2luaXRfaG9vayhjb250ZXh0LCBnbG9iYWxzKCksICJzZXJ2aW5nX3YyIikKCgpkZWYgaGFuZGxlcihjb250ZXh0LCBldmVudCk6CiAgICByZXR1cm4gY29udGV4dC5tbHJ1bl9oYW5kbGVyKGNvbnRleHQsIGV2ZW50KQoKZnJvbSBtbHJ1bi5ydW50aW1lcyBpbXBvcnQgbnVjbGlvX2luaXRfaG9vawpkZWYgaW5pdF9jb250ZXh0KGNvbnRleHQpOgogICAgbnVjbGlvX2luaXRfaG9vayhjb250ZXh0LCBnbG9iYWxzKCksICdzZXJ2aW5nX3YyJykKCmRlZiBoYW5kbGVyKGNvbnRleHQsIGV2ZW50KToKICAgIHJldHVybiBjb250ZXh0Lm1scnVuX2hhbmRsZXIoY29udGV4dCwgZXZlbnQpCg== - source: '' - function_kind: serving_v2 - build: - commands: - - python -m pip install requests pillow tensorflow>=2.1 - code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/tf2_serving_v2/tf2_serving_v2.py - secret_sources: [] - affinity: null -verbose: false diff --git a/functions/development/tf2_serving_v2/latest/src/item.yaml b/functions/development/tf2_serving_v2/latest/src/item.yaml deleted file mode 100644 index dc7640b0..00000000 --- a/functions/development/tf2_serving_v2/latest/src/item.yaml +++ /dev/null @@ -1,28 +0,0 @@ -apiVersion: v1 -categories: -- model-serving -- machine-learning -description: tf2 image classification server v2 -doc: '' -example: tf2_serving_v2.ipynb -generationDate: 2022-08-28:17-25 -hidden: false -icon: '' -labels: - author: yaronh -maintainers: [] -marketplaceType: '' -mlrunVersion: 1.1.0 -name: tf2-serving-v2 -platformVersion: 3.5.0 -spec: - filename: tf2_serving_v2.py - handler: handler - image: mlrun/mlrun - kind: serving - requirements: - - requests - - pillow - - tensorflow>=2.1 -url: '' -version: 1.1.0 diff --git a/functions/development/tf2_serving_v2/latest/src/requirements.txt b/functions/development/tf2_serving_v2/latest/src/requirements.txt deleted file mode 100644 index 8d3d1955..00000000 --- a/functions/development/tf2_serving_v2/latest/src/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -pillow -tensorflow \ No newline at end of file diff --git a/functions/development/tf2_serving_v2/latest/src/tf2_serving_v2.ipynb b/functions/development/tf2_serving_v2/latest/src/tf2_serving_v2.ipynb deleted file mode 100644 index 6a15b11a..00000000 --- a/functions/development/tf2_serving_v2/latest/src/tf2_serving_v2.ipynb +++ /dev/null @@ -1,545 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Image Classification Model - Serving Function\n", - "\n", - "This notebook demonstrates how to deploy a Tensorflow model using MLRun & Nuclio.\n", - "\n", - "**In this notebook you will:**\n", - "* Write a Tensorflow-Model class to load and predict on the incoming data\n", - "* Deploy the model as a serverless function\n", - "* Invoke the serving endpoint with data as:\n", - " * URLs to images hosted on S3\n", - " * Direct image send\n", - " \n", - "**Steps:** \n", - "* [Define Nuclio function](#Define-Nuclio-function) \n", - " * [Install dependencies and set config](#Install-dependencies-and-set-config) \n", - " * [Model serving class](#Model-Serving-Class) \n", - "* [Deploy the serving function to the cluster](#Deploy-the-serving-function-to-the-cluster) \n", - "* [Define test parameters](#Define-test-parameters)\n", - "* [Test the deployed function on the cluster](#Test-the-deployed-function-on-the-cluster)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Define Nuclio Function" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To use the magic commands for deploying this jupyter notebook as a nuclio function we must first import nuclio \n", - "Since we do not want to import nuclio in the actual function, the comment annotation `nuclio: ignore` is used. This marks the cell for nuclio, telling it to ignore the cell's values when building the function." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The history saving thread hit an unexpected error (DatabaseError('database disk image is malformed')).History will not be written to the database.\n" - ] - } - ], - "source": [ - "# nuclio: ignore\n", - "import nuclio" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Install dependencies and set config\n", - "> Note: Since tensorflow is being pulled from the baseimage it is not directly installed as a build command.\n", - "If it is not installed on your system please uninstall and install using the line: `pip install tensorflow`" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "%nuclio: setting kind to 'serving'\n", - "%nuclio: setting spec.build.baseImage to 'mlrun/mlrun'\n" - ] - } - ], - "source": [ - "%nuclio config kind=\"serving\"\n", - "\n", - "# tensorflow 2 use the default serving image (or the mlrun/ml-models for a faster build)\n", - "\n", - "%nuclio config spec.build.baseImage = \"mlrun/mlrun\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Since we are using packages which are not surely installed on our baseimage, or want to verify that a specific version of the package will be installed we use the `%nuclio cmd` annotation. \n", - ">`%nuclio cmd` works both locally and during deployment by default, but can be set with `-c` flag to only run the commands while deploying or `-l` to set the variable for the local environment only." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "%%nuclio cmd -c\n", - "pip install tensorflow>=2.1\n", - "pip install requests pillow" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Function Code" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "import warnings\n", - "warnings.simplefilter(action=\"ignore\", category=FutureWarning)" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2021-01-29 23:47:50,165 [warning] Failed resolving version info. Ignoring and using defaults\n", - "> 2021-01-29 23:47:51,342 [warning] Unable to parse server or client version. Assuming compatible: {'server_version': '0.6.0-rc9', 'client_version': 'unstable'}\n" - ] - } - ], - "source": [ - "import json\n", - "import numpy as np\n", - "import requests\n", - "from tensorflow import keras\n", - "from tensorflow.keras.models import load_model\n", - "from tensorflow.keras.preprocessing import image\n", - "from tensorflow.keras.preprocessing.image import load_img\n", - "from os import environ, path\n", - "from PIL import Image\n", - "from io import BytesIO\n", - "from urllib.request import urlopen\n", - "import mlrun" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Model Serving Class" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We define the `TFModel` class which we will use to define data handling and prediction of our model. \n", - "\n", - "The class should consist of:\n", - "* `__init__(name, model_dir)` - Setup the internal parameters\n", - "* `load(self)` - How to load the model and broadcast it's ready for prediction\n", - "* `preprocess(self, body)` - How to handle the incoming event, forming the request to an `{'instances': []}` dictionary as requested by the protocol\n", - "* `predict(self, data)` - Receives and `{'instances': []}` and returns the model's prediction as a list\n", - "* `postprocess(self, data)` - Does any additional processing needed on the predictions." - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "class TFModel(mlrun.serving.V2ModelServer):\n", - "\n", - " def load(self):\n", - " self.IMAGE_WIDTH = int(environ.get('IMAGE_WIDTH', '128'))\n", - " self.IMAGE_HEIGHT = int(environ.get('IMAGE_HEIGHT', '128'))\n", - " \n", - " try:\n", - " with open(environ['classes_map'], 'r') as f:\n", - " self.classes = json.load(f)\n", - " except:\n", - " self.classes = None\n", - " \n", - " model_file, extra_data = self.get_model('.h5')\n", - " self.model = load_model(model_file)\n", - " \n", - " def preprocess(self, body, operation):\n", - " try:\n", - " output = {'inputs': []}\n", - " inputs = body.get('inputs', [])\n", - " for byte_image in inputs:\n", - " img = Image.open(byte_image)\n", - " img = img.resize((self.IMAGE_WIDTH, self.IMAGE_HEIGHT))\n", - "\n", - " # Load image\n", - " x = image.img_to_array(img)\n", - " x = np.expand_dims(x, axis=0)\n", - " output['inputs'].append(x)\n", - " \n", - " # Format inputs list\n", - " output['inputs'] = [np.vstack(output['inputs'])]\n", - " return output\n", - " except:\n", - " raise Exception(f'received: {body}')\n", - " \n", - "\n", - " def predict(self, data):\n", - " images = data.get('inputs', [])\n", - "\n", - " # Predict\n", - " predicted_probability = self.model.predict(images)\n", - "\n", - " # return prediction\n", - " return predicted_probability.tolist()[0]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To let our nuclio builder know that our function code ends at this point we will use the comment annotation `nuclio: end-code`. \n", - "\n", - "Any new cell from now on will be treated as if a `nuclio: ignore` comment was set, and will not be added to the funcion." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: end-code" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Test the function locally" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Make sure your local TF / Keras version is the same as pulled in the nuclio image for accurate testing\n", - "\n", - "Set the served models and their file paths using: `SERVING_MODEL_ = `\n", - "\n", - "> Note: this notebook assumes the model and categories are under /User/mlrun/examples/" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "from PIL import Image\n", - "from io import BytesIO\n", - "import matplotlib.pyplot as plt\n", - "import os" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Define test parameters" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Test image:\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAQUAAAD8CAYAAAB+fLH0AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+j8jraAAAgAElEQVR4nOy9SYylWZbn9bvDN7/RzNzDY8iMzMqsAXW3uoCmG6k3IARi16tGwAYkpF4g1lBrJKTes6KFkNggYNOClkoMQmLDILXYMDRZ2VkVmRkR7m7mZvbGbx4ui/uda8+CiKpqqgIcya8UCrdnz77h3nPP8D//c65yzvFhfBgfxochQ/9//QAfxofxYbxf44NS+DA+jA/j2figFD6MD+PDeDY+KIUP48P4MJ6ND0rhw/gwPoxn44NS+DA+jA/j2fjelIJS6l9USv2BUuoXSqnf+77u82F8GB/Gn+9Q3wdPQSllgJ8D/zzwFfD3gX/FOfcP/txv9mF8GB/Gn+v4vjyFvwr8wjn3R865DvhPgb/xPd3rw/gwPow/x2G/p+t+Cnx58fNXwF/7ri8XeeaurzbgwLlp/lQBDgdopbHWME2OcRxxzqG0AudAqfnvHNM0Io6PUqC1YZompmlEKY0xBqVgmvw9tDZorRnHyd9rvp7Cf0cphdYa5/xzuMkBDqUU4mEppf1TzvfVWgEqPIvWCqXU/DsdrjmOI9M0hnfVWjE5h0KhjUZexDmYnAP39HzWmIvnmadAfj9/oOb3UFrjpmn+nWJy03xtf8/L6zg3hWf0w4X3H4bx4nMwRqOUZhyHeT6f3lGe3/lLhOV009PcOeevDcqv+fy8MuRW/vt+PSbnmMYJYw1KKYZhJE0S+r4PEoNinh/HOAz+AZRGacM0juGa0+S8jBiDnm/mnH8+kTEvO9PTM84yY4whiizOOfph8J8rjXPT/Hfze4pcKP975nXy31dYa3EOuq4N99Zag1K4aWKaHDayaJkzYBwGL/ZaYea1cvNEh/+HuZhlYP6OPJ+a5eP17bt759wLvjG+L6WgvuWzZ3GKUupvAX8LYLte8W//m/86wzDQ9z3WWpIkwTlHWZZ0XUcURSyXy7BoAGmacjweyfMcpRRv3rwJnzvnsNYyjiN1XQOwWCxIkoSmaQCIogSlDOfzmTiOGceRrusAMMaEZ+26jiRJiKKItm2Zpomu64IwGmOePVsURYDfINZa2rbFWstqtQJgGAaGoaGqKsZxpO97xnEkjmP6vn+mRMCwWPhrx3Ec7q2UwhgTvt+2LeM4kqYpaZpirQ0b/Hg8MgwDWZYxDH4Tr1YrkiShrmuapuFwOITnnaaJKLKsVgVZllGWJdM00TT+meM4pigKuq7jfD5zfX1N3/fEcUwURXRdxzRNxHGM1jq83zRp0jSlLEsOh0OY02EYiOM4zHlkIJoVn9aGrh9o+4GmbWnbDrRhGEestlxtrqjKE8YosjQBN+KmAa0BN9F1DcOkaHq/IZfLJUmS0HVdeOc8z7HW0jQNTdMwTRPX19cMw0BVVUEGqqpCa816vSbP86A4RGacc1RVxTAM5HkelEvXdWEunHNhHbMswxjD6XQKMqm1Rms9y8jAer1+9ndt6xVI0zQkSUIcxxwOB5bLJcYYyrLEOcdisQjy5Jw3pm3bcj6fiaKINE359/79//BX37Z5vy+l8BXwg4ufPwNeX37BOfd3gL8D8KMffOoWiwVd11GWZRDuoii4vr5mt9txOp3Y7/dBcIZhYL/f0zRNmLzlckld12Gh2rYNm2SYNfo0TYzjOE/6hFIG5xxpmoaNFUURRVEwDEO4Xtu2YRMOw4BSiqIoSNOUruvouo5hGIJCE6V2+dk4jpzP5/laXuGlaUocx5Rlyel0Iooi4jjGWosxBuc0p9OJ5XJJ0zRhg+V5zmq14ng8cjqdAIIH0rYtfd/PnpEKgm+M94yapmEcx/BcxhiqquJwOATl6zdPFJSZCHGapuG6dV1jrSWO47BBmqYJn0dR9GT95jW7VPLWWtI0vZQJv8FGsKlFaY2xEVmUMNU1/TCRZpZzWdH3PWmRst/vcW6iKDLiJMWNPU3Tk8Ypxij6oSe2lm4cMMZ7jlVVURQFm82Gsiw5n88YY4IiBcKmj6KIJEmYpondbhcUZNu2NE1DHMcMw4DWOijFOI7JsoymaViv1+z3e9q2RSnFYrHAGBM+y7KMLMsoiiIohiRJOJ/P9H1P3/dEURRkNo5jlsslAPv9Pqx9WZYsFoug+MVgiZLvuo5xHDHGBEP0XeP7Ugp/H/hNpdSPga+Bfxn4V7/ry9M0hY0mgn04HIIlFSEGqOsaYwxpmqKUCpv5dDoFyyMT0Pc90zQ9UyTiDfgJijDGMAwDbdsGS3C5CLLY8mzW2iA8YonFexDN75wL3kGe5+R5HjyUw+EwhxbumcsfxzGr1YqiKLDWXlxXkecFdV2HuWmaJiihruvC5pqmKWxSUSpt25IkSbCE8ty73Y62bcO7xnFMkiRh0/t71BRFQZ7nHA6HZ9ZsHMcgZFVVh1AMHFmWU1U1p1NJFEVorWfB7GmahizLuLm5YRiG8P6itJ1zjA7GyTENA00/oJTmeDzTjyMozeQUaVagjMVqGPoOlFd2iglrY7I8p+9bnANjLEWRoLWmbdvgaV3KyzRNJEnCYrEICjXP86D0jTFhQxtjguWXtRflIfNprd9aURSxWCxI05S+76nrOhgqWSdZW6VU+L7sCWNMMChikB4eHnj58iWvXr3i9evXVFVFXdc451iv18EDHIaBJEnCHtNaB5n4f10pOOcGpdS/BfzXgAH+I+fc//Fd35+mkdPpFIRN3P6+74PFB55pyjiOg5DL5hJXTzZY3/dB0MUD0VqHxTMmoqoaHxvOGnm73ZKmabi23Ge/34dri4IoiiLcw8eZUbiOWMW2bWnbljzPw6I4B1Fk2Gw24d5XV1fBGwCCO3g++9Bns9kAzErFW7vT6RQ2NhAUQpZlQXGJAhElF0URWZYFoVutVkH5ikDJJjFG8/j4yO3tLeBd6KIoQngTRdHsmk5EURwsatM0wULJXDVNM2/OgqurKxaLBQ8PD0EZSBg0TRNKGxyKfhzp+wGnFHXTgdY0TU0/DOS54XB4YL1a0bUdcRLTdS2RVcSRYRgnxnHCOfzzxH7+ZcOdTqfgHSwWi6Dkrq+vg5yI4XHOUdf1s41UFAXL5TIoV8GpxOubpimEWCLTIt/isS0Wi+BNiQFSSoUQROa6770ydc5RFD6kk/uIchvHkePxiHOOzWZDnufh70RGF4sFdV2Hv/mu8X15Cjjnfh/4/T/Nd2WyBAART0A2lSykuGt93wcFsF6vvYWYJ1Z+Lxr7fD4zTVPYlM45siyb7+nIMm8JJZYcxzFoXvEy0jTl5cuXgMcPJMwR70RiO9mAErOJoIsWl3eQ+DBN0znWnoLSkvff7/fzezaACpZJLK+1lq7rWK1WIQYVD0WUkwi3uOryHPKzKA8gYCoinKCw1gR8QoDBzWYTBFDmNUlypslxPlcByDImIo7NLNw5ceyvI96Nd/u9lxBFEdfX1zw+PvL4+Mg4OLJsQdNUpHkBShH3jtPpjEMxOTgcTxijub27w00T2iiur7ZU52PATazVDOOEm70uUdzynufzOWwQ8Y5ub28DvlGWJUCYx0uDoJRiu92SJIl/5tnbEhmRELSqquB5dF1HmqZkWRZCVSC4+yL/4vGJ0hUlIwpkHEceHh44n8/BOEmIKGFNkiRsNhvquuZ0OgVFKPN9CRp/c3xvSuEfZUzfiEe11mRZFgQMCIsiIIvWmtVqRZZlHI/H4L4Bz7wNsYjizmutKctyttoR00TYKDLhl2GGKIaiKABvdcSyiEsmsblsNrGUAkhd4hFieRaLJyskLr3gGTIPAF3nsxiXSkTmQ4YIY9/3nM/noJhEocm7yVyI0GjtPQHwlk/eyc9BT9sOwWLKM4r1F+vpvTQTnkvuK0IsClw2hayhrMHl3IvndDpWDKMjijOU8la/7weO5xKlNE3TYqzFaIWalUTX9TgHbd9jJ4VzI3meoYzBTR6rkNCzKAqSJOF4PD5TCoIBiXWWtZFNLv+epom+7zkej3Rd90zRTtMUgO6Hh4dgmUXZn8/nME/fFhovl8tnSl4UhFznmyGA4DuiQOSZBecSEL6qqgBSernqvnM/vhdKAQhovmwyiW0FW7gE9gQjEI1+dXUVwMRLIEjiRHGhlssl4zgGzZllBUqZILQCel3GzLKYu92OcRxZLpdUVRVc5UtFpLUOIJB4D5eAFBA2Z9u2YaML9iEosSyst6wtxtgAwolQZFn2TDldAkiiMOR3l8r1m0It4ZRsdJmDpqmJoieA0DkX1kNibZ+5aLGmR2lFkRdB8fZ9T9f2DP1I28wCqPzmEeV06VLf39+H+WhbRxQl9MPE6GAYJ7p+pO9HrNWgDaOD6lzRVGcWy4Km7bh99471asE09LPhMGitGOf5EY9HNpQoOpnv1WoVFIJsNsETLvEpMWBiaCREWC6Xz5S8yGKapiRJwnq9pq5r9vt98BgOh0PwzpbLZVAokg4tiiLMy/F4fCaTElKKMpOwUry7w+HAZrMJsh3HccCgJGz7tvFeKAWZRHGBgLAAMgEiTNPFAkv8K0JYVVVwhyV1Ji6UoL/W2jCZUZSgtQ2bRCZX0P2bm5uAAF+md6y1AZSUTSqLKxZShE2UnWhzEczL0EgsjbyvuI7gPQVro7C5xcIIki1WQCyKbHJ5JwGcRNgF7S7LkqqquLq6IkkS2rYNlt8DaB1xnAVFKkpW3ivPc7quo2naZ/FpVVVEUcRnn33G69evQ1gUxzHL1ZKuawO2IWFUFEUcDgdOp5MPaaKUOM04nkusMjRNy+lcUlUNJhoZxpGmbjkedqSJpR86PKdizThOniehFMM0Ehs7b+QnsE0saRRFz+SqKIqgQC/xhMt1XCwWQd4ugVoJTy9TxpfKVrxOwX2AAByKghSlIlb+0lit1+ug0AR/2263GGPCusk6iEyK5yiylec5WZYFg/dd471QCm6OTYEQy8mGEYtrrQ2aF55SixJuXGYFLl05D3CZZzl6SSPVdc1i4UMQuTcQlNP19TXH45HHx8cASEmuXjaKuOIiGLJhBfSR55mmkbZ9CjfG0dL3fXjX1WrFOI4hg1GWZXB1x9G77OJalmUZwgtJfWqtubq6wjkX4nVRLMAzjyJNUxaLBWVZhjkRDOJ4PM4Iuw7KVNxrUbzH45GXL1+yXC69YiIOm6Tve9brdVBacRyx2+2DMhS+hZ+DIcTc8m7jOIJK59x9C6rn7ds7jqcT57ICrajKmmEc6dqGrnUMQ8/kJoa+R2uH0YosibynYAwmMmjl51TkSTa7YDXiFcg6X252MRSXYaV4T6JgJEshnsM0TcHyH49H6rrmzZs3wbDI31x6GsKHkWcB76WJhyByKV6OyFqSJAHjUkqxWq0C2C3YmyiCSw/mu8Z7oRS0NuElgJDrFYsnGAMQNPE0TSF+FssvExVHEZExnM5nMknHtS191xHFMVZr+rbldD5Tl564xDSSpRl11WOtAjewe7zziG+W4CaNGyeWhU9ZNXWDsZphHBi6nsRGWKVxdJxP55C2mqaeqqyYxok4iVkUC4Q5KDiIWA5RVPI75xzWONyMWDP12CgmSyyLfI3SiqEfiAzYKA6urWQXZC6t9Qooz3PiyDL2DUPvSCKLURqmnjQ2MGnaqadrG9IkBZMxTVB2NZGNsCoiiTL65sT5UPrNrw3aTlhAaa+klB7phxrUQNuWFEXCMA5MfePfxSoiC1lmGcYJpQc2Vwv6wVFVLV0Lp1PD/nDCOXjz9g390FPVFWVVMjrPQG3agckpNI727cDjfk/X93z86iMWRYo1CUmcoI0mXWRMZlb4yxxnJsrziW6oWBQ51kazt+S9K1G24kWkaUqe5wETULNS66eRSUHdtZhxINMZSe6t8aQgjb0sn06nQFoSzEA8w8ViETy1uq4DUC1hp4S8wsURYFrSm4JZtG0bMhZt12HjGJskwTPrnaM+nWjmbN53jfdEKaiAhstCBCILPg4XpFVSeKKtgTCh4s51bUtZlmzWa7bbLUVR8MUXXwT3EPxmieaFzfP1U9wVRd56dW1we62NMPrJuiggjr0VFovv4/kOpSGKvIsoaHVVVVR9Rde29LOyE48hy7IgBLe3t8GrkXDHaI2a33GMInDe60jTOceuuzkmfQqzxOO5TAmKEE7TRN/5bE3ZnUmShKurK87nM+PoMwVRZzmfSrp2fApXMm/hszRDKz27ojnTNKKsnw9xn9u2IU0TtFagvLLI4pRIGbquJc1ilquC0U10fYcylslN9MNI0zb0bcRuv+dwOPLmzRu6vmEYR+rGs/XGoWcYemyU0Q8j4zRRNzVdV6OmCQVE1hBZPw/KKvLNgjiJ/UbKMvq+pe8i3ORp02nqler51IbQQt6nrp/4GiEtOY3oQVLMTx6hm2UrmUHh0+kULL1kx8S9b5omGMBLQplgLpfPcAk2ZlkWQhzJnAkeJEqkrCrKh4fAq1AzzqSNIZ7l77vGe6EUpskF4EqQbnHjtNY8PDw8Q4olLhMPYrFYhNis73sUcHV1FRhrct0XL14Ea3xJZ75kLkrsL26hTF7fNzP/309sFGmU1igsbavo+g5j/HeLogjutgiDkFBkk4q7LAoijmO2222gvEpuWZSYCJ6kliSPDd41bLsRhw44g4QFglFIPKqVo5ytndz/0pVer9ez0u2Joyyg9peuv4QJHj+JcfopE3PJ9hP8RbgRbphoWs8zcMz1B1pjTUTTdDTtyPlUsd93PD4+UlUlb9++JS/8BuvalvVmwzD4MCbNUtpuwE0T1mjc5MPJ29u3xJElT1OKYoGyiq7taNue87nky19/ibWazXpNXVXewPQ94zhR10OYM1Gm1trglQaindGBSyJZFfEIhJHrnCOy0bMUtYRuT9hNGxSKhGpiFAUYFH7CZcg6DEO4h6Q3F4tFABGTJAHzlBWSawIh1P6u8V4ohWEYuL29xVrL8XgMpKC3b9+GeF0EW8IKUQJiUYuiCOQM5tjZGMN2uw3km7quQ25+GAY2m00Ad47HI5vNJngfl6k/Y/ScKvTMvSe2mqXrelCeaef/RoV7y7UERMqyjPP5TFmWRBdEH1kwYSVK7hmgnkHEoiiekZSAYLX6vsdhAigqqS7ZlJc1CZdsyaurK6Io4nw+h+8K5nJ1dUWaFDw+PgagVGJfmZuQ2o004+TrJYyxxHFC348hNWstHhcBUJp+GGi7niT1QG/Xjzigqhoed0du3+6oqoY3b17PnpSPu682nuBV1SWME5GxpAufLZlG7z12TcN+fwDnWXNpkrLerhhmTCWOE47HE9vNGqV8UdfQt4zjAPifxfAI+3G5XIYMgXyeRDF56nGqrmkxStE7GPseDSjniIwFOwZGq8z/OI7ek91snhHuJFUqSkYUziWv5ZKkJ0pCMkMicwKCX2+3gS5t59BaFF3/x4QQ74VSgCeqs4BZMjkCRAkFWHLfQlCSTIBY+CiKSGdrdj6fgyJ4fHykbdtAQhKrKKMsy0BWEc/i008/RWvN27dvGIbOp7fGHufG2XrUF2yxKwAfE88ehqT3BBmOoojT6RRCo2EYQoGOKAeh2YrWn2ayjzA0BUi6zKFnWQbKUlZ1yMVfIugCWOZ5TpbGRFYHVPqyNuF4PNK2Lev1OmRbBOUWOrlwOQQYU1pRxAXGKMZxYhy7kOmQ9Twejx7onb04ZUCfz9g4JssLTucWo2PK6sjt7QP7/SlQojebDW1bM3Q9280GrRXHfce6WOCUIkpTtPL1Id3kQsVjWZb88suv0CbiN376Y1599pLBzrT1KEbh2O0Ovmiq7XDOez1iReUdRcbEc7TWst1uqZuGZvYucY62aXGxI008L6TvvJJiVrZCOxdFLt6opCWdc1xfXwd5FAUgKWwBhb/JstT6qUDrEoObpolpHHFzODWNI+0Muv9xFGd4T5SCCLekVwQYk7FcLomiiLquKcsyxFeSlpSJAu+6d20bGGuST5ZUobi+l4w2IRzJvfq+53Q68fbtW1ar1Uy1rgPNN4qiUNAirn+e57NrXwVQr21b9vsn5H2xWIRCK6cUh8MheEaXRUYy0jSlWCwoZqVSlmWwKKJEQtbFxD6fP6fDBLyUasTAaXC+UlDqGc7nc4hR5f/+GRQ4EwRNvBX5d1mWAb/out6XIGtF1/U+xNIaa2P6vmMYJk6nM3E8M+lGR9P2PO4OoA1ZvmAYFH0/cf+wZ5pgsVzyuHvwsqENRZajnGNoBxIbkSUpURwzTCNd37PIM4wCN83emjVUZc0Xv/wVTsPVyy3G2DlDNNC1LVV5whofvogMdd30jAkrALhYXMGt7PzuVVmyXCyelagLBtRfhKhCYBOFKRkdr/TaEGJK1argA5ebXHgo4glIxkOMqHiQPnzs6fsONw1M44DREctFQVmVRFaTptl37sf3Ril89NFHlGUZSqe/LfYSt0pcYSHsSFpHqt3i2ZuQ2FuYXSLMl6XZdV1TVZVH5udFkDSUEKfO5xNNU8+bvwjXjKKYOI6COyhZlOPxyP39fXD3tfaW7JLnoI0vt5baCEmZijUKYcWcTRDaqoQHomhCyDD5KsJLctIlaCsYgNYWowhkL1EsElKIgC6KBXnu3Vnh0EvV6WU1YNd1HoGfJsbRhazHYrEM+INSekbVE/qxm/kgI3QDZXUmjnuMzfijP/o1Xe/I0oR+6Fmt1r6oLFsTaYPGM0pXxYLNek3Xd7S9I84ylNYMfU+R+dT2qSxxDuq64Ve/+jXFKuPF9Q1FkZPEXglExqA19F1HEkdzaDiFGP1SEQoHQGRmmLMBsmlFviTe72euwSXwe8lfGYaB+/v7IMOSDpaQU2p+BDwWr3e5XIb1yrIspIXFaAo2BoosS1gul5RlOV+7J57xHSnj/7bxnigFFejKAgheVgPKRhItKyQX2VQS70lKcoBnfyOxFBCUziUB6nA4hPhMricx+TA3tUiSbI75YRhGrE1m5ZWjUDRNT9t6AE9AoUsOwSUTs2ka9OySyqa9pDmLFZmmiW4OGYSHIKlY8aaEyOIA58bwHt+ck4eHhxngLFBuDNeXyj8p+JEwzIdATSjgAcJmkZBFuCXHc8luv6NtfJwaJzF5VqC1Yr87UJYVWZqxXK1ou4Z+6Gm7jvJ4xJiEmxcf8Q9//kuStGCaHrl+8ZLj4ZHVekXfdmzWa/I05f7tG9I4YZHlpHHCerXgXHu3+lzWKAer5RIHtP1A1/UM40hZ1rx+/ZYi8+uyWa9IEzxIOo2M40RZ1rNRIIRJApqu12vSNA0g4jAMvLu9w2qNc9DblgZF17WMg5+fxEYweqWyvboKIe6lVyqZC5FVAQIvy+llvsWT+SavQTxCqcgVXKprG6ax57B7IMsyrjZrjscDE440tmRp/J378b1QCtOsfcHXsS8WixBniSadpokXL16QJAmn04lPP/0U5xxfffVVcJHF0vftE2tO6tovyRuXfIDNZhMq9oTqKtZXrOc4TsRRyjhAO3nUP1/nlGVJU/fBXTcmJkl8mvGyIAkIKHzo9zCnp4RNKMQTIdYI3TlJEnBP3H0RShHYpwYeNpBhgIBSSwwp3/Nx/RQqTzebDXEch3BMuPKPj7vgRYkChSewVhRPMdfwy30k/XZZlOYm/x5xmmAiizaK4+lEuz/SDwP7/Ylf/OEXZPmGNC2Ioph+6Nksl0TLFatFgRtHPn71MX3XkkYR282WbmxJxxjiGDd5L0WbiAnYrrfsTyemqsZY38jkcDiyXq+IosT3J1IaayOsjairCmOmADReepaPj4/PQF5jDGmSMI1TwLFwjnF4qsIVoHAcpyBP0i9DUouXGTSZXwlZFosF19fXz0hqsmYy1xLOSPGc3NPjV4a2LhnH4ZlHKAo9et+zD8453r59G9I/oqUFCRcBvyRniMYV5p7WOpBO1ut1oIrK9WRI3vmyLkAyGcId/yalN0kShn5imkAp3xcgjkeyLA+hQ9eV87VUsPbyvAIYiSu+2WxQM/gnRTJSBi2uudz7dDqhleJqtjYSy18qDa01KI2ZFYN4GhIXi3sr2YUi9wDoN+vwhQa7WCxYLla8fn0bnl3CBaGPC/BYVxVp5tmfTwzMMTz7OI4XhWEdw+i7EiltsMaQL9b8nz/7Geey5nd+53cxOgHd8fLlS7SDNI5ZLBYs0pS2rhi6nmlOISZJhCOdLawiihNG570EbSKiJKVpW6q6pCrPfPHFF/z4xz/mpz/5DYzWpElMnvneBicbkSQx0UwCE0p4URS8ffuW0+nE1ZUHkyW1eAnSitcpcisclKr1PTQE+7mkP4sREzxNiGfSD8G554VclzjDpQyLwWtnfg54nkYceYUjoYisi/zdd433QilY4+sRjscjfd/z7t27kGYU90hAFfmOpCulUlIQf1mYy1TNdrsNuXsZogjE8orLLJMdRVEoS66qGmMjpmnAGsvEyPHsF7ooCrLCcyymccLaBKMtSexjUGsNfd+htaFtK5zrAeMBMRdTns8M/UCRp6wWhU9tDr4yMoojmDGO7XbL8XgMHAapMRDLMIw9bhxQ2ruim+2KceqJZnAvjmPqqkYrAroOTz0DpclMqEvAkRUF1po5y2DoxwETWdL8KSYVJVre+c4/i2URrKNjomkbbm5ufNuwumFZLLFacz6cWGcL+rYnMREvrl9wPJ1xxqINLLOUoWl4eX3NDz9+xeuvvqRvGnAjJlakWUSW50zHCa19eJOnKX0/0LYdLS0qtrDMedjD211DW3VU54ZxND50UIaqaWjrM7F1aNexWl2R5xn39w+kaQSMZFlMHBus1bRtg3OeEyDzJ/iWKF1REJvNhmoGDq21rBZLwPHu3T2RjWByJFHM6XRkgqAwJIwVeQZCZbBwT8QwinEUT1AM6jhNNO3EhMaamGLhM1/9ONB0A+3j4bv345/Xxv6zDGM0n3zyCev1muPxGBqaCFBWFEUA6g6HQyghFvdYyB9iwaY53jLGhHBAylxlI0s14/l8DmQP0eSCXex2u4AKM82dlyLDOA2zq6pAOWxksINhUL5Rp5rd0rbzrp1S/h2H0a5uZAgAACAASURBVG8UbeyMdDd0XUtdNTNu4YVaa0MSp1RlNZOiTECcV6tVUGjC2S/L0mMU1mCsJp7bqEWxpW2bOS2pub65Ik0ShrYL4GogFs0kGAFrp8mBAmMtahhQWs3FRsZ3VZ2BrnGaaGdST1VVF/UEal6PBHD0fcc4Qdv2tNPANPpipZ//4R9SNgOr5Q1oTVbkRJHjdNjxkx/+iI9fXvPrX/yCsWtJ4pi2LbE2IslihnFgvd54l7yq6duWMRrI45jjNFLOhCM9DCzSguP5xPlcsd8fWK2WaANtfcCqiSw2VF1Lki+IY4u1vimt1orr66uA46TpinGcGNqn7lkSLklfD+E1iPf3zdL5yFqSOeVrtKZtWqqm9tmUeV2ltkIpFWpqJFQQb0K8OjEUAnIKjwdAKUOc+PoLlCbPFzinOM0Etm8b74VSEEKQuFJFUVAUBb/5m7/J8Xjk7du3Ie8ucbpMnFCRBQk/HA708yKIGwaEybvUtKIIhMwkE3uZ/YC5R8CczhTyStM0z/r7FUXBcrGkrVt2ux3n8sQw9Dg3EccRUWRJo5RxfCpD1lpzc33NMTpzPpdIlBPPDTratsUxhfcSF1LSmhJeaa3R1vKwe3yiwqYpH796xVdffx1AxM3MVjzMBVPidoogCa+i73varifLl8/CL+HpS/9CqcAUZSCUXkHlLzv/OOeoBkeeZ7hhoC4rXJrx8PjI7bs9/9hfekESxzzcvqNrT3z+g0/JspT//X/939gUGSqyNGWDjSKKRTFjL4qb6xfUdU0cRdRlxdD3TKPP0Q8zrhNFjkXkN9w0Dnz99Vc0zTXX1xu0sRRZjGGkH7whuCRpCZVYeAKBfj/qIEcif2KcZH6kL4a0oJMemEAoPAvl0oCem9qItyshhBi7SyaukKpkXYTkJyQ+8bYFJJZ0qHgyItvfNt4LpeDcU2NJiamttXz11Vd0XcfNzU1I9YkQi6CJovDcfV9lmCaJ78YzI++XsbdoU4mn5VpCRZVYUGrf+74nSX1sKlpasglCIpFqR+ccifXu3zD2WGvmEuPWK6u+A/WkEPwiLlBosizldDozDN4dFhpzkkR0fRfuIZ9L3z9J107OEVuPRfStL8rSWgcXtes6Xn/92j/j/K5AoO5KeCXvVJUlddOH1KXMTZqmITU6DEOYD0HXBVATBSrKa5omojhBOUeeF2RJyu3bW97dP+KU8V7i+cQnrz7iiy92/LV/6p/g9//e3+PHn33G490tQ+tDh812zWa7xc6bSzgA2fxcCl8v8jgDtOM40nQ9UaS5udqirOX2zVvquibPU662C5wb6YeBoR+py0PwCqQobblcstlsaJomNE5Jowxro5AalEyTbDaplxD5u6Q5eyD3MeBRWZZh44jJPTWKFY9WmKSCt0n68XQ6hUyDAJCX2IPQ3cWDk1IAqc78/4FScIHaK5tMOAZSHi3luDKpwmkQwZSX7vueaRiIZzBHYrNLltol2CMMNcElpmlivV4/IzdlWcZipkPL80j6RykVYjrgol/BicfdI4fDnjzPqKqSpq2x9qkRp3OOw/5InhfkuXQ+cvQztjFNE23XYO1T41lBqaVNunRpbtrWA3d5Htz5JI559eoVVVkSzy5llqa0F9x5UbJ5ngdFmaYp1zeWyXnlJaW2onyFRCbzKEVd2+02hCEyj8LB6PueEYjimDhOqM4VX379hn6cePXxpygFWZry5uuv+Mt/8S/wxT/8OR/dXLN/fIBposhzuq6m7zvO55KJCWsiSl3NGYCYZVFgtMFNvozaSirv7VuOVUU/jiRZxuF4pGlq1qsFi+JzEqtYLldMbqLuvFsdzwBnVVU8PDw8a8Xmra4O3hA8pTElpBCleNkNXDa8rKOkIdfrtS/qapvwXcHOBFi8TK9vt9vANhUsTTJsopCB0PNTDKF4GuLRfNd4P5QCBLdNhGy73bLb7cKkX2pFIeIAAeEWF26xWDAOAw/39wGoFJbeZessIDDLpEpN2o8LliDNLeI4ZpyRYVl0ASGl7wH4zMb+sCeOYtIsYzks0Vrx6tXLuR33kf1h9yz9p6O5S1HXMgwjbdvTm4Gq8gsZxZYoss/iTFF0cl9RoEopxjgmmj+Xis+6qkjimPVyyc3NDafTibu7u6AE5Xkui8PsnBYTZFxCAcloXHIc4KkvoXgzm80mrFnoZzGOFIsFp9OZx92B23cP5IsVcZKRJClJ4kle29WCX//yF+wfHtguVxBFdG1N2zSM48DpfGZyI3GUsCh8SnS5WJDGia950Bq2W6KZy7FZrTz6Po0wjsTWsjse+frrr1ktc642S4o8JY5T0nQI4ZIAz13XhbShGAOr4rDBZC2kFucytAWeVTXKMQRyD2lyM82HuAhT9nJdLolq0zSF4sCHh4egqKTkOs/zEDJfcn1k3URehCvxbeP9UApzbhaeDtSQw1UuhU0stUz+ZWNNwQv6vp+bbeiAzEpO9zIHLI1QRLOKGyoNT2QzrFYrtDGMs+aXOFNaXQnJJEkSutZvsp6efujmzePQ2vDixQuKRc44DYG04htuGpq6nd1PsLbDGAtu7pgcW7R+3r2nruvQqESwGBGyvusx1p961NQe+FoUBeMwsHvc8eWXXwbSkQifuLGXVNk0SUNRVnLhdUmsfJndAUL4IesitGo5i2C9XlOejxyOJ06nMz//h7/g9eu3fPTJZ7y+veOv/dN/nYf7e37yo8/ZP94ztC1XmzXV6UR1PFHkOcvVkn4+kcpEEeOMqfi43oeHisuTs3wG68XNDV3XUAwT6XJJP4ygFGM/UlcNVRzx9ddv0AqKZfEMqLu6unpWaJbnucdMep41KxHPDwjcAwntpLtTwH9mfsww+PNKjDEkWcrheAxhiMjkZRpROnULhiMH8UgXrEvjJPe6DHMvM3CisL5tvBdKAVRwNaVi8bI1mGAAkn8XxSANJyRXHPr4a008x11AEF6xhk8NR13gKch5C7JRhLhyd3eHA1azxyAhRVVVzyouBVzKYh+GPO4eUAoWi4Kq8qHO5MbgDoLEgJYoimfhG4giPwdaeXdRG0WWpcHbEcBRlIR4Kr7gpaU6l8+s13q9piqrUDdRVmUAxwTHkdp+qeMYx5Fo5iTIvEpc/cTydLx+/TqEdlJeLJyGaXqqwxAXOjKOIs948/Ydh8OROM/phomf/PSnKK04ziDx1Spmu1lweHikPB6ItJlLoydWyyWrqy1lXZFEccAx6rpGOcdquWLOUGKMP6shsgbcwP5w4lg3WKN9Lw03sd/tyfMUlURoa8Laixd5SSzSWocmK1lcBGN1SYiTWgmhrUsxlSgMmQv57M2bN3MKecvxfArhKhCqhS97Sgq3Bbz3fH19HfaFZI/E+Ikiu+S9SMj93vMUhsFzExaLRYiD2rYN/QFFqwmIYq3ldDqFsmCJ80UgrTGM84KkaUpVVaFtmnMuEICEtSYpPlEe9/f3obHm+XwmimMeHx+D+yyVigJ8CpFnGAbKo7e4282Wc+m77RwOR/yZiyNt98Su9GQUT2h6/fo16/V2VjJPrb/bqkFrFdKokh0R4PFwOPDJJ5+wWCyZxqcW8uL1yLNKvDnOZ2yIJYHnZyiKwi3LkrJqgzKVU6qko9R2u+Xm5ibgGjKPArZJ8ZkoEQCsQ2nLuSypm4a6bllvPAPx9evXNE3Di+srjvs77u++ZvfwSBYn5FlBlqZoK3005mMAshSj/WY4ugNMDqMNyax45Zms0awXBeMw8bA/kCYJVzc3PB72vHnzlixL+OEPPiVOIsCFNLUoAfm3EMCyLCNP8gCwCr4ia3ZJL5cNLCDfZU9RIDB3f/3VV+RFHkIUMXiirEVZXCpz+b2sm5w+JqG0zIGEyHJ/IUp91/gzKQWl1C+BEzACg3PuryilroD/DPgR8EvgX3LO7f7Y60ip5/ySl+lCYXlJquab2Qdx8yWmG4aBPE3Zz30IHh8fQ9oHnhBaidMEYxCFIhx3wTdkgzXzEW9SAy/AojxvVVUM/cBqsfJ1DlZD6bES58b52YcQY4t18acaSSeeGqX8eyRxNm9GbyUkuyJpJ1FCsvFxPstQa8M4jeRpRhontHXDXbDUflN1Qx/mGJ6ou0CwcsZYhpFnyPnd3R3As7ZkWvs28fJOApxZa8NZjcKoa+mZnA/bFssl0bEmzQs+/vgTxq6jWBTsdo+U+zse7++xxs6Zk5Zh6FlvrryljCJU21E3LcWcHUlm13qaxgCsBW8Qh9VemS2XC45Vg1aavvPzutvv+eEPPps5JuqZhyBovQDP0nGqa54YsxIiSer4EtQWxSuMQvnMWht6KSqlwpoIN0fwM1Gsssmrqgrhg4TN0udTPDPZN+JZSBgDhErdy2rcb44/D0/hn3XO3V/8/HvAf+ec+9tKqd+bf/53/rgLJHHMp5988pRNmGvJ4zim7zrMPIl1XYVNZY2maxvqqgzutDEGo5XvwTiXL8thGGLNxbXyTEnHND3lbqdpnN3GmKoqAUeaJnR9RzETSJI45mq7Zb/fh5ZuUiZb1zVZWtANI6fdbi4bHlAaHncnJjeRpl5rK6YZHc+JbDSHKvdBMD799DOurrZo7cgLn2F4eHhgHAeM0eR5htaKKLKAo+0aoiTCKU86SrIk5NclJx5FEcZaaKBrG86noz8ncowuwhqHVnEI3ZTyh550XRcO8BUMImRzGINSrduK4wEiGzH0PXmWE+W5J89oAMdXv/6SOF1RpAVXqw1j19M1NV1T0tYnhq7H5kvGvufY+HRjaiJu7+/Zn04+n59lLIsl2IxlumSzXBNZS1WWKOf7WsbaEMeWaepJigybJTR9Q91UnPf3JFaRJgl1M3D7cCJJY65XCdc3V3Njl5a6rnAu9sxQDEW+ROuIpj1RzR5nmiUkaUySRLStZBtqqrrC2pgkTgL5SLAzUaJinFarNZN7OudUPAj5jmTbxsHzL+JkPry360IadpEX86niUJUlk3PPWhxKilVSld81vo/w4W8A/8z87/8Y+O/5E5TCOIwc5xbf4vL3fY9WyjO/uo5xGKirMhTo+L/rw2b25aqOPEvDz8fjEeCZZRYvwbe6Hqiqcqb2+uvI8fD+qPGJuq58w9coJrKWoe9p6tqXNKep7ybkfHOPJElohwHqmlM5FxJF3tvIiqdDUrQxpNFcOBPJmZg5XTcEywyO0/mI0mr2IBR939F1Lff37wIbcb32dOOu80enaasZuoH9cU8/ehd1sVoE65XECZE1TFMWSqHbtmX3+BAKtJq6oh8nnPZH9HW9rzY087wJyOlJXDVDW9P3HuNZLXO0nsVqmhj6js3mimkYgYZ/8LOf87jbUSwjfvL5b2C1patrqvLI+fjAx6+u2O0GTJR45l8UE0WWsevYnU5UTUuaJKhJoTrF1eIGNWkebv3zx5FnIzLN4Gff0/YNIxGgSGNDFimG3ocWcRRxd//A6uYTlhgy2xEfK/IiJY5nVmjXkMQZRbHAmJjTsSLPE+LYBGss87jsiqdqUlswjSqEXOK5yDxLJe0wDKS5L/+W0FnK4y894SzL6FrfXUoo7qHWIYqIZ6/Dh2wGHT2dqyF9OKTHyB/XaOXPqhQc8N8opRzwHzh/kvRHzrk3fq+4N0qpl9/2h+riKPqbqw273S6g6hIHNU3DarUKWnO9Xj6bMHHFLnEBT99dhDhamI2ipSWW9u738y444nIK/iDx2jS54KZfX19TFEVIX3700UdkWcbPfvYzyrJCRwlJkgYN3fdP7cujOUV4PB5mprAKjEhpjSbsQpBzAZ6OOQ/KcMZQhANwWQq+Xq999ebsVobWamk69xlsydIsgLKCoEtZdiiYGSccPv11Pp+5uroKc+7Pe2g8e9A5lNIhZs2ynPV6w35/mMOsnjT1HtvkGn71qy/ph54otozTQFt2fPbpx+x3d1hjePvmLadzCcw9FCcoipyr7RUfvfqUcfC9FKMkYbXeUnUNQzWwWS15+ckrkijidDxSjWeUtURGM2mHtZq+70iSlJubF3RvbzlWDUkcE3cj+90jL1/cYIzjcDhQNyVX1+sAMspanseSum7AqSBfl99ZLpfPNl/dd9T1OXxXMgCHwyHInmRQ5LTv+/t79vt9CAfkO3meo5XCzfIv3ZnEi7gsv4/jmCTPQgczkd8kSQJl/rvGn1Up/HXn3Ot54/+3Sqmf/Wn/0F0cRf/TH//QycaU7IJ0bxavQeIviatEqIVdJ/GX/FtASGEyCmtRznwQd87TkONnlWPW2mfYgS9MycJJv5eHfV6m89brNTZOqeuGafKbxxrFNPbgNCaJyLIExZLq/EROkXy+HPB6f38f8stK+U7Jl8fAi7BI3ChIswja5bxdNvmA+SBXFKfTKbAi3717B/gsg5QNa2MZnO8OdT6dWK+WbNaeVuuzHnVQQtNQ8/Dgj5+LoucZHH/8nO+2/frtV9w/7Oj7kU8//YSf/cEX5NmCP/iDn8HU8vDuDbH1Z0IeDkcME1lReGWrDE5pUJaXn7wCYJwmdqcTKFhtN9RDD8aQrZaYNEbNstKNIwxeeeVZgXOKKHrkarMln6AdHyjShP3jA+uPtyHtezppsizh+vqavhup645p7CiKBXW9Dwbq8fExcFcEC5NUosT4EhpfnosqYLP3dt2zngvieYiHIPJ6Kcey9pf8GfE+bOSzQbKPhGsitTPfW/jgnHs9//9OKfV3gb8K3CqlPp69hI+Buz/pOpIiEwspZBwgpNyGYWC/3wdXKE1Tbm5uWK/X3N/fh7TgZa58tVoFNxeewogkSSiKgsNhz+GwD+kzsZSSx3/qYfgEwklKaTs3xZTFEAQ+z3PsbBEOxwO+IYsNKPzQ+yPJbm5eMAx9uIcc4bZYLJ49d9+3ARSSdwtFWhAIWeDz0wKKyTsK8nzZq6E8+bTVfr/n6uoqPLtgMzCfXRklc2r2zH6/49WrV0ExT+PIpGZMwyShXsXaiLr2523iFM4R8uhdP9F2A+vNds6c7Nis11T1maGrWS0XvLje4saBx67hsHukqRs22xtMFJOkBdpErK9ecC4r6vJIsVn7TMc0cihLlPFZgvXyGtyIOxh6NzA2HmCOE8/+LLKM/fEdzeiItOLd3S11VbGIJ9IsJs8TFARqs0IzjjrwR+q6DpmJPM/DHIpMyHx6JWsDLiBertCMBUSuW9/GXsKL0KcBgmHzhXk+VJWQRMhL0hNDfpYMloQJUk8hHsulofjm+H+sFJRSBaCdc6f53/8C8O8C/yXwrwF/e/7/f/EnXctNT/0LJe8shBzhbUs65ZJjLkIsn0layJfuesKQMYbj8RhoozIpsnDS3gp4xlsQdN27h0/Hjo3jGFrOf/7556E4SUBNay116Ss6rVYs1yuM0UEY7u7uAivucDiEqk8B9YRGLEfO53ka2INd1z3r7ychwmazCUIjnoEIisyh9KWIo4h+Ruol1bvdblkul9ze3oaejUYrtHKMfcsiz0hjS9dUWFMQGf+7pi6pziecG/joo48wxvDu3T3WemGW9nVuPg5+tz9xLlt+97f/En/4R7+i7zseHt9xvd3Qli3bm1eM/cDjwzs22w1FscTYmDhNQRt+46e/yYRmnODzlx+zXuUY47DGotzkW42lCUnsz8YozycWDn8IbVszjQNaQTeHklZrpqYhjSJMFDONvkXadrsmTX0pfVV7o7QoVsRxRmR9Cnl7dRU4GOIFyHxLuHo4HFB4HoqEAcIMffnyJUmScD6f/SG/ShElccBsxDOQDSzyPvQ9fdsFfogQmiQ7J3hB27UhWyZA+m6343A4hL/5rvFn8RQ+Av7uvBEt8J845/4rpdTfB/5zpdS/Afwa+Jt/0oUkJpbCjzRNQ3+Dp82RhwNGpIONcPUvmVziVYj1lRhY3C5hp/nFqYnjKNQPyMYULoB4FVmWU1XeWotyEXKTLKyUey+Xi/Bscl9JbSnlPR9ZTKkVkGcU/oBkTlar1RyGtKFAR7ABINQrXBJugGchkmAB0kVpvV6Tpb5iT7wMQbkv60QcT8w48XyEGpvnWajE83NgA47Rth3GDEyTozxXgC+53u12vLvf4TAkScrhsGeYPGbStTUvb26Io4g3X73lo49f4hxkRcGLlx9jowSUZbFckecLkjRjudoQWQ34/H95PnP3+g192/DyxQ3WapQCbS3jNMHknwOtiKKYRb7gxfU1++MvyRcrdqeSru94bAd/glVsGMYErfm/tdLruh6j+2CcZA1k/sS4VVWFwgaPQWRT1k/CCV/laoguKPnCtL26ugoGQMIQWV/BwKS/xmW3rzTNQKtAoLrkpVzyJL51P/5pNcA3h3Puj4C//C2fPwD/3D/StXg6YUespaS7pMtQlmVcXW1CM1VpiimLIg1CvDV3vHjxguPxGDaLdNERj0Nq4KVASYqxpAbjMmbbbrfEcfIMc1BKcXt7G4AerxQcdekr7IyC2BriOApCc319TWwtQ9eymgFKKamVTIlscGAWkCrU5wslVjwDgNvb22cbVipMpQWbhFHjOHJ9fc16vaGZPRo5ZFb6TYinIR2oA2BV5KFrcNfUrBYFm9WSpirRynsaURQHzoa3VgPT6Gjbhjj2hVf94Jic5vWbW05VSV5kKOM4Hvesi4w3X39NkRUMk+LHP/kp9w8PdMPED3/jh3z+45/ApLi7u+d0PPDu7h2TG6mq0qeltWa5XJAlMV3fo3WC0WpO8U0oB1pZxrGfOS744qlx5P7uFmVjuqahn1rivSXLEpLEkBS+DkdhiOOM8lyjlMZonrFZL5u9Sj0OCJnpyVWXZkGyXlJDMwwD5oL9uJzrVMRYCilpGAaaqg4hy93dHev1OoR1stmNtbQzl0bwCaFei3fzXeO9YDS2bcvr169Dv0RBVaULs3RBMsYTPuI45s2bN8HVkjhcKsLatp83chxcflmspmmCZyA4hngY0n1IuhyJFf/666+pqjrE6Je1BqKgBCx0zgVWmngAAqK+fPmScRy5vb0N3Z5lsa+urp4dgPpUtzE80/JSny91D/CEych9xfMSBqZsdoDDYU/bPPE1pDRXyn8lvJJCoMv7aP3UlVpiV/kcngS870fatkMrwXCyeb4MyljevntH33cYU9A2Dam13L55w6vrF8TGki02/PyLX/Nbv/Vb/PZv/Q43Ny/48ssvKU8VQ9fx+eefo6cRbRNeXF+TJP4ch6HvGXrPa3HjyDA4nPOdjcuyZho6sjQiSzPGYqBpalaLBWV9T2StL6Dq2rBpd7uB09lnXzabK/rOoZQhjhIiOyJHBwpBSAht8KTctX46nUvceDFykkU6Ho84RSi6uwSVT6fTjEHdkGUZ+8cdb9+8eZZFk9SyhBpt21I3Ddo+HRsge0HCS6lr+bbxXigF0V7SYWmz2QQOdxzHoUDp7u6WFy9esF6v+frrr5+l5ISY4Tfl00Emcl6fhANSjOKc43AgWDYJU9I05eOPPw4l1pdI+jiOHA6HkN6RgqinqsuOyPhY3hjDOB8z5wlHhi+//DWvXr1ivVqBNgzjGNJD0dxYRcIHYaklSRxCKllMOQpPctyi9QMte34eQbOlV6TWmiL3HtHpdHpWNyKH4XjWnz+gpjyfqErnQ5i2I4ojH6+PI0orrNHAU2+A9Xozl7GvKMuKaXT0vWdt3t3d0XY9yvhQxti5K1Qc+epE46tBjTKcyoa/+I//FX7nt3+bX37xR9zdP7BeLvn41UuU8wfjvvzRD0iyglPVkMwe4/l0ZP/44FmNlWcCerLZFamx3N++oa5rVkXGyxcvaNqG9XrF/W7P1Hekccykx2CQhqHFWL+ptDJMkyayKdZEMyj+VHEr1ly8vBCOokM4K1RywR3E/c/znGF6ophLeldCEkmTa62ZnPc4D4cDX375ZegQJsZTlEU5e4CCV4mcSNghiv7bxnuhFKZxIs8ypnEEYyjPZ6qqnC2VYb/bcXd3x+c/+mGwzMvlirIseXjY8fCwI7IeG3j58mNfYzCnwcRNkzSlpGYk4yBWvOt6fyZAkjLN3YeTuY3VarWm74fQj1BiQSBYV8kVZ8sCiaPHaSSOk1mT+wKZd+/uiSJLsVwRm5gkiWfqcj57KiX39w+M48iLFzfEcRSshYBI0hxG/pNF7zo5E6IPmZw4FhBWczyeaOqGOPYVhkLeMdbQtA1aG2wU+dLk0Z9DEEUR4zyfUTcfZdYP/rp4DMbqCKtjrLYc9wd2DztsFNH3oydBobh9ePBu9ORomwYNGBQGRWQsP/jB53RVw2ZzzW/97j/JT37nt/mf/sf/mZ/+5Df8WZFas1os+fxHPySLY6+kkpQfL9egNW1VkqcR29WCcRz8idIzpqOVosgyxqnnuH9gdI48if1BO7mnKPfDRJLn9KeBrm/o2p4kjbCRpzwPczGWcwrUiDYarZ6OJjgcDt5DmRykDhtplI0YJ3ATc/ctARpbjFbEmd+wRiv6cWR0T2dvSkgsYeD5fPZgYhxTZL7uwlPoXbjuJdDsgKZrgyG5VASCQ3zXeC+UAgrSJGYcevL1ipubG25vb9l1HW4cKeuacfBxr1i/NM04n6sAZt3cLCnLFjjz8HBHnDx5DkIwub6+RjoSezJT/OwQmK7zhUC73Z71ek3TtMjZg1Jr8emnn4Yjwi81+WKx4PHxkdOcv18ulwyTo247jueKpvMWIE7SOW3Yz5kT6dtvSNOVPzg1lVSTCkIRUOUZzJIQSATieDwxTX7zgwfTsmw+j7KueXj4yocYK8tp7l0YRRFt3/kWazPeIvHmOA50Q8/2+prPPvuM+/sHDocj3dDjsCRpAgrO55IiNdjI0pYNRZLQupbj6YhNUs5NQ1m3nPqOsXf0dU1qDG0PLzdX9F3LIl9R1z3r9Q2//Rd+l/Viyf/yP/xf1L1LrGVndt/32+/3Ofs87731Iotkky12s1vqjgJoYgTINEBGCZxRBgY8CZCpnVFGBjzKKCMPgiSDJHaQQQIEiARLHci2Iqsj5tLcAgAAIABJREFUWWr3k80qVtV9n/c5+/3O4NvfrltKM23IEsBsgCDrsliX9+y917fWf/0ff8TTszOKKMLzPMZBwHKx4Ob6lo8+/JC6LUgPO7788pdUZUneA87z+XwYb1zPoa4NDNMkSWKwdDpdxXQddMtE1XRcz2MymrCPEtROw9Fs0ihhv4sI5yGmZ6DbLp2mUHUVhqqS5QlKVw1bpDzNqIqSqgNT1+mqmqJuaOqasq4F3diaYOoqtAq1CroKqtpRVwW2qdNVUOZvRWxVVQ2ApWz/Jd4gR1tN04YxWeI5stswDIOWt8Smhy5SklvxVdfXoyj06LckdEiw7yHJQ8y+wolXhHfEpGlGXTeYpk0w8tBUnRcvXpAXKaE2HvAH6cokPxCJ0srTVNO0oQMZj8ccDgeCIGC/378z+0mc4GHrJW/QkydPhtZfblGkqtP3/aHDkO3cw59Nqu3kWCPHCYlPSNQa3jLgpPpTrppE+2gOAJf0EpCtsMyfFC3uW1GOoEqPB5xEFh3HEey6OE64ubmh68TsvN8fiKu4XzsqdHSUVUnT1NiGh6KpLJZLFprGercnrVri7Z7NdsfF5Fkv/YanT58SRydMow/7HY95/vxDxuPxYMO3Xq958uTJQCTb7/eClFXX7Pd7ri7fEEfHIatCEMr8fktjoevGcMqWZYGCgtlzL4q8QO8DcV3XIy1EhoVpm6RFximLhc6gFvfeDwQPoYXesaolk9uYXs6cZ7nodvvfYxgGUZpyPB3Z7LaDv+ZsMUfRVBFUk4l1seN61A+Ys9KlSa7PZbZGU9XEvWBPPhsSq5A4ma7r1L0XqBzNJVFKbqW+9s5L8gfruo7VajU82JJ0JLcDdVPRtDJuXmU2m1IUZX9a12i6sOSWAJDsFOSLKttuyVH4yy+g3P3ats3l5eU7phiKIlahaZoOrLC3vAgBHsq5/6HwShKSJGAnZ8m6LofiNGgO+g5ErCLbYZaUoiTDMLi/vx/IU/JlkHmUqqoN/400C3VddzhhZOCMYWjvqCwlBgIMhUnMn9UAXAr5txDbyP03CLBRRcHQVRRNFexDVQVNo64b8rwgihLG4yl39/c8efyY27trQCDzo0DwU1RNI0kER+Pzzz/H8z0++ugjPv74Y+7v75nNZizPz0XBuLzk8s0bVAWevfdebwdnslgsBKB8OglRWO+OHSe7noWqYFk2nm1hGTp13aKpW3TdYDKZEk4m5GVO03WUd1e0LUSniKouMU2DwPPQNZW2aTAME6/v1oIe6I6OR2Gjr+to/f0OgoCOjiQR44A4jGzSVFjLCV8Ol6ntDmxIyS2Q62xpZxdFEXQiq1LeV4lVyKIuV9tN2zDug4El2CxtAeQB91XX16IoKD3Y9RC0OxwOw8Mq1y/36xvyIus/BCjKAkMXxidXV5eEYciz956x2+0GbUDXCfrobrcbciLkyygNUGV7/pD9KFeXUgrreR7r9ZqLi4uh+3gYsmoYxtCJyLxKSU6Sc5982SWgKU9xYKjmMgVrt9ux2+3e0dbLrkRR3npUSmk5vC2iEpSSs6kkwYjvaw1kKgk+SUKUdLp6uDWxLCE5F4VaAeUtC880LWzL7k1nj1S1UHDe3N2iWw551Qhac90ync4pjtLNaEJVSU8AUZjp4OzsnB/84AdkRcZ0NmU6nbLZbBiNRpxdXPDLX/xCIO5Zhu/7PH/+PrZjDYYmVSNGtHAmNjmdquD4HhMFDjuF26sr1ne3zMIRyz59SY6QRiuKyHQ+paxrblZ3YvWsKVRlTZ4XxFGCqasYmk4QjAlGI0F+0zTyLMP1ROaFqigU/QpXM0w03cTzVDLJNaib3h1KxbZd4dHpeXiqNmzI5P2VLF3ZUQZBgKHpg1O0zJeQjMbdbidwFO1twZDPjAQ4Jdfhq66vR1GAoYWWdFE5/0i7KcuyyMqY6PpE18kYrZzzs4t+Fab1I4LHdrsFGDzx5LpI2oLd3t4OH5A83ZfL5VBVT6cTYRgOgTPyJZnNZnzrW9/iZz/72YDcL5dLTqcT0+mUxWJBnuccDod3KKcS5JQ3VqDLb910DMMY7L9lKyjtu+XoIr8ObzcFkgUqxp8MVdWGn/eh+YssaqLFVOm6duDYSxbeEGjbd1fA0AnJ+XU0GmNbDqqqYdsOWZZTNxVJKjZFWZnT9ThKuo/YHmPe3Nwxni0xTGcARqMoJk1OuI6Nqok153K+5J//838GCCbq8+fPGY/HTKdT8jznX//FX1BVFU+fPhWn83jM+v4e13MBQVXv2g5FVfA9n7qpByJRGFp0TcdoFLJdrWhbKKuauhaekR0aZb8puTh/TNvB6+tLyrYiS1J836WpWw6HI2HgMzufoWoah9ORum3QVU1I55OUIs9RUIbkaTcY9R2bxzic9KvA7p0IQFXT6TrQtbdJUXLElPoFubZ2XZc8zQaatRTmSQMcSY8uypKrq6thZJRxjJIA97V3XnpofyU/KJmAJDGGMAwZjcS8fzqeqKoWy7LZ7/do/YpoNltwPMTYtjPMydvtdhCNyJwG+cG1bTvgBlJsIouHVK1JjsPxeBwAUEkTlZsNOaNdXV2RJMngYyCFS5PJBOl8JMeI0+kwzIEy3v7hzyxXlNKcU44iFxcX5HnObrcbiockHmma/s7II01C5TpS1/VhqyOp4ZIfIYFUScQRRQ3SVJzKpmlydnaGqmq0TcdiseR0ithut2i6yqnM0QyDOIooipI0y4nTlJvbFV64ZLG44DDbYfS7c0lOK/Icx3I5HA7keYHr+Hz3u99lMpmgaRqvXr3CsiyRPuX7b233LYvHT57QdgKk7TSNspe0j8djQBw0aq88PTcMPMdGaRpoawxVwfcDmgZM0+V+vaVpWi4vL6nbhslkyik9oZkaRZFRlhamodN1ClEUo1fFO2g+bYeh64xCYX/XtS2GZYKqkmQ5umlRVzVtT0unKMnSjLIo8XyNu56EBm89MSUu9dCpXK6rJeYmDyzJkpTdr5qlNJ0YYWVhlc/qQ43Lr7q+FkVBzvayZbVte8hOlIy9oihABcv0GI2EZZmKhqZZRKeYTK+wzJMwLenZdXJNI9cvk8lkEAKJk2/Eq1evBnWiNG1dLpcDT0LeHGmAKV9cRRHJPVdXV8MHLf8uu4WHo4fcUNi2PQhjZOE6HA6DT4R86GXBeDhfSkKRLBJt27LZbPobLB4oKYWW30f6M0h5uSA4OQP2IIJXj4Mabz6f0zQNt7c3w55bWsRVVUXblGRZjuf5LBYLYUDaVdzfaRR9YlfbQd2qZLdrUDROUQqaRZqkZGlM01YDP0IWvzwTLfPH3/gmdVMP45V0zTIMg5ubG5bL5cBj8YIRiirDhAxUm8E3QNWFB0NDh6pptIpKkYtuj6bGMowe6O0oq4amE5qKU3zk9vYaXTfI8wI0ha5TqKqGSTih6+B0irC7twQjWqG/0E0DTdV6QxaxPWo6DcO02PQrWd/32Wy2BKMA1/MwTIvJJGS33VHV1fAMiA4nHO61fPE3640wHtK0ITNVHgLyUGvbFl3TB7BSYlryPZCA+VddX4uiIKW+IFhxco6SBSEMQ7bbLX4wwjBsuk6lrlK6TqMsGppGxLDf323xvIJw4qHp6tCCy9lbnvDSnGK32zEajbi/v39HRyDbbon+ytFB2m87jvNOLLlUasotg8Qx5Ikm20F5QorvFb/TFT0sjHIzIsFRKWmWN18KnKQMVxYS07QGcFG2nvDWH1B4Uow5O1sKSXQ/Kkn6tHywRJsvo87fKvQE7mIMBCpQcFwHx3NwXFeAYIrCuB2DllEUFUmagaJS1y2e53M6HsiLlLrKBQ25bftUp47ZbMZ0NkVVBO9C2obd3d2x2+14+vQpYRgOWx1V10mSlAzRPquagWE7dIYOKDQdoGqoXUdVVCJ4Z74gOR0xdR1N1Vmt1sRJJub8Dm5ubtF1g6YQdOZTFOH5NnmWE51OWL3oqqpr9Loe8jbEKOdDD2LLLYSmCFzm7PwCug7DNIfC4bhej32VtP39BwZRnnwOpEDKtm2hQ+nfG0k9lyCxJLzJsVCO3bIrkweX7ES+6vpaFAVJJHIcEVwqPRIlwm5ZFufn5xSl0DVoKjRNSpblKAjjjLpuRKpRVmHZGkWZDtUU3oJj8sSVYON8Puf29nZAY4/HI3meD2251A6EYTgoIbfbrWhfHz/m8vKSPM8Jw3Co5hILkdRqRVGGF1jGmpumObhWK4oy+EVMp9NBKdk0zTAHSuMZeAtKSkDRcRyCQESqSyOPOI45Ho/DHC8Bza5ryfNs4HtIN+KHnn3j8ZgwDMmylPE47E8ZweGvqpq6qodfHw4HGrWGtsPsTVsURWOz2bLb79FUHV03UTSdzXZDlmd0XdODxaLl1g2DFuF9oPV2+LK7+dGPfsTjx495/PjxsLMPw5C6afrNU9mvawVGIsVywq9RtNd5mlEkMbQNl19+iampjDxPbGhcF9NyOSU5N7d3qIrK/rjn9fUbbE9Etrdth+97g0gM+jQtTUXvHY/8UcD5mVB5RlFE2t/Duq5o2mY4sSWv5CH/QGooVFUZQHK5Ip9MJsDbsBlQhk3CQ8k88K6cHrAcEXknv5/seoGvf1GQKxXRwro94Oj1RByGtqvcRyTpiTiKBbHIMMnynLaDoihFm1c3rFYbmlaGetYYuo7an3RSsVaWFU1dk6cpjuvStSKFyLEtYTtmW1jWjK4TBKXVas18NsMyLY6n46ASVDVVjAFdh+XYb2WuqgpdRxTHuI7DbDZDUVW22w1JkmJbJl3bDUChoih0bUccxWR5NrR7+90ey7Z6q7NgKCwSTKz7bmQSTsSpU5YEgc9iPmcUBMRJTFGUA7+iLIsh81ESsB4KxSS2IJWRitLP5qoycB+qsuJ4PFHXQlDWdS2oQqJc1w1hGBL4I3TdZDQyqOuKqetS9eakdSMESpqu06EwnU55+cVLvv+930Y3NDo6Vut7Xr9+zXgc0iE+x9lsytXVFa9eveb84pw4zVgsz0XBAOqmge6t/0DXdjRaQ1c3qI3N6bjvx8YAo2eBFkVJ2yqEoxGbzQbPd3lz/RrHdYQeoevQ6SPdiwx0TYwFskXXDeggS3Pu7u7RVQ3LMvH9AM/3qWo4Hk+A0GE4joNpiAhE27Epi5wkqftOT2zefN9nNp/T9v4KVVWR96Otqek0qgAy5YkfRScURWc8Hg1biLrvDiQbVuqEgAGb+Krra1IUVMJwjnAGPhLHOWla4Dhe3xLVKEqOH7joBoShy/6wJ0oS1Lam6nJqrcV1QjRVZ7PboiktmqpBawyttudOANHaN3WH2rW4riXCR/Mc2zRI8oLpZITjubRtA+2IJC7QsFA7nXASomk6WZ6i6CqBP8IPA6qmIU1iwrGwWi/yjNl0gucK0PPi4oIoihgHPvv9ns1GzIaOPRqcc+I4FienqtE1LZZhoinqMFfXlWBc0nbYjjWMHfvtjrqsBv591cePSYcox3EIPDGSTXtRlms7Q/dgmRa14wo/RE0nS1Jm8xl+4A8gpqKAZQmVoWmpGCbUTUrbdnTolHlJXDdMRnNs0+G4OwmHaVNjEjo8eTLFsAzKQqNMRStsOT6jcM6rN28YzyaEsxFFnVApGT/+xY9QFJWZM8MJHAzDYrPb07XgOh51DU8eP8VxbU77HaZpMQqEaM4yHbqedNQUNV1Vk6cHmjrH8x0My0RVVOa+z367F+nU8QFXb4nLI51aEIQer65vaBUVO7A4pRltlWNbKkUV4+kTHM3BQIxTbdNSZBWtAZ5nE2WiYzH0hsVs1Lf1QFPQUGObCobaYto6eV7TtRpxkmD3gijDMPB7sNW2bX74wx+yWCzompquFS90XdfoBni+PTwjvu9zOBy4u1uRRNkgfJKdx2j09nn7qutrURR0Xeejjz4iiqIBwdc00UY+VB2WdTHM5Y7rkGQpKIq4IUWBoVd4Y591UQm3ZB3aU4Jl12wPJ2xbyGB1w6ZqBBpbodOoBvvDCd3QcF2bNC8pqprjcY9lmwSBh6V5zGYTgrGPYSqs1g2dIroDy7Bo6pSuAUUVbZkchS4vLwceuwRRsywbbow88SXIKjkJh8PhHcXk4XAYWmopfpnNZgMZS86WmqYN7j/S418yNmWCsjw9zs/P2e/3g0eEJL9IDGS+WGBZNpvNpldIxkP7KR7yflNiaqAwnE43N7fkRUFVViwu5rRtJ3wQLIv75HY44R5dXAyrWae31nddlx//6x/34JjB40ePiE4xh90BQ7d4+uQpF48eC3v2w57kIAxoFE2nyjMaVaOra7oWijIjihKieM/peE9d1XRtS+AFTMOQtm/Zsyyj6EHVal9C29JWNWWa0QCpqRHVBYFnkWcKrjXGcexhhJOjr+RLSCDZ8zwCz+ZwEDGE8rSWALB0whLrcZO8KIdQpIcMWMMwmM/nQjXsWGiKcG6SOQ/SYEV2wZZloWrqgJ8Bw3MgcYevfVEQe/KMqipQVYXxeNQjqQWK0pHnIpi1agQ6K3QFAuRR1Iy21Wg6he12g6romJZDUeRESUacviUlCYfdS0aBsDvTVJW6LnFdh/lsQjgKSIsc3VRoO2EdNhqNGY/GeLbH2dkUVdOo6gLbNKjbhjLLaaqKqq5Rmg5UWC6X7Ha7wSZObgl83x9WQ5KlttvtUHphjSwQco0pRwUp4ZZcdylflq2iBCglD15uKCRwKvkSco7d7/eDgYwEIGVrKYlcSZKiqG8BP8nHl27DcrbVDQPDtFFVnbpqUFSVLC9AUTg7P2N/OmEHI370ox89mItFzqVYr8V4jk3S07yvrq6wfZcsy3nyZM56tWI8GkML7z17j6qoOW63AjsALF0hrUqaLCdWFHTDZBxO0AwDVYNOrcmLVGwrHJfF2RnTcILSdcRpSlXXqP2Lg6KgtAigMy9oy5JOUfBNG82xqIqUIskpTJvETGmat/Fsvu8PUXCyNa+qiqY1B0wnz/PB+VkWE/nZO64viqdjo6mKGEdN4WRdVSW+57Ja3WObBnRviXfyOZLbMvmcj4IAzbCH7y0xNCnJl/jUr7q+FkVBWJyJ6AjLMgdnYenmLH+oqq4YjQJs28JyLFRN4+XLV7StcCNuGpU4jqiaDlQd1RAtVhwJso+iClrJ9e1KzFl+QJJldG1DOPL56IPnuLbO/hixnE+Zzc4Yh2M0pWN/2HCKD/0NLcmKgiAY9XOsuIn0LyKIdu14PA7AmKQjy3lXsislMiw3C5I1KV9Y6U0JDC/mw5dLbiikLd14PB7mUEnnlrTrqhJJXKfTacAsHm4dRqPRQODK86xfo4nxRWw0OgzD7Fe+GYeD+PksZ8RiviCcTImOMcejIJg5rsfV/RpF08jynNPpRBAEVGXJdDbj7u4O0zCgbXjv2TPBEahryAs++eYn3Fxf41g2nu2ymE3Jkxhd1cmTGNc2CQwdrRZy7MB3CcZjdMchLwqiJEZRVWxHZ7mc0VUZRZbj2uJEL/NcWO27ApA8no4opULXgK1b7A4HfMumU8CgQ1dV0ixHbQ00RaPtpc4SPJRqVcmBkYne6/UKTVWF+Wsvc5Y6H0mg0zQNTdcIw/EATIusEw3PFR1nmsTYlonveZRFNrg9S5BYCv3iOGY8HjMOQ6q6Y7lcslqtBgBcEub+Rjwa/zqvuq44nY6MRgFtW3M6HfsHW+QtPH78SMhEFZF9kOc5hmkMI8bNzZq8f0l32z2nKEHVDdIs5Xg4Dh/eZruhLEr8wBfdAirOeEaWxOyTguv7DRoNbV3wxrH45OMPAZUgcGi7lu1qJZx3ew59moosSNOwMHXRXWTpWwWijO+SnInlcjlsWuTJIU9/YKBjv3z5krOzs8Hb4HA4DFuEbf+iylFCkrLknys5DpIGK4lYD0lSkgwjTwypupSUWNHmWmi5AChPp4jFYonsDpoe+S/LClAZjcfopkmR5txv1qR5QVHVvHj5JYquYzsud+sNmqYzmYRkaSqcprpGgLuOjdWvgZMk4b35TPgv5AUqCtNwgmNZRIcjZdvhez6z0ZQ2SUijE7bj0JYWp32NltrotkXgOxiOQ5ZlJKcDm9UaXdPQNQ3TMGj601xVVfKyoG5FfmaWFixncyFuqhscT0ijHc9mPp2JXNCyRinE1kua7EqSm1DwirTy0+lEUzfolvjs5UEhX0jpBqbrOuvVPfPZHFMXRTrLMxTLJEsTLFPHNg0uzpaUVdnb0DEwGWUXKWMFT6eTIIL543d8NyTl/f8XNGcUBVVFyG11kQeQJgl5ITIKDEOEctiOx2azFgq6zRrTsnBch/Pzc95c3fTmIzmnOCZOhfZBrubkzOw4Dqv1hlMU80g3ifMtvucSBj7r7Z7AFYKZH/3kF8Rxyuf+S87PF3zjo6c8ee8D0VrHCXlRYhgQnY7YtjOkUMnq/5e1BVIoJV9QeUlZt2SqyVWalImLdaNgJq5Wq8HTz+jJN1VVMZ/Ph7Xmqi9ckj8vXX5kypPcU1dVRRiG73gHPrS567qOMJxQldXQFahKj+g3LWVR0jSCe+D6PtvtnizNSJKcJM3YbDfUTUvTNRyORzTdwPVcQefWdei6QYcyGi3ZrNfMplN83+fy8gq6ju9/73t8+PwDbq6uCFwP6oZHFxd0TcMvf/YTAsvCsww0VaXIMwzLZjSZYPk+6MLkRuuRerWDcTDCtizo52nDNNF0ndu7O7wg4Pb2liwvSJKMpqo4m0/plI74tENXFFRFpW07irKi6pmJsl2XFn5VVQ1WgFJsliTJYFzz7NmzYYyTStDRSPx/NU1N09TEcdYXb0+A0utyKOqaporYPd8niqJ3xgDJ+CyKgsPhSLxavePtKA8WEX33N2Pc+td2tW1LXuS9rFj4EmaDJbcBSoeiQN23xI7jUNYVh8OxNwtR+l2vOGm3uwPHU0RZCXTddWz0IEBRFbI049HFBRcXFxR1TVGV3N9es1/rYraNdMIgwPVGvLq8hbbl55+/4H615jvf+Y4gRHUaqmZxjBKKIifpR4bZfIamM8xugr22GTYLZVnieR7L5XLQRDw00RyPx8NeWc7+knQlx4KHL7CcZeXpfTgcBhm1LDaSHi1xCXmqPZxJZeGUnAXpExBFUlWqsLpfDyo+4e+YCTCvKLm7W6FpOnGcsj8cOZ5O7A8nkiRldn6BYVqMDCFcszSdxWJBVVUcD7vh5zN6z8iu6yibhvlsxvn5Oav7e9q6JktipuOQ1d0NdZ5jmZbwolBVyrZhMT3DGYfoqkpTNzSFwAPKvKDMhYu1bVkDMUpiPVkurNXrtuHy+pokL8jTBLoO0zIoywLXsamqEt0y0XSDrKzxbdGur9frQbgn3bylPZ4s6J7nvuP5Kb0WZfHebDYoCPampqroqoKhaxx2W7q2o2pqjH7kqfuuQHok2LY9yOjli6/rOn4QsNufhgQp6VwuwdGvPU9BPrCy1ZUfrjyxZJt7ipJe8edQ1wKI1A2LLEuHdny1WrHdHyj7D0fskTt812E2n5EmIqFpdXeLaVssJ2PGrsVuu0VTYOwHHA5HDN0gnCzZbXdc327YHn7ETz+/5FuffpPvfPYtzpZzmrpgZpmk8Ykg8FBVheVizul0YrVaDQIvyWyU3YIUNEnGZRRFnJ2dDcGx8rSfz+fDqOD7/iBi2u12Q0EBBtBKfgYSs5DdQ57nXF5eAjCfz9lsNkOradtCPyIxD00TaVVZmhPH7yLVx+Nbk5vJZIKqis+3qGtOUcL+cOT1mzeYpi28HbqOzXbL7/ytv8Uf/ot/QdO05LV4CcqypGvf+k/O5nNWd3ckScJiecann37KzfUNvuMQnU4sp1PS6EQaRziGgRv4ZEXGKJwwWS7RdPGslHVDlsbohkFRVMKrEEFFbiqxATBsa1CwRknMKY64vLri6uaaU5ISHQ+MfLffdFWomoKlG6Ao5FWNZZgDbVzO6A8LtizMk8kEQ1fo2mZwaJIyd9M0hyRv0xSmsaM+XSrLsuHEl4Q5UfjrATuSgUjy3jwcYSSZSb4/D42ApbZGFotfdX0tioKuadR1yf397VDFRYpvTlHo6LpB1zXkuWivyvJBSlIP0imKwna7FYVCU6nKBrXTsE1d2F13DS9/+UvathE8ebcDQ+Hu6kuapqWuGtqyINF1LMumaxVevb6irltaxSDOOtK7A9v9n/Ly1RXf+vRjPvzgGaORS1OVjI0xZZEPoI5ksElHaSlflgj+eDx+h9UmV0a6rjOdinTlPM8H4FLy37u+7X7y5AlxHLNer4eY+sPhMKy0ZACuPImkKYfMdpCz73a7fUCTFifYF198wXQ6xzDdHjsRcetXV9dDsTEMYVG/PxxI6obdds+rL1+T5wV1A4raayCqmrwoORyO7PY7zmYzoihiOp1y2G/R9X6F2rbstluWyyUfPP8A07DQVY276xvKNGU5Cdnt9hgKmLaNZ9uYwYjWtIiLkkpR0VqhhByFDl3bEYQ20W5PmWQsFwvOnjxB8zwaOvwgoKxriroSL3tZ4I9GVJ1Klqdi7DkdyJIINHA9B3RdaCl0A9N8a8MnW3IJPErw8XA4MJuMKEshaZcjpgSCPc8bXmLXsaHr+m6p7e9zQF2LMVRI/00M0x4KtewYsywbZAISjAYxKkumqjQckqawUmb/K9/Hv9nX/d/skixAcYqmg1WZZNCZpjBe9VxrGCtEocgYhyPyYj+0bLLK1nWNpii0dU1Ti927piqoqLRNRV1pGKrNe48vqOqGm9v73hhFx3Yc4ijj2599l/3hyGazI8lK6rrklOT89Be/5O7+li9eXPD++094dL4gzRNGgY+tCWttmQol50oJKsnIcMnalBuDyXTKYb/noR299P6XLaIcEy4uLtB1Y4h6l6Yn0pFpvV4P44qqqkO3IFtm2SWMRiPMHv+gbyeljPvNm9d4wQTTEDPxvke7pchLZBQIU5VXt/dcX133OEOHUTWkWQn96dk0DevNCtMw2e8POI7I77Ass7eQU1hFGIRQAAAgAElEQVSt19Q9cDYaj1mt7snTFLXrWC4X3NzcYABtz8No5jNaVaFRQDEN3CBgHE5Fgc8KNFUTa8pTRBLFHPd7LNvG1nXKpuby6orb+zvuVvd88fIleSEAOkXr2a9FynaXU5Uphm1StC1+OEXRFKIkpq1SPM/F78lBbdsyGgX4vs+TJ096fUsy8DCk2a7k2YzHY+JYxAFMJhPyNCGOTr3GwaGpKzTVJc4iVAXSJGYxfw/bcbm5ux/el4d0ZymdFl2pQV6+1dUsl0ug43A40vZd+Vddv7YoKIry3wD/AbDquu7b/demwD8G3gdeAf9x13X7/t/9F8DfARrgP++67nd/3fdQNY3z5SNevnyJawuUlk4V2n10PCdgf9gTjKa4ToBq6JziHMcNsCwP1y/Rj0ey8kBRHyjrlg6Noi6o4wY97ROhGoF2j3wf27LxbAfbMKnyCFNVUTSF9epGkH50k+32GlUzmExmuLZD05iY5rRH3wvevNlwfb3n+fPnPLq4YDQu+eDpGFXTiPMKXzWYLZZER9EyOraF3+dP3q92NC1Udc2bqyteX75B01Qs22S5XPDm6iU3t7dYhkfTdFiWYPUdDgdWq01vDlNjmjZxnHKKYrR+nrUsi+XZObv9nqwoBTAmjm90y2Z3OKGgEucV4XRBXlzjmibr1YpwPIau4+b2Gu2wIUsyDNNGU3WaBg5Riq5bqHpMWZlUlUKaFSSZcHq6ubrG8wLSLCcvCrSqZDEZkx+P6KqK4zl0XYViNET5Hse2iPOIKE0Zj6bo1ogXn/+CONoxXyx5/OQJ9+s1TV2RnFLm4wnjZUip2ow9B0wIRxaerdPkCXVekZ5O5KlYW1ptiaGVKK5DYxrUpkGZ1RRtS5znpFlB4I8wDJPD/kCR7KnrnE7VuD+dhITeNPFUkzrPOA8cNqt76s6lMTTM8YjAs/F9D8exCTwDTalQKRj5Jqrmcdgf8DxvkINLx+bVavVWaalqNKigqSR5QVW37E8xluWQRRGjyQzVsKj6EeFwOLwjfjMMY+gKZIRfs9sxm47QVJX720thLVjV0LWY+r+dHdt/C/zXwH//4Gt/H/j9ruv+oaIof7//9d9TFOVT4G8D3wIeAf9UUZSPu677ap0mQvS72+346JNPiI5Hrq+uBlRVsvZm0xmnKANFwdP9vuK2Q/vWNMKsUtM12q4ZWrCu7ejUFk1V+1gxG9MQNuVRFAFNT7MORYvetOR5QVGW1HXbi2BqTNPB90MBAB6Es+6TxWPKsuDy6pJTdOT8bElT+nz4/AN8L6CshN2ZdE42DJ8oOglfBcOk7Wp2u+1ge/bJJx8TxzE313ccT8J5qsgLXEeg0Gm/ypMKUsMQDtbb7ZambRmNR0PIqdozFne73XBKPXr0CEXVOOt32nmekds2bQdJFNN0ULcdmmmh6AbXN3dstztcxycMp9R1yylKWcw8Nrs9qAZar1QU6km/lxafKGuxx+8Ujbbtpd2KOrAvo36NmiQpt3e3BF7Ihx9+SBTFHDZXKNQE45Btz7g8Ho8sJnP2hwOe42DpJq1WY/sWaqNQJiW6anB9ec2rL1+SRCcuzhd88OH7zOcz7EZlsZijGyZVkbOYTzkedtxeXRH4Ll3b8OjinCQ7MRqNyIqSJM9pu7ZXctYkdcF8IlS8ujvCcQUd2XN9Li7OB6NdgRmJpG7XtIckMLnZkfiNNNy5vLxkv98PwUOSO6LrOqZl4fabLFlEZFYoMOSYSO+QIAgGklzXdbg9uUlyYGRX+W/l0dh13R8qivL+X/ryfwj8e/0//3fA/wn8vf7r/1PXdQXwpaIoXyBCZ/+v/6/v0fZt5pOLC/a2jdqr9+ReNcsyZrMZmi6whDSJUegwdI3NZs04DAkCn64RiUF1DQp9zoOhiXWmZWObJqZhYPU2Y6ap4/kOlmWLG9+05GWJqmo4rouq6my2W06nI57XYlo6H370vCelrLm8fDXIkfM85c/+7E+5P5tyeXnLx9/4CN9z6dqK5UJYf23amrIsaJpKMOUCj7OzBVUlqMNZlnM8njg7OycMJ9i2Q9e0FEXJfr9nsViw3+8HJx7P85hOp6IL6lq83nb+4aZDqB2zQXhVFAVd23DYrglGI/a7La4XsNsf6VSTY1pSFg2r7Ymr63viJEXXUk5xQVOLTI1woqBpBodT0m8wXGG2W9R4ns9+f4AOdN0c2tu2L86qrvWboxNZn9kQxxnvv/cRiqry4sUvaMqY87M5l9fX3K3Xg8PVfrfj+eP3CHwfRYE8SdGUjqg9kmoZpmGz3mw4nE7c3lyhOwb+bofjlVh2gOva0EFbFXR1RTjyUZUWTVEZ+Q7TMEC3HnO7WnFzt2K5WOC4LverW1RVZRKOSbMCTRfORh1v1bdFmfPs2ZN3nJWFSrNFpnFJxa3EfMIwHDAgiQlI9yuplJWZJ3kuQnvPzs6GdbeMLJRAtlzxHg4HJpMJ0+mUbc/+lKIo6WT2N+GncNZ13S1AJ9Kll/3XHwN//OD3XfVf+39diqL8XeDvAixmE2az2eAok2UZm81mIObIkzEYhWI91NSMRgHH04nz5RLX9/A8l8NhzxcvfkmRn0Qyr6pi9fO8ZZjYloWhiR85TROOh4LdXmQbarrQ4LeA7biEk2mfBRGwWIqEaNMSyVRRFLFcLnj6VMh5X71+JcggSsfV9T273YHjIeLZsyd88P5TqqqhbnLG4TnrzQrXsRkFHkWR94zGGl03OJ0iDMNiuz2gqRpxlOHYNtAN9GKpqz8/PxcjST8bKqqCbhjs9/uBmGIYBo8ePRq8/jabDXRCrHXYbXn06BHXt3egGmwOR4qyIclysqLkX//iBXmaYOgGbQeb3REVndnMI0kzxuMpmm6gamL12tRC2v3Ztz/jj//ln1CUNb4XYDoOnidWn20jKNNxbFPVDXXT0bU1Yejx/vsf8Bd//iNub1fMQ5/t4SAoyLrwDzgcDpzPF7z88gXH7QbfdlnMJiwWU7wnHrZtEcUJl5dXJHnKz774gs7USJsC3/VYhHMePzoTLNEy4+rNl8K9SulQ2pq2KomSE944wDINDF1jPB6RFyVJnKLgkN/eMZuOMXWdtmwGc5uqqjkeTlyp1ywWc9q2GeTbp1NMWVbvmPnKl1qSzuTGJ4qiAYyUmzhZJCaTCXVd8/r160EPBAwFQWJY0kej67pBvi95MZYlxk8ZT/9V11830Pirlp+/UnnRdd0/Av4RwG9844NuPJ/z4uc/J8/zAX13XXECSbDOsm00TSfZbAh8j4iOMBzTdhJE7IbU4a5T0XrJr64qKHS0dU1elZi62O8augaKgqYJq++uExyD6BSTFxWGbhJOQsIw5Ob2ktu7a8IwpGlKfv7zn9I0Dc+fP+fp08dUVcXjRxesbras1ndU9S1xHBNFR95//wnTcMSby8sejGvQe/n2k8dPyPOCqqpRFBVDN1mvdv3ab9qbqiTDTlvmGsgNh/RWnIQTjH4scByH3W6Hpmnc3NwMm42iKDgdD9RlQZzEVHVNVTf85Gc/4y9+/FP2pxg0g2A8IS0a6koQfEZjAeAZhkUQTtEMQQP+3ve+T11XRNdbHj9+QpKkaKZwZdL0kjCcYNo2juNi6CZpldPS0bQdddOgqgJAPjt7xP1qzdX1NYEfcIwiTsmJOE2Yz+e4jsNmuyU+HPEtizpLmX3yTSbjEUWSk0YJSVzwi89/yZ/9+b/is9/8LlGe8eLNK1bHDaEfMDYdPvnkIyHvrgq2m3t++tOfsVwu8X2PV69WpFmK6ZgEnks4HuEFI158+YqyrkhT8FyLqpYvtvBykPL+NM369e4ByzLJspzJZEIYTlivN8PW6e7ubqDwS/6JZJcKqn7zjveH1FNIjoNcU0omrLTSkw5ZD/0T5CpfHrRy4yRp1l91/VWLwr2iKBd9l3ABrPqvXwFPH/y+J8DNr/vD6rrmrvf6l+tFmdQsr4uLiyHkomlqNFXl0cU5ddtgOw6b7RpdV4Uufn2ibUVRUBCJRCXg2Da+K3bEbSNeiKrpMAwdpxeqBJbDtz/7DkmakSRCMPNnf/av2O1X7Ha7Yd8sKcFSzSiELg6OGfCNb3zM7d0N6+2WLBOzu+85LOYT5vMJo1FI2yjUVctmsxvszaqyIIpWg5nq6XQkDAWbMQiCYd0k15YSWJJil6z3SJAIt9x0bLdb5vN5jy/E7LZb1tsdv/f7P+DVm2t2x4gozSjqFtcb0akWumHTljW6bmEYNkHg9Kvdt/kK19fXTKcTDEME5nhewG63ZzwO0ZK0T1bqsExRzE3LJE4SOkDTLbq6xbIcnr33AT/8k39J3TQcT0fKMhdbINdlu9+z3W4IPI9DUfC9b3+b737nMxbTGafjkas3V/zi8y+wnIDXV1dsdwe+fPWaZ++/T5InvHj1im998jGuovDDP/ljPvzwQ968ecObN6+o6wLPszFNDccxKaucrqnoGoNJOCbJSo6HvejmWp0oTmjqirrxUVWDsqwG2ngYhjiOMHjtDFgslxiGRdqHwT404pW6FGBgqUregXy59/s98/lc4Eo9NXk+nw8chbIUI6VwpDYG79EgCAbCmiwu0ntTjg3Sou+rrr9qUfjfgP8U+If93//XB1//HxRF+a8QQOM3gD/5dX+Y0bPNHuY/yHh3RREmHIqioPfjgGUalFWFjjYQdwxNw/c8ZrMpX355I1BWBVR0EdFlGEzGI/HyWjZVWRIlGZ4lfAkXZ0vSJOfq9pY//MN/RhSnzOdzkjRju9sQTnyePX2Ppm0xdLGmO55O+J7HbrunqVuiU0yRX/Pes2eoqiIMY/KcFy9e8eTxI/b7I9vtkfff77g4P8N1Riho6Brstvs+c8ICWjRdzIDS4VcCRzIbU1KmpV4+joUASKZXR1HEfD7ncDjgui6vX7/mxYsXFGXFar3l5vaWNC/YHk50qFQt5GVFp+jopjhNRv5YcPdVHbcPMinLEs/1+gd3w8h3ORyOJEnK8+cXXN7cUhYVtusSxwnhdMrxeBIxc0rB8XTEsGx0TScuEt7/4COOp5gky1F0TXRFmk5alOS1yPkwNY2ma3n69DFn52fUdcWb1684bHa8fn2Jabn44wnHU8Q4nPDyy9fML+ZcXd9Q1jmff/4F7y+WuL7Pk2fP+PyXX6BqOrP5ghYoyopxOKHtOtI4EjmXecX19S1RHCNDfuuqomkE+9Tzgt4YR+np8y6+H/TW9R1NLUYL2RVIUZnRj3gPg4ju7+8Jw5AkSVitVgNNXmINQRAM4jjXdZnNZtzf31OW5RBVLx3DpM5BMmAFXT3szYJP71j5fdX1b7KS/B8RoOJcUZQr4L/si8E/URTl7wBvgP8IoOu6nyiK8k+AnwI18J/9us0DiLRdycmXQJqqqgMBZzIRrkJ1WVLkOYf9nqIscVyHtHeZsW0bx7aZhCG6ppLWIjPR7VeQI19QSLuuI01iurbD9zxGkzmj0YjTUVhpe67HfH5GBziOS1GULJdLrq5fU1cln3766eBhcDgceP36NefnjwdbeX+ksz9u0VTpXKSQFRVX1/cs5wuurjbsdim/8UkzsCAvHp1RFEIj8eTpI+L4RJ5nzOZjukaY1kq+OzC0f13XsdlsBLHFtvCDgOPxiKIo3N7eDj4Mkh0axzE//cUX3O9O1I3wJOwUXYTotC11B7qmYuoqqqIxsQOiKMLUNFQUqkJkQKq0hCMPlQbaqreBMwjHwuSmroWrcprlXDz2ePHihVBpNjWO65JnJY7tEYwMfvvf/R3+l//5H1OWNYoigmsbBaq2g7bBdiwWixmz0YjzsyW77QbygjQ60dQdfhiiKDooCpbj8lvf/z5ZVXJ184bn77/Pan3PxWLOdrNhtNvzk5/9HMt1Wa/XYmsFwu24qojSFF1pydKENC3YbNakadKHGBvkeUbTtlhNSxuJ8c0wTIq87P0lRPtvmkavZ+jjAa1uUL+eTiLF+uzsbMAYpE2aPBAly1NyQqbTaZ/WFQ9ahslkwmq1EsY5Pdi83+8HAFIWFenqJf+SwjlJivsrFYWu6/6Tr/hX//5X/P5/APyDX/fnPryqsuL+/h4ZlCrpmFI9KD+IrqlIkoqmrggCgTkcjwehHDM0bFN0A+ORcEui5yYIW6uWLBNfsy2b6XzCeLLAsAWBKIpjVFXHcl3CcMJoHKL0rke3d7fMFzNev37F8RjhODaqqvH+e8/54PmHNG3Lzc0Njx6d8+Xrn3F5eUWaxuiaiWs7TCYzVFRWqz3T6Yz16sDlqx/w+PEF47HP6RRxfrHo27uaKDrhuCZ1naGrzqC/l97+crTa7/dD8EwQCPsv6cvYdZ3gfbgup5NYg/7yl79kd4ioFIOyFfZgqqqjIJhwjmVimTq62mEZBo8uzok8j6qucG1zoMs2dUlV5hz2G+q6QFFEW3x9fYPaZ1mWZYWuGf3Du6brFOpa+E44juBY/MY3v83xGJHlFQodeZbRNAXBeEKnKOi6YE2ORmO6ruXNmzf4pkG82+FZFr/927/D+x9+TJLm3N2tiNMC23HYn46MRiMuHp1haBqzaUhb1syWZ9zcr3Fdh/PHT/mDP/gDPv7GNwhnKlXbYVo2h/srJprO/eqe/X6HYzvYnkdZV70Fm/j5ug7atusl6TVpkvWFusUwTBRFI4pioljcD+k3KqXopmkOeNl8Lqjx0tRXgogPx2ip+ZnNhIK0LEts2x7WtSBUl3ILIY2KpE+HdM2WgOfXXvugqspwmsmZWSKq0vw0yzJMTSFLUwF++T5F7wFgmCIZWUSFmYThiM16B11Lnqeoilhfjkcjurbl4vxcUHSPCVkpkOJxGGLbLkVVsd8fuLm9Z7vdMh6HXF5doesau92O+VzBMh081+PNm2vm8zmr1UroBlQNy9L59NNPsCyHtobdZsd+fyKLMzwn4P5+x3vvvUdlJ7x88QbL1lhv1nz7s2+iKA228xiUGkXRyYsYjQ4Rd2YN2nkpGQd4+fIlbdtye3/HZ599NpisSJDR8zxub2+5urrieDyS5hW1aaGbdt/CtpRFgaGp2P1Ma1smKipVnnO2mHN3d4uuKHSqWKM2TUN03EPbsNus0ayAtu344osXBOOx0PZnGZom4ti22xdDK6ugYJgmhq7x/e//O/ze7/6eUJGqHY7roKk2SVFgOQ6PHz9CU2G92TBybSw6sqxmMpvxrU8/xfVc3vvgA+q644uXr7m+vuP2bkWrtHzzm5/wT3//d1ksZ7z68jXjUcAf/8kPefL4CeNwwouXX5KkqSCXxSlJmpMXJVmW0u12xHEk1uBxCppO3bbohompa3QduJ7ba00KTFPYwe+2OzzfHajlXdeh6Qwg8Zs3b5hOp4PEHRgYrZom0qHCMHwnmAcQLM/RiMNBkKDG4/FwbyWzUeovJPW5rmtOp9NwsMruQvIcvvZFQdJmy7JkNB7j2DZ67wzcNg26YVA3DZbl4Y8F9uAGE4r9HsNwOD87p21bkTkYLlkuHvPjH79ENyzU1iQrDSaaT1kZhIHH2A0po4wii1Fcm8C3KeqEoqj4jU+/TZYVrNYb0jRht1uzmI55fDbn1StomgpHbWizE0ZbsBy7uNqcYhrw45/8mJvNLdDh+4HgrwcjPv74E8qy5Kc/+RmmbXC3fc352QWqq3PKYvZfHjmmMV98ueC3fvMzdF3j4tzlWfgY01R58+YVk+mUvMzwRwG77Q4/GLHfHxmHM/78z/9CxLu3grxVVhW3Nxu+fH1NC9ze39F2HUXdiWJgWui6hu8Iw9q4Asd2RG6hbuHbgkVZtjqdZlF2Co0qJN/HVOgmsjQnLxtAJdtvhRy5q6CtMQ2dotI5RhFuEJBXNUlZ0tYGXdNgaha2a1PkMTe3r9EMhboqqYqWMAyx24KPnn/IdrOiLnLm4Rhb1cjiI+EkRNU0rm6uyfKC7ybf4ac//Tlvvvw56+2W2fKMKM343/+PzymriuS6oCwKWuAYxwRxxPXNNVVV4Fomptqx2Wxpqoq2zDnEGUpSEU5C8rLENAS+Qa2SqdC2DUXZUfkdjaqAoaGaOsfkiGYCWoPj6uRFQ+AHLJbLIRpPcgikr4XMG5G42WQywbIsttstdV2zXC7J85z1es1utxvEa3EcD5sKOWpLqzfJbEyTlK5pyWIhn9c6BXM8Rmk6qqzAdv76gca/1ktWuKqquLi4wPd9Xr9+Pcxbjx496mdiIf7QVRGT5nk+pikQXuE8I0xPPvroI/7vP/0RZd3g+SPOzx9hGQZqJ5yDyrLCUDTOl2fUhoKua9R1y7NnT9is7lnMz1jO53zyjW9yOkWcjnvUrqbt2kG1Zts209kUP/Cpm4bbu1vOzs6olUYw+vrVY56Jmwr0UvAR0LLZbTAsA9dfEB2PrNcbDN3gD37wh7z37ClZWlBXHapWk2YxVdOSprloWZuOV6+vuLq6Yb1esT+c0FSd29WWuqnJsqxX/d0LN2LTIPA9DEuIlbpOyHerskRB4fzsXDxo4QRdVfFcj6Ks8UZjsrzg4uIRKA0tArwNRiNs22W93pGlKXmWYZgmXdv0XgVd/7NagiVa12iaToOKY1t0LT178UTT1CgKGKYxWM395m/+Jq9fvRJZnYpwy1IUaOqGw/4AtSCBLRbn/NEf/RE/+vGPOUYReVVwdX3J3XqLZpropompKpw9ekR83KEbBrbj0NFx2O95+vQJ+92OPM+4ubrGssXKcTobcXZ+ThTHZEWJnueDH6JsxauqQtN1LMdGUSFJUgxDRVE75rMQVVU4HPcoqorn+UNEgHTAkgQmXdexbWHWMpvNhjWlBAYl8eghoUkWAenJILNS5BpTang0U+t9NzNO0YnywTZisKD7FdfXoigAw7ggyUpy1/7BBx8M4ih419vQ9/130Ni2bQWAg0kQ+BxPMYoiaMqqahEGAY6hMxqNeXpxRlJlnPKEKDpxtrwgi1Nub+447SPmiyXb1Zrv/tb3MDSV9WbF6O5WoP6HA9EpEuq1QyRuomlTF0L9OJ/N/x/q3iTWtuy+z/t235/+3P6+tl5VkSw2kmCRFBwhsQcBEjhA5rENBIgzySyzjAJ4mkTDAA4CaKipJMOIHAmwTVFiI1Ji9c2r197u9Gef3fcZrL03iwnpRFQQVM7o1auHuq/u3Wedtf7r9/u+HpZ6c3uD67ocHR0JSe7dQkBNCyEnOT4+Zmfb3N3dEkYhsiTx6WdP+fTTz3jw4D7z4wmu57DaHoQVSxamo9e3S+IoJs8KJNVgH4RswogkTvoHWFINVODk5FRYhWSdsmrY+iIgMxxP8Pd7qqri8uKCi4sLPv3oY3GF5diMRiOCYE9dlwxHA3a7FUF4QFU1NFWns1YfggOmbZNmOXZ75RVFEd5wxGw6wtTVvri1226ZzWb87u/+Lr/3e/9Tey728TwRzf32t7/N6xcvBQLd0GnKggYIwhhZUQmTmMl4jKLoRGnK+x//kLvFgrKuKRvhsawlmYZGJFGbiuz6ClvXieOQ27s76rLA1A1qJK6urlFkCeSOTuVyfn6O43k0ksxoZvDq6ooa2uixhqLpYuDY1pWzvGmtW3UP6dV1rf11ymw27yVCkiT1PInb29t+YehuF7orxo7X2aVWy7Ls5T0di+OL2rhO7bdvf57DwQAJcYWt6TqSLJPlGYoqyodffnBrG2PtaqBfHIZ0qjCxSmZ9TLT7BnYk3S9alCzb4Oz8hDD5XESKG0EySqOQ0/kMrf1hDl2Psiw4ffgGZVlzd33H2B3guANcyyYME/7sT/6E8XSC5bn4QcjjR4+Js5wH8+M+FLLb7Xj05E3u7u5wUqtF1W+Fh9Hx8Pc+241ocnZgE0nRCKOY+Plzzs5Ouby4ZLFYtNHZEkPXePfd93CGLg8fPaRqB1qmYRJFCZcXl2y3B6JYPER5A2EsPtFub26F38D1MHQdVdHQLJ1CLciKEtcV5/vuAcyyrMeCXVxe8PSzp5yeX7T33gOSduHsuBdBECDL4kFWNY2yw8lLklCfqWIRUBUVRdYACalpGI3HVO21WufIjKKwZVg2fOtb3+Lm5obNbi8eZlXE0yVFJUoSDFVFMyySvEAvK55f3/Di5hpF1dhHEZphkJQlcZriDYdYrkNelcRJTCzJaKqGJCvohoplW/iHA1VZkAQheZ5xCAK+/o1vcHn/PlEUkW42SFVFFCXkhUDPZUVBBbie2894ylxCGw4wTJO6Kls/qSl2jmXT7y67uHnnL+16Kh2sV5Tdln14T3RDIobDYX/ccF23h6l0tesOjtPNIsTPKET5goqxt521arkvZoD+z68vxaLQDUk6PVsX8Oi+WV3/XlHC/n7+i469xWLRwyc8zyOIU+7dO+fFy5cUZUFR5DiWgyYLY5TtOIDEfrvD0FTkqmZxfcN8PME0HZBVmrLisPeRG9htdiimxdtfeYfVasV0JviJ8/mcIAj6aPEbb7zJ3/z1DwkCwTRMkpSbmxuqqm4TgSq+f2C93mA4IsyiKgovX73i7bfewjAN9rsd08mEIs9wvQFZWbE7iEzEwRcuAQmZqlHwffF1DlGIH6UYtsPQFLajPMuoGhnL8ihLGA4GFFlG3cRMp0KoaxsCH+7vxMP4zjvvYKgacSSuqwzDwPd3LflKgFWiWGxRm1rEa23bZrtZtVwIgyAMsB2PyXiM7Xo4touhGZR5IY4aaco3v/lNXr582Q9Moyji8eOHbLdbAXxpP5HrRmjWgjBiMJyw26wZuA61pBEkOVkQsY1iwb9QNVTLpkxTLNdpa9UNYRJDXZOVNcfzOYv1Gts0SdKE2USIf4umRlIU3nzrLTTD4Pr6hiCKyIqCyPdZLJcEYSyucSXQdIOmqdubHwVDE7avskg4Pz/r8XkinjztKVyyLPd0qe6mrTOh27b9BYuXyKPcu3ePq6ur/s91OEFJkvB9/xeAvt1NRfde6kQ/YRwJ6nhZ9CWopmn6K/Rf9vpSLApdJLO7S5/P5z1EJUkS0eff7SFXz7YAACAASURBVBgMRgJfPRj06OruzCXuh0X6y7R07t+/xPuZw2bjs9tv0VSNCkl0KAwTZToR9dG6Ynm7gKqmzApqrUKW1T52apomqmlycu+S29tbLMvi7OyM4+NjfN/nwYMHPH36VESOk4STk1N8X5wF67phNpv3gyMAw9CwLBvZsBhPxvi7HXKo8uLFS47mM6I4RjcMFAQYoypyXl7dtNFVWaDt0xzLjVA1g7yqkVWDsg64PDpGkiSyNGe5WGAaJq43IPAPuHZFXSPeQIp4MLzBgLIoiOK4f5A1XefJkyds9+KaS5CAQkzbpiizHhhTFDW6Jq5KVU0j3e+xXY+8qJAVBdcbMJ3OGQ6G1GWF0jYkPc/j9PSUjz/+SKDLZImLiwsePnzEj370w54epJsmaZLgByHjwYC8rJA1g1pSCJIMyzIparA8EdIZeWLXN5pOSdIU3TTY7nbCb9mAIskEcUxTVkLwaltIgOe6VA0cHx1zef8hSRITBKE4f0cx2/2esqpb4rJYqIDeNdoVoGRZAkv8OjW0lrdY4XkiptwFkLqt/xdrzp33Eeidpt2z3PV/OuFw98buFABdbLlbhLqjSFeW0nWd9XrNyckJhmmyb7MMo9HoV74fvxSLQtXWbAfjMVq75elwUZqm9fe2vr/n+PiEy8vLvuzx05/+FF3X+/RemqZ41gDHNTk9O2axXJEFQvAx9kZIgwGyIlNVNTIgNRWaqiBLitghtI1Nz7SwvSFHJ8ckaYbnOnz1H/4DhuMhZV6SJgnQ0NQ1vr8XRanDgSKPyTJxVWWaVjttFvTkIDi010getaqzXm/57ne/wycff0Tg+2y2O1GprmqKqkKKI9B0aCTyLMc0baqqxjItaGA2FWdTWWo4PT4lDiNm0ykPLi+JDwd0VXgOBwOPqhKK9jhLkHXxcDXtEW0w8Nhtt+LIE4RcXFzwxuM3KBoFRZE4HHYUZUoYHfr77yLPaWpxDj+EARUNaZ6haSZV565sGpqqpMiy1vOQCSXadMqHH4ruiGs7vP3223z/+9+jLMteAlMUJYos6MtJlmPqBpbjkaUZiq1jeyO2/o7BSOwsJVlwYppcLHb+wW///0TBzTFddrstcRjgOg5ZWZJXNYqqYVg25/fv0UgSg8EQVdNZbbcEYURdQ1nXxGlGXhTohik8F1L5c6ej2lKidZ2qpSRNp5N2oRW7ru7KfTAY4Pt+H2fuuInL5RLHcZhOp30u5fb29heKVNDW0vO8Xzy6I10H8ukQ82maErRN46ppaCSJ9WbTg2W7ReiXvb4Ui4KiCrPNfrNp7b1ObzPqVsn1WiDC5TbK252VHj9+zO3tbV8ssW0bSZPIypyzsyN+9MMUTROJNMsWjcLgEKBJCqYmo8pg2iam5VAh0ygqjawyOznhq9/6TbS22+4MXYo0I9qL867v+6SZUM7vtytm4yG6ImHZJ5yfX5DnOdfX1ziOw3a77Q3a2+2WMAyJigOSLPODH/xAtOaKnLoWK/0hDHvicCVnyIqCLMlEQYBumBRZAeMx+90W0zBRZAW97ebbqoZq1FycnIhh7XoLNHijMcPxkNWzNZ4u0FxZO4+wLbsviSmSmIcgydSyYADqhsHr188xDLU1Ghskyc/v2htZQtVUirKkIaduxGBQlTXKvCSJY46mMw6x31KJTKIoxLIsjo6OaJqm3/ElScL5+YzhcMh2u2W/2yM1InWZZjmOK8hMcVZSSyppXnByckKSxJRFjucOqOtSJAmrmtFoCJLM1dUteZaJnZIkYTkOaZbz/OVL3nj8CG8wwnIcZEmmCCOSJKMoKxRNoNdMU8w2GiBJU0xNaktFDZoiiyFjIm4gTF3rewemaaPrgglp23a/C06SpDeGd8j9LsvRDRU7bN4XhS9i9+H9wrHVNM1eAzCZTES6UtNwXJckTftFZ7ffo+vi2lr//4MMpit5dDj0bijX6dGur6+RJKVPdyVJ0jcA5/M5aZYxOzri7uYG2ZDQdZV33vkaH330lBfPX5OmMWki7D5iK47YpstC4lI3DZqpMz89497jN7DHE3TDREKCpuawXvPixQuyLOXTTz7tS0e2bTMdDhmPx5weH7Fcb3p562Qy6eux9+/fZ7FYcO/ePQ6HA89v78TZUpLZbLc0LS5OBHnaW5aqwtQFmSepaxQJDEVGNQ2oKlzLxLYc/MMBy3Kp84IyiSmyDEvV0CQJVZHI8gJFkymrnDSNUWK1V4p112PT6ZTVasXQG7RkH5+iUTAMrT3+VKRZRF1XPxfQ1FIfpsqaBkmW2kafiirryJKErEAURiKKqzY8fvyYq6sbPG+A6zp8+9vf5vd///fxvAFBIL5viqwQRzFpIkxUjQRFVSMrKoqmCzJRXnA4BFxenuF5Q/I0Z3Y8xTR0Xrx8hlzD0HWZT6a8eP1azGc0Eb82TJOgPdsXaYbrebieh6JquLbNs2fP0Ayd6XSKZposN1uSFmlv2jaW7aCS95Vk8SyUZGlJoipYhkgrNk2D6w76K8zNZtPr+jr4bpZlnJ6eYlliVxkEQU/8/iLjs7NLdUTusiz7W4yuTv/F9myWZSBLHB0fYdlihuTVIvosKwphFP7K9+OXYlHorl8G7ZS1c/IdDgeG4zE3V1ftGc7sZw+KovRFj/l8znq9Jm0HWa5pY5o6g4HH2dkpT58+R0IhCA5oksTQsglNC0VqODqecnxygqobFDXopgESWIZOI8vsAp9P3v+AsWPy6sVz4VqsBMnGtgwsU6csM9arBffu32c6nXPv3gOGw2EbOS745JNPuL29YzqZsVgsMQ2b4XCMf9jjuR6z2Yynn31CkWdtfFXv669NHbW2ZhlFljBNYZiWJFVUwdOE4cAjjUKGjs3pdMaHH30ottK1zPn5GcvVkjxPOYQ148mYnS+utahE7kKR5P5+3NQN0m4QlpXsdqJboesyRSrMyJIkiwx9VopbEanEsEzKvETXhLhW0cVwLTiE1KXQ6+maxte+9lX+8A//CE3TeOedd3j3Z+/iOA6Hg9/DbHf7HXleMp1M0DRNkKp0g7qsQFaQpPYGwcpxbRd/s2c8GjGfzVgsbimyHLklWVuGgaaqlAiYrKZpAhMnwd73mY7HZEXJIQx4eG/C3c0tWV5gmjab3ZYXV1dELcHasi1kVSXLM7yhg2HoZFlKmqUkcYRl6TityLcLE4lSU8HFxUXvcex2jWdnZ/1RIcsyNpsNcRz36cPBYPALMwjHcRgMBuJ7tBMFuo9b3MB0Ou0bxoqiiCtT2+rj0EBvIuuuQ3/V60uxKJRFQZZEnJwcYXWrXF2TFxl15ZJEIaOBi6pbKIrao7Ety8L3fZbLpZiCb7eiQrwXlBpbsXn84CEffvARkqwycIZQS6RNQ1KDV8qYjY5neMRlgW4ZeJMxpmvTyA0f/OwnPH36ObHvc1UKbJvtOBwNJ5iOg6xp2O6AyXSK63mCzaDCwT8wP5rjuBbPnz9nMHCx7AeiQ29oLJcLRo7F+fGMp5895Wgy4pvf+DrvvfeukLuqGiVQlyVhkGIbJkcnc7IsxzZsXMvl/OKC4HAgywosXcc2R6zXS8zQRbUtFFmhaWQ0xcC1hhwOAbPRBMcqMVyHumqI45S0bLi4/0jUcOuGpq6IqpzJ0GVoaqzXSxRFFv7KMqdpagzDRFUU0jpDlhUUNBRJpZYEkamuC/KyxLQt8jLF8Rycoctmv8O2PXbbA6qmMZnM+fGP/4qmES3DwUAcAcs8RwZGA2FQ1lUFVa5ZHzYocsO9e/e4vr7GG1ikech6vUFSL6hXGxbLLUlWkyQ5F/cmrLcb4iSlbGSKsqQqxNBQVmTquqKoSj7+7FP80Gc0HrfN0zYIlOVIDQwcD9cZUJQVcZZhGzbjwQhdV9lXNU2ei0Fq2SAjUZcVjmUzaGXBXXS/A+h2QaSOxj0cDlEVmcHARVNliiIjjoToRZElLi/OuLq+xtBVwiDAsj10zWK/O6DrFifHZ0RxLNK/io6EgiKDVMvURUMWZ0iNxHw8RzcMguDAeDD5le/HL8Wi0NQ1/n7Hq+dNTxIaj8e4rssnH7yPruvcv3fJYrWjbprehdiFZzoEeSdZHTtjFBSW+w1fefMtnr18zr/93veJ44TZ+AhZ02hkFdcbMZvMub1ZIJk6E9fBdWziJOFP/+RP2k6Aymw0oopS8rJAlmRR265qptMRF/cf0rTIc1VTWK+uefX6FZ9+9ok4x7V30iLfLjGbTdB1lTTN2fs+X337LbbbLYPhkG9+/Ru89+EHpFmGa1tiV6TqDAcjqCVmk5lA32c54SFAVVQ0R0ORJZIixxkOkDQNdzikLCpM3UZBRlMyBrZMFhVUSsVkNkGRNQ6HiIMfUjUS8+MTYeZKI2qpoawrzo8v2G7XIs1YFyT7EElqWiBNg6ZpRFGMrBtIyEiS0h//bNsRRwlFFsPZsmQ2O+LF81fkecHX3vkWz549JwxFVXrc+jZVpaRAtDU1VaHIMyRqLNMQnQy54fb2iuHQYzh0URS55WmWJEmGrGiUFZiWy9XNHWHo4w2GyIomZiVNzWa9xtQ1LMskjCJkRSErCj79/Cla2ZDlOXGSELRhIcNy2O99kfJ0HcqipChK5tMZ89mMjz76gCaq0QyDLEnRFZk4ijF0nfnxKbphsN1umc/nnJ2d9Z2ELqvhui6qImTHWm8DW6BrOuPJRBidTBPHsWkahbKk5V7K6JpBXcN0MmO1WhEEEVUlLOLT6VQMl8NYuDKRsS0Hzxn05bZf9vpSLApCeKmRJBmWJVqLUZRQ1yAhk2cFy8WKRlJ/ATfVGXKyLOsVWkmSUIQ5X/3qOxRyzeu7Ky7Ozrg4PyMKk1aBtkduVKLRiOvVEmvgMjs54v7jRyiayp/+2Z8RhyG2afHWkzdJogRfCzhus+qjyQTdtpkezSnygu1uj3/Y8erVS9arO3zfZzqd9np4gJcvX7JsNV4C/V6Qt5afpml4+eolpmVxfn7Oar1i13L2TFuhKEsmLfLdMA1msxlxHDMYDpAlsb3My4Ltfoum6miqjmXa1FVNg8RsNmW32eE4HofYZ+R6JEnOdDhm7I0oixpD1ahrmSDdUtcF/nbHyBaL0IsXPqoqwjO73RbX9drhV4Rl25TtgFQQs4VWLgwDwjCgKmvSNEGWFU6Oj1mtVoxGQx49esgf//Ef4XkCwmvbNkEoZCme/fNmaIemD4IAy7J68pTgT465unqNZdlMJzN03SSKIkzTRDc0iiJlPJ5gmCayIpwXVWvRNlSVshLR9cPhgGmanB6npJWYpyyWS7Z7nzBJqGUFSZF5/OZjXl9dochCDa8bOovFHZIk43oeErWwj0tiocqy1u3ZxvW7uHJX9GsasetRVZX9ditamS3eXzRNc4JDwMEXDUpdE96H7XbX3zh0bcu6rvsAX2/aapOOIkRV9h4RTdN67sYve30pFgVFVTm7uBDKK0Wlbn9vs9uhKErr2atQNaMvl3QR5y7L0J3BB4MBhmKyXC3IqrwlNJ0KItNigzG0sGyTqqmI8pTGnGGPhjx48gaSqvLxhx+xXW1QZJlHTx7gmjZRkDA4OsF1Pc5OTzBMA9MyOAQHbm5vuL6+Enz/vIC64eH9B4KiYxicn1/w6tVLdtutANIGQhleNRDFMev1Gtux2+q2TFrkYorf8hrOZkci1r0Xg6u75ZKoRWuhCBSYLMskWYpp2CwXK46PTynzlMvLS54/fY6maYzGI9arNWeXp5wcHfPi+SuiMMD1BqJEFSc0Tc3AdvH9DYal8/nnIhFq2zavr14gy01vNjo/v0TXxR161f4dDF14HJI4xbYdijzD1HWkpiFLYjRNYTIZcXF+xmeffoIiS2iacEDUVdnPAqr2Ae6m793C6jgORVEQxzGnp6e9N6Fq38iGkZNnJbblsNmuGI48xuMBSZqyXKwBCV3TaZr65+E4GsoyY7fbcXV1QxAGRHHMIQhEYlCWcMdD5DhhtduQlTlSKfPpZ5/1n96WZbNeH5CaGtPUqeqm/SSHpGUwjMdjfN8nDEOm0ym3t7d9lmA8HrP3fXF92AJVu/lA94Y/OzsTGY62f6Hrem/0sm27P5ooiiLanYrSm8o6K3tH7vr3pRnhS7IoaJrGZDKhKEvs1m3YzQ00XcexxfCmrBpBXq5E3bmLQruuK7ZJrV9x4AygkXANj0N0oKlKHj98QOiHXF3dURUNriO286Zn8uTr74jsfBQJKOtsThrFaIrK4uYObzLFO7tgdjRDVRRkSWKzW3P9+hVRdOCNNx5BI/4uUqPw6tUrmkYsDs+ePWOzWnNzdS3SaIDrOOiWjddy8va+GLKtNxsaGhFoKXUxH0kSpEaALsv2U043jN6rCbDZbNAMnbKsGI/GvHz5krff+ko7tFJa2pTB0fEcVVGYekM4v+STTz4n3O4ZjsYksRDoDEcuqaJiKCrWbMRicYvjuqLNJzcoiiBta6rKxcU5V1fXFFWJrhuUVUWRF1RlhSwJeIkkizq7rus0VcVhv+fi/Izr6ytGgwFFmbdodNFIHA5conaB7MA7HVtwNBoRhiIW/fTpUy4uzlBVjbLMcR2X3c7n/Pyc5XKJZdltkKegrn8eNTYNg7LMicuQRqqhqVE1lf1uz3uHgEZVkCQRNW8QXo74xXNUVSVJU1RNp6wrmixh4Lp4nktZSzjuAEVqUBUZRRX9GiS5F/Fut9uep9BRl3zf53A4iHxIFDEeTSiKgiCIeqJSFCXMZkcoitaGkcxfALceHR31IuUuxdhh9GVZ7mneAhAsFpwsyxgOh7/y/filWBSquibLxSe9bphoeYE3VEiThCwTyPXuqqu7Buw+Sbrwh9d6+IIgIIhDZEnh0flDqqYEGg5+wPnpKavlljiJMS2Hy0cPuP/kCYc0Qd7u2G02FFnJwB2iSSp3tws0ReX+eMr04hxZgv1uR1UUROGhjacalHmOqipEUQiNyte++lX+/Pvf57NPP2W5WqHIMm+88Qa3Nzc9DUlr3RJN06C1fXshdxXW6QIRMx4Px+SpgHUahkEYiXOu53lUtSjglFVFk5dYlkPTCGLUvk3z7fZbyqrg4f0HVFVBHIXc3d6y3fpkSYLjeJwcHaOrKq9fv8I25xjzI168fIU1EA9f2SbrtruV4FlaNmEUoSiCCSjLkmAEyjJ5VaJIMB4OmY5G5EnCZDRks9lwcnzEq9evOT055nv/7t9y794FRVlgmga3t9ccH81bUVUjehMtUKZrBna14w6VXlU1h0OAImuUZYWq6uR5SRwngIQia+R5yfXNNapqoKoab7/9NoeDz2a5RNNUFne3pGlC3Ma0y7psqVkykiyjKQqmJeY7pm2R5xllXVM0GlXdsA8CHj98iCI3XF+/hkb8Paq6QaoqirJEabs23Sd0dzVZVRXz+by/hu3iz106UZZlhsMhnTy4s0eD1AengP6olSRJz2LsSnHd97CDu4zHY66vr7/82jgAVdfZbXdohslmuxPXk4ZJWRTEqXDnubbR9w06tn5nWu6su47jIEkyum1ydXNFkaUcH82pq5owiFitdyRZxaM3HvPq+opagvlsxtgboiIxn0yRkZiOJ7x49pxvfuc3OLl/nyjPub655XDwmU4nFHmBqinM50fs91vhDhiOKLOKv/iLv2DbZsuHrR5+u91SFAXj0YiT42OyqsL3hdjFME2qJMaxbZSWP9np4uumxnJsirIkDcRQ1XVdiqqkiiPiKEKSZZpSFhzERuLo6ITNes2DBw/I0pSD79M0Na7r8PLVkuBTn4cPH5OlJaoiGnnz2ZymEY6C8/MT9oeAohEP5eb5CkVRBW26hYPWVUVViciz61gtpl4VR4iqxjQNojDgaevvnM9mfO2drxLFES9efM7xsfh6uqaxWNwSRSGyTCu/HbIPhAzHMAyCIKBpGi4uLliv1z3Q9PXrV7jugOFgTJFXLdpeYjgcURQihrzZrjg9OWO72+M4Q0F6ShJ0wySOQsIwRpIaZEmlLGtyZGS5RlFAQQCAaBpMTQy29TZDU5U1hzAkzTLRapQahp5LFB1YbcUHz3g8QlG1Hu7avXG7T/DO8CRmKg5pkqLrpthlICNJMuPxtJcSK4qKYVg0COjOfD5H/kLpqWtITiYTVFXtG5udkapbYLuw1K98L/5/8o7/v30J21OWCwaBZdstJ0EUgY6PjhmPxxx8vw8vdXTbu7u7/n/Y932qquLBo4eUZcHi7o6h53HwfVzH4rd+8zcwLZc//8sf8fr1K5zBkP0HH/Kdv/fbrG4WGIoGDxreePiIyXTKxYOH1JJ4UKsG9quWvHvwOT8/QzcMVuslhiEip7c3N6xu7sjzjP12S12LdNp0POHu7hZZ09lutyK9pqq4A2F0KtpPjjAM0aVe6yiGeWVJ1i6KXWBqu932+XYA2zTJs4qqrDg6PWW73VDkBVdXYosuSw1B4CMrYieSt32Hy3v32O32qJqG7+8ZTyYEoc/t3a2AjBQyi8UtnucSRX7/9QaDAbvtvrdVp6lA68uygqqoVGWJaejEccT19TVNXeONXHRV4cnjR/zBH/wBSRJzfHJMXZcc9nvKMmedxKIFm2e9/UrTNC4uLqjruh+OdR8Ag8GQ46MT8rykKgFkEbJKMyaTIdvdGs8dcHN7g+N6vPPO1/j8s6coskZWpiiyysATWYDcTMnbmxNoRGeCGqlpSKKIuiWIq4oigmaqhbBeSe2cYEyaF0iyQtPkSIpCmudo7eJgGEYvd3Vdt91hyT056fT0lCIv2O12vU9y4HlEcdTSsx3ubu/I8gW24/LkyRNc1xXp2Db/0D0fXXCqO1J3acmO4dCV+X7V60uxKDRNje+HjMdT6loiDBPqWhKDr8EYy3HJ8pLT83P2223v0Yui6Be64Z1Db7lasd1tuDg9JU9TbNNAkmRkSePoaM7JyQmfPn+BWbmcnZzR1DWL2zuePHxEeAgEsdmxkQyNcL8jDgLe/8lP+No775AFe2ZHxwS7PUmWMZnPMUyLn/zkT1ks7nh0fkocxyJ+3FJ2P/v0U4qiYDKZcNSm0BTLwg+C/jgUJTFI4oF3XJfBQJCTh4MR282GGsFNNGWLqhTb0qStyh7CgAeXj9E0gzAIGY3GjIZD9rsdk9FQ3Bps1yRJwsPHj1htNvhhyGR2zGg6IUoTRtMJ73/wHo5tEqchmqYTJzXz+ZwXL5+haRqz6RTDFBLf+dERWVq0Fq+05WEoPQvAcWwkSaZpKgxDZzab8r1/9z0R761KHNtmtVyg6xpNXSFLMqal49gWyXbP5b37AP1D38V/0zRlPp+3cpSKsqyIwpiiqLEs4W1UVJXFYoVhiG370fwI3TD54IMPOT89Ex2aoxN2uw0DT8Si97sd/uFALbcrcl1TVwVNXeGYlpg9KAoSDUWRt2/+RjQmkUjSjMuLMyxTx99vKYqcppFQVSF/6cC7tm33HQWgL0ONBhNMw+K1+prVakVV1qiqjqaW7Y2BKgjSctITnuM4btWHQq/YDWZt2/4FhHtZli0rwulr01/6K8m6btBNg8FoJAZMjZjWZlnGdrvl6lqwEMu2O9CdoY6OjlitVn3Ky3EccSYrUhRVJc0yRq5HkWVstzuyqhEMx8mY4zhhsVhzdnzBRx98zGGzZWC5DJ0hzmCI4Tg8e/Y5w+mEZy9ecHl8xHw6YbfbE/p7bHeAY9lopsWzZy9wnAH/0T98h+KwE+YhVeXu7g5Zlnsu3/n5OXmec3NzgypJqO3uoEurifqxCjT9vKRuaobjEZvNhhqh2JMUmTzLyFrFeSPBi1cv+cY736Isa7F9N4QlKs8z8rzh8vKSzWaNZZm8+fZbHPyInb/n4vwS/7Dg8RuP+euf/TVJloIEQRhwfHyfm5srjuZzoKRuirbaXveg0sPhgK6KAJrnesiKwqBdqMsypyxqphMbXVUpy4IwDPohomkaeJ6Drqvkedrv+mazGa7r8vr165/ftbdp1U6E8vnnnzObzkmSFFXRGQwmrNcbQXiqSh49eoysNKiqzPPnzzFNG8u2MS2L0WjI6m6BYZhMx2OSRGRcakBtoMgzqtZOpckShq6IgF2aUBU5IKHoFrohHBuyrFCVFTc3t2iqjHAMSTi2zWQy6ZOadesU7bR+3af3aDTC3wV9wU1YtEJ2u13PVOxizoZpkOV5D2+ta3Hks9q5Rzdv6W5qusp29z3P2uPOl36moCgyo4FLHAXYlkVdFsRhgO+LVV+VJfzdBtcwmE1n6LrGYrEgjmKitiTS1DWj0RhVVZBkiaPZnDA40LgeN3cLkCTKqkaW4OG9U26vX3M6n6BrCvsi5/TykqPzc37ru99BkiX+tz/6Q6azKaYq8+jynLnjsL65ZRscOL28xBt4oGkE/gFZkXnw8CEH32e7WFBVFS+vbjBME3c0QTFs6rrh85dXNECaVeRygm5YzE9OCcKQKIlQDNHTb+oGU9NRGiFlLcoCbzjCsW222w15Jlb7pq6ExbgQScePP36f05NT/MOGyWRM1ZSohs5oNKQoS1TT5Kd/8x7f/Z2/T56VNO3x5nDw+elP/6plV4SMRh7TicfrF8+QFZnDPkJXNCRZoy4bAW1RS1zXJokjaHKKIifNVDxvQFkVuLpCsDuQpQVpkbDcrahLuLm9QVEkHMfCtHR8fyfwb6nE6fE9EUWWdVZ3C2zDpi5otYAjirJku99zOPjMTy/YLJc0VY3jaiiGwaO33uLp558zcIas/R1SU6AqDW9/5U1uFiuOjo8Jg4D94cByvSZNE8q8oKlr7t0TiVM/iUjTBEWSsG2LIs8JA584ilAdDaO1cNUoWKaJoshUZUHe+il0y2DkOS0prKEsa6qyQTIUJGTR08hzirwiL3Ic26MsalbLJZvNGoB7l/cwLWHBrioFSW7QNIW6LsnzBMsyUeSGPIup6wZD18nzjOFwgCIr3N7ccnR81AqEZBzHoWlq1usVWZ4Rx9385Ze/fl0V/X8P/FfAqv1j7vZLiAAAIABJREFU/13TNP+q/Xd/axW9Isu4toWpi2GJpkhYhsbg/AxJkvpV8eWLFz1IRVVU6qpiNBxR1zW+72MctWCJQ4CqaSRRyvPgFaoqSlW+7wu3ngxff/sR7334lNvbK568+Rbz+SlPnrxJlOe8/9EH1FWFXBRUhz1H8zlyJSb9dV0TxBH+9SsuHzxksbhlPJny7PNnQrxi6DjugN/5+/8B251PWVXc3S1I84yjRuLFy1fIhk1NQy3JKIZJHgRoto1TV+Rxiq6ITyhLNyglFcPSWSwWRHGKqRtkqbirNnWLLE2xLQtJEjXhINoLyk8WCYBK6HF+71Lc7U+nBGHGX/34b1oOoNJ++ppUdYWiyNR1w+vX18ymU6SmIgkjcS3YWqMDXxzZFFmcrR3HIokLDENHlgWRKI5jdv5WhKl0kyzJUEuVgT2EpmqzJQFVrdNQcwgPTEcneO6YKMzYbK8YjT1cdyiu4qKEvIqYzKbEcUojKRSVCOtMh0P8KOFutSC5uaEG8ihgPh3iGCpVFpOVKbKqcHN326ZBD4wmY+LIIE9SwVo4xNR1hUqD2346V0VJFEaEYUIQRMK+pehkZYOlS+htu7cqc1zHYT6dirlLUyFLwqSlyBq+HwAyhm6hKjqGKzoJelmiKOLfV1XFZrNqQ1y+wLPbBpZloukKg8EAgDA8IMsCl+95NmUbv3dsE5CQPQ/bsgTu/3DoqVrT6ZjlckGShC3t6u92Jfn7/F9V9AC/1zTN//DF3/h1VfRd61FV1b533vXEO2pNN0TsGmPz+byXdQ4GA54/f85yuURVVVarFY7r9mCKbrtmmmaPt3rw4AGbfcTr6zXbzYbJZEYQHEgjcU6bjQZMj+asFrdEQUBRlCR5Dpq4Frp3eYm/27FeLHn/Z++Sxik0MHx8nySJcByb4cAljGKapub+5T10TQR/dvs9A2/AaDLm+auXyLJMTYWqqKRNjaIaJHFCmmSYgyFvP/kaRV6wXq8o8wzbFhIZRZIw2uqs43g9uCOKor780sFqumTbb/zGt/iLP/8BWZpQKAqLu7t2Gl3w8MEDPv3kU1JZXMeen56IRGFVUhQ5ZV21CDwxQ2hqEaAQ3AK5h950dWtD15EVhSAMkTKJYBOgKgrUFXVTopgmiioLRR01y+0KXbWQVGgokZUKVVMIgqXQ+AVLAn+PrMocmhC5ani9W4KqU0oqqDq6ZbeLW0VeNtwuFshLiTgpOD87w5tMuH95weJuIY48rc7NMAzqumLkekwmk95Q1lGMFEXpW7yj0Yi6EM+TWDB1Bu1/p65rdFXFtgUhaTgYohtab/Lq6tKTifCEdgj2OA4oy7ynktvt0aNzPnYzgKZpCKOIPCvaI0GNZYnejyTJ2JbL2dkZYRSx3W5JkhhFkYljg/FkRJYLDkiXc/hlr19XRf+rXr+Wir4bggiZatwvELqu98MTVVU5Ojrqk1qLxYL5fN7LT7pzU10LJfsbb74pGAyyzG/85m+y3WxYLpc9tSkMQ8ajIS9f3WKZOmF4IIkjPnt1xcgbiHP1wRfKessErWR52GOqDvPTE3x/z37vc9hvabKMSXdNV+YoikqaxBimBU3FaOBxdfWa2WzG3/ut32K9WaMbKreLOySEYOTV9TWKomCYpui61w1REFJFMc+fP+fNN99CkcVRwdBVlNEQ6pooDAW5V5F7ZVjnG+wI2ZqmCXS6aXJ5dsmkVZSpksRkOBTmYlUjPkS8/eRNXr++4vrmWmj1vAEgRClpFmMYeg+LifZbFFXFNkXfP04S4kgwBDRVQ9N0EXsuSlRdI4siBq5L1TRYLSA1STPOLi756OOnZHlBUVXYtommKSyXt4RhQFEISEsaBih1gVor1LEoY40HHnerDef3H1ErKqUkY1oOw+GAosg5PjsTjIS0II4jmroiT0XcfTgaMp/NOOx9PNdjs1lTVUrvzfC+8EbvgmRdt0OTjR6DZraBuqQsxdeoSgaDAcPhkDRLaRBAYdd1++p0F8nvas6ObTMcDvuFvZsjdNfv+/1epBazhCLPUFWtlcgE7XtI7UN9WSYSmh1Ve7NZc3NzxXAknn1V1Tpcxy99/V1mCv+NJEn/BPgr4L9tmmbH30JF/8VXXTd90KLLuNvtN6kDqmRZhuu5/Y1D0E7uj46OyPOc0WjEdDoVKu/tVhh1bBtNUVDaWrDYom1IkoTpdMrlvYecnd/nL//yR9QNbNZbLs4uGAw97t+/z83rlxxPpuwPB9KqRNY1jk5OMCyT1WpJkecYqozmuszH4tPFMg0GwyHXNzdkWc5qsyVOUrK8IEtj8rzgxcuXzOcCD//t3/5tNvs97sBlsVyyuFtSGAUD22U8NigQ5/zNZoNtWZSa1g4RBdXIbGG16/W6D/aoqtqeIwVV+fr6WmxXdZ3paMLpyTEH/8DzF8+xLYvBYMjrV68xTZPgEGBoOm88fkKUxey2O3RdUJOahtbuDXVZILVkJbkdotVV1SYjK2EIlyQkRUZTVTRVQzZNNEVGlXQs3aCpRH/l+uqavMiwXBvfP7BcrNFkSWjayhzb0qHMqctSpDtLSdS3ZYmkLjk/npDFe47OLgnTnLrMeP+9d3n05A1QVJq6QWoaZpMp282GQoKD7wtwTVukK6oSy7Gp66InJhstoh5EQe+LH1qG9nOLk9Fq3pQ2RhxEIsqu6zpRpDIYeD2Svfu5BIGQzRRFwWq1QtcV0dloBcFfvGYE+p+lYWjMz8/5/Onn7VxABhocxxbK+zzDqixkWULTVOI4bLmORt/7EAAj61e+H3/dReF/Bv45In37z4H/Efgv+Vuo6CVJ+mfAPwM4ORKWJdM0xX1tC4ugvV6ZTCasViuRMDMMtDZA0qUZO5CJpmmUZYlpmmxWK1zXZblYsFws+jv20Wgk7tl3O1zXY7lc0zQFn3z0AQ8ePeHi8oL5dMpiucQ/BBiqRlmXVDTM5nNUXaPIUjHciROyKGLsDrg4PUFTFV5u7virH/+QLM2om4bp7IgizwizmKa02G/X+NsNr19+ju2K+vX87ITvfve7ZHnObuvz3s/eZb/doasaqqLhDobc3i4w2uu7pqqwbQvHcdvvm4HjuGia2icfl8tlLwrpev3r9Zr33nuXJw/fYDQewoumnXBbWJYl5DLDEZqq4nlDji/OeS9+l+BwwLJ0TMvg9uaKqszYb1NkRabIchRFGL5F4KoiLRP8lvFomBZV22Z1FZWmqjEsE1XR2O8OZEXO1t9j2iZllZNlMXlaUkmiJOe1zkpNlXHcIU0NTVljmTa2rYNWY7sukqqT1wUD28CPEt566wlRmmG7LtvVGqOqudtvRQU8z5FkGZqGvEjF7szUQYZwH/Zat07E2m3zPc/ruxeqRM8myIui1xNK0CvaRqMRhiGq/oeDSMDWdd0Cg6Q2qCWkQVVZ9cG8ru/ReRzOz8/ZbrfCEyqDqim9a1JRtL4kJsviVsp1XepGHNHqukQ3xHF8MpmIlOx4jPx3OT78slfTNIsvvLn/F+Bftv/4/1hF3zTNvwD+BcDbTx413f1q1/TKc1GOMQwDpWXZ0cjYts1quRTXem1CqwNT/OxnP8NqZR+T6bQvlLiu2+PMuwDJaDRC03TG4wGnJ0dc39xhWwbvvf8uQ2/IaUtzilNxRTeaTpAkmaE3YHF3R9EOPseuyxtvPKJIYoJ9wmp5h9PWfKMoJokD8QA1NR9/+D41YBkaaSITRyFxnnG3XvLs5UsePnpCWZT843/6T/ng3ff56IOPCOIY1/VEajMS50FN1YjjBKkFsRZFQZomFIXSo8S72Uw3l+nup29ub7ANkzffeovpbIosK+z2OwbDEXlZs1ytuH//AftDQJCnfP0b3+SHP/hLkjRlMh7w4P59Nuslm/UCuWjQdI0kETJfXdOEih2o8hzNMFAVpb9CNi0T27RI0oyktYWrus54MCCMxZth5JpM3QkDb4ihiyCQaxuoskRVVEi1gqYYOJaHotc4A40GSYS8yhJ7MOLZqyuSIsfWDQ57H1MzCA8rDge/vdoTzcy8dSLUdcWLVy9wLBurRal1JPH+jZ/nvZdEURTKPOvnXY5tkqUpYTvY01W5b/Na9hhNE/Sqqqp68O+HH37Yh5K6r9NdW3Zyl+5nd3p62vshkyRGU7V+N23bHWo+JQgOuM4A0xSFwKLMsGwDkPrehW3Z5FnJwPx/mdEoSdJp0zQduuU/B95vf/1rqeibukaSZWRFYbVasVwumc+FQKNqz//73Q7TFKuvoij98HE4HPbHi/Pzc4EGa/v1WZYxaek9w+FQHC1Ms6cKh0HAwLM5Pzthu9sThQFxkjHyhvzVT3/C20+ekJUFnuOw32yxHBupQZzlg4CvfO1tpp7Hp+9/yGwsYqsP71+y2+1Is5zXccx6tRRbwbzANDRhjFqvkVVZLA5Z2jb9Kj7//HP2/oHrmzt+5zvf5T/+T/5TwjDiz//8e0wmU3RVFWfiLOVw8NlttmjtgykeOLN/oLtPug78Wbc9icHAY7lbYd06KKbGbrunrBqyuuHtt7+K9Plzbldrzi8uWG5XlFXNZDolTWLCIMJzdDzXJglN8ixBU2QqTUNCEjdCekNRVj3JKU4yVEXFdVwaqaHRFMqkwVA15KohiRImkyHnjx9jW5r4vpjiZ9xQUOQJcSyqwxVC6Go5Q8bzIxzPAKnAMA026zWSlJJHMSfTKS9vFsRpCFXD3t/iL8XnmKrIHA57ZEXGtCziJKRsqci6bWBpRr+gdjOqjqL8c1Cq6DOoqkoYhsRh0G7XxU61qsrWM5m2eDW7l+F0lXrPEwv9ZrNpE7kScSyQAL7vM5lMqOsaty2jzU5OuHrxAse1Remq7YbM50e8evWaohDimK7bIAakGrqutSGvks1mw2q5xnW9fhbxay0Kv0JF/x9KkvQtxNHgBfBfw6+voq/rmrvbW+Io6s9wWZZxe3uLpuu9xSjPCkEjCsWZrdsad3bdriVm2zaSLLZnZVmyXq9/oS/heZ7YFtKw3ayZTkacnZzw/R/8mOPTC4oyFw6G/R6amigImXhDNGSiQ4i/2zMaDQmDA9vFLbPjGXLTkKQxa38PsowMGLrKZDggjWPyNMWybWpdFQSdKCTOMqpWoGIicXQ6EyWbnc8f/8t/xVe/8jW+89vf5p/84/+Cf/Nvvsd77/4No+GQ87MzfvrTnwhEW7uQ6rrSQzt0XW8HSmo/IBPMCoUkTZA1iSCJGI3GnN2/4NXLa5IiR9Y0Lh48YLFYUtQ13mBIWZXohoVh6LzYLEljn+lkyGw64XDYkaUZtil2Z5IsUxRla/uuaeqaMi9Q21JOI9fUErgjQcR23QEXZ+ecHE05PpqgKDW77QrZqBlMXB4+epuHD84JwgNBFJFEJZ99+poP3vuc7bOA2eyY05Mj9EZn4I0JA5/JbMpivWbieexfXRPEKUkUYbU3T9SNcEs2AqYiKwqD4RBV11iulgSS2p/tu+eqc4p0zseiKGhKcYzYbrdQC2Cw2gmAZTEDEPYr0QPpmIvdTKELL3VXjYauEIbijer7fh9g6o59mq6LdqquYlkGpV8yHA7JspzhcEgYRq1KUZild/sYx7GwLLvnd1RVRRwlVJUQC/3ai8KvUNH/r/+eP/+3VtGrqtY7FjoRTBiGpGnKi1evKPOcOIpwbWFbUhSF/X7PyckJ7777rkg7tljx6XSKf/AZjUbcv3+/n8ROp1NevHjBarVis9kwGo04mk8FBstzGU9GDIdDFoslo+GYyWiMIsl8/PHHDB2XapJRFgJ6YQ9drl6/xvMcyiJns9ugSgKKUbXVVd0wcF0bx3F6uEVwECKYsqp676IkQZEXZKWPpOjcf/CINM0IDhFJkvKv//X/zvHxEf/oH/1nnJ4c896777Lf+6IaLNG35WRF6PG6XUHnFHRdgTTryMuj6QhZU7lbL1BNAwuJydGcPCu5ur3l7OyCew8fstvtePLkDX7y4x8RZxkD12I2n6NQ0pRiJyZLElVVIEkCwy9LErIkt9ekhvhUtywUTSxOURmhOxYDe0hd1BxNj3BNg4f3zinzmMBf85vf/Ab3v3bByfkUVZfI85hJpVLWE/JcZnw0QTI0nj27QXdMVN0lzSsc00EC0jihTHOSIGTkusRRiqFpSHXDeDwijCJIhO/SskzKpiZOItz/g7k3idUsvc/7fu+Zp2/+7lj31tRdxWZTlNWiLNoWiRgJEGeVrBJkGSD7bLKws+My2yBRgMhxYCcIkAC2EcSLWIgFqU2KUmJRpkj2UD1V1a2qO3zzcObhfbN4z3fUckgBliGjD9BAo3Cr6tZ3z3nPf3ie32P1iKKQbL3vtgpf7u01OWqKZVmMRyPyNNErx8EAoSRSNuQtD8Gx7M6ZCJDnOev1mg8++IDxeMxwOOy2Caenp22rLFsLedUlnWnXbMDt7W03cA9DH8Ogg/XojYnUGx/b0QPwqub27g2n96aUZcV+v2c0GmOZFpZls93u/txDwfze9773r/P8/qVc//1/999+7z/+D/8DiqIgz4vupg7DiKptBaqywrJsDddotfDr1apNe9K7W8/3KFuZ6GGqOxgMWCwWbFqS0QGjrZRitVqy3W6J+n3qWjGfL5gv5lxf3zAejSirErs1WiXbHYPhAMu2qGXNPt5hmkZ7EDRsd1tN7J2v9UlcVgyHY/r9AZJWc+HaKJQu7wwbxw9wvADPDzAsDSjRe3/o93qsV8vukPn+D74PAn7jN/4GeZZTlgVpkuL5HkoJdrsNo9GgzWSoaJqauqoZ9CKUbMizjKbWAbEaeGuTxCkCk+nRCQgDw7TwgoAgjMjyDKEadtstYRDgtZkGTaOBunmZY1o2tVJUlY5pN00L0Vp0bctujVGuRo2VJYarpd09v8fDy0eMegN6gcfi7g2jocff/Hd/na//1a/R6yksq8KyBZZnt4NlA8MEP/SYHA9wAsV6uWY9W+Magmy3o0xTlFSUTY1wLHZxguO5ZGmitQVKcXt3qw1X7ZQfwHEdXEeX/skuJs1S8pZRkLSkoqA1pzm2TaP0Gtl1XcbjEb6vq5CmrjFNo+NDFEWJlLqiEO1LY71ed6yQQ9TbbDYjS/UqUke8ed2soa4bDNPCEAZNoxiNR9q5KQTzxYI00Tkjq/W6NbwFrNcrpGxIkj1xnKCkDlY+Pj5pw5FdTk9P+c2/9z/ffO973/utf/V5/ErInJumpigqkiRlOtWsuabR9JokTjucddM0pHlGGIUcnxx32DPTNLm5uSEIw84oUtc60++Q9fjkyZNOPx5FkebZZTmL1YpdnGDbDvdOj/j02Ycg4ZNPPqTXHzIcDsnrEtNUYBkYtsHPfvoTGhru3TvX1GR0IGt/OGL2ZsN4NKA36JEXBR9/+hFSKNKi4IuXL3A9j3DYoyBA2trdOQhDjhxtNFovVzqheNBH9UPSquLV9RWDwYAPn33IfDnj23/11/nmr/wyf/wv/ojZbAZmwluTEU1TsNts8NvV3zpfEe82+I5DYeq3Xp1mrG4WuJ5PEPVQtSTe78nyksdvf42b21uiPMe2Bc8//4iLi/uEYQ/Hcnhh2Hz22acIt0eWVfiuRWPV2EJgOq6eCylB4IeAhLTBMQAkyoAKhW8IIssmECauVDw6P2f0yw948GSE2ysRxkuEYSNwaXKBYQYYIsC3wXclUZgT9huwe8hsxU/e/5TZ+jVv3XtMWQsaW7DJCxZJzCrbQVPjRz5FDdt9Rtgb4ssGw4CiyAkCHwtY3N6S5wWG0Ig/19eqxrKusBwb1YbQlk1N2dQ0bZhwUeZUTY3ruTSN9kfs4gQDRS/qUdUGhml0epuDaOmQ+6iUnpNYtkOvr4VMZVmy3SXkedVpTtaZdojmZcHx8ZSirBi2a/DFYsFoNMIPPObLO/IyQxiCLKk6MI9sFAI4PppqSvif09V/JQ6FAwjUcRwc1+Xi/n1WiwVCCM7OzrrYuEePHunQmDb44oCiOoictLIvRCrdNx7Wj+fn59R1zfn5Oa9fv9aDx+GQ9Voj0nTfqBHe9+7d482NHkodgLCu5XDvUrv2vvjiC+1TL9JOLur5LqZlslqtGEwmpEVBvqxZrJZ88fIlZVNRy4YkK7mZ6wg5r9XHH7QZh7Kx1+t1RJ7JZILf+u2rqiKJY4RSLBYL4t2eb33rWywXSz7+6CPWqzmBH1AVGhNvWwZ57rPbrKg8r9tEKCnb9GEL0YpbBr0ejqMBLEkcY5gmv/IrfwXqjF0SgzAZno8Jo4i33npMEu8IA4+7u9fYhpabQ/uWc6xWbGZTt/OhQ3y7EAaOsIh8F6Uyykpx+eg9Hnz9AcLKqIoleZ5h+ZYOay0lsi4wPRswkbWgqE0kA3rRW0yPDM4f5Fx/9hpMAyVpWzPJ/O6OyWRMGcdEQZ/h5FRTjtZLPv/8M5pGw3cWiwW2bVJWWq3oe/afiXs/fG6maXZyeiEEPd9rw1lqhFIkSYpsJHXdAAKJJmo1sgKiDpDitT+LA1NxPB6jlOLhw4eMRiPSNGW73XbiPL1xyLpQ5dV6yXCo0WuWZTEcDjsilVIafHx5ecnz589bcdOuy5c8vETLsmSxWPzC5/ErcSgcVjEPHjzg9vaWi/v3ES2jcLfb0W+5A/v9HsuyePTWW7x5pRWC6/Wabcu3O7gQszynbsU6Bzln2bIaNpsNy+WSR48e8fjRI8ajIZ4fcHX1CtlIHMfFsdsdda+vdeVeQNTvsdnvcAOfvhxixAZxvNcoNMeirip8P8QWEXlekBU5z69eMj2akq6WJEVFNBhheQEK6A2PMAzN00uSpCsvq6ri8ePHxHHMdrvFdmwePnjAdruld3FBU9fc3NyQ7PZ8+uwZf/2v/XW+853v8P3v/y55O73e7XYcTSf6MKwKPfBrGZaB72Pa2tgTRRH9KKIsSzabDRLBeDwiTvTWZLnboBoFhom/XYMhKIqS4WBAU+bMpcI0DHCtjklpmgYG+q2kS96mA5YGrmYqRKHLbnvLN7/5DtPLIXm6JMkT0jJjE+94+PgCOwgQjkQ4CmhQjURYDoaKoDFRjYPjx0TjW3bFp9ws5oThAMMJmU4mTNYrDAVJVqBcyatXr3Q2Y6D39JvNmjTVwSmWbYLQ0W4C0W2smqbptlhv3rzBcZwuj8FE8eb1GwaDnpblA3lZasx/69QU7UFykNh/eU18SJxerVaYpsnd3R3r9br7GqADpehZgp5Nua7d2QEOq/jNZkOv19MbuIsLitZB7DluZxc42KbfvHnT4Q5/0fWVOBTKSu+Ar6+vuby8hHZXaxhG9+Ed1kF1rbMJez0dorJq1YtCiO50/PIm4jDsGQwGvHjxolsjvX79mqdPn+D7Pot2h3x+75y72YzPv3hBmhVUtcS2HU6PT7i5uyMIfO7ubokibff1PZ8w0Fbc/X5HZZSs4iWz+ZzNbst6u+XV3UyTkQxBr9+nlIJdHCOsgMlkwvX1tYa3thxKjRnTFtd33nmHxWaN5dj0+xG9MOKzzz5jdnuDkpJd0/DDH/6Arz19ytuPH/PDP/j9js+wWCw4Pz/DMk2SPCfPMlxHQ0td26KpK+K9xroJ02TQ72kZ+GCIIQxW6zVP3/06q8Wak+kJf/KjH3MymfKNr79Lut/gGoL5m1eUVY7daj8E2vEqUAgUjusgm1o/CHWNqSyMBtbLGZf3J3z7O9/EjyRSCmwzZLmIef8HP+Xfc0554vWwzArTqrEsARStOtJHFjpW3YtGmKGHP+iRlDmSthU7OuLzF8+Z392R7fYUSUFlaVJRXZcd+k6TnRwaWbd0cAfLdLvtzUE/sF6vO1rSIaVpPl/Q6w9ACMqywvU8/RkIqMuiRQlq38hBbn5Yneuv0y+B3W6H7/vs9/sOxWbbNqPRiDAMCQK9PThEKAaB12kYDoPOw/frOA7RcEjy+jWnp6c4lg7S2e81bWqxWLThwNW/mffh38bl2BpSenF52W0gDv/Q+XxO3gpdDtCSYYub2u/3nYLx8KGLVvDktD16HOt4rKdf/zppUVBXFaPRqOMa+J6rh1BS0R+Mef78Bffu3ePTz77gzfVHPHr0iOfPn3P7+k2LMW/o93pMp2OGgz6mYXRTb0OYbOMMwzaRKKJ+H4nACwKKuiYrC27nC5I0Zbve8cEHH/D48ePOC5+mqY40b1OKb29v+fCTj8mLgn6vj1JS8x4fv0VZlOy3W7brNX/ww98nCHzqqtL4ulSbsNbrNUWhb0SjbVXyTEM6EDVpElOVpU6mSjN6UYBsKo5Pzgh6IbWS9AY9ZvMZ733rV7k4PuXZz37Gfr1kcXeDUApDAW3rZ5kmtm3iWDZZluA6DnWlKzbZNBRJRTgJqcqUX/v2X2E4DahJkZaLkg7C6fHZiyVHP11wNP0G06mDYcRIuUeYgNGAKmmEQokSP3SxAg/DsalyRWBbpHmGmSSau2BotoMQFrWnQa6OqzddNzdvGI1GVFVJmRZt6+PiOPrrDtyGA7bs8IAejHllXVPudwz7fbIsp9frU5Y5VVGihIHjulR1Q1OVXaV0kFQ7jtORpXa7HdOpzs48MDXm8zlCCIqi6F4Wh/8OLechIe0QNnuIi0vbFkEIQZrqcNt+v4/fApGzLOt4pr/o+kocCqZtcXl5yW63o2o/tKqqyLJMl2pt312WJScnJ/z4j/+YXq/HyclJJ/FcrVad7NmyLIIwxLJtysUCKSU3r18jpE6a9lv9d9qulWxbT/6Pjo64vLzk6tV1u++39bR4t9c6/1oR9ULc1h2nSTywXq113FqaURQZtZSEUUicpkgFu3iPEgZ1u/t/9Pht+kHA1dUVoHvuixZxP5vNUO3cIAgCAt9vo8MyHNvGECavrq50vHsra9VT6hIpG5pavwWUFO0NrG/qAyhUCMF6tdK+EMdlu1kzHI2YzRYEvT6uH+B7rj7slCJpQ2gPmP0sy/F8nyRJcVwXwxRY7feoszkhCAOEUIR+oLX3nodj26yrBNfzOL10KPoWAAAgAElEQVQ35vTyDGHpmHhhgGE7SCwmJxfsdhZVGZEm0B/0gBzDqFAUIBS2I+n1FWkpGU8nCNPAsg1sx6aoaq5vbrqXgRAChfZYTKdTZvNbgDaNquzK+zAMsEx9GBwwZ6vVqt0GuJyfn/P8+fOu9Hdcj91+x+3dnND3iJMM2zK056MyKEtBo7R79BA/cKhaD8DVA5b94LM4QGsO1XHWHuBpmraQ4ADDgPV63SWzB0HQxco1TUMcxzxq2R5lUXZbt4Nrttfr8fDhw45W9vOur8ShcHi7BYGm4zRtuXWg945Go84SDfD1d9/lpz/5CWdnZ5Rl2Q1ZhsMhq9WKKIo4Oj2FFg3+2WefdXvhg/Jxu90yn+mWYLPdMR6POjR7GIY8evSIz794zna7073/eMSgP+Ds9ISHD+4ziCINf9msyWL9xs3SlDAKqKqa2WLRotd90iwH00YJg7woeL7bM+z3OZpOu5IuTdMOVOr7fic8UkoxHg3ZbjaI9rMKXY9dK8gZDPosFgvSLG0fID0glU3VqfEcx6FpkfC2pfvcNEkoypIkTTEtGwyTpq5QsqYqC6ZHE9bxmtVsztnJGbJu+PGPf4xn2xjUjKYTVssKTLBbi3oURcjmT1O7BLSTde3S3G4yLM9iOB3R7/UQtsI2bJSyMQ2b45MjfuW9X2YavcPpqYeSiqausG0XZIEwJEI0WGaJ40h6oU1d6TxLUwmSNCWpKnKlkCiUgCTP6EUDGim5urrCMGg1MCmDQZ/pdMJg36csteHroPwsy5J+v/9ntlsHalGe5wyGE1xXt4/aCJaQ1BXHx0faZ5PE9PtDLAqNcrOsrqKVUhKGIWmadulhX2YqZlnWheAcqE0Ht6swIM3ibrB++PqDlLrf73eZlFWhX6JAV2lfXFxwcnbGp8+e/cLn8StxKBxw3kEQ8ObNm877EAS6lMsLbZ09lP1pkjAajbof4Pn5OVEUEUURoL3xWfuhHdyUoAGsUko+/fRT+v0+o/GYIs/avsxjudrqfIIWk1WWVadpEIaBME2ifo+Liwt8xyGL9yyrirRlJviuyzzZsFpvKIoKy4QyT5CNoshyvDBC1jW2rROZD4nPQggmk0mXiHRQbAZBgNeaoI6Pj/Bcl+dffMF+LwGF73lYpoHnOCRJzGq5IgrbN0ea4LoOptBDP6m0UxDAEFA3UlcZhslqteTpO++y3u4JfR9DKKqyJN3u2SxXHA0nvLq6wnUcfM9ju14wGI9oKJGyJonTjo+53Wi0vOu6VGWFaWoRkGwalCWJi4SkKDBcH6m0zdySJsqwGQwdnjx5wH5d8Nnz59y/OMJ1FRqQ6iCk/uZNYeAYAt912azXmMJA1g1xFmMFAbKqkIbACTyqImefpdhdqpOFaZmAxHUdlstF659JcF2PQb/XzWWGLR7w5cuXAN19cUiDBoGUSmdUSn2PrDdbyiLXsxvHxrK1NP0gSNpsNn/GgXkQl9m2zfHxcZvCte44CxpQ2+8Ok/F4SJ4nnbjJcRyGwyHr9bobis5ms272AfwZzLvneezbHMtfdBl/Cc/4v/YllQZI7HY7ja4+Pm5L4hq/bQOKsiRNU722iyKOj4+J45h79+4xPjri9OwMt+3Nnz9/zu3tLVnLrIuiiKdPn3YpOYethtEOai4vLzXoJc2I4wTHdVuenv5BGKZJWVcMR0OEYZAkKWEUUlYVqpFMj6YcTY8wDRNZl6imQskKWRfUZY5pKDzXpi4LJqMRliHY7XZdyCjoG+7gED1AZg7V0/3LS5SUzGYzHMvGNAyGg4EmAed6BalaX8Dt7a2OHW9bqT9NQqIT65imiWUaKCkpi5z57I7lYsFuq6lNUkoWd7cErsfXHr/N5fkF05HG2ud5ju26bOIdl48e8OjJ2x1SXFO2S/Jcr89k+4Y8SK9Nz2Sb7pDCQBo2UlrI2sQ0dbvi2HDvfExvmLNYfUQt50gVQ1OBskH5UHtQu4jaZbvcc/X8BbQORdu29Swpz9jGe3ZpQiUbavmn8e373b6rGJfLZcsdOMBPNGAnCALOz88xDINXr151P48wDDs7vx9GBGHI6zfXCMNkPJkQJymbjaZ7eZ5PVUsWSy2QO7gYTdMkTVPu7u7YtylUh2G5QgfoHJieh8SnL2dBHIJcptMprut2cwnHcTg+PkZKzW5o2llbkiTc3t52OauLxYLbN286eMvPu74alYJtMzk74+blyw5E4Xo682E4GJAkCcv5nMnJCfObmy6eOwhDZrMZaZ7jOjbLxVLjsVoM+t3dXbeFKMuyIxE9ffq0rQRKLMvgi88/p9frMxoPKe/m3L+8YLVcs95s9Qda5CzXBZ98qnCcr3Nze81idqtLaaH7+jzNkLKhLrUcuqlrXNvGtByEaZMVNUVVMJ/fEvX7uKaDaRqcn5/hOi677ZYiz/E9j32smRK2ZZPECZ/vdoRBQOgHJI2kyHOWywVJvEc2EsOA0Ui3Ttpl6rUhOhaWoUGwlhBYlqmt10riOBYKUG2vu1oumByf6hWipcNYl/MlUdRjtVziOT4P7t/n9u5Gcy2yHrajV5B+ELBaLrFbMM4uyyjzjMAPUAqKQuPKJIqiLAh7vdZNKXUV6EnyWlFLA9cLefLoiPjYodczUXUGlgMSlLJBAgiqPOPFZy/YrtY4tkPaJOz3a8rtBnw97ykA2/NwbY8wCgijgLu7W+qmIk0zyrJoUWgHSXiPXi/i+PiIm5sbtm1ehp5JWEjZsFgscD2X/W5LVVeMRgPqumK3K/A8VyPhs7SLdOv5Fp5jY1omlm0ShPrnGkUhQeATRSFVXWGaBovFjH6v38bZV+RFRhSFOpKwKhkOBziei2lpAjpKw2P1QbVttRRNC8X18Fyvc18eDiXf9/Xm5ku05//f8/hv57H/86+6qrh79arTcx+CU3zfb+PW9RonS2IMy+xko2EUtrkDGfO5pirZroPn+6yWS4Ig6Dj3B9vqZDJhPp/rjAXbwDbHCFnx8UcfEPV6TMYD1HLFaBiCKnEdgzQvUYZgtZ/z2QuLps7xLIdBGCHzkixJUXWDQJDvaozKpkpL+qMBp5cXDCYT1vGO+WrNNo7ZxXs8TJK91lgcQkXTRItmVFPgWD5lmWA0Nr7rka0TDFNgGyamZ5PlFQ05mIqiyDCw8EMb13NQjYmBgyVMLRGmxrEVliWo6gIpG1zHxrN1ZLztmjRVwfHxMTgBWWNSmyGGZ9OYDoPpMdvtGks1NEKRVxV+2KMoAFVjGja+H5LlGaahME2FZZua3ryXWGZAvKtItxmu5+vwVVVhuxD2XISQOI2gLkpMIWiEwyjqY5s2UqLpTbWuBGgadtsNH/70Z/zL3/8XBNIlyVK2RUwtFHkp8SyBbzv4owllVeP7LoFvkWYbhsMeSZKRxDlJnFGXNkEwwjQtHjy4ZHJs8+LFC03KFg2GqVCq6oJbhdFgWeCaFaaqSVvJspSSptZDw+GotbqnCVLVuNEEXIUdWvh9j8DzNMm675EV21bnUFLXGY3UyPc4kQSBzWJ50w0oZ/M31HVF1bbWvV4PQ9igasZtsvfaFLx5/ZKLi8uuMjw4M9frNefn58xms44v8vOur8ShIBvJYrHo2HUHroKUEgyDyekpm/kcYVuMplM++tnP6PV6ur3w/W5NtFwutWrMdZlMJgDd0O6gSDvg4H3fpypysrRkOj3i5m6BlGDZgpOTY8bjCbVseP/997Edk0rngzCfL6nSgnF/SNofku327c1stGUmFKXeBcdJwueff457e0ulJHGeg2G0O2ydF9jr9brNypfR293u2YSiNjBsC9d1SNI9UkkaqWhKbaJxbF8zHi0DJQ1kBbav+QaObWHQYBgNqIo0ragqHSdv+RYuUEv+1BPSNDx4cJ+7uzlNkbUK0jW2beF6DuPRmDTdI+uGi0eX3NzcoFDaHFYXrNcrvLbKqauGqqyp65IGE9O3UbbCDnu8ulny1pOHVKJpobMNOCY6l7XEtGyUTKmbEst2EaqhLApMDK7fXPGHf/j7XF3d4NgRSZax3e1xXM2q7I/G9AcjlpstgRBkacZ+l5DnJb2ew3ajy/ZHjx+ipMBxXE5OTlFK8uzTH7FarWhaJmKR59iOzX4fg1JEvR5VVbJYrHAcG9lIdts9ZVUSRfpw0OpGRb83wLIkthVhmSGqcSkyxWSotQXj4Sm2bTGbzYhCkywrqas11aAhz8t29uB3a/qyKHFsA9fWz8Z8piEt4/GYMNCpTxcXGmfSi/rMZvOOwnVoVTabDUBnkPt511fiUEDoh2Cz2bTZB/qBiPp91sslSUujafIcq20d/CBgeHTEeqZTmyxLf7hCCEzD/DOqsKLQEWK73Y7hcEjU67FcLMgTi+s315iGSxj2EUJLdUeTMVcvr3Bdi/FkwNWba/JKEXhtwk+vx2g8ZjVbUOUFrmlhuzbCMMjSTA+elCTLM8pUUq5W3H/0kLEX8OLVS4TQLRPo/fdhH36YNh/CPk5OTsjTgsViqclSroVMZJu8rduCummQdU0YuAgUQRRhGxZN3RD6AaYpqIoEqRoEBo3Ub1/DNDVEVRjIXJu6bm+uuffwbRazOybjMfEGvRI1BdevX/P4/rcxUHiuxajf79RyVVnSNBWmSburl21SlI6W01RjQVbnmKZNIQ3We8kmNgmjENtSSFIsq0GpCpoM2RRYlotsMhpVoyRYtsv11TW/+7u/y8efPIPGpSggSXPqWmLaYDsuvaiP7bjkWcFgNMI0HSxhsdluqCuptwZhq/w8mvK1p0+5enXFxx99yHa/aGGsBvv9XkN+DBvD1nmfVVmjlKCpIWk3Fp7n0u9NWsdrjs56dKlrhZAWWaKwTQhcEwOfIofAC5CNiTQslLSpS/B8C9N0qUpJnulKOQy0uW+11BXF+fkpVZmx3+87fkOW6WH5Adj7Zdv1IU3t4aNHrFcr/W9oq/BfdH0lDoWDNvuwfjl40bOWr7BcLnn7yRMQgsVywTtf/zqL+ZysVYMdcGw6mSgk3u27QM5DWEae54zHY+2pRzsFN1IQ+H1Wyy3RoMcu3lHLDM9PCUKPXj9gMIwQbyRKasZDP+q3LraI4NJHlpXGo3kBvSgibxruZnfMFnPyUguE/DBktVwRZzlRLyTNMgLf76bDB5PW6ammJx8Qc4vFgiTNsCwTz3JIsoSqqbSOo66o2u2I53kMoiGWJfA9H5T25Od5ShQFRFGIMAKqugByTENXGFYbZS+VQhk2u+2GCySvrl6SJommCx0fY1aC46Mpr65eYpgt7LUssWwLQ6gWOS5RChzHJt7vaIwKJaGpoapKpJBaOdgYfP7ZFdPpI27epJyd9whDk6YqETZ4jglCUhcxtRLaAyFNlncLnn30KT/453/Aiy9eYlkhjYQs0w+m7XgYpo1lOxRlhTJr3nn3GyAM1ssN/WBIGAxJ04S60kTw4bDP/QcXXL3+nJ/89E80Lo1eN28yDKNTCtq2zoQ0TY1bl5VBEiddv97U6O3GkUO/32c6PWIwGODaBoHvMxwNsAwDQygC3+PoaIpjW5iGwem9c5A1tm1q/YdtE13ozzhOElBwctJG0TeqzfDUocaGYZJlOS9evCSKQu0srvT3fHFx0aVrNa0g7rAKPQBif971lTgULNPqTEq0rrH5fE6apjx9+pR79+5RVxVJpkUch0j3w342jmP6/T5xHBNFEZ7vUZVVt9bRNJyG45aoexAI2ZbD2ek9rq/fAJoFcHQ0JcsTRqMhT56+zXI1424+p7qN8VwX27K5fnPDm6vXeLbLdDzm3tk50+MTDEMwDHzCfkQ46LNYrQmzHtskAaXXfOtlwWg8JvCDFpJRcHt721F4ylJDMw6KNcu1qeuKwPfxfZfdbs1uq98IQonO5LRqFJf3LnAMD8exGURD1qslGveiGQdhMMD39EqXduMj6wqEQaNM7CDARHF2PKVuJEfjCVkSMx6N9PYCRZ4kXG233Ds/oyxyDEOQZgm2bRInMXG8J89TUGAKE0PYukpRYNaS/W7D1tnw7E8+4dG9J6hE4Qc+6+2KQu6oA4UhSs3QSHfYts9HH3zCP/+93+fFi9dUhcS2POK0QcoGpcD1AgwpsR2PXm+IH/WI04yqapA0RFGfm9cz4niP77vkudaqvPX2I370o/+X6+tX9Achd7NronDA0dExWabfxofQlDwviCKtAhQYeG5AU+uN0enpKWdnZ1xcXJBlGV988QWr1Uo/uEaNYUK0DFFNjWxqTo6nLFc3TEZDRqMBQdTDMC2OT85oANFuiwBm8zlpkuB5Xgtm0YYrENgtJeog9hPC1PDWVgnZ6w86ifWB/dhIideaCn/h8/iX8Iz/a19SyQ57fZD5DgaD7s0ppdRtga0dea9fveqw74cU5Pl8znQ6bfurkP1ux3q97pRiZ2dnpFmG2a6mdEKyT5EmBIGP47ucnh2z2qwYjUd6pZll3H/wgC9evGa11i7L5WJJGidMxmN816OoauarFZ4fYNsWdlNzenrC9PiYL1685MXVFelsRi0lZV5yfHqCrGpme41pS9OU8XjcGbU8zyPLsq4sNG2BYRrkaUyeZaxXc1RLNXIsiygMCQMX1wrI9hmqkhBo843vBaA0LbssTExTo9fDUOgpdVVSKYkF1IWOkFdNRZHGPHj4iNPTc16+eMHs7pbpZELo+9y8XjIc9MlzDY1xHQvPd9ltN61/f09VFuRZiiEs+r0hZVnguz6eNNjvS/Z3a56tP+Tlh1qj/+//rX+HDz74Q7LkDt9XZEXK9e0tWVqw36fsdxlg47khruOS5xVVpVsN2wGp4Oj4lKg34PT8grA/5ONnnyKVIE5Skn2GbdoURYkQiocPHzE9GvHjH/9LkiShrisWC+14DfxhW6llDIfjzjEZhjZhGHF5ecl3v/td+sGIu7sZhmHw+eef80//6W935Xue53pmVVUI9riexenJCcNBhOvaSLknzXp43iV38+dEUcDZ6WMc2yUIQ9JE6xBs1+Xi8hKhFLP5vFOk9vtTRqNxd48c4EKvXr3Gti2iqMft7R273a7bRARB0A3yr5fLrr3+eddX41BoZLcuPFiobdvuYsc9T69WHM/t5Jy3t7e8++67Hd7qIIcejsdk+7hzhd3d3XUpU4ZhcN5yHMuyZDKasl6siHohJ+dnKCSDUZ8k1e3IdHqs+fqTY+paJ0vPZnOEVFRlzXg4xPc8Ni3J5vzeOSe9iLwsOL93D0wT07YxLIv1ZsN8sWIxm+P7HhKjS+85GFbOz885Pj4mSRLm87k2rtgO6W7ParVENjV5mjAeDuiHPY6mEwZ9PU+RtY1jhwShD0KRpCnpNiYvSjw3QvgOdS0Iwx5KGSTJnqZdR6qyRNYVVZGhmopkt2G9nLPb7ijynKoqqYqCN6sFVbsea+qqVUC2aPcsJd7vyPMMgdJ/rtSsBlkrelGfo/6UMm5I1lukD0YtefNizz/5RzNCH2yr4sVHL1lsd+RVxXAwwhQRtmEihI1sbPKspigaLNtpo9Ry+lGfh4/eoqwaVpstNSa//Cvv8eb6BhlnvPXWE15/ccV4POboeIJtG3z44Yc6TbwpOjVsHCegNDg4inrtvMDn9PSUp0+f8uTJE1zX5fvf/z6r2YbFYsl6vSJNM+q64gAzNwzRZXBImWu9ggVNkzEZ90kz8EODvNhRVjH7OGeUH9M0FevVAtMySXYalyelz3q15vTshGI8wLBMimTftsGCMDQxjIz1eotl6YNrtVp3z9bBYn19fc3Td96haVeThzb6511fiUNBKdXJcS1b50SeXVywXq87x2Oe58T7GIXi6OQEKWXn+BqPx2x3O1zH4fb6mqosCYOws8De3d3h+z5FUXD18mUHbNnvt9i2wLINHM/G9n2S7Zqw1+PRW19jt9+x3cacHJ/TyA2fPHvGdDIlS3QKcFFVNFIhm5rXtzesthviVLcypm0xmZ5wnBfsWhm2EBaNbHjx4gWipRR5ntcJVO7u7ro0qwOAI42lZjyWOYMo4O3LJ/qNYxtYQhL60O/79PsTbNenqkvysqA/DDk963Fzu2C12mPUOia+KEEYBxScnt/IpiEMfYRpIOsSYSpeXb3k/sO3eHD/Pq9fXfHy5QsC32M8HiEELOYzNqsVrutQVWWbqLSmKDXMVQCWqVuHvMiwTAtHmIxGY8p6SZyu8CILNwjZ7G7p9c6IwiELY07oCWy7QTUmCvDdgLJsUEL7KgyzII4TirrBD2yUEnhuwOWDM7b7mM++eMHN3YLVZsvx8Smj8QjXMHEcm6LI+NkHP2W9XrdhxTZRNCDLE8qyxrUlhmdyfDTm3sU93nvvPSzTYr1Z8w//4T9idjfTzlJJi2AvGY36NI1kPp91G6Q0jTEMUzs9DUjTAs8pqPsKKQVl2RDvEwxDW7VBcXt33b0YNXJwRVXpqmO9mneiKNOwNIl8GFHVFUnyGtt2QImWVG2097s2RB3gr2kcd/j5P+/6ShwK+mSOKIqC3XZLnmWsWqfYQXlVliVhLyIvS9I4ZjQada61oN9HCYGsa81juLikyDJevnxJr9djMpkwmUy4urrqYueEEGx2W5pKrwJ/8ic/4ujohOVqhWlp0ZFp2wz6Yx4+eMxq/SFJkmIYZlcaZoWOihsNhx2/fxfHmLbF9e0tn3z6BWEYsl6vdUS6abPZ7hgNhuStr6EsS6Io+jPAlUPST57nmFIy7EeMzs/xPZPj6ZC6TDiZRDx9+pivP32bk9Mj3J6PaZv6JklTGgWNstiuMz755IrPPr9mtdy3mnmt/DMEyKrAaYNNLMtis1rhRz12cYptebiWxWg0Yj674+OPP+Lxw4eMR0NtjXYcikIrTfMio6x0FSGU2fEabctCKY07T/OG09MT+sdDlldfsM4k0dEFMi959uIZnu1jCb0R8RwTqSRVCy2xHcjKjDTW8uLe0CUIRwhhcHp2jkLw8uUVwrLp94cUVU1VKyzbZh/vuTg7Zj6f8Ud/9C9QSieKDfpDpGrY7fYIAb4X8fDhIx48fMBbj99iNBrx27/9213mgm73MpbLFb0g5PT0hCgKu0rDtk12uz1NU2NZLpZlIisT0xAI5dHUNk1lI/Bw7R6WGeK6Fr43BAS7/YrhcMTJ+Tm2afLm9WsUNXkes1guOT4+RqAoi5okSajKujWr+fq+zgrivQ4DWi1XuJ7VQYoOPpiDMeor3z5UtU7lqZuG3W7HyckJtm0xbMMzNOyi6uYN2+2WKIro9fuEUcRqNuP08pLdcsnDhw8pywqn/UenaarZ/XXNxYP7yLph38pCsyzBNCCPM2oJd3e3vLm5JYr6jEZTHCUYj6Y4dshml1MWBYvlktGgz5/85KcotPCqqErGrqO1EapmuVq1U2ODTz75RB9kaUpdyQ5uIS2rg8ccNBSHVKCDxNl1XUaDIafHR9gWeK7B0XTMd3/jbzGIXKLAxvcsDFVgqAbX93FNEy90kZiYTsDF5SkXl/f49b8m+elPP+F3/tn7SNlgOA6ykC2/z2a3jxn6Abv9jkYpwOSTTz5mMOi1QhctYV6u5qhGC4KEENze3rDfb4j3sWZD1jUVkqZqsIyKIBC4jo+UDUVZsc0yBpOIMA5Iyz2rZI5tuShbEBc5vhthGwLH1kzC+kvIMqkKpNLsDcuxCSOP0fCYLC+oqprRaEJS5AwnQzbbHRKDfisJfvXqJe+//3skaULRYvWFENRlQ7835PLygr/xG7/B8fEJz5494wc/+EE35zk8THqjo3M0EZLtbgMoPM9nPBkxHA0Yj0fs9zFfPP9CVwxbnbcha0WeKcoC8qwhSxvWRkIQuPhehm2XmvYtNRhXYjOdTrAsk5cvX7ayZrur8KIoom7qrqI8tMuHLVy/36duNLT4QPMqioLA93X19VVfSSogznL6/R6O67JYakCI41iYQsdfVT2fRgq22x3ekfZGVHlBWRQgFfFqTS+MWK/X2Eoxv7nh4v4lru/TKIltGQjLZLackxcFjaoIBiPG42NW67UWTxUlFw8edzJQpRQISa/vcXE6RDQP+PjjnJ988DP6wx4PJo/Ii4rX1zfM45RMGUSRz3IX09RadnrQr1eGQVJnDE6mCMOgrCt6gz6ObWNbNrfXN+z3uy4tyPd9RoM+l0fnjMcD0mKN7dZ889feon9iYRoVjWiIc4ktLJRjI0oTZ3RC2VSkcUKvKbCMhOHIonfiYkwvsad/jR/+Xz8kT1LAQCibqkoxRU2WrijSingXY1oe41HIfvEKazqlSjcsZ29YL+8YjcdMplOkUqw3a6pkh6pqVA2yEuSl1lJUokGqnEbqTIS0TigXJbZ/SeiOaXKDamfhRz5lWRF6Lv1ehKANKmlqVBxTVTWl1OQlSzSEUcTF5SVV1WDYPr4ZIIXLbLED02Cxek0tK0zb5PHbl3zwJz9hdX1HkujUscl4ijYzNZwch3z3u9/lyZMn/N7v/R7v/873Wa1W9PsRsmqwDAGqRqia4cCjCjRZScqGOFniuh5NmvOTn/5Rh4Evy5LvfOc7fOMb3+CzT161mSUuu92GLI3JqxqFwXqb4Pkhm13GdDwhzyBLS3pRjW3D8fERq9UCzw3J85LZbKGDYsIQw/VY3FzTj3oIy2afbNgnGybTKU1Ts9lsuip4t9XkbcM08H1Ne9LAuJ9/fSUOBSkVwrBwbI8o6rHZ1BR5QZHlHB0fsW5XdQ2GRpm1KsC6rknTVIfFjkYdP89qWf5FUSIVLXzToaz1umaz3XLv3j32cUYURZRVxW6rNQ91XWO1vP9er8diPmc8HnN2ckoYhNR1w+vrW15f3/Dy6jVF1eAFIWmWtyTqPlWlV6DFl4AaYRThB4HeD7e06jiOkU1DXdW4rW3abEVFURThOS5e0CracPnWt99jOB6RZCmmIXENkzovMZXJrnDwc5uJYRCnBk3lYSsIPRshHIQy6UU+v/TN94hnFe//zg+pc4XrBuRFBu2gt8gLytrCMBq2hn6wlleeBT0AACAASURBVOsVy+Vaa/DrlH0cczefa9FVVdH3HZQAYRqoxkDWNUVVIRA0KG3LNAws26Gqaw238X2qsiYjx1CCqiyRdYNjOwShT1VXNEoxGE/Y7fas7mYcH5+AEBiWxf2Hj1CNotcbsN0lZK0Ja7dNsGwL13f4xi+9y831NT/+8Y9RmaZrHw788XjMo0cP+cYvfYOPPvyQ//Hv/V3WqxWe4yKEJn0LQ2CaRsv2SHFcq3tZxHGm3a4thLWu687v4DgO77//Pj/60Y/YblKevP2Us1Nd/f7yt3+d+/cvGI0HLOZ3VHlGnOwpywLHcfCDgOVywcnJMTc3N1iWyXgyatFrJpZlst/t8IJaG/yAJm/wfB8/yEmSuLPg07aFh/jA6XTahTn/G8mchRCX6Bj6U7Qd5beUUv+NEGIM/O/AQ3QgzH/ShswihPivgP8caID/Qin123/e32FbNrblEicp6/UOgUmR12zWK4QSLdLLwXUc8ra87jBVvV632guCQG8bgoijswviJEaZJmAgDRPbtTAcj6xs2OwSPC8gSxLS/V5HvHt9Qs9nNrsj8DxiqbBNC9U01HWJYQiCwOf+/QuSLEOJHZ5SWLZH1B8wmUwIA4+rq6tOiLXf71mv1220W94y+EfUsqEsS1zHxWit01WkBTOmYejhkO3QG/QpqhLbczk9Pyca9EEUxLs1RVZQpjmWsFheJ8Rlwjd+5RTbC3EMB1EUmMrAtD1MO8A1bcy+ybvvfZsPP77l9uUdyjCwgxBRSbJcD3KVrKgayWy5I8m1yCVOMoqypJE6RGWfJFi2dmzm6a493BvqpmlFNhIhoKkUluPgWwbHJydYls1ysejsvG6LWldKdXF3lmvTKEV/OEQYJsowwbJZLFecnJ5y//4D9mlKnuYI02a73XJ7N8N2DmrTEb/6a+/xT/7J/8nV6xeMBiOiaYhhanPQw0eXfOtb32K+uOPv//3/iRcvXtDIGsu0ePLWY/b7HYvlgjD025lWjUILvrrN1WTacQ9Ua9IyTZOmbsibgjRbo5SGEn/wwU949kzni8xmbzg6GvPg8oLpdMygDZ+djEacn53j+h7xbofj2jieS7zboBQcHU2o64r5fEYQRCAlKMV2t8MLQxzL4vz8HKWUzirp91DQia4OB9chku5AlPoLHQropKf/Uin1x0KIHvAjIcT/DfxnwO8opf5rIcTfAf4O8LeFEO8C/ynwDXR03D8TQjz985KilFLYjk+WxIRBr8sldCyHppaMj04o0pRGSQaDAXYYUrUrlXnLVzyoAEejEbIBYTnsU00JGk+n2uasFOvNhrN790nThHi34+7utgXCagPPaDTk3uWvMru7a7cTNtfXGwLfZ71ec3HvnO1uR5LmGOYN682OuqkJA5/1akkSay7e8fFx9z0dSLxfDnvtD7WwJIr07l3nWmjdwQG04jkuTV0xngyRoqCsK/ZJQqMy4n0MdYOJ9gvsU5Mff3jD8LTi6TvHCFkS7+8wkTiugRsG2JZL0whGpw94+933KIoPePnZp/R9F1klWjItG6qqoW6gklXrmrSom4a8LGgaqRWQKBACaQiS4kvqOKEXc0rnrWjPQiMRpt3Fu2ctJbuRkqquNR/BddvfpzSL0bToDYYYlkk0BGe7x4siBqMxludp+Kyn6dp5VhJGPcqqYTIa8fRrb/F3/4ffwjANHlzehwZmbxZYlsU7736Nt5884Td/8zd5/vxzGlmTJHu+9a33qOuG29trXaU0JVVlatl2VTEejekPok5xqpRAYGAIE8M2KZOEStbt4SiJwh6O7XapTsdHU6qqoCz12/z5889xXRPX0W//3W6LZRoIQzAeDUmSHZY10DL1qmK7XaPQ7VrTqI5mvtvtEOu1DkRqqz1A54W0JsBer9dVBoektAOq7S90KLSZkTft/++FEB+h4+X/I3ScHMA/AH4P+Nvtr/9vSqkCeC6E+Az4deAPftHfUVU1ZVXjhBGGYTIaDvXOXEpWyyUCzUDwIo+syLHzHLtN8DlAXo+Pj7vT0PN9StlQt5WEadnk+5jNdstwNGK1WiMElGVFluxxbJujY61isy2Toiq5d3GPLz7/nCSJqauKk6MjoijENG3Oz87w/BDLslG84vZuRrLfUzU1s9kdTdNwdXXVUXhHo1E3AT4o0w6xZGEYMej32bfti23bHQr85OSE0/EEyxIYTo1hmlRNQ5ZnpEWJa+rJc5VXOOGAXXaLsH0uH52S7zPWzRbfV3qfXxZIYSLMkEoZvPXONwm9CZ89+5yk0HqDRkoQ+o1f1RJhGdRNQ1FV1G2FIKU+DJSS1FKTmjENmqZuuQ0CYQgdG6cUUklM16GSjTbltL+eZRmmMDRl2tHbJ1oITJyk3H/0EMt1kVJxcnqMYbsMFZyd39MELsPAOoB3ghrX9Tk6PuHhw/v843/8f2CbFn7gUWU5VdnwS7/0TX7927/Gs2fP+F/+wf/K7e21Zh1aBvfvf00PBZMEA9HyE0ftz8igKDRBebPeMZ/PqJsa2/JxHBfDcPB9T681s5yiOPBEHTabmCBwsUzB+fkZcbynqQsMU7ShOls81+L09BShRMsXdRCGABTr9YqTE31fb7YrhBCcn50RBLp93mw2ncnvUCnnea7T005OUC2Kb7fbdRJty7JAiM578xc6FL58CSEeAu8B/w9wcgiZVUrdCCGO2y+7B/zhl37b6/bX/tU/q4uiv3d+RrL//5h7kx/L0vw87znzfO58b8w51NRVXVU9sEFSIgHbEkUCMg14Y+0MQfDSG6+9F+CV/gMvTcttSxApWTTFoSmaY7O72V1d1V2VlZmRkTHeuPOZ5+PFd+KyDbNpwA0TdYFEJaIiIzMy7/nOd37f+z5PhO25RJHAjBVZiiJJIr7cfVNpltK0NfFPUGOSJOHx48dojoPtupRVRVk1tK1Ekucs1yskWUY3TUaKiu37lB3tyDF05lXOeDSiaWp810FVZJIwoExibNPAsUyC7Q5JlpgeHLBZb7AMA4mGyWRElKREcUyRp6RZLtpsneH54ecPKreH9Fnbtqi6AIj2ej3aDuTywNoru8+RJIkiL7i5umMwdXH7Om3aIikKpmlT5SVF2SLLGoapYZgt6+0tr6+ukeuYtgloJRlJU5EMaCioGwPHcXjjzSdUcYNuWuRZApWIkTdVQ0Mjttqde7Kqa5pWBHOatkGSFGRVQe7yFQ0i99A2AhQiIVF0aPe6gbKqaWUZy3b2x2Hj8RilQ8DTMSIeFkOv32M8nqIbBnfze8IgJhcBCz7//Pl+9yK1LWkY8+jsKW+/8w62afG//7vfZjadkaaCfJxlMb/8j34ZWTL45je/yQ9+8AMcx6KuG+qqxrQN0iRHN0Twpy4rRqMhcZxwcXHBcDhiMp5xezcnjnNMU+joFVnddyjKou54nuJUTFWAVsYyTSxLQ1XEn9txbPo9F6mVsWyTphbFtjCMODk6Jk0EtHU6m+C5LqYpuhBhGFDXYnHXDQPTNPeReN/39x2f+XyOoiicnZ2haRr+oI/SPTo8cEWaRvxdP9Sxf6ZFQZIkF/hXwH/Xtm3w0NX+mz71b/jY/4MS+ZMq+g+//F5r2yavX3xOr+eTxqEgF5sGWZiSpoJvH2UhVWXu0W0P4IjVakWWZQymU+S2Rdc0oiTl0aMz8izj/n7OydkZWVWxuLpku93y6PFjiixhPBpC2yC1LeODGW1Z4jgivZinKXf3c3zP4/riFf3hkIuXL9G6OcCg3yfLS6Io5m4+J45CWlnZ5+VVVd137R9U5g+LhOu6jEYj8jxjtVyRdXdOs3t0eGjAUYsJ+N31DbNDH8MykCUF1TApGom2AV23QJN4990ZPb9hvX6J0qZMfQVFk0HSaFFoMIjjgCDKkGuVMs8xdJ3tMkKTxUIkqypK3VIVDUVZCIN3t12uG/HoIHcE66ZtRdxaV/enJlUtBLN0n/dQ3RU7pj5Flu+r7GUufs88SZEVBad7tIp2IfPbO8aTCY/PHhOEEWVeUtUN48mEqisEnRwfcv7iJZIs4boOv/Vv/h393oDTkxPOz1/w1Q8+5PjogD/41rf47vd/wO3tLaapsd1usG0Lx3XwPKcTwe7IsgpFarm+uhMls96IPK+IogxNNRkOTOqqJk4S+n1/D2TNc3EzeO+9o71sJcuEdq5uCnq+S9OIhaNtKiRJZTo9wHUdqrJBcXXKsgEU2lYmDGIsy8HRREM1z1csl0sePToTi1FV7U3qD92Hpmn2TeNev4/f86nyHFXX/29pYSGwqQWW4GdZFCRJ0roF4X9q2/Zfdx+ePyjpJUk6BO67j18Bpz/xy0+Am7/t67dti2mL7kESx9zNb7BMEyWWcG2bzFS5ubpCMXSSJNlLOS3L2vPw0zTFTVNs32e9WtE0LVmWoqsas/EQVQLqkr7vkicx8XZDnEQYpjgSHI6GOL6H1LZcfP555/vzmU0ngiEpyZR5xvtffo/b+T1lXaOmOWEUo6kiLTedTMjrZn8RPISyfrLg9YBHy7KMi4sLojASPoa6oejOnB8m2lEU0ZNVdGTu1huqrMCzhsiqqMO2CiiqimnYuLrGV798hOGpHB475HGBazSYpoQk1xRZyiKKuFmk3N9mvHp+ydWLS8LtGsvQaZqcusuEJFlFlteUtVgQaEFWVDRdE7kBoKHF0FSRB2laNFlC0iQ0RdtXqh+KbpbhoMpdGKujGwdBgO96+25L9hOtPdVoyeIMBlBmBWWa0/d6VHXNar4EJLZhgNQ2PH78mK989St86/f/iF/4+V9gs1pTFjn/xT/+xzx78Sn/82/8BpvdGsvRaClZrVf0+wO0TvyyWu4wzRxV1VBVhbpMGY2miFxGI/ibjcqjsxOGoyGe6+J6Hr2eS6/fo9/rd0PklN1uh6Kof234TlMkSVCbwmBHWeZE4Y4kDsW/eSNTFA1tI2PoNp6j7X/Pxf2aPBNNSN8bsl4FBEFCWTbourZ/BH0Iuj0gB84ePcKwbcIwJAlDWK0EaqAjLj28Nx+I0v+fFgVJbAn+R+DHbdv+i5/4X78F/FPgf+j++5s/8fHfkCTpXyAGjW8B3/5bfw9ZYrtdoUiQ5QmqKmOaGjc3VyQdvLUscwzHwjDM/YXzMLh6GDQGux3BbkcrtYKGXGbUbY3Vc1GkhrYuSaIUS1NQ2pqB7+H6nqgoh5HQrhUFmqIymEypspSjo2O26zWlIqM7Di1gmgJgqpsWd/N7yiKHtsW2HVzd2A928jzfs/ge/JayLLbM69UKRVX3Q8WyKCiyHK0LNQ36fQGaKSqaskSVZFb3Sx4/PcK2LAo1R5N18UMxMU2YHg5RLAfF1kllDVdr0eSWqiiIc0gS2Cy3PPv4U77/3U9Y3CyxDR3HkikQO4Gyrqm6xwC6bsR+hyBJSN2fX3ooqekaWg21XFN2XE2RmgOalrZuqIqCcLOlrWrU7o2pa9rejjQbTyiKglUH1tWKgjzNqPKCPMmIgpAgjHBdj57ria5JWuC7Pm+9/SZ/9d0fMJvOiJOEd959G0PV+eb/8k1enb/EtnRUQ+bu/hpFbXFci7atcVyHPCvp9z36vSGGYXJ1dcnJ0QmT6RTP8/ng/Q/RDTG999whq9WGH//4R9zdrrh8/RpZkYhC4YhUVJWXL14Sx9H+vZnnOU1b47libjSdjdFVlYPDCW88ecTBbEoYbCmrijBM8VwZx/bRVAPHsQXct6ixLRXP6wmug6Ts+aW6ru/DeQ8MxtVqxaFt4w9HglIVBHvA8cOOwjCMnxnx/kvAfw38UJKk73cf+++7xeCbkiT9N8Br4L8CaNv2E0mSvgn8CHFy8d/+bScPIHgKlm2ShCEHp0esbm8Jwi2OayO1LVGwxbZN0iRFVTXk7sjuwfak6/o+8lyWJYali8XAtpDblrYq2W02uI7DJokZ9H3hG+jOffWOoRdGIXVZUhQ5s+mE5TxneT9nuVxiGgZNpVFlObRgmRZydyTn93q0koSmGwwmAqkdBMF+wiu2p0HH/BOEI2QJ3TBwusegB22700lyLdsWZqmqpswKFCRefPY5b7x9hiwj4CRSTVW2SMioUklbxWiygUyFKrfdoEmmzGsWiy2fXqz55JPPefXjz4m3AVWWULY1im0hNcIjUSsSSDV190/2k4+JkiyhSgqKqiHLYsBYliVUHbmpaffDQiH4qZDF5JI8y3BcV5w0aRpVt2Ma9Pv7bsvD44Zruwx7fbI0Q5FVRv0hbdOyXCxBkkjSnLffeou/9/d+kT/7iz9HlYXI5YP3v8x3v/0d/u1v/VvyLCFPEyRqVEthdDTg6RuPePnigrqCpq7xvB6GbuH7fd555z1+9Vd/hXffeUqSZjz7/Bnz+Zy/+t4PefbsOZpmoHTP/7tgh6YXe5PXA/354URA2M0UVFUjCDKC3R2vzi9wXRtFltB1hcPZlLOzY3zP4Rs///PYlsugP8C2TMqiYL1ZUdcVpmlzfy92vpapi+PQyZCmaRiNRvtH0vV6vT8iNQyDNEk4Oznm/v5eoAYk4QEBmE6nP5s2rm3bP+ZvnhMA/MOf8mv+OfDP/9++9v7z65oy3GIpCm2aYOsanj9it92iyDLHh0fc3d2xWK/o+Q79vodjO9ze3jIaDUWwQ3EY9H3m8zneYIamyOSpiLMquoFqVURFTm82wbJtdtsthmZ2QNUWxzWJooS2aUiyCk23kWUd1TKJ02taWcKQZUzHIS1rLNUgCBM0zWIympKmJaZhUyUVDQ11XlPnFUEYkKSxmH8oMkUpZgpVUQINWZpgWw6qouI5PnGcc3R0hmXaYhretCiKjmH1ibYZn3z0jK9/430GfZemyfAcFeQI0zTQLdGYU+UGXWvJ05Yam7zUSYMdP/7Ll9xd3dImOVUSoqs1mq5SNCWoNranMrBMrm9uidMtLWKQWOUZmipTpCWmIcQuqqLSVA1tW6FZGlVVU9UFlmXQVOJoU5UkFEVClxrUtqIMQvzpBEWCNM8pJcjaFldR0U0H15XwLAdvMMDp9URvIY7ZLeasthsur69QdZ1Hjx/zi7/ySzx//opsW/LmW6ccHk/4oz/8Pb77ve8gGSVRsKORWlRNRdVssthm1TY4xoz+tMfTJ485mE754P33kXWZqij5zp/9BX/5p3/G1fWlqIWbGmG4o6py0jSjrApURWE8tkjyAg2wXZPtdkdLhu2YSCgE4QYtE6G1sNgiSS2yCVGxxTEN2qKlrCySaIkuF/zZH/0u/eGIs04ye/bojAN7InZSsoyqK/T7YgGQpRapkfG9PqqkItUStmFzeniKJEtcX1+zvFsyPTwkLWE6OyFOYqqywnYH6JpGmjfU7c/w+PB38ZJlibqsoGkwPR+pabm/m3esRnEX0TTBBxwORxiuS1kKeOXd3Zztdkscx+i6cE4urm8EJh6JPMvYbXfiotF0yrrGUVVGkwnBNsQ0LVQ1Jkkz0ZnQdUzT4vzFy87RoDEcjTEtwV+Mk5gwiugPx2wvb8g7FLdQc9mEYUIQBnsij2NbFEWOpokjP0mmG/KIO+qDXFfW5C7tKPH88xdMxhORcDMMJFE5xOsNuHh1haS0tHyJybRPVVUYpkpRVLStjCSpFEVN08og69xcL0Dq8fFHn3J9cU0Wi+2m77kkaUIUh+yiHbbrYrsOp6ePyPKSm5s7dNMCSe6OsjTapuk0eZJYGGRF5O+pO5WZ6BMgiUDag2S2bVqKLKfOa1pahkC/1yNdds0/2+ZgPCMJEwauhzscMJyMaVqBSM+6GUSaJJxNJ/yzf/ZP+ZM/+VP+4o++zT/5L/8Jigq/+a//DdtgzWI5RzOEEkBWVNq6RW9kDN3iq1/5Om8+fcKX3nkbXVN59vmn/Omf/J98/3vfI4oCgiAkKVvKMkeWJSSppaXCcSxkRceSNRRFJgwDvN6gmyUUFF1+A0lGkZWO16GRZaLFW9UFtmViGkJ4rMrCxVEUuaBhWxZFkbHbbWiaisVizpMnT5hMJmJXPOhj6Dqvz88ZDQekXYpX13WCMOTw8ADbdmjbh56QgNtut7s9tdmyLPxeD8M0+fijj/bD8L/p9YVYFJpaoL5NwyCPY0zTwO31cG0bv9djt92QlxWeaQijUbf9LLtAkq7rhGGEoohhlSQJmWsYhsRRLEJCjiOOK/McUzf2xzhxGrPb7faPIpIEk8l0r2urqwpJ9ZnMxkRBSBQlYoZRlhwdHnB1fYPrOuyub1gtFrSSgu06zOd3XeTVoK4F/aaqqv0w70E607aCIOVYYvgznk7F86JhMJtMub08x3PF86Xt+aC2fP75BdfX13z44bu8885TDmc98iKnSBUUSSXPU5brFc+fX/CX3/6Yq8sVaarQ1DquIzBmsqqhmzaurBLGEXGS0hsMqKoaVdFxHNFI/cn5jdQ5EQF0TcOxHVpaVrsVum6gaXq3cKioiiIeJ7qCjqIoNFXdqdjFiUQ/L1BUlWgXYisGmqpi2Tae79Ef9AnDgDgKWd7PqbKM04NDfvU/+Qf8+bf+iD/54z/G0m1+7/f/D/IyxzSE6XkyGbNYrrFMh+nBMe+992W+8XPf4PjwmLIsmd/e8r/9q/+V9WrJ5etXYo5RlaiKiGEXZYaqKiiKjK5r9Ps96qaiKHJWqwWGaeC4fdpGYTQc0ev3CHYhYRiz2WwpipJeb0QSx9R1w3AwIkliFAk0RaRye56D5/ndDUmnqmpMQxe9HU3b6+MvLi548uQJaZoymUyYTCYUuQgeVXWFIZukWcr9YoGqbrAsm6qqWXdfp+5mCg8nFLvOd+K6blfX/ptfX4hFQdFU3njvXcIgYH57S1lX2L6P2+sT7LYgK/SGAgmmqgIIqhoGSadce7jbmp3jsSgKrl6/3h/rbTYb2rZldnhIGkUsOopNXYoJeZZE2I6D59oC6CK1rJfCsiMBlu9Rpilt3eB5Pnle4roOcppxfHTIdrMVDsm6RNOFfWg2m7JcLonisAujIO74nVWobcWgqG3AMA2GoyFxmJCmCbPOvxDHMYZrU8ktdVnij3rIqoyqaoTBlu9/91NefHrBdDbtjqlULNsgLzIur6+4urqlbTXStMW2hriei6zI6JqL2dW2szynaGqyPOcBSfdwcpJ3sw5FkrtymCRCTt2ztCIraLqG43mYhkFVVkJuI8k0nYKtqWvUFmRZnLHnRUEcxfQGI05PTsizgiIvaWmZTmccHx1RUXN5ecF8Puf6+pr1YsGT01O+/rWv88lHH/HZZ58htS1JVlCVWfd9qch1Cy28/c6X+M/+03/EYDSm3xvwe7/7LX73P/wOd3d3qJLMeDwUOx1NxXFtmroCRCDLMLTOPSJ2bpvNFiRIkxRJ0tEUi7psidOE7TZGvZ6jKhqW7TCbHdPv9dF0ndl0xnA4QjUkoKEqSzFUzDPkVtCr5bbFMISt++jwCK8njq8ffJJJkohHCFUlryo0WabX6+O7nug/WBaWae8TirutWABsy+neDyKs9KARGA6H++tjvV7/1OvxC7EolEVJFibIDfhujyLPiXcRaZjgeS6D8UjYj25uoBEAirKqKIua0XDSnRVnGIZJ09S4rsf9/Vx4AGWZ2XSKJMvQNPT7/X2RSlNU8jQTYZrRUOw2dA2/16OuRP16sbhnvVnjei5FIaqqmqZxfHrK5cVr3nhyRs8XFurnL14yX65YLudkWY6sKNRNQ5alJGlCXYt4sBhIyR1iTgymqqpiNpth6CZvv/02sqyQZxl9f0hZ5Ww3G8I0Q5UUdMPDsWWyKKEudV58eito0G1N09aUVU5RFxSFwJc7toHvD9A0C8t1MJy/bvPleY6saeyCHbbt0DQtuqbjez5ZUXRBHRF0abtcgqzJ3Ta1wrJtHK8vPIlFAQgjstTZqOuqoshFmEtWW5F+bFvu53OGoxGu4+PanthRtS3b3Q7dNsiylCgK2axWuLZF3/P40//4H7m6vEKVZWzboZJbLEtFURoc16Dc5fznv/7ruN6AzSbgX/7L32S52OC6Fo4DtqUzHAxQFIXNZoOqqViOxWq5wnZs0rRAUiQ0Q7gvFUUlDEJx7KzbPH70mNFohOd5YkGxPfoDn16vj6ELT2SSheiG2Jo3tUSw3bHbbUizBEM1OJrMcGwTz7XxXBtNU6jKgoOjKVkhqMzvvvceq04wPB6PxQA9z8k7Z+VgOMTt96iKUtStEWW0B9DsZrNhMBqi6/peEDQej/f17wcw7U97fSEWBSQJSVWIwwDdEKTinu+LiWqWMZBlqjxHkWRsz2a1Wu17DpbrInXCzjzPyfOaKAxxbHsv3Oz3+9Rd+zHY7dhutyiKSl2W9P0ehi566qauodKyuL4mTRIW+RwA23YoczFv2O526LpJGocoMjiOzdHxMUVZkGYxaZGyWM2RJJm2FbHVh+2zosiUVdm168Q23HUc0XPoFrTlcsHV1RWKonJ8eAQamIZGT5IJNls0ScGzHIoioShlsdPJFWzHpUgiFEWibjWkOkNVa1TNYjAY07QKru9z+ugRRSOMxYZp4UsSsqJSFCVRGFNXLbquYVsutp10SjKDthFcSEUVd1LXcZCRsC2LOC9BVjA0A7/Xx7Zs8X2ZJrvtjnXn9MhLkY4cDAZi/rILyeIM07QxDw8JQlGscuSa168vqKuKo6MD0jDm+bPPKLIc33Voygq5bbogUItlGwLuIkl8/MMfcnV9z/XNkrKUUWTxWKOp4oSgKIo93zOqa7ZhhKRqGJbD9OAQz3PJ84LpZMrXv/51hsPOPylruJ6HaRrsNiEvz8/ZbXZstwHBNmY+v+f+fk4YCfR60whWhYRA4CuKRBpH1HWJoavYtoFtGVimwWg05IOvvoesSMznc8aTCePxWESYez16gwGL+Ryn3xczs/sFrudhm4LFqRsG569e0e/3sS2Lw8MjAPzhgLTjRcaJGKKbpslkMvnZcgp/Fy9N09gFEXle0iIzHIlz6yhOiZMMJPENREHEfL7ocgAWpmlTZkUHNU5PqQAAIABJREFUJmmwLJvBaMLd9WvSJKHMc+7jO2bjCbJhUpcVeZrx6uU5p6enqIpCURXYjkueJrie29VM4X4xpyxLnj59iuP3yLICu0sapmlGnIkhkaKqJEnEsO/x1htPiPOEINqxWCwpipL1dk3TCMy5LKvdkaqCqooKuK6J1XxRLdBVg/FozGazYdyd3a+3G4bjIb3+kKeP3+TFZ58TZznTw1OC7Za2apGSChSDuk0BlVaW0E0DVdeQFRVUg9OTM2zLIatKxrMpqGt0TSOOE1TdwHE9VsuVcEJILmdnj1BNgcYvikJIYzvBrKaJI0AJMC0LzxOadICTo2NMXUfXDXzP4/XFBY7j8+mnn6KZBkEYkiYJj04fUXcDzV5/IDBiskKaZ3iZSc93WdzfM7+9oypLVFlBViRxfGyJRSqMUrQy77bPKXXTkqYlluXz+PFjykJCUUx836LnSxSFwOI7nouqGbxjW+iGSMe++daXxN+RYVDVDYauomoaLS1ZmpFnJff3C16/vqGq8i7wNmIyG7LbhNiuyVvvPKUq62421aLrBjoSRZFR1xVZGlNWBcNBj9GoT5pEbLZryrLg9vYWx+0kSB2g9fr6mvv7e0zTZPoQ2PM8aKGqal5fXXN6eorfH3B8ckqe52RFyWYXMBoN2SwWYk6X5/jDIc+fPeP48JAsy1itVj/1evxCLAp1VYkEo6xQlAW649C0YNv2fstzcHgoNGLLJVVV0+/39mYoWVaIopDlcsXTN94QUdQu5fjh177GYrkUWQbLwrDE3aIsSw5nM6GXM0zyNGN1vxDZgLbB1HSqXHT8bcNERuXVixccHh0SRyH+cEDWpfpaWhQFNE1mMhlxfXPN/X3dmZNcirKmAfKsRPSJxPOrYPmlZGmGKqs4PZuiKNls1ui6kK5qtsH93RJLt6GVMEwLXbfIypK0LJlOZowPDkjSiLQWseT+eEoYRhhdQ7RpG9KyxBsYlEXGfCXMQpZhUtZiMZUVhTTLWC5XeK7L7GC6j2kXWS4ErqqK0fkJFVXBMkz6/T7e9JB+v8/hwaHIHagak9GYVxev2Gx3JFmG63rcLC6gFViwtq6FiVvXcW0H27JwOqxengSkcUSZCVFu0jViWwlquUVRZSzTpqc/ZCVqBoMh/f4Qw3TQDZfLyzmF2jAaTfi5r32FN58edVLgGt3QsT0TWpXFaokkK90zfEAc7ajLmjjOOT+/QJJbNpsNWZax2+24vHyNLMtMBgdsNmvyPCNNU6I4wDSFWdq2TXa7nRDRyhIyUNclvZ6H69k4js3h4RRJavF6Lo5jcXh4gNMh5B3H6dKRyv4Uq6prXl9ccHR8IohfVU3TtCyXK5TNltFoRBCGZGlKUQhXpiIJJ0hVVVy9eoXUCgHOQ1X9p72+EIuCqmkUVY3h+dS7HbKk4PT7mGGIoRu4nott20jI0NIx8wTRVkxZC0zTFsmtWJRKXNvCsWyuzl/tZTOe56GqKqfHJ4S7gHSY4NgW9/M7UbJRZIZDgfUeDgaCKxhHhNsti8WWN56+yefPn9FKsJrPsWyLOBZMxs16SV2VxElEWRVouka8DfB7JlXdYFs2VRXTVHUXAxYps7YRdxRNFTFg05AYj48EjHazZWJNOZjM0CSNu5s547EYKm43G0zPIchCPNdjHW1QTBXPtTg6PSVJMvKyZBvHgsbTNER5jud7uJ5LEidIisJwOKapGrTrW0bDMW3doioaYRiDxj6B+TDkdRwbQzdEldjzmM5mnLz9HqPRSOj6hiJo9MkPP2YxnyMhE0cJeVEiKQqWLVySu90OXdNYxjGpnZDlGVphUbY1miQhtw2WZaJVFcvlgqZt2Gy2OJ7Ll955h9nhAU0t3AdJmpDnGfP5PYvFisnsiPfe/Qr/8Fd+Dc/r4dqGMFNTk5cFd3drPv7kM25ubynrEmSJ8/NbXp2/YrtcA+IRKopCRqMBSRoCDU0r+IYffPBlJAl6PZ/lsuDrP/dVNE3l449/yO3dDZvtgiwTAB/fNGmrmjDKKIqUIMjJ85imzUQJLxTcg08+/YTRaMT7779Pv3tMcF2XXq9HUYjHvbqquLm+Js0ybNtmMBzsjdT3ywWj0Yhev08QCFlxkYravqIonJyc7LVxD/r6n3o9/h1d93/rqwVmh4eUTUPdWZt3y50411cVWkUhjGOSXYDnulxeXqLrwrZ7dXXFdDrFsiyxYkcirtw0dccrcDsnn9B2I0lYrkvQOSF0QyPLUjabDf1+nzzPGI1HDKdT5nd3bJdLer0euu6QxKJ3cXV1RV2VbDYptmMTroQEJU5Fecv3XbGd7baBQhiaCuqPZdPULWkWCsxcUXWVWQNFUviFX/gFsjRnuVjgOi5FVqD4CjfXN2S5qMXKikzVVDiOw2g8ZLvbgQrj4Zg0LdiEAYZh8d6773I3v+fs0SPuF/cEQcjN3ZxpI7yMTdMQ7nbcXt9gmCaHh4fUZcVyuUTRFHRT7U52aizTEvHrwQDHdjg6OMR1HPqDAbbtYlkOWZJwfn7Bi89foGsavf6Q5XK9P65TbY2mbdjttti6SZWVwo4tCaagrCpEcYqEwOev1mviOBKAE03lzS+9xcnZKZZtoxo6l+c3pFFOlqckSUx/0OfDD7/Cz//i3+fo6JgoWbPZrgi3MS8//4zLyyuKqmK12RFGCbppoRkGSAqGYXJ29pQ3H73ZlYtUkjRGVUHTFZarOVmWEMchz559ilzpXVZAI4p3ZFlKliV88OGX9xbrOI5IlmvyXNC/FRnqqkSyNGRZwvVdVE1BVmQGgxlxFO+J3rqu47ouQRBQFAVPnjzBcRyqqmE2PQTAdXyquqKuwbJMZEklCmOGgzHXV68BcQM9Pj7m9vaWw8ND0Q7u+g8/7fWFWBQkCVRTgzTFMjSW93dIsoSuKWiKRBJsuZ/PKcuGO2mB2+/jOgJtNplN8YcDTF1jMh0RBAGbzYLRaMh2s2F8cMCLFy/wgDjLsDp5rWZZbKKAod5DNnU022Kx3TAZTzFth6yo6I+mVGXNNgjRTBPdMUmSmMPTE9IsxXcdNE1jvdni9/rESQoN9Nw+Wz1AU3WCIEZRdMqiFNCUphV3CLtPXuTYhhgWaYpBU7c8e/acoiioqpqenzGdHbLe7pBllaOjEz7+5BNc38O2Lfp9kzQp6flDev6QxWKB53tIkoyu6ZiaGJxu5nPOP/+M0XCILcusb2+RJNjc3YmL3NRpuuacqkkimWhrDKc+ZRVTlQm6ouB7LuPBiPFgzGQ0ZdAfoOkiwbjb7pAQJy1NW5NlJY6tM59fc78Q7ALXdEUlWNXELqluMB0Tb+Di+g5JWaCbCkVScHk3J4wiyqpEN218v8dkOqHMa378o+8TRRGGLmYwk+mIOLWRZBXVMLm+m/Ps+asOt9dDUWWMvsUHR1/DsR0RvtJNhsMemqxxe3tPsAu6KLFgevg9D1luiaIdLRW2Y2JaGnEcEscR4S7h6uqSuhYN2O1my2Kx5JMfP2M4GHJ2dsZgeMD4A4cf/NX3CKNA5BVkGcv1kDUdRTVoaQlDISRSdZmqyQGT5fKessgEcbppuH59KfSCvgDNVHUl3lOmSdtIRGFMlhSMZlO2mx3D4ZSr69eMJ4c0rcJoPKVpRFFts1nT1F9wwWxd1ZRRSBSGnRAzR9M0JFlC6wxETV0yX4pTA82yMF2XVpLwfRdD1/F8lyLP8T2X2cGMIAwZTafIqsr04ID1es1gMEDTdZBlWknCsG0M3+d+taKVwO/30UyTXRihawWqpmM5HobdsgsDPM/jfrXAsiyOjo9JkoTDw0N0w+T6+powjBmPKm6ub6mrhqqsOkZCJJgOhoVpOfimSbAJ0FWRcDs5OcGyLK6vr7m9u6FpGg5mB8RpxIvzl5imJe7iNPSHQzRVRZFVfK9HGIbkFOyCENvxODo65OXz5yyjBUkYMJ1MmI0HvD5XSIMddVNT11WHejeZjUcM+719gef8pUDPFUWGLHvYpo5tmUitjOe6tC0EYUxV3JOlpRDd+hbD4QDf94Q30ffYbtZ8/PFH/OhHH+M4Jk+fPGE8HeHYNlmccPX6UlTIez6KphBEO1BkNEMlDSvuFss9pVjVDOI4Yf3ps45CJIhYjm2TZDFu4/LB+x/w6MlTbNsjSTMM02Q0GtEfjqkbic1uwW67pa4qbMvBtT00RSWNY/p9A1UxqYpGDBelkiRdkWcVRZFx/uoFQbBhMPRpabm5uaYqS3r9PrOp0MZZlk3TthiazXQ64/LyNZeX12TJjn/wa79GEsds1ktev37FajVH7waauiZyBbqhk2YJcRJzenJKkiZIQBSGDAcDVEUh7eZkdLIZVVVJ04TNZkuWFSiyiqKojEYj6qri3Q+/yuZ+zovnzzk5ORKnfF3W5GeuTv///2oJdjtWq5WoQLvuvtFV1DVSnmOaJoPhkLIU+nnhEWxIU9FQTO4EsNNzPXq9Hovlct9UtB2HvCurPMhXer0e8/ktV+fn2JZN0gie3m63o64q5G5LORpN9py7vGPcbTYbcYY8mbDZbPZDpQfOg2VZ9Ho9gjBCQjQNNd3YtznDMCTPM9q2wbYdXr9+BUgEwQ7DMPF9n7opaZoWRdOomxLD1ImisIt+G+i6RhgFRHHEaDRC0xXuF3fsdmvhfpRbEaNVZXa7LYOBcEtKkpCDbLfbvWvjwcD1ICMVAS6ZOAzZrDbUVcV4NCNKYuIkxzZden5DVpdUVc2p+5TNZs315WuKPKeuKjRV5vzlOVEY8uX33mM6mQB/XZpSVZWqKCiLgrIoRXlMkSkbcaT8cGTWti1hKHwVwJ4j4Ps+vX4PTdfo+33auuX1+QVJmtPrj3j7nS+JgBmSsFSlJRoyUiuxuV9wvn1OGISkScp2s+Hly3NxoUgi1KTpGmWRE0Y78jxD1zUkue1+rlNkGbp+y4/aHwt8RNN2WEGdg9khT5485p13vsRw+C7TgwlxGJImMacXR7Rtzac/+oSiSAVwR1Wpypbp5BDf90nTnNn0gLZpKIscVVNQVFAVmdV6Q5SEWKYgP1V1jWmYtFXOo7cfk8QJt9evadqWUSOi8tPphOVywdHBTNStq/KL731omobb29uuP2Bxe3vLbDYTtOA0FRPwosC0bYyqZjQacz+/FW4ETcX1fZIoZLVakoQRs4MDcWF1F2Cv1+Ps+FjESHXRf3h9eclwNiFeV+KO7Wk4rqA177YBYRiR5SVZnlNWAqZR5TlhGOK4AiUPcHt7C4iBnKaq9Ho9yqIijGNcxyFyUqo6omlqiqJCkkXf3nYt6qomzRNRZ9UN/J6PqqkMhn0RH16u0I0Ky7L55JMfiuOqpuHwULx5gjCg1/NZLOY0LWy3G1RVIU0irq4uqYqc8/PPcRwb0zQwDQNVUUkSEeIaj8c4jkOaijfnZrNBlmWOj49Fo0+rmU5gsVwxHk8Io4gsr9ClBnQZxdLRFZXNakXbtizmc8IgwPMcNFUliyM8x8Y2DXqeS1rkbNZr5je3lHmBaRhITYvnOOi6TtW2SKpYsFVJBIwelOsPSLs8z4UPYzDg7bfe4hvf+AaPH79BUZakWcE2CLBMh4PDIzRdQ1FlPMsg2GpEgQCMjLw+J4fHnTvEI88zPv3sM9IsozfooUiChZDlGZIMu01AHEd7AOpyuWK9XHN3e8N2t8W2LTRT8BjKsmA+v2K9nvPtb/8Jw9GYL739NuPxqHvk69PruZydHKFoKrdXnX3LcZlORE7CsU3Ozk55ffGKxf0di8Utu92GR48ecXw6Q1Y0ZEnm/n5BFMU4R8eYlsZ2c09dNaRJSl4WzJf3HB8dMhj2Wa8XbLdb0cuQZdHV+CmvL8SiQMs+rNE0De+88w5ZnlP9BFmp3++zurnf380d22Y4HDHo97i+uUaR4Pj4mDzNCKMQ3/f3dibXdXEGA5q2JYlj+sMhtmGQJjFVUdHqDePRmCRNmN/NSZIMVdNRNBUkGW805vZSnGI8OBJvb2/RdZ2joyN6vR5pmrLebLi4uWW1XKMqCqenp1Q1ZFlJlCQ0dUOaJpimJUJNikycxqIjYYqdked7eD0hzJVVGdMykBUJ3/b2O53BsN+9OZcoyim+7yNJ8p4DeP7yBY5jI7U1SZpgGBpxXFCXBYqiYRiWsAit1/u8vSRJ+11UmqbUVclut8AwTCzDIooigiAgKysWmzXnl69FkengiA/e+hJFnqNr4u56d71jMh5iWxaqLJElMRItx4dH1EXJQhLwHFVRcT0X0zTJixLNNNgEO5pCdEOGwyFBEBDHMWG3exiNRliWRV3XvHxxzt3NHElS0E2TLMuZ3y944823efr0TVpa7u7vWc3vKcKI3W5HmmVIioykSLQSHB4fcXRyjGHqxElM8uOY2XTGkydPGHsDVusVSRKx2WyI44S2Bc/1eP/LH/BLf/+XO4ZjShjuuJvfEuy2VHVJ09aEYUCeZ/zld76DoeuMx0NUBb783peoyhzDNDiYjnn3g1/n/m5BUzdomkoQxKzXG9qmwXEsjo9nLJc2SRKQZQ6eL+L/mi5zcnrQPQ6UzO9u8P0epqUzmU24WyzJ8pRqXXB0LLgg2+0Ws6uq/7TXF2JRUFSFwXRKsFphGGKbHXc25jxN997IJ0+eIEkyN9fXTKbTPf324OiIIo1pm4bZ4SG1IrO6FTsJTdO4vr7msa6TxLEgFVkW2zTl5upaMAdq8IdjgiBCVQ36fRtJVbEtG0VR2K1WBEHAo0eP6A9ES+34+HjPGsiyDNM0OTo+JqsbNFVH1XUuXl+SZmnXKwBVU2lagQLfbDciJmyZyLJCludYlomiimi0rCiYtoWm6ZiG1cVwU/G5aY6iqsiyyvzunqZpefL0DT766COKomA2m7FaLun7PoNBH0WWeP36gl25xXE8Dg6EZ7PuOIqr1WpPmn44qdE0lTIvqauG0WTCZDymbiBeLkSnpK6RFIksT7g4P0eSZILthnC3o8wzPEfIbMrcIIkjtstl59fYic6EqnUOBVivVuiWSVlXFHlOW7XUZbV/7nUcR2yxq2rPuozjGMdyWS7WeJ5PQ0uLRNvCH/7ht/id3/kPrLcbirKiLSrkvEJVFWRVRtYUDMvEtE2ub6/I/7zAdmws22Q6GXN7e80PP/4IWVZIkxTTtDg7PcP3Z7x8ec7t7Q1Kq3Zi3ZjZwZQ33njC2ekJ1ttPaduGi9fnhKGFLFs8++wZq9WKINgi0RBFOzzH4StfeZ84jnj57Bnvf+XrZGmOJMHrVy+61q+OaQ1RJDFbydIYTVOJoxBNU+j1exi6weuLC3TdZDIaoHSSWcMy0DQZXVdp6pb1asWw3ycKQzRN/+InGuu6ZrFc4naMPsfz6EsSqmmSpSnT42Pi7ZY0SdANE8dxhD1X0tBVDapqD/6kaXF6PqbrohkGuia8AGEQEHWi16auGU8mHB4ekex2XF9f05/M6A1GTA2roxnX6IaJrCisVgv6fTEFljpIiKIoGB1E07Is8TzZGYKm0ym33dHSZDwhDFOCMKQo6z0UQ5LVDrZq7wnPRVkRJzlpKtp4um7twZ5JknZ4L6EMOzk5YTQciWBNWvDq/IKyrEmSjPVqTRxFbFcbdruAD99/n35/yHKxQFXF38fDXGQ0GuG67h4VN+i6AcvFPaZhUZQFdVlh6CYnR0ecnpyw3KyRZAWrQ+Fdv74QOQVdQ1NksqokWK/B91BkCcexGfT75HVDkRdoXShqMhqDLJFmGZswAFUQmlEF/PVBIKxpGoZh0Ov16PV6e3DpoD8SVfUgIOtAuaqmI6kK2+WCsqmRVAlTs5DkCkVVQJbIyowmT9AsDVOVkBto24qyyNhs1l2zUOPs7BG+LyzOf/bnf4rnuYzHE778/juQi0iyJPfQdY1Pf/wjoijA9cTiYlsWX/vq11A1h9PTM3bbLYvFnJvr152yreUPfv8PGI+HnJ2dMp4ccHJyymq14unTJ5imTrBdY9k687sb2hZ8v4dt2ciyQhxFHTjIRZEVsiTBsRwODw5YrzZIqsbx6Qm6qpHEMcumom5qjo6OiIKQ3S74qdfjF2JRaFvRAWgeGo9ti6ooJNstareiaZqGoouGV9O2ZLmIag56ojGWZgmj0Qjf9SjKkiyK0F2XshY4rM1mg6qqewpNLUmMpjN0SUFTdeaX1/iTCYaaMjo4QAfqpqWVJZBk+oMBlmWhWhb319eir2CaRFHE+OCAtqq4u71FVVVePH9JXVU8OjvD8/rIis6Lly/ZbAMkWe6CPJ3EVtWp6hZJ1nAsF9vxkGWZfr8vjNuyQZqkGKZJWZQoiozjupRlyWw6E5p5WSLLcmhl2kairSUc2xOJNsPh1atLwjBiNJxRlQXz+bwzLptMp9N9jfzB8P3AlpTbRmQgWonl/YK6bvjwww85OTxC1QVZybZsfu+3f5/PP/sMq0v0GapCkWdkqbBdWaYJdU3bQpFlAh7TAk0r7namwUDts+jI247119mSByz5g1Tn/v4e27bRdR1J0iiqFt1xMD1v75E47fucPnlEEIXc3t5R5SW6pyJLwjAepxF5mVFUBVLSomkyjm2iGzrCkSLwZre3NwwGAwB0QyYI12x3Szabe84OHjMcerw8fymgOVWJZZkUecl2s0NWZC4vrxgMpzx58oQPP/wAw/g5ri5f8fz5M64vX4PEHv3/ne9+m8urC05OTrifX/LoyWN026aVIYkzFFWiquD1xRVVVXa1a42b3S225WDborD3+bPn6LrOxPcp8pQ8TdAta9+RGI/GzKW5CKf9lNcXYlGQAKmukRUF1RDps912u++Et7sdpm3jDIbEQSB493VNksTYpi622oZBWRSsVitM20I3DJHTl2W26/WeQrxarcTFbRg0WUERRqxXK16+PMdfrnj61ttEu4DBdEqVZRi2i2knOJbwMbRpyuzoiNX9PYqioGkabWfy7fV6ZE3L0dERw/GY65sbZEUny0uRQnM88cO28QdTHMdlNpvheR6+7wtzNaKlWFUVw+GAoT+m5/cpS7EbkmRJlGIWC3Y7IbN59Oix2DlMRpimQRQEAhqy3ZImKXEk0FuqpnJx/oIwEgOnB9WZruvc3d2h6zqDwaBT2gmrsa4ZwtVQNezWa3700Q8Z9H2ODg/xPI+zyZTRoMf89hpD0/b5kbapcG0bTVFom4bdZkvRmbVN3YAWFEkSx9FVgmaZ+J7PLgpIkwxDfzhyE4yMuq4ZDod701YURfTHMzzHxzRN4iTm+vaWvMixZIvpbMLho2M+/PpX2K53hKsYSWqFTiAJeH3xiijcURQplqnhOiZ13RAmOaqiUzQ5t7e3rNcrej2P6WxMmgWkeUKcKlxfvkBVNd58espms+PVq1coEpimhdzKtBVMJjN02yYvCv79v/9tyjLn+OiAD95/n69/9SuE4RZNVVguF5imxm63oihiHj96zP3dLYam8eTpU9597wMur67I4gVHh8ckcSjKf0kGrcR6ueHpkzcAMRSdPDmgSjMaQyKKQvqGwezwAFpZNG97fZrqCx5zlmSJ66srBuMxatPs2W+mbbNdr/dKejPPcH2fpq4p8xzHtsiSRKTDXHFBZXFCFIaMDw74v5h7s1jNsvM871l73v88/2c+p6q6q6qrupo9sNlsskVSo8PQsC0HAZILx0CCOBcJggC5iq8E6DKxBMNGjFDxhSTIEjRRpESZkkjJJEWKQ1cP7KHmOvP0z8Oex1ysXUekQjIIGAe9b6rq1Pnr1Dn/3mt96/ve93lFnqMUoSvTyUS6GaOIZgGYODs9RctgPl+gKgrLpUNvbZXl0pHJy0KQFEj2UbFDVSoVsigi8P0L5mK1WqVWr6OoKpV6neXSxbRtOt0es9mSF1+S1UEUp8zmS87OzgkTFV036HW7KAW1yPcjPM+VysOTE8IwhESgCvlwZFlGuVwiDGWpfHJygmHoJMlfUq6ULyy7lmGgGxqmrhOFIYKcZlNOGxSh0ul0aDZlPuG9e/fIskwmaxVefpBmrdiQx6E0jiDPsAyTyXCIkibkYUytWsVSNCqVCqsrq8UOZlGyLRazGVEQYlTKVGsVDENjuZTiHWEYJJHM0nRcl0qtRpQmBGlMmmbEcSKJTUX2hQT3yqZzq9VCURTOz8/Z3d2ju75FvV5ndW2VG8/dYndvj6OTQ7733ntMphOq1SpkAlOVFK+z0Tn9XpsXXn4RZzFjcHYqob5ZymLhSJzaYk65XGJ7e5tSyZLJ0CWDnBjLV4lCl1Q1UfKcg4M9trZ22Nx8jbt373N+NkQIae0OwxjNlvFu1WqV6TRiMhnz7e+M0VWVmzevc/PmTabTCePpgOOTA6pVm9FoQKvZxLZrDAdD0jihUq5RL9fQVEHJtphOZ6SpQ6fdZTQaE8cJjuOSZxB4UuXZ3GjT7rQYnJ2jaTq1ap0szijbZbq9/o9+Hn+cMeL/r+vWjev5b3/2X+L7Pv1eXyKotb9thAgE0+lUBq2mGVZBRVIFTEYjHNdhfW1VevoTOee2LFkO6pbFbDJheH7OxuUrzAbndLs9ZrMpgeszG04wLUuitAyD9Z1tMqDcbOAtlkRJgrtcEjgLatUqYRTR6/fZ3duViKtaDVXTSLMUIRSEoiGynNlyyXw2J0lzHj3a5eGjx7z3/l3G0wlnZ+fEmQ0IWV0EvsxwzGU2gCh2VNnD0IkjSY56Mv2QeWwU4zk5tej3uwSBR5rKRTVNEvJU6gJs06LT7dKo1fG8OXHkyKZdsfClaUoUJ9QbdYSiEMcJ1UoFo4CoKKoccyZxzGQ0JE9T6hXZLzA0ndySRzLf84gjmSURBQGu41Cv1bhcSHSnjothWlJX4riQ5xd4u2qjjmFZTGczojTBtGRkepqm0vqeycBez/MKjUKDTm+NN956h1K5TK1WvQDBqJqCaVmMxyPeu3OHMIiYT4t7QlepVcv0um1UVbC5voZlGrjOEj8IOR1Mmc9n1Os1up0WjUYdVREEgUcU+mQFzUhRJfEgAAAgAElEQVREEeSQ5jmu65OmOVev3kQoKrPZgnJFhrTkIiGJI1qtOqenR1iGjIpr1RtkWUqv0+Hq1avYFZuFu5RA1Rxcx2VtdQ1dl2rTPIfVlRVKZZvxaCBH5FGMqqioqmRmKorCbDqTFvEk5Hx+TppJ343r+HQ7PdbWN1GEim2VsFbWb+d5/uG/+zx+ICoFeZ6t0OyuYGkauRCUKzXGBUXZrFTIxlO8xUImOEWB5B/YNqau4RYg1AhBFMeQZbjLJabRlqusZSM6XQyg1+lKelCcUK83Kds1DNNkOB7RXV1huVhI3bWAIAwkSTHPEKj4foRm6DiOR73ewrQswjhGzVM0y2Q2nZH7EY6zJMsgSRIODo84OjhkNhrhzmcMjo9J4xg3iDAti8lY4r/SNEHTFPzEQ1FyOYXJYsp2W+6+Jf0ieER6/qVHw7QM5vM5ihpSLgsUxUDkCvPZHC/wscwSzWYVhZzpdIypCbIwJPA8XM/HLtmUalUsy5Z0nnZbuvCiBKWIkmt1WiBanJwcY9UqRL6PG/nYhoGlm7iRzBwwTB1NU6gVob/zxYL5ckma5dilMqh6AVPJLqYJqqGjaBqu55KlKaamYZfL2AX5OQgCVE2DTJBGIX4UEi9mzNwlo+mcV15+mclkwtHREUIInn32WVRVJQgCSprJCzc/xHy+4OzsnPF4zHB4TpIk0ohm2RwcnhRYv5R6o05vrUuzW8d3PIbjMcvlkuVsjipkAI5RBB2nqcT4hVGMouqUqw0G4wGbW5d49dZzhJGsevZ37zGfTzg738NQYRb5rPTaUDU5OzpGJAGz0YDrz95idXOTtZUNBoMBSZxJRF7dQC8mctPFnKXv0t/cYnJ2RugFWJaOocusT5KEJImYTn10S5eah3KZ4XAsHZgCgtAn8AKi6CcLmP2Pfgkh6K2toRRxYF7BX0SIC8GKYZrEaYxdLstzUbOJacmE3pWVFZaFRFrTNBrr64zPznAdh3KxwxqGwXg4pN3pFIKkhDRJMEwLx3WoVqs4nofIZEZivlyCIhiNRpRLErv+xEyycJaynM9zhKLgLlzstIznONRLJYKxj2VaTCYjBoMz/MAlTkKEkHPn4dBFUW0Qkl8YxQG2bRVMCBkEKhQwrRKGCeWyQRB46LpBt1dHCIU48UnSiGjpU6lIW7XrhigiJU1SnGWAqmokSYoQKuvr6/IGn0/QdZNqVUdoGkvHIRMKhpVimOaFQGexXFAqtBphGJErMvhV03XKtk0eJ6z3VySq3XVkAlEmjxiVSoVatcp4OGJwds7R0RHVapVqpVaE2OYoyMoiSVNM2yLJM2nuyXMMclzHwSikvEmaXOgW8jxn6cr3Nc/h61//Ot1Ol1qtxng85q233pKJ3aZJHMcMBgOeeeYZNje3cF2HNIm5e/8e+/u7lMtlsiyj1+0wnU0JfJ/FcoZlWWRJRKNaJksSbFMvqq+UNAkJspgo8EjSDN0wMa0SlWqF7e1L1BttqTFRNU5Pz6iUbMndUFTskoEgYTwZoxUcUdMwqdXqnJ6e0ux0ZO5HtUq5VKJer3P37l0ZVpznNOp1xuMR0/GYZquFJmRD1PU8At+XStwCg+e5HkZFBt5oispzzz3Hwd4BaRwXMvkP+kgySVnOZjS7XWajEeVSCfKcOAxlJHm9jm4YOFMFVdexymW5S/u+jDkTgmzpgKJi2Danh7JDq2k6dlnINOHBANO02N8/oNVu47guYRBT7/TIVJUkzwiDmMloxPUbz0g0vC7fIGn2kefsJ7LpJE0Jg4BWu4VhGpiWha4qmArU69JUo6g5V57aYdUPePjoMQeH+yhKRqVqEeU2k9mUbrdD5idsb29IV53nYtkSnyWVhgkQs7m5wvn5EM9bUqs3qNVqkoiUC2rVGmenI8rlNo1Gg9lkim3V6HY6dDsd1tZWKZdLTEdjxkOLxWyM47qomkGpLDMmXM8ljKOLc36z0SDyQqI45uT0FMMyqZTLlEtlDFUhi2IarRbe0iE3DUajEVEQFt6V+CIdKs1Szs8lGk/LFCzLliO0LJM7tABN1VFNnTQRiCwjy8HzPZaFW1B/QkHW5fRoUGglup0GaZwTBj7TidS41KpVdnd3i0i+kPl8zp3332elv8L29jbtTputjXXWVnqMxiNmqpBsS8vEMAxOTyaEzpIsS6mXbIyyjaXJnlTou+iajqprREmGUDTKlRr1RpN2p4dpSkbo+kafk9MzOp0Op0e7nJ2foyoZhibhNfPpqKCBpaSZTODSNY2jw0Oq1SpXr17FNA3G4wmXL+0QhhH9fo/pZEKlUsHSNI6PjnAXS7rdLq7noYAMR6pUqZRKLJw5iUgI/ZAojHhw5x5RFFGv1nFdlyj6CQxRQohN4DeAFSADPpvn+b8UQvwS8N8Cw+JT/3me539avOZ/Af4bIAX+xzzP/+zHfQ1FVQh8n7ODAyrNJoZlsSgAKmEYsr+3V3D/ZENMFQrL2YJypSwZC8WqKxC4jgcIhFAl6nqxpNFo0Gy2QZPkI8uyuXTpCq7jMh4O2Ll8mWVhUQWIwgi7XIYsgyzHcxwatTpZmkh3ZKtFs14nzVKWjoNpW0zHo0IcI7BtkygK2dzaIMty7ty9R71R4+bN67z73nsk5zGLhUe9XqFWK/P887cYjUbFGIaL7MkwDBBC0G532N3do1Ku0++vEQYJiqYxHk1pNdtYZoObN7e4+vR12u0WmqrQadeoVmQZP52OyMnZWF/H9xzcxVJOOUyDg4N97t67S6PZxCrb8j2ncK4aOr4boRsmlmWTAWkUE6YJFcsiThLscpk09C90G6Ui0UvXZIDudDxhPB5z7949ci+m0Whg2JJCnEQxqQIxObkqsMolHNdFURVazSae78usA00KrDIhewy9Xg/XdTk9OWFn+7LUXJSlPP58cEa73cbQVSaTJZ1Oi+PjEx4/fsRwOKTZqtNut1ldXWFne5sjTeHo6BBVVajXKqy0b/Ho8WMm4zFnx8cShV4E+Ahy4kg2bstV2X/pdrpsbe2ws3OZWqOJpuqcnQ2KRyVnfWMdXVdYW+syHp0ymYwIPJexEPS6vQuORsWy8H2PIPDRdWmSa/W6TAYDLMskB5lXslgQJTFHR0e0G00WiwU7OzuyGiuVaDaaJFFEf2OLk+N9fM+h1WyhazpxnHByckKj0WA6/cnArQnwP+d5/oYQogrcFkL8RfF3v5rn+f/2/Z8shLgB/BfATWRs3JeFEFd/XEpUnueUbJvhcIjr+yTdrhxRKspFV3w6nWKVylQ7Hdz5XLrMhEJURLH5fojne7iuy5UrV2RkVwGVEIrEpumqSrnd5vDwUI4lNZ1aEbX1pMPd6felJsAPCJKYyWCIyEHJIQpDbNPEnc+p12uU7SpxmqAbBicnx2xtbWJpMBoO8DwPu1Ri/+CQNE3lQhHHdDodFssll9vr3Hz2WaIo4vbt2wUO3qBeqyMUQRhGVKt1dM0mDBTq1VUMw8JZZLRbK9y4cYN2q82lS9eIo4QwXhAnKUdH54xHA8pli1rV5tLONtvbl6jVq5SrJc4H59x75332dh+zXC5pNptcefoqy+WSR48fFPFjdc7Pz2jWu3S7PYSmEEQBvu8h8lz+PFQd1/NkQngUX5B8arUapmnK8v8JE2A+x/M8Dvb2OdFPqNSqqIaOGwXkukq5UafRaKEZBtPFnNl0RrctzTyGYUgEveuCqjCbzSgVlvU8gUf379NqtTk62Jf6jSjm/OSEcrmCrmnkScLW+hpnZ+ekScR4MGRwesrRwR7dboePfORl+p02t2+/zsnREYkvj4YVWwrnnKWc+rSaTTTNlhMhJafV6VGpVqlWa1QbLQzbZmNjk8APMQyTlZWVooyfYxoae3sPcJ0ZgbdA12QjOU1TslzazfOF4MrVq5Ilquu4jsNysZBHD9PENgwqpRKKpjF3lrzw4Q/jLxZ4rsfhwQHt4ji9XCwgzwnjiPlsQafTIUkS6nUJorVMi+l0yvQnwbEVcfNPIueXQog7/JBo+e+7/iHwO3meh8CuEOIh8BHgb37UCxRFuTivA/jFzSqKzni906FSq3F2csrDu/fodruUbJ3A9YiiiEqlQprlpGlOGMXs7u1TqVSkek7XCKOYcOnS7/fJNZ1Gs0UUhngFwSaJM7zAp96os1wsCYOAII4xNI3pZIJpGChZhmEaRHFMd6WPs1xQ0xTiKKTeqLO20mc2nmAZCqVSiVKpxOPHewghsG2b1dY677z7PrZt81Ov/RRWrcdbb7/Nm2++habJsJV6vYGmKWiKjq4bVEpVTKNGuVyn0Why88ZzdNpd6rUmh4cn3L59m4f3zzk/H3Lt5mXWN1e5cfM6hnEdRWg8fvQAw7Dw/IB3332Xw8MDHMdBVVS6vQ6HJ6fcf/SQ2XSK4y6lLFbXCX0fXZd+hLKi4Hk+mSJlxEkqUeNRHKPmkMYu6ArVapXFTMaSJUmCs1ziuR5ZntNqtej3+1iZRpwkGKaBZplgaFiVMnatgmlZZELIsjqU+gnTNC96QqVSCT8KqdVqnJ6fSXza0kMT+sXP++zkBAoPx2Ih/THtdgvHcaVqNAxRVIVKpcFsPmG5nPP2W2/S63f5+Mdf5WBvn8PHD+l1pKBraTjMZnJBOx+OyKCwiNexq3Wevn6d9fV1NtY3pJYkirHtMpVKhXv37vONb3wD31/iew6GIYnjWZZTsss0m9ICr2s6uQrdbgeFHMO2SOOIJNY5OT5mdX2dyIw52NslSVPWtzYRWUZaTD8Amo0GQlFwlktMyybJMjRVpddfJYsTFrMlvW6fsl0mLBZ3Xf//qKcghNgBXgC+jcyY/B+EEP8V8DqympgiF4xvfd/Ljvghi4gQ4p8B/wxgbaWPXaQzCyEujDpPtO+KqlIql2XmIlK9p2v+xYIQxzFJlrK+vU11PiNJU1rtDvPxCNuWnMd6s0mpaCwpimyYCVVFUVU6rRanx0cEQUC1WuX05ESadHTJdZyNxyS2RJDFUci7b7+FVSrRC3xavS7ucoFlmjjLBcPBFLVw9TUaLU7PzjFNm8ODI1568WWWzpJ79+7x19/5MienZ1iWjL237SpRmKIqOc1Gk83NTZ69+SytVh9TrxLHktD053/+l4xHU6rVOs88c5PtrTW6vT5eNGbpuRwc7bOYzWi1JPTlP3zta8xnMy7v7JBm0On2OR8M+KMvfJFWq05QNEwNwyCJI3JVUpYM0yJJU2yrhF9oQmy7hK4qzMZjRuMJpCmmrlOqSju3aZoXOgff9wuLeEiv2+X69es0yzIARSgKiq4R5xmdfg/V0BhOJ5wNzrEMg/HZ8OK1SZJgWKZMCSt0GKVSScrhy2VECmenJ2xvb2PoGoeHhwS+R7lcxnUdBudnVKtV6rUq02mC4yyxTJ1GrY4qBIvFjMlkxHQypt1q8vwLL/L666/jeR5BkexcbTSI4gihKGxdusRTT19l49J1er0+tUoF27ZIklSmWXk+b7/1Fm+88SZhEGCYCqoqOYvVSpXRUHIshBBS7WnbmIaURRuGTrValU7cqkxBu3/3DleuXCmeVu0iAGk0GrHa61+E9ZRsG01VGQ/lxM5PEjRdGp/yLGM8GrG6tkY8i0mThH6v+5MvCkKICjKO/n/K83whhPg3wC8j16tfBv4F8F/zw3Mn/29iiDzPPwt8FuDm9av5+ckJ1WYTtTAZTSaTCzdcnueyS9zvkxRhpSCNSL7vE8XxD0SaVStVAsehWquDEOhQHDGCCyRamqYkaYKiKuTktDsdkijmYG9fGp6ee47j3X2SOKJcKjMenyMEmJbFzs424/GYO3fe51J4GUWVO2W9Xmc2npFmCacnA1RNo9tfI0tTGr1V/sNffJmj4xMODg7IMkGpVCWJE1rNLp1Ol0uXLnNp5xLPv/AipVKJ9969y5996cuoqlGIj9q88spLlMslylWL+/f2eOe9N9DvG6Cn9Fbb9Ho9cmIePr7P01ee5u996BdoNOvMx3OGoyHf+Oa3uP/wAe1uD8uUyskwkD+XPFNRFZUsS3E9B8tuEsYRhmmiaBqT4ZA0TSibpoSKxAmq8rdJ05omTUJxIUXXNI1Gvc7a2hq1Wo2tncus9Fdkoy6O8cMQq2Rj2RY5FDe0Qb/bxTItGs0m0+n0IrQ1yTOyPKPT6XB8fIxZNmjWZI5HqWSzXC7I86wwqJlF5ZkzHA4olcq0Wg0ZzlsuEccR5UqZ6XRCmiZ885vfoNfrsdpZYfvSFQ4OD4lmU5IcyAWVRotGo8HOlatcfvoq7d4auqazdOVot2zbVGs1FvNF4Z5d5ejoCNdZsL6+ymIxpt1p02k30FVoNhvkaXYxGSuXbAxNxXOWJFHI0d4uG2urTKdTRoNzLl++TBBF3L9zh97WNu1Wq7DWmximyXg8lhkdxbEkz3KmI5ltYag6znzJnfG7JKmMsavXqz/ZoiCE0IsF4bfyPP/D4qE+/76//zXgT4o/HgGb3/fyDeDkx/37T6y7nuNgFrtsvdmUfYECbGKYJnGWMx6N6K6sMCnCMoQQlBQFx3HY3d1ldW0Nw7ZZTqcoeY6CpNf4BZfhiblGN02UXEeoiixpy2WyZHGx8k7OzrEtk/FySRyFGJpUBx4fH2OXbPTCdfn2W2/hBwEUcXODszHP3rqFrks8epwL5rM5Dx+9SRxnJElOkmT4foyhW2xvrXLzxi1u3foQy6XDZDLlz7/0l/h+wNraGq989GW2tzaxLIvd/Yfce/A2d+7codVq0O/36XQ77Oxsc3x+QpZHHB7v8cy1azz19DaNahPTkF31YTxksZjjuj7Pv/ACL7/8IkeHR7x5+3XW1lZQFdh9/JCNtT55npLnAlWvECcpVslmNBkznc1YWenL8jWTVGby7OKYZ5fKeMUIWVNV+v0+5Dn9fp/V1VWq9TqVeo1KvUYYBCRpKpu7QmAoKhXTkkc6u8TR8QlWYTSLClWpomskcYJuSjm2ikKWpqiKynQyRREKrWYLz5WOWd/1MIoE5iSOaTUb2JbFyclxoWCdkaYJSRxjGgaT8YTxYM7lKzFPPX2dk7NTNF2n0+uyvrUpv4/1dTqdDppewjB00iRjMZ8TBT6e40rfTbFpXb92jdlsjB+4UtE4mbKzvU6jVqFkWtQqFcigWqnR7XWYL2YXzt6VdpujoyNKtRrj83OJb/d9tra20KtVkihCNQzMUok0Ti6ehflkSrleJ4lTTEOmQt258z65yDk9OaZUsqg3qqT5T8BTENIf/G+BO3me/8r3fXy16DcA/CLwbvH7LwD/TgjxK8hG49PAd/4fvgbtYgSn6TqqEGiGFIk8KUNVz8PzfVrtDt5ySbffZzaZ0G63oThzOs6Scq2KyHOaK33yJMUydMIoLHwKkqjjLJcy8rxaYTGfSZNUmuD5Lpvbm0zGY+aLGUkcgwKmbZJGguXSYWV1jdl8xnyx4O79B4RRiG6aHJ8cs1w4xLHgnTv36PVXGI0mOI6HphscHBxJ+lKWUa50eO6lG3S7PeI4YW9vnz/63OdRFI2tzS2uXbtGv7+CaejcfvO7fPXrX6Fer6MWFcn2pQ26vQ4bG+sMhwNOzw7pdDs8fe0as9kc2yyhqhr37j/ke2+/y6OHj8mSnFvP3uQX/7N/gGZofPc7r5OlCa+99nFyEt747ne5du0amgIPHz7AtktYJRN35NDudsiylK2tLYSAyXCIKJrDCuD6DnJDVUkyyIQKik613qRk21RqDdrdPuV6DdUyEJqsFAA81yWKIhbTOXmSIeKUarlCr9cHAZ7n4YchQhE0223q9QZREmPbJbyFi09IlqUyFanZlEQucgQCx3XQixDXKApIEhnKQp5KJmWSk8YS929oGoqhEqcqfhiydB0++rGPYdkWzVabza1N6s0mpiW1E0JRSBIQIqFWr5KUTPI8IzqV7ILNzU1ef/11zs6O6LSbXL58melkSLlcxnEcWo0mcSpVp37og6GR5jm91RU5iRuc43oeXuCDonByeiqT0fKcepwQBD5pksqMzc1NFEXBWSxQNZXpdIymGYRBgmbqVOs1HHdJb3WFVqvJ0pkz/Qlj4z4O/BPgHSHEW8XH/jnwXwohnkceDfaA/w4gz/P3hBC/C7yPnFz89z9u8gAgFAWrJLMHNF1HURQyRSGJIhqNBsvF4oLGpAiolku48xmtdgvPWWKWy4gso16tEjkOuaqQFicWVdhUa1WiIEDJwVBVSsVEI8tTTE1hXnAXBRmtVh3TUBlq4gIBFvohmRD4YcT88BChqghNxQlDTs/OUXSNs8GAOE4Zjj3eur9Lnsv5okAjy8A0ylzaucKrr36MlZVV7t55jzffeJfZbEatVuO5D73Ac7duMZ/PefT4IX/zzW/KwA49J8lSKvU6lmVjlSs8//yHuHLlKZIkuUDBlctlFHQiP+Vwb5fbt9/gvXfv0G53abd7fPrTn2Zra4Pbt7/NyeFjVlZWaDTq3L97h9lsxs2bN3nj9usMhwOEUJgvlgih0Wg1mc/GlGtVNjbWJMk6y6iWyiyWUsacCoM0yzg8G0lGg6LQanbxY7h24yq2bXNwOqSTxmRF6vbSWRJ4PrPJFFM3mI/GHD/ek2IlyyQIQrJcVlWKohLHkvxsGAZxlGCZNqKiEHoheQ7lag0/jNBMk5pl4HkudbuOoghKtkkcBiSJR71Wx1kYMh/BlyE3Wp6jC0WKt+pNmSVpyJzJq1evysq1ITMi86LvFYkEw9RIYqkxSHWBEE0s8wZ379zl/OycTr+Pks8JA4fjw8eUSja2YWAbBjmChbOkVLLJNcFkMsKqlPDCgG6vS0KO0CUIp1wu43ou9UqJIAyIz05IU5l4pVdKjMcD2aBNIxQNIi/ErpSIs5QgjVi9vM3j+/dor/ZQVJWZ76EZ1o9+Hj8I3ocXn7uV/+UXfg+zWiUprKRhHMvRVmGZ9cKQyzs71BoNnGLs8iQLMUkSmu02omiY+XEEQmZUzmczmaxjWWxu76AqKs50WgAwoVqRkVpe8XXr9TpxHDObzTg8PKRUKlG2y4gc3nv/fcIo4vDoGN2yODo9YbFcMp3Pmc7nBEFEkulomsHp6TnVag1Q2Nq4xE/91KdI4pQ33niLMIwQSs6tW89y7dp15vM5o9GQ+/cfFK68Booi8DyfVq/FredusbOzTbfboVyu0Gk3cAs7rRA6R0cHPH68y3g05fj4mDCMKJcrXL36FC+//DK2VZV9hgf73LhxBcvQ+dznPsed99/j5s0bdFpN7t+7x2g0ZLlYkKUprWaT1bVVytWKbABWyhf6ifl0iiZU3KVDrVIhTGJKts3p2Sm6qlEpy1i5ZqPB5Z1LlCtl8iyjXi1jGHqxAAvOTs84Oz1FEYKz01OGgyFxklDpdGiv9HnwUJKtK7UqlmUxGI9kA7JIw9ZVnciXE6YnbIs0i/E8F6WAm7ZaDRRgfH5GkiY89dRTPHz4ENeVxCvdNNBUjWazSa/fZ23nChsbGxdxg09+fcKxzPMcRVWJRYJeJEZHUYyzcIjCiN1Hu/zVX/0V9+/elyxNsaRSkirParXKpZ1tOgVRqt1pUS0a6CB7ZP31daq21Iucn5+zWCxks9UwcD2PerWGoetMp1Ns275wjfb7fUrVKqauFxV2QJKCalloyIzO3kpfKoZ9n9lsxnMf/7kPrvchzSRLUdXkOTwIAjl5yHOEqpJkGc0iD9J1HA4PD9nc2rqIIHsyzoyiiFarRa1WI0wSstSh0+kwGg6pVaqEnkcaJ1KOa5rkCheGoLBwPuaKUuRGZBfIssfTx9ILoGlYtpwVn52f43o+qm7y1NVr7O7t8/jxHktPpjG32m263T47O5col2q88dYbDM6HbG5ucfPWs9y8dYO79+7ypS//Gffv3yeOZFS9aZkMp2PWVtf4yCsv87M/9/P0+y08L+Tdd9+j2+3RbDY4OxvwrW99i+FwXFjCJ9h2iW63y8c+9irPPHOddrvBw4cP0Y2MSqXFR1/tcPv2d/nqX32VBw/u8/GPfYxms0MYBjiuR6UsG59ra2ukSUIQhKTktNptUFWCSO7efhCSxQm1coXzwQDPd6iUy9RrNZzFUqZmpTHOfMbRwR6dTgdDNxDF+X0wHKKoilzsPU9CU6Zj3DggjmJGB/sEacrKygqDweAiJewJS1LTNFzXxQs9LEN2/jVNUqSSNC7AuTWEECwWS0LPJfE8EALP8VCEQq+/QppnUqlZrdJsNqnW63S6XdY31ml3OtTrDWxb7qhPUpUUFBRVQS84iZom0FQNkQtiM76A9zabTTRNo1WtEfoudqlMr9dH03SGwyFbly5Rr1WICnDs+sYGiqIQ+z5qpUKUJCRpSrVWk81IXb6uUi4TBCkbGxscHh5Sq9UupOmNIlU6S9OCl1EiWMyIk4SV1RVUIVBVBcsyWFn5gLskn7vxTP653/g1jMLmfHx8TKVSoVk0G7v9Psv5/AK6Ojg7QwBbW1tMp9MLsCtC0Or1SJKYk+NjatUq1UYDspzED9CLVTL0fckTSBPiJKLaaJBEEWEQMBwOL27UeSG6sS2L2WzO48e7JFmGaZd4vL+PH4SEScp0NpPlaxCyt3vM1tY2L730YUajCfv7+/hexMrKKj/3c7/Ao0ePcZYO//4rf0aaZrRaTelsFFJG/fTTV/nUpz7B1atXUVWN0IsIgoDT0zMODg4Iw5DHjx+zv79fJDaV2dzc5PLlp/joK69w5coVFEUi7u7cfYcsTWh3ejx4cI+vfOUr7O8fInKF1177OK985CMcHR1x9/332H30iE67yUsvvcTbb70pfwa2iapp5AIqjTqu71Gv13l47z5pGNGsNzg6PMRzJqys9On3+ozHY3RVo1ouF5OVprQv5znddgdV1bh77x6aruF6Hm7g02q3SbIU13OJ0xRdM2m1u0xnU2kCK8Jj9w4PKJVKtNoy30NFxdBk30nXJdpcKDlnZ2fMZhNpFVcEkecRLhaYlkW91cQPAzT5+PYAACAASURBVPprq7R6HVbWJKPxiZem312jUuDfTEuSty6uwoWJgFzkZFkuuRCqShwm0jUZpyzmCylhTlPOju8zGQ2khN11pICuZGObOteuPs1wMJB/LpV57/33qVYrCCHodDqoukGt2SBLEpIwIoxCKvUGZ4eHmAXl6+joCMuSNDLP8wCwbVsKlSybs7MzXM9lvYgR6Pb70n+SZ/Se+tAHt1LQdI1Op8Pe3h5WkaTrOM7FN+46DtVGQ1YTQkgYSZYRRnJcZtl2MVO3cBcL4iSmt7KCu1gwPD6h1WrJJOpqFQG4jkMSxVTrVRk2oqpodgnXcQnDiLMzOduO44QwjLBLJaxSidWNDe7cvcfx+ZBMKAwnM/wgwgtCzoYzSuUyL3/0VTrtDt979z0mkwnb25e4ceMmg8GI3/yt32T/4ACBIM4TDMNkPBmxurrKq6++ymuvvUan02G5XDIcDahUqkwGc/7oc1/g7t27jMYjLFPi39rtNi+88Dwvffglnrt1izwX6LrO3t5D/tW/+jc0mjWef/45qrUKv/4bv8G7775NFEWoqsHHP/ZJPvOZz3D7jTf43B/8AbZlsLW5yTPXr3J4dMhgOLiIFTNMEy8MiFMZO6/pOlbJJlU1ZvNFYR4yqVYreJ5Lu9Vkf28Pd7mgUqmAaDCfT8nSDFXIXTQIApREZTSdkJFTFy2EoaHnFoaqYGk2o5E8SiyXS4JQOiV1XWc8meC4Lmtra4hMEPrhBbJ/OBwSxQH9fo9y2ZY2ZDLyNMXQLWy7RLvdpdFu0en3WNve5NLTT9Ff6WMUC4CWq4U9XV5RoVUAQJFOybzoVylCjrPzTB5lBaIIQT7HMi2azSaXLj3F5ctXiKKA5WyKqgh8d0m9UiZLU8hSojBA1w2uPv00e3t7UsW5WMoxuuMghKDb6xH4PrPh8ELMNSrSyzRNk8lhilIYxXJsyyLwPTrtJleffopcEURRyGwyIiPHtOwf/Tz+R3rO/19dWZZdkHWCIGB1dZWsMMzouo5VqaAUBGC1wHRNJxOiIifv6PDwgsqTJJLFtxiPyVLpFkuKXAHdNDk7OiL0A0zT5Px8gGFZOEuPPM/xXJ80yajXmkwmExkk2u0yGo8Ikoijk2NyRVCp1RjPFqS5YDRdoBkGrc4KL334w7z/3h0OD4/pdro899zzPHr0iN/5nd9hPl9IPmMsE4tLJZutrS1effWjPPvsrWJ3kwapweCMB/cf8PW//mseP9hHEfKB0FSNRrPOa6+9xic/8QmuP3OdRqPE0dGAr371a/zVX36Vvf1d1tZW+PhrH+XBwwf8yZ98gSyTltpypcw/+Mx/yvMvvMxn/8/P8sYbb6BrKq3WNs12g/3DfQ4P9tF0mbjsOA5oGl4Q0O73cD1PqgPtEk60wPWlq1Q3DYIwJAxDXNdhNp+jCIFlW7IacBwUVaVWbyKikEyROy2qgm1bLD1XgmQLvkIa5UXWpVKoVVOCKCLJJFpvsVwyHo+plWtcvnyZu3fvcnx8jKqpaJrKcunQbEp9hOe5xEHAWrdPo9nALNk8de0qK+vrmGWbaqNOqVJB0TTyPEfN1R8Q2hiGUShv5Ed/oK4WXMBg5Pujkqc5zWaTe3fv8e1vfxtdS7FNA8sy0RRBGgWEvkunWadkSxZGybJxl0uSLKPdbNJbX+f85ASyTGZWZBnNwuA1OD+nVC5fJEfX63XOzs7kwtHtFlWTjEIslWz5bCUxWhGwPJvPyQovx4+6PhCLAkAYx7R6PbzlEtMwcHyf0HFkh1VVmU8mlCsVtGLVbjQa0ikZhiQFlTiKoovzVbPZxvN9RA71eh3PdTk/PiaNEwQCUzfYunKFBw/uc/bgAUmSXOgY7t+XSURPJMqD4ZDpcsFkOiWIEvYOjplM58wdn83ty/RXVlFUja99/W+49ezNCyrS7/zu70qtQuEaVHKFy1cu8dRTT/HTn/pp6vUa33vnHfIsZXWlz59+8U/53//1vyaKI7lD+j62WUPXTZ566gr/6B/9IjduPMPKygqqCm9/73t87atf56tf/arE3ldK/OzP/CxxEvPbv/1bDAbnBGGAYWi8/PIr/NN/+k9QNYN/8Su/yuHhPqZp0mm3eeraZVqtBn/we3+BZZtsbW0SRB5hFBFPp1iVMnGSUK5USeIY3w8IoxhNN3AXC/I8ImOBrhscnZxAltNqNqi3OoRJytz1UBA8Otij3elglGxc36O72qdShP7mwMJxyLMcx5PVgV2kjjeLBf90cE4QBPR6Pa5cuUKtUuNg94BqVTIdoyjCD1wWi4VEqPV6rK+v0ajXqdfqlMsVgjCk3evQW1tFUVUUXUUoCmleBNUoP/4+/YHryXGiCPyJolhSmE2Dra0tKSTKQtI4wnOWCDLajRrVSglTFQhy1Bxmkwm6baMXLMrR2RmbW1vExXHWNE2yKGIyGlEul6Vj2HGKQKNz2STt9QAYjUYSwJIk+K50vObtFuE0wirZ9FdW8DyPSZFb8sOuD8SikOc5eSpnx3ZBVXLPzy8w1J4nd3JNVaVbsvDZR3GMoeusbWwQRxE4MuzD0I2LOXrJMIjCSMa6qxpW2SQsMiWXrkutWmNj5xLvvfUWDx48RNd1XNfj8FDGmp2fD6i3GoRJzOHxMa4f4AUxGbC+scn6xiZpmjMYTnjhxZc5Pz3i/fffv+hug0SUV6sVnn32WX76pz8loR8nJ6RhxFpvhYd37/H5z3+e6XRKGEaYpkmepKz2+jRbPV588SU+9alPsbW9wXQy41vf+ianp2f8/u//HsPREHKo16t85u9/Gs/1+PKXv1Sg4qDVavHzP/8LfPrT/wlxHPPLv/RLHBztoagKYajx0Y++xPalLf7tr/0fLGYz9KWG5y1QVQMyA0VT6Sgqx8enVKpyCgAKqqazstLiMIqZzBegamQiQdENGvU6jVqdWlPmW7a6fZIkJs5igjTBLtkoSVTAX1Nc15XgkzQlKzrtQRD9rRzY0Dk/P5fIc8uiXpdOx2atiaEanJ+fkWUZ7Xab6Wx8EdxSLpdZX19j59IlytUaWZ7JZrFlgRAX/QKRgyYEqCrihwpyf/R9K2sHcRES3Gq3ZMPYMKk15CKapgmh55AlERXbRBM5JCHuYkqqqiRpQrvTZTqb02rK13uOi6YImb1pWszmM7lJ2DaVSoX5XPpMrl6VZrZmp0Pk+3ieJyEzvk+lUadarUp7tm7Q7fZQhPwONfUDzlMgh3qrJf33hexWBpxIsrNtGORJcrF7J0UEnOe6mKYkPJuWRZqm8sihacWbIUNFDNtiWYwmQz/AMi2SMJKqRM+TVOhm80K3b9s23W6X+/fvc3R0hON7+HlMmuQsHJdMaPT6a9y89Tx37z2gv7LO9s4Vbr/xBrPREM/3ik54il0qcfnSDj/zsz9Nv99nd3eXlZUVhGFy+/XvMBwOef/99wnCAE3VQFexDI2Nyzu88sorXLv5LJcuXebevXv8u9/+TZpNiQj74z/+k2IsZaHrOq994lU8f84fff7zpGlGrVah0+nx8z//93jh+Rf57nfe4Itf/CJ7B/sITSYVr6522d7Z4Fd/9X9lOp0Q+h66plHxKiRRSqO2Sqla4eTsFEVV6a30MQ0TsowbN27wxndfZ+k4LFwfRTNQTQvdLiE0g/76BksvYDpfSn5jDqVaiSjPWE7HbK5vsL+3h6HpLBcLLNOiVq1iajqqarJELvBnZ2eouky/dlypDARZ1pdKJUzdvLAC27ZFs1UnSRJqtQrXrl2j1WpSrtXQCwL0k8AhVS3Qdk+e6yfXE9/4xb2Z86MKbVmB5yiKuBhXZpm0+0/GE+7cvVMsHBmB56KKjI2VHrqSUy9ZzKZTNAU67TZZmqIIQXtlhSyOyZOkgA/bBEGAQFqt4wIHt7a2VuRm5JRKJVTkMdyyJNOBPMNZLNANA7tkU6tVOdrfR9FUDNMkiqMf+Th+IBaFDJkvGPk+i8W8sBqX0FSVOE3wHefifBSGgSQjWzaaphIEvmyuZJJvGMeRfFgKX/9yuUAgEKqCqiicnJ5gmSae48rmS/Emdjpd7ty5c5Fd2F9ZwbJt/DCkpAj2Hz8kywVZLmh3mqytrfP222+zurbBYrHgG9/8FmEUoiF975ZlUrItnr0lNQaLpcyBHA6HvPveu6hxznKx5Oj4CAGoQiZP9Xo9PvTcc3zyU5+6+H//2q99lvfee48bN26gqPD5z38Oz/NRVTBNg8985jP4gcMXvvB54iSiVCpz48Yz/ON//J/z5ptv8cd//AXu3LnLwcGBnHkXMYKf/OQneP311zk9O0HXZCS6aZlMpiNKdhXHXeJHAaqhsbaxUfgKPNylwztBwOnZKUkkg3h1w6TRbDEYDHA9nyCMmM/mBHHCdC7pWI8OH8gbtFrF9T1m8znNWgPfDyDLSaNY9iLKVVbWVhlPJiBgPB6TZhmmbV30jUajEaPBmGatQa1exy6VqFWrNJp1ms0mlYqcUrTbTQzDRGhSMSiKCkEg/nYxEEj8QTFV+LurwA9bFgS5LDGKzxBCIBDkmRS7CSHY2twmSWMmkxHkKVXbknbuNMKyZWMziSMavR5xnOI4LsvRiGqtxtLzLs7+SRLTaDYlpzIMUXVdSsltm6PHjzEtaYd+6umnsQsEQRLH5HGEELBYRBiVCjngez7ng8EPMFD/7vWBWBQ0VcGyNOIow7Z0skylVCuRBAFJkkgkW+BgqIpsOFarxGlE4DmUKxWCYEkUR9RaLTIvRjMEnjtnNpujaSqe5xUyaIdUTYmVhEhNaJZrDE5OydOMO997G3cxR9F1ojTh9vfeZjAa4QUBCycg1epEYcilS5cplUocHZ9x5fIlBoMBb775JqVSCWIfzaxgaiq3btzg5ZdfJs8zlssFew8f85U//3Pywkp8fHYmQ0yCoGggNrh+/Tof+chHePbZZ3n06BFvv/02t7/7XZI45tVXXyWOY770pS/i+z6WZdFoNnj11Vc5Ot7na1/9ujQA5SovvfhhPv3pT/O5z/0+ruuSpinHJ7vkhBiGQsm02dnZIYtzbn/3TfJMIY6lFn7peqBoBHGEH8qQXiMxEXmXg8f35Fm+ajKfj1i4UiMhEgUdhWDpspzMaDabBH7AfD5nMpmgaRqP93ZJ/RiByXQxJq5UCOYBQ3dEEASEuiRBx3GMPxiRCXn0EYZOrir4vkeSS89FGEWcnJxgGjZJkv9tgnbhsXj66afZ3NhA0cv0Vsqoqo5Icml+KzT/QhGQZsWIUZCnGUITZLkUEj1J/xLFvOEHrjxHEBXmPQVSQZ4JhKJjmJrscQgV1/GZTwaQhJAEhH7CPPEJPIcsCdF0i9l8KbNLh2Pq5Sp5mnO2d4hmSWBrZ3WF8WRMnITYto1t6PiudIHGhUQ8yzLpbzk/l9BfRYE8LyzY0qxmaCOm05kkRHVqZNqPfvQ/EIuCAJzFDNdxaDabktSsq2gYZFlCuWTjuA4lo4LvenKkpWqy26yq5KpKLBSO9vexLQtn6VCySxe26cFgJDkCRVzbzs4OrVaHsmlDrjIcjUiY4PgB49Mz3CBgOpfQ0STLiJIMXbPptCV5986dO2xubnJ8fMydO3cu9OyGbtDv9/jEJz7J6uoKnudyeHjIm2++yWKxoNGU46NHjx7h+pLjWLJsrly5ws2bN3nhhRdwXZff/93f43vf+57MNqjX+NQnP8nbb7/N6emp5FUaBr1ej1u3brG7u8s777yDoqgkScLly5fp9/v8+q//Oscnx7z84Zf5ylekdyJJEjRdaht+5md+hj/8w9+XHXdFIUc+LHmeXbhSJffRvTCJPUmo2t3dZblcyinQdEYeSefitFCK2rbNm2++eaEziaJIull9lzSL5eSgVscLPaJFWJTzKoqjyMg8Q8P1fVzPKdLFIU5jsigDkZGkGnkO0+mMJJH/7+lszO7eI4RQqNdqVGs12q0Wm5sbPPeh5/jEaz/FyuoKpmEghIJqIEGYxfEhTTNIc1T7Bx+JnBzyv9tnyJGUSUX+G+L/au/NYjRLsvu+X8Tdvv3LzMqsvbp6me7mNGfImREpmqRAS4ZIjwiDtA3Y0BsfBPDFBuwHA6IhwJbfJAP2myGbhg0MDEND0SYpwZKoGVMcCiP2kBzOdPcsPV3d1V1dW1buX+a33iUi/BAR94vvVmarOequLngyConM+pYb98Zy4pz/Oed/BELY4kGHe4fcu/eA+SxHCMmNGzfodVu8d/s2o6MDBoN1JIa93T0uX75ElmW8d+c9OjLh5NiyiSEFG5cvc3iwx9HoiO76OlVpuRiN0iwmc45PxrSyFlmrzXyxoN1NKUpFkiboOOXg6JhuLCztfseS8g61JcLtDYZUT7v3QVWK44MDZrMZvXbblVWTrG9uIcEWKDWG48MRWdZCK4VSFZPxlNnMumZEZEupP/vss8xmC9599/2azWiRF5yMbdLOo+1dxuMpx8cn9Ls90IKj4xH7h0eMJ1MORiMm8znT+ZzZYgFSkqQtLmwMuXz5Mg8fPuT69evcuXOH3d1djDGWM+DiRa5evcpP/aWfsary7dt861vf4t79e9aU6LbpdDq8//5dlFJ0Ox3SNOOFF17gV37lV8jzBV/5yr/gO9/5Dru7u/XJ9zM/8zO899673Lt317qXdMVLz7/Ipz/9Y/zJn/wpu7u7dkGqiqjd5tatt/j+m9+j3Wrz2Z/4LK9+41UMmv2DPaSMaHda/PzP/zyHh4fsOQo4KSVa65Xz0NKPLd97+NAmut66dYtOp8O9e/fcpl+gFor5YsZkMmG+mDGfZxwcHpCkMTISjEZH5PmCLEvQUjCajJku5ghp09wRcOe9O7TaLbqdLlJX5Iu5TcF20ae6qlBViXb1JPOiQIgIKSzHAMbUeTOHhwccHOxz9/07fP/73+MHP/gBb735Fn/tr/1VfvInP8eFCxeQlQUIDZpKKaQUxHECgWZgjHFs+qtCwWDIc4WUhigSCGGFg0DQ6vS4+fzzYCSz6YJ8usdbb73F/Xt3iaUkloI0tutyf3+fWICMIMosk/bR6Ii01aLI52SdjmUByxccHh2RtTLKsmIwXOP45ITdvX22trZ45uZzHB0dkWYZi/mchw8ecOnSZQZZ7OjtIkrnXZvMZqiqZLD5EfApfLzNsJjNXPFRSeyqK4/2bBUmS2AyptOx7D6jkzEXNi+QRCnFvGRnZ4/R6Jis3ULKhEe7e8g4Yj5fcPjgATKKKJTi4OCARVmip1MUhlvv3eFkPLN8AAYOj444Oj4hSlLa/SFxu8dkOqPd6XPp0iV7mknJ66+/Tp5bktI0TVlbW+PmzZv83M/9HJPJjK/90b/k9ddfpywtjbZnBrI8ChqjDYPBgJ/92Z/l85/7HA+3H/K7v/M7HI1GNXnI5uYmn/nMZ3j9jde48957gI2N/6mf+ilu3LjBV7/6VUauilZRFvR7gxqPSdOUwXDA7u4uRZHXIdvdbpfPfvazXLhwgS9/+csW3JLS1lfQllNQCIGUtlCrqhSJ22h7e3usr6/XiWn9fp/d3V2kkMhIcHw8smZAnnMyjjBGs79vswKP5ragT6tKIRJoYTiejgHD/tEBN248Q6fftbyNU8saVJWVLWRblRht8+mqSjEtJxgDcRyhdQXGAsZSWJp8VYF0YKExmsV8xt333+f46ITj0Zh79x7y8ksvsba2TpLE9Ppd+oO+rU9a6ZXAJSsOxGMMIQJBnLSciSFQGrSyp/Du7j5loTg+PuH27XdJzJQkgn6vx/7eHlkiSYcDoiji8PCAQbdDp9Nimi8YxOvkuuLCxhqLqiRJU4gkw0Gf9c0LNqx/umA6nhMnKZevXKXX7zNb5MRpxt0HDxHA+sYF4jRhUeQIGZG1O3Q6XaI4Zv/wkL3dB2Tb25zVngqhUFYV45MJqqyYnEzo9rpovWAyntBqtZnNLMd+v69pZy0mk0ccjUZEcUxeFBwcHjEajai0Znd3n+ki53A0cryLBYtFTpZl7O3tgcDRWxv2j0Zs7x4wm88QImI6m4GMEAoWlUZGMe1u3yZSRREbGxu8+uqrtXrtNZHr16/zhS98ge9973u8+eb32D84II4i0jQmy1okSczu7i5laWsvbl67zi/8u3+FZ555hi9/+R/yzjvvUBSOZSjNWBsOePmlT/H9732XBw/uWXMqirh58wZXr17hG994ldlsSpYlLBY2gGs8tnECShniOKLTafHo0SNLJtK17E4XL25x8eJFfuu3fos8z5FSukrOFcZoa2dj7WkhBDKxJok3H/b390mShMPDw7pCtQB6va5jz5ZUlaSqrBuuKBZLG17AZF5QqJJSlRRVQRxHGAHv37tDu90ha7dsfoOq0JVCYgvSGmWFq5SSLEkwxqBUhTD2lFblAi1EPU5CSKpKUakSEMxnU4pc8Y1vfIPvfOc7bG5ucv36da5du0YURbz44ov89E//NIPhgHavRRTJegzq5tVt95IUsTUbcBYE0GrZIrhGgxCWT+KZywPefuv7zCZjiiKnKFp2HCNhvQ5VSa/f49q16xwcHSDimNH4hLSVoauSdnudk/GEqrIlD1pZxsmhjRYVQtRJgYeHhyRJUnsyxscjpCqsl85Vzh45bbzTyZDybNfrUyEUVKVsCnIUsSYiZJyQdGJMpdGLHBEldHsD3n33PYSI2H70yKrtZcHxyTHT+Yzj4xMqbUk70k6H4/GYnUc79Af9uty61tqGGJe2tmNRKUptKJWmqgqUAV1WKFM5uqyUXn9AmqTM5zPHuTdHShtpF0WWT//HfuzH+P3f/30ePnzIbH5sN2cFvd6ALEs4Pjm2aHEc8/LLL/PFL36R9+/c5n/5n/+BJRRVtjhJHMUkccTVK1d47933eLS9TVnmIATDwYBnnrnBq6/+a8aTMVJa8s92OyMvcpRSNr7BaDqdtqsHOXYl9+YMhwNefvkl/uiPvlbXZ9RaWc3F1a+we8CglKqxNa8dGYeE+0hTy3bUoSwKyjK3RKrGkCQxRbGoqfXyfF5zcEYiwmJgmiSJbP0HCcYIytIKt1YrRSqDcEVvkySqN6gAIimIkwStYpTSVJUK7hHSJEZrg5EKIwVK2ecrVM7xccV0NmE0OuK99961BXYvXeLNH3yfb3/7W9x89iZf+PxP8tLLL5EkKXHq6iMY7DWVRkSW2MUkgrJ0Fb0QVGXJ0dExQkjiyCZKSWnY2dnh4sWLPCwKOt2uFbzaUah1uwx7XZIk4tHuDieTsdvEMy7duM7e9jblfE6v16tjL5IoJXZMVCKOOXGRvZ6JPI4ikk7Hlj/QNnGs329xPLZCqZVaijalnvJiMAYotGQ6nnAyWdiKx86Wnc/nlGXFcDig019jMp2yvbsH0jIm7e7vWh77srTFYaZz8tGYw6MjZvM5J9M508mkjh0QwqqHWmsUhsr97eggMcaeOGVpg2guX7rEYj7nwYMHzGbWbk7TlHa7zec+9zmeffZZvvKVr3D//n3yfIGUoFRBlrVAaCbTMeOTKd1un1c+/eP84i/+Em+88R3+8F/+PvPZjHaWUJS2BFwrTdnc3GA46PHat7+JEJIkiUHA+sYaRZlzeHRQ11gUlahHMGulaK3IspRer8N4fEJZ5rTbliFofX3Iw4f3OT4eYRNQjbOjY6RM0MZqC1pXjgIPGzeBD9Kxzaf51qeoAJzbzNd71NomA9nMQoExVihQaYxKkfYoxRjlNn5i07IXC9LUpjJXVQUYWmlqNQMnZKSEWEqMExLGKATalVYz1gSIIpLYahECZat6C4M2FVVln1FIQVEu0KbiaLTPzs42t9+9xZvffZ0XXniBy1cuW8ykKFyy1SYXt7YYrq1jtOZoPMajElVVIUTkvCBtlKq4f/8+Wmn2DncZnxwxOTmh1+1w4colVFHQzpI6rmZvf4e8LJi5+Jwkjnnn+2+yvr7Owc4OrSQhi2I72EqxsTagXMwolSKfT7m0dYFWGttgOVWRRG06nRaHB4cYIZBRTK/bZXfvkauQXdn6oGe0p0IoCBHxaO+QxXzB2voak70jKlVxdHjEzu6OBbyiiMFgjW6vZynSqpKDw0PGkwlxkrDIF1THxyitSdKUSmvSVouyKJjnOZXSFJWqQ1OlEGgBlUPb/X1EUURVlrTSlGuXL5NEkkc7jzg4OKCqqlpte+6550iShN/7vd9jZ2eHLLNl2CtdWECvnTkQcs5g0OfFF1/mF3/xF/na177Gt7/9Gmgbku2jNbM0BQzP3nyWb3/rz+t6kIUq6PV7XL16lTfeeKPGDDyBp1eZ48hWcur1ei5Ix7Ihz+czhOhw9epV3n//fadh+M1ucQSldO1zF04N92xd/sT3QsBrDEpZO9/OzdIO99pEfboHKrgqSspoUQsMlLIhAlWFABIpidxmT+Ko7suTnBoXiuxNGhvlKt38CaSMkVLUsQhSSqQAUWkUXvhXaAVCC7SWjCcls3nEeHzE3t5Dtu/e4c57t+n3+zVHaKfTYX1tjY2NDfqDAUopZouC6XTmeC5skVfAUaoX7O3tU5YFxpR86oUXuH/3LpG0YchGKZ65fgXcoYeB9bV1LsQ2SCtzp3kUJ6A0oz2r9l9wldBHR4cs8gVXLl9hPj1mHMPU1aAsioLpu1Ou37jJ8cmYbrfLfGFZsMFFB+tqJT6r2Z4KoTCeTLhz94EtiAJcvny5RoRH4xmHBwd2ofMAISVjx4gkpGQ6nzGeTuxJ2+lYGm1lKPKcOIpRruJQlNgBNlqjjUEZrBtOOJsXgRSgioJWZv3lWRIRC5hPJpYB2mXqvfLKK3Q6Hf7YsSOlLlxXaUUUCeLYYhbz+RSwbqnPfvYn+NKXvmSBqLIkliVxFJGlKcZYlfza9Zvki4XliECgVcVw2GdtfY3Dw33GY7vRq6pESqvSWnCQ+rcQtvz4YjFHa0W326XX61KUBQ8fPiBJLIJtQ7CtQFCqROnKbSh7LQIh4FuNNTgtzguGSCiiKEY6D53Uy88K6YWDbp5JoQAAIABJREFUROUFlauYnaYpkRfOTp1OkqS+LxnFLnCnQgpnMrhoRDAYo1FVZYvnKG05DpLE3jvayn68NuSK8iqNwZtMWOEgFDKykY6z+YJqPifP5+y4Yr4eO/Fz33a8kd3OgKvXrnH/fcP3v/M6ZVkyGAy4fUtbz9hkQrvT5sqVq7z22mt02y2kgKpY0EoT3nnnHWIpWR8O2NhYo5WmFFXFIs9RLocn1oZWy3rjTg6PUIuCbrfF1auXmE8njA52SSUk0qDyOVmWMdwYcP3aZZKkxfXrN7h9+x3r8p+MmY7HJLEF8nnaMYU8z3ntje/Wtu53v/8Dur0e3U6HoixtKfjDY4zQTKYThJRojKNUMxgMldao2YxFnqNLjRRLNdZFpyNlDNIgjEEr7XzzPs7Vqpra2HDTJJL0ul3u33ufxdzmqQ/6fT79yis8++xN/tk/++eMx2PiyGXVOQkvI2UlvDCkScqVK9e4dOkiv/3bv83x6IRWy9KhC60oCkvwqbV2zEEph4f79WkuhD0lr127xhtvvFEnfQnnF7ensA2tlR4TEPa5nJeMNLO8lFEkSLMErY1D7u3G8hWuEcYJBVO74mCpGXhU3gfKxCvBL3YjKGW9FXESW4FRWxj2fv1YGWOQQthxAqIsBqdhSCdArEouXFyBcNl+tlK2dIQnQmA9Jdqu8cgJysqZEktwUPiHcXEYNnxRSgHC2IIrkcSUClWVzOZTx24djA82liGKJEopep0hd+/eccS4MWtra+w8ekia2gzYTqdDv9thsZiTOXq5fDFjfWhJUcpFxealLap8UbMwT0cjW3Qma9NpZYwODihzO9/SwMnomCSCwoGW+XzGoN+nWMzptluWoEgYRoeHXLn+DO1ux3lqYmbTmcubaLOYTDg+Hp25H58KoVBpxbRaMJ5PyAvr6hvNxy65KaHb7ZEXOfmidGqzPd3Deng2yASnz0q00LYuoQPObKRXBMJGs8koRmqJcUCUX2RGaaIoxmibyPTOO+8wmdgQ5auXtrh5/Qp/+P9+hdnJkVV1BeAWsy4N0qQYLWm1u1z71DXW1zf45p/9KZPRIe0sI58fuQ1lals6iiJ6vR7D4ZDbt2/Xp3Or1SJttzg5mThb2FmxxtQYiEWRJUJE7vQ0aAVJnKJiQxyl3Lj+DLfeehtVWb+6MqU9MY12SLm/Do5b0ssEBU4w2RPWqu9KC6I4chtVUQlbRyNOUlua3hikcIJKG5uIZCRVJKiMQSYRpRGOuSi2qr3TTrQGrUtkZNmGTGT7KbVBeVejECht6VmN0migFSfoKEZLA0LWtrOUgkjGiFIR2eBktLHPro2h0hWlsVhGEseIdsS8qlDFglhGJHGCdvRvwoCsbATkaHZEWqVECwuEHowPKfKCtbUh6xsbrA2HxO2MThkR6ZSD/SPW1wYUswqVV2Csxy1rpYynCxbqiHleMprtsrV1AYWgs7bB8fGRFZDtjFa7RYlGxBHVvEILyFVJFmVknTbd4YC93V1bZu/wgO7aZW4+exO0ZmfnEZWB4cZF+sMLtAYbZ+7Hp0IolGXJ4eFhbSt6ynCtNbOqQjmEPM9LqnLpIgttVqs12BgAre3RUaPeTtJ7lt8aQ8AuTCOX+eVpktb++clkYqtLOzBsa2uTV//4j9nd2bGL2Qkbf4pGUpImFhjb2tzixU+9yNe//nUmk4llAAbSNKlPweXClTX1m3+WOI5tQdALG/XYeAFiBYOn6LbP71X5oijqknhJollbW7NRiw5HsIxT5Yq9TjAmYVu+T/2MClW/Z5yWZoL/V0rV1aelCx/2z5vESxNASGntfocV4DQ649+TEm2MDVpSql4Pfu59oSDflNauorWu15EdL0ubhgSNRmqJlMbmOkjhBJGuWYtarVZNl17MF3WGrRTC5qc4k0cpG34vpbT8jZFkkS/Y3SuYLxbEcczVKLJxJK6s3u7uHmkiSJKI4bBn6eGmMypVsSj3aHU69Ltdkjhm+6HNR1kbDul1OyzmM/rdLlEsIYJ5URIlCbO8oD9cZ+viRaIsBWHp4opCsXAsY7bGZ5fj42Om02mdWHhWeyqEglK6DgYKF+vSj65doU9b0tuDbLAKhEUejDKqXqiwRMr9tf1rwhiEpl5gPv1Va0273a7zBtrtNs899xyHh4c8ePAAoObn99eO47jufzgc8sorr/D1r3+d0WhUZ7NNp1NSpw6XzizyG73VarGzs1Pb0V4opGnC9qPtuh/PGRFOqo0+FLbKU1XVgsZvot1dW9vSmh6gA85/P3anCQUvCPz7NQjpNlz4mp1HhURgHMFrHMfEQtbmhsbycSoXNOVdieFYWJMhqceyKRDCefSf96/5/0sp6z48rhHH1qMhpEQ49agsS5cEtQRF5/M5eW4TmiKC53WaZOTS91sB1lBVlY2Tqap6HLTW7O3t0Yn7bKxvWMLaVorWEYaYvb095vMp3U7bMjprxfUrl+sYg0hCEkskmkG/iy5z8vmcC5cuurT4mIlbg73hBq3uEOKIjc3LtNot5vMcpYRLChyzubnJ2potnJNlma3NeUZ7KoQCDh/wLUSujTH1oomjpPaH53m+BLo8z4L7fIRABTH8zeuBEyZOKIQnIrCy+Dw9nBCCe/fuAdS2s/9MKGjiOOall17i1q1bK4van0D+efznvbbhgTt7qkf1xrbOEiuslCueUrsk3Y/WmspUSKfxlGVZF2c9OTmpsQg7RoBeCgC/cfxYhnPgN3s4HpFT50NB6sdTa41xwK3W1k1YoetxN3KpdYT9KqXq090+u6rHwr/vPx+uj/AZ/FyHAtMfFFLIOjDL4wzeHVtWFUovhZR/jna7TRonlE4I24m3h9disUBIa/55lmfP9eAFhZTSFjPKrfYzGPSYTOwGlxPNiy++wPr6kKPDA7q9HpubG7SylMGgz+133iaJYy5tblDkCw52diyJTqfNbDpFmYgKiZEJcdpmMsuJkikyjplO50RpiyhtoXNLYuu5Jgy28NLR4aFN4DujfZhiMC3gXwGZ+/z/ZYz5b4UQG8BvAc9i6z78p8bWkkT8BUvR+0XkN4jfXKHLTSmFqkx96iz94KZeZP46hsdPErsezOomrnGoJaruS6D5iL9OxwKD+/v7terl+wyv59XWXq+HUoq33367xgwqxwXhr++R7NgF6Hh/dZZljMdjC0S5/y8WFnz1J6/XBvx91+NnNKq0ZsfcEdN2Oh2Oj49tYd2GZhVudOAxARB+tvkTfk4IsXJCxsJtdm0DoEQAVCqMK8qTrMy5D4bym9gH1oRz1Wz+ml4IhBqSfz+OY7tBkRjl5mnFBe3iNFjVQDyIapwWN5vNwBgih4GkaUJeWNeqdylHUVQLA0+7HscxETnaaI7HR2ysrxPF0G232X60w9HRId1Oh93dfXZ2HrF5YZ1r166SJgkvv/wi+XxBd22Ng71dsjTm6tUrTBcV/eEaRaXodAd0u5YiYPvRrtMODf3BGmVRoMqy1noWLgbCC9ezxhU+nKaQA/+eMWYibPm4rwsh/jnwHwN/YIz5e0KI3wB+A/jb4ocoRd8MLvfqtZ80v5DKQtUnlI/JDzUMf6qYAE8IF394gvjf0jT+775XlpbXcTgc1os3SZKVU8sLA399sODkzs5OvTH9tf3khGqzV4m9qry2tsbJyUmtWRRFUZsL/rP+RPULsH4+jUsNNvUpCzgXpk2GmU6nFEVe+/LD8fb/D8Oa/Xt1oV8nfP2mCbUbX7i31si0rkPKq6qyJdwjeaoQCjUkpRSxjIgiuSKAa7DYaYReYwxNGb8GmuOq0XVSU70XXN6TEMKmGbt78HNfFIVzmUoiYbk4/JxXlarH1T+313ayLFu5D+lg1E6nw6JYkOQRucOpBv0e0+mc559/jsFgwMXLlxiPLXfIYlEgpbCUeFozn875s29+k+HGRbprRR1Ap5RiMOiSJClHR0dUlS2aI4Dp1KYHXH/mGbIkIWq3UWXJwNETntU+TCl6A0zcfxP3Y7Al5/+qe/1LwNeAv80PUYq+Jqk441RaLkpRT2q4uEIcIooiFNol+DyOK6wALOb01/1G8NRW3m4EVja0F1rBWCGE4OTE1qScz+cW1XZagb/XUDX2bTabsba2xnA45Pj4eGWTxnFcq62eosw/l78/pTRVuap2LxaLOkAqdjRkcRLV4c1+4QK1ZtZUy5uagaVjs3MSmkE13gCOpdh7fZbzpFkKktD0C4Wxn5imwPWfaQpVLwCWIOxyLnwfUkikkcvnC7yVvq9mW5KwGDxxa9i3UlWtiXhz1sc0+OuVZYmkQkooqxhtUgyWvXzuuDs3Nze5d/8hvW7G8ckJSRLzzI0bPNo9wOiKssxRZUkry+gO1mxEpZB1IV+lNI+2t9m8fMUWgZnNGWxscPf2bU5GI6bTKQ/u3ePKlSu0jHXdLxy561ntwxaYjYA/Bz4F/E/GmD8RQlwyrpakMWZbCHHRffxDlaJvttBOdn3Wr/vFG8n4MdXHvxdKeiNtZtsHqZ/weEqs7883Dyb6ghve1vef8apruNGFEMxmMzz/gMcjmgvbf8cvqsViwXg8tpx6SjGfz2vVteV4+Tw5qe/XP3scxyRxspL3H6rm3tb2WpcVrqsofjju4VyEm9U/f3h9f1L6zeJP3aVpZlavcwro7W15cILIz08obBogY3iv/j78mDRNIz++xoTSYPV6TdzJGMuYsByXJUYRRRFCRiv32Ol0LIYQPIfXGiJpvUWVqlBGMegO6LbbqEqR5wXD4QDiBCUiIhETpS2yTpf337/DYjaj020joox+v0uvP7CBbdoeOoUrrRgbzfpgQD6b8Qf/9J8io4jFYsH169epHFW+cGtyuLbGA4ePndY+lFBwqv/nhBBrwO8KIT7zAR8/zdfx2M4UQvw68Ot20FbNgHCiQlzA/r1Ua0N10r9/amfBdR+/W3HqZ7wa6VmOkiSh49iFfQCRDzoJbVu/AYFaffXsOMaY2uzxZogXCsqldlsqsV6d4FIai2/4aDof4hwKIQDdwALCzaq1rslOCGjEmsBd+OzN9/zm9KevV8+b3gh/G/W81fieQUixZEBrgIUrm9MJztCl7FX10OvhNY7TsI7Q/EnihEQmtVmAi02hccCEwkcIgf+3XCqh4ND1QeHvy8+tn9d2u02eL+pnt9e0Hpjd3V26na7DoDSjkwn7o5Gt53B4zGDQA6MRRtPu9onSjOmi5N69B/TbVlvcvLCJHPQZDIdobTg5HtFKYp6/+Qy3332PVqvFcy+/zPz4mOPjY+6+/z55npOlqWXpOqP9hbwPxpiREOJrwBeBHeEqTwshrgC77mMfqhS9MeY3gd8ESOLY+E3SxACatq2X2H6x+Pe8TWfteBugFJ4qpwmE04RHc1EVRcHx8TGDwYDBYIAxxrmElsCdvwchbCprr9djb2+vJgr1p7LXGvwJHm4K688uODw85MqVK/XJg7bh02VZ1uqqByj9RjHGYLRa2bjh+ISAptbKkQWtbnp/P+Hzh8I2FDJNfKaJK0RRROFCqf01y6rEqOXzerMnHOt6DWiLAzTNmdM0AP98p5k/teBEEIu4DkUHi3VYD9UStPTfWV7bgd4ywpjlOAghiGJZr0kv7LtdW2+z1+vRbi+L0chIgLTga14UyOmUNEmZzm0ldaUMShjSdsZ0Nmc8sZ6EJInZ2tgg63R58GiHTqdLJ5as9zocHh4g0KRJCmg6vR5lWSAF9AZ9NjcvIOKEhasbkec+C7VVh+af1T6M92ELKJ1AaAN/Hfj72JLzvwb8Pff7H7uv/BCl6FlZkKctBKWUDV02CrABJDXo5Ww+gwAR1YFMNsjRrCyWpaAQCAlKrIoGIZ39665ZakU1n6GNTYyRkaDTbVGVNu6+qiw1faUUQktOxiOSJCVrJZRlhXTMPAZFUS5qTccm4rt8BQSRFCRpRJEX7Oxsk7Uyur02YlExnc2pSheY4+z4enFGiY3QjC3ZSVEUlNp9RjizSjnQTZq6f1WfuBIpQQqPvHlPjkBqN1Zag9YYYVxOhg0Vli4RS1WlA2ytei4cEOg9DsaHTQbNYxLekxRqhpUyxGKJGzSFnBdM/lQON2Y4NmDteq30MubFWADUBkopBIYIYxMhgMj4cCwf22RDv5EGLbUNdIvsOJVFiXZ5HK0kI44i1i+ssbG2zt7uHhQV7aTNbD4nToEIJicT8nle33un07FAqISimFvPizHs7j6k026zMewyGR+zvj60vCKF4nA2oJAx797f5vLlSxyc3KPTabG2tmYB7umE6WxC1u2ws/OATq9Ht9+lPxhgtA03H48nnNU+jKZwBfiSwxUk8I+MMf+PEOJV4B8JIf4WcBf4T9ym+wuXogexYnv7U0c0FpLSamnvSQcesRp1511CRjmBoJeqors/26MFnVcCeezrllY+jH0QGPJ8QVHkKypkHMXEMagAv/CuQ7uxjK2wvGL7qhXhV5tAUiCNJM2seVBOrDup219nc9NWJp46os4mUAcgKoGMRO3O8ye9rsfMuyJBysjlVixPOm9+BMqy5W40pv5tB9AKEG1stGKpijqDsTmO9XNKi977Fp70Idhaz782aL2KZ4RArRcU/jnDNRGuA/+6wVApKxB8mbf6cDDBGvFzsvIkLmrTaLS2p73QNsw6iWOyNLV5K5klWFkfDi2TstYkUcx0MWd9bZ0sy+pSiH1XAMfzWRZlwWwyI2slYGCxmFuzoZ3ZHId8jpT2YLx+/TpKGXb3D3ju5k0uX7nMvbt3SVttDo+OGI9PuPncc+zt75EkMQ8ePuDzP/3TFDMbym2pOEVtnp7WnooCs2kSm8ubG/VEhq688HfoevKvNW3eEIALkekm6CiEQCNQpyxmLxBCoWACVT0EDJsAVZqmbG1tobVmNBqtxAX4zVynDjeew6t3Wus6mlIT0e50a6+DEILJZMJsNqvHyS58ewI2x8KbGEtTwNr94T03x0VKSSwlSWN8/TXqtOnGM9Sb0Dzuqg1P+nBuwrYcU0MUL/GK04Ko/DyErkkfLRqaRN6tGX43NOn8/89aI02g1Zs97TRDYDkn+v0+F9Y3MMbyP8xnM1qZTVBKWhl5UdTxDJ1Oh36/byujBwF4SIjTGCkgSWKGgwHtVoskEpwcH9dJVutr6wghrQBaG3Jxa4tL169jqpI7b9+iKnIGa2t0Ox0m8xnT6ZQrN28SG8HhwQH93hBjDAcHh/ylX/oPnt4Cs8awYpd629Q3v+CaGzCcwPDUCTdtuEh9H8vvnS0QV66HQTrtwb8XajNh4EpR2JJvTT98KETCQKvw1PMpxakjFinLEqUFqixZGEOaJCRpSqfVctpLsfTLW6oRwJsCj29IAWhpw8r9WCw1iseBXgJh5QWk32T+mUM1PnymEOfxcxcKDd9PE0fSWiOkAcVj9wRLl3Dz+/6zYcBVuD78MwC1thXey2nfCQVheH+qqpgrTZamdT7HbDazwLQzqWTbjsmRowUM8aSisJXEvYBfX1+n1W0zGAxYX1tjNp9ysL/H7s4urTS2iWHGoCpF5LS8oihctq9iMZsxWF9jc3MTjOZkPOZoNKJ/YYPJZML06IhWq8PW1avk4xn9fp/W015gVojHN3UTVQ43um/haRN+rgmEhd9txtE3TRR/3ZVrWrRiZcH5TeJPfi8Uut0uaZoynU5rWnn/2TqYJrCTw368+RECerqskDJCq5JZmdMVXcffEFOVBVppBNpliS7VdePUYuGuH0kfGerThuWKRhU2b06EY+SFh4/G9DkiXmNpeiIeQ/Ibgv00TKDWxOwsrAiL8B484h/+3TxQwvgH30JMInRjhp8LBcVZ0X/GGCqtSF3yXb7IUWVFt9Ox4eztNrOpjQ25uLWFgbrMW6fTqTVBH3Xa7/fpDfsYA9vb20wmY+bzGUW+oEwTup02ZVGQpRnz+aIOpz52GkRRFJSLOdM8p9Nu2UNDStpZRrfbdaB1ST6ZOsZqW7DorPZUCAW/SCFwKwYqYvOEDRdQszWl/2pQzOknwmktXDCwBCybp1SoEns7LVy0YbBO+FlYqsveDPD3pbVesitpgxAatN3ki/kcKQRpklAlCUWe22xxIWzAljs9IikxDRdieGLX0Z/mcYTf/4T2frhBfS5FiG2EgVD++z5kvDkmTZMvjNtQymZZah43WZrj39RwQjPJX8u/HwY2hWspDIgSQliPD48fOOFcVsbQbXdQjhNUSlmf5t596rGldrtN5H77MfEAuY+Wnc1mFFXJeDqhKkrAOKbtiF63jxRgjKbd7pJlKUkSs7m5SbFYcPXqVbRSDNfWORkdMZvNyBcLPvv5z1vuESEoyoprN2+y/2iHtbU1W7Y+PXv9PxVCQQhqF0lo2zVtyLCdpvb6Fm4Av1Ca5kgNKJ0S1XiWsAmF02nfMcbU0Y/eZPAnV1PjaW4Qf40wDsKOhUJjEJ6b0FhyFk/KYkzbxUHYyD3loglxTEy+sLrQlqcwjBNoal3N5yUY3/B3qBWcNW5Njay5ycL5C+c3jmPHF7kad+E/611rYQCX/0yoYfn+Q4EbCi1v/nhin36/X2t5PtU9JLV5bMzM0mXp+5vNZsRRRLfbZTgcWhxIStrtNgcHB3WsidY24Gw6nVr2amNQsxlKWxMxTRPSJEGpknxR0G63yDIbxtzptLl4cYtLly6x++hRzfgkBDamJkvpdLs8evCApG2jTyfjMYe7u3Q6HWYL6wH7oI3/VAgFWLXTw5iFUDUMJxhOV/2BlcXu22lAUniNs67l34ujuF50YcBU87e3s5sLsHlChielz23w1wjHwNUhQjpaeq01plKUJidOEjpZy2bv6aq+T0SQOYg3IWykoEAEobqr2kIohIE6/bx5AjcFXRMI9M/gx+U0jS3ECPzn/RxZarfHg5GWsRZ6ReiEAiXEkcJ7C7GbJh5kjI141FqzsbFBlmWMRqM6rNyn9XtcII4ty3OW2BB2zwTlzb9e18YpeDNlNpuxsWGBdM8G7nGnbtd6lmbzhbsPW/dj4KpFp5Gl75+Mp6yvr9HpdJlOp7z77rsYN3/tVovj0cjyaESSIs+5v7sLkawLNe/u7tLv9Vnf2Kyxr7PaUyEUhMu59xu/qSnA45hDEwQKW7hpfTtNHVTGUAXuqbNMCruIVk/FEBn3myyKIkeWOq/fa6q3oboaXi9UdcOMPymF28xuE7lFXpYFZVm4hQpJHINYFaQYW7i2aT7Jxql5KuAHK7hCM5gp1JzC8QuvF5oHoYYU2vPAkm/BmWAGG6QWjk+YXenHp+nhOM10bGoOTezJP7cQ1k23vb3NxsYGW1tbTCYTJo6fE6h5MZIkIZER7Vab2JtFxs6BTZqratKWrNNGac3+vq3mZIypyXvW19drIXH12nWMsfk9Fy5sUJUlDx/cZ+fkmHa7zeXLl221s8WCKBIcHR3RSlO2t7fZuniRnZ0dkjji0tYmUWTrkfSGAy5dvsz2w4dMJlOqStFu20jGOPm3CF56Ek1rXcemh4s63KTNhQ2r6HrYfHx+6DL0n1vBKISsS3ivRAeeBiw5YDAEQL1t6Be+r8LkUfdQQwg3l1ddQyHh06ubaq6Uthq1/2wYFq3UMsU7jlOEjFYwjHBz+nE2xtKlN0/8praAsaZGE1wNhYe/x9PmzfcdzkFo5zdtef+5JLF08+F8hffl7yP0cnhh4+fRf64JTDc1m/AQ8mNbVZUtweYElcd2oiii3W7XKfBpmtb8kVXpSW+FEwoWSOx2u+R5Tu7wBaVUncPir2+Modfr0ev1OBodQ2ld2Yf7B8xmEyJp3Y9Z1mY8PqYoFly7doUsyxj0+xwcHHD71i20quj31knTlMPDQ65fv0ZnMGB8cuJiI4YYYz0vs9nMVvo+oz0VQgEcrz+NjS4E4ozT238mxA/q7/G42tkE2YzxBVCkjWqz3S1zCBzgYLSleHPcruCqElmhEoERaGVQQiOFpqwqojim1+87qvIYrVxdAqdtRBpMtfRYaKOJhMBIe6rFfiNJaXkFheUB8AvdGIN0TEZaW0ekMLouMtJKM5tFV1X2/v14CAtgabAl16UPVrIYxErUjrDQhJFiyYoayUawj43p90FBdWKhsHqNiOz3LKeiRlWlxTnipVaVJokTUrYCEsIQyegxgRBiB94tWq8RKam3vh9jKREGImOvUakKtCaKbe0PYyzLt3D36oVDK81sZfKiJJ8vljiCNhSL3N6isAQqVVnVQxZL55YWAqNNnWqddbp0uz0HLK451iMb2pwkKUmSkucFezuPbM3HouDRowd1XoQU1vd1fDKmKCv6vQFFqRDSFjTKkoTFYk4+n5FEkEWCVhIxnUxI++tESQs1W9DpdikXCxbTMaIqiNT8zH31lAgF23TjlHGBt0tBEfztW3gSnCYgQrcXrCL/gC1aWtuyy9DaFb+7i34UWCYkYwBblBlbgchFR2pDeXRUn2YYgypLm8EoBMYFF0VCUmqFLoraTtXKkb8aU3NH2kQee3oVZeF3nA3tjqK6cKgxzoWoFEL42gcRSqvaPHIjZJ9dYDezY53yG1pIavkAnkw1+NvoFRMuxCZ8SDmhtuGe2YZWGeIkqs0fO0eGoiweI645C4MI8QIPhNbCw2seLhIzdhGbkbR9VlVF5fJDtNHONWtqc6rWdpRyAtx6cozXmIwtGxBFUR02rVnGnyhjw4erqiKOIjqdTp2Zaty8nZyMKYoDFosFly9fRinFZDKh1+uR5wvyYoFWmqtXr5KmKd1uh8PDIyptKQE7vR7tdkbsohsPDw/Z2rzAdDYliyQnJ8ekkeBTLzyPkTFpd8BguMZstmA+nSKNIhKGKAIRP+XeB4/ihie/twUfs/c/wPa3by/Rbv/bZ7M1XVS+v1BdD1+rhYqwJ2bz2v7zXnVVSkFlVoQC2jIee9Zo6RaqEqY2M5pZleEpGcfuPhuUdX4x+k1pUzU8CUhVq8ZhOLD7iCWrxWDp4OvwBnfCm2VfgYALU8ydAAATOUlEQVStTY9GIFa4cZvejGa8R5baGoYeEwg9NOHcNXGB0GUZzv1Z9yCbcxuYNisgr1iGcitta4L4+wrXSjg3fs7iOCZ2Gg3GanJFWSKFIEuz2lygLImMqQWfd1MWRUG/30dKyWg0oj/o0+52aLXatsamMZSVYuviRYSwHB3T+ZxFvqDbTinLgjR2xLB5QWfQZ/vBXdaGA+IkozKGR9sPGQwH6Kqy5REXM1vlGlvj4qz2VAgFISANuAnABuD4yVwJTmEVNDwN5W7ayt4F5D/ftEc9o1IzF3+5gFbJRk5zg0opHXW8UzfddSIvNLxP39iYAhP58nQ2FNeHMftnMW4hRdgaEFmW2WzJ2gOz6trzi9NnYIaL+rRYDf96E3wLUfomoOuFtX+tOZbh/YSMykvsgMe+G4KupwGGTXC4FrgsTb3aWyOkLRAc3LMP3kLYNGgpZW3mWFevrJUjDZRFidLK1QUxdbKau1DNXq21RgVnUyRskZUsy4iTuE67j7OKOBD6WZbVyWIekN7a2rL1LCLJ5uZmjRVFjhPB4hgRi0XOrMxZG16h2+miVElRVhhVIqWg2+tTFhW33n6by1euUeWKH9y9Q5nnYBS9TkY7zcBUzOZPuflQq6L14gNPkmJfcPTcDVPBfzdsfjOHEYch+3O4YEJB4N9vBhjZ6z1+v7B6Mgq36OpgHqcVyCh2BU4CKnghqVg9Sb0/3IfE+nuwhKuqPv38hiqUBSZXkrdYemlCkNH/1KCas6Mty3kjsMsBm4ZVodMEfUOtrsk+5fsOT2cvAE+Lbwjnw3/XP7//bJhm7U0uGwa+1Pxi56FIksSlOxsLJktpTSkPFmuNNAbl6e6D9aRcPQjtSVicEFBmWXRHRhLM0mNkAca4LlWX5zm5wzYqV8AmSZKa3LXdbtNut+u4hbIsOZnYIkcPt7fZ3NwkdtW980VO2xUMTrOM555/jnw+ZTqf0+91WOQFaSSZL3IurA85Go04PjlmMpsTZV22rlxh995dVFUhhGQ8GdNttxkM1jirPRVCwWqyzq4Vnlff1n0U/kcbkGLF7j9rAYUnW4iKh0h38z3/veZGOA2w9C0UCv47vtApzhZVSqGMqU+sMAcvPGF9qHR4D1JKVOHcbw4BF5GkUoq8cKQpwfP4DZ6mac367DdTyBjltQqBrdHZRPGbtnxoQoSCqfnsXkMJ8xOaGlaovYSfCTED31eYD7Myxg18KIosbpAkCVnsGKaEN6v8vS/N0DiObYFZVxTXC4tQg6k1RWnH299n7A4Nm8ZvLI+jX4PuhC/yAoxNfirLklZsk6b6/T5Zlrm6HHauPa1/1u4wmcxcNGRFt9vl0qUrjEYjW2J+c9OCm1VFWVUgI6I4YT6bopOItUGPRV5Q5rlzTwtu3LjG/bt3kRIKVXGwv8/62hrtTof57CmneJeRLaihHLW3T5EWHmuQNuhGidXN21T1ffObLNzAoT0MPCYo/OIL1e6lgFj1uYfxBOFi9tcOTQ9/nUgECVoOVJRuw552mqcuJZfYlnrzlbOMM0UeM2GMVYVrBmO5DFAJXW+RtEVcjZBIbDai5YYQNdOQsaQSqAaeEwqMpqsxdBOGMRj++0qplcUWuiPDjdjEi5pz7e/Dtzi2ppVH+6WQtXCU0hGu4nkaHI4RNdyqWlvBLVYzcUOy1rpvhxMJjE2dD/JavJlggqJEcSutXc6erk0pS8zr6dpOTk5oaUO3PyCOY9bX1+uIyLULmwhsvdX9oyMsX4t1beZFRaU15WQBRtNpZxhVIRBMJxNOjg5J44hOt8+D6Riw6+rC5ha33z3mrPZUCAXPEGa9gKaO8/cqrD95pbSS235s9eSBVdOA4Lvhpm/aqk0V25sUIZBn3ZaPR1I2/eNCCES01HakXDIB+yAiYwxGaCoeB9dCoeLNnm6rbSP8HLvxoshXwLCo5m5Y3fyemzEUkNbMiK2l7IBTH+FYj1FgehlVPTa+zY3bxFiamyh8Ro2px8Xfb1MgN82UENRsvp64jNIkSRyRaYnSJQX2EInjmLjVIo5sTEipfCzKquovo6iODnSrDimd50Tg+BiXz2CMJsJVjXKmmy87p5VCCImMqLkzvbYW5ol4oleA4XBI2unS6fXY3Nyk3W5z8eJFLl26xK1bt7h//z5JktLr9ZmcHBMlEWVZITAcHBwxn425evEiulKoqiDdzJhMpty78x7PPvcce7u7XLlyhUhG9Ps9JtMJaZqdshFteyqEgjF6Jewy3LAQLDRHRgKPq7LhSdMEyPzGaC7esPn/hym24XUEqyShZwmX2p/XaDVAZsAITSS9a2w1UzEMEJrP56iiqkNmpZQgBYs8p7Qlk1ewkFC9Pm1zWTXbFlsBR5ZSqZU+jSujp7SmMqtj6vsJN3IYiBSe6qfdg29+7kJBEJo2p81jff8ON4idiqy1LeeuXNBSLKzbWArq4CMf+p2XRS0wveblBXjl57ERTAXUyXBAnXxmzYqq1ui8F8gmMjleyICP03MnRFFU17nY3Nzk4sWLSCnpDtfpDdcRMuLTr3yGnZ0dvv6vX6Xb7XLz2ed5//33OTocsb4x5OLmOocH+7z15ptICdcuXaTT6zIZn5ClNngqTVLGkxPefvstqkrRHQ7ptW1exN27dynK6tR1Ck+NUACfmGRcEE7sF7m0Qb4Yg4qwrgqWSPnyGqvuy9M2ffidEFdofm713uzJIE/pz0c3+gXkfdewVDN9AJSInB3tFpSJBMKsAnZNTwpQ06hHsSVg6ff7NhNPqzp6zy/0JhYSnrS1gJViZbOFgGQIsprq8XqDTS2gBucCt6f/TOjBqIWGXs21OA2bCHGFlY1pTH1w2FDfqNYavZCxVb+X3qfQU9LKWuQOzLVxBxpdFs6duMQPRCDcmiZNOEaYJbWd0Roi6ixVy2zlUrqVqmtceiGVpikbGxskScJoNOLGjRusb2wwWNug1Wpx+/ZtxuNxfRgcHBxgjOHGM88w6Hc4ONhle3uH/qDPcDDA6IqjoxGL2RSBod/rMRz0mS2mLOZzhJTs3L9H6+azNjy61WY8fsoxBWMMpaqI4sgWho0ju6mMRmi3MA0uEm91cYYqd/N0gseBrrDP5j2cph5bLQGUUfhglziOQEQYE1sAS1vOQqW1FWJ+kwvhouaWLi/vZamKZVqyt4EtiGVPbylsIlSU2pTcsiqRhbQs7kKQRDGFtDUcCGziMAAo1CL8IpdSkmVLItdw0/oFX1WWJzAWwgU5WcEofIASzsXnokE9uUtTiDTH146VOwSk32SxSw93cyls7ADCmgDezMEYGyxWKTvWasm16E97YwxVWVrTwVUYL8uy3ujz3FZ1irz72IGI9Vpy8wWPu679s2lt12UhIEsjKq3Jq5LI5aAY5+DU2lY0K/KcKI5Js3Sp7cQxeVHQ7nS5+dyzrK9vMFzbYDK1eRC9/oDxZEJRKRCSza1LXL16g4O9fe7d22axmJCkLTqdNYy2qdfz6ZR8PmV9bQhRxt7RGKEMZTUny1Im830GG+tEswlxnNg1fEZ7KoQCApSkXhhC2rLf6CV3IAYbvcfq4lsB85x6eZrdC5x6kp5lRqyYB8KaOO4DSOndnRprZer6HtHiMQNCSkmlrb0qzSoVm/QbDDBCItzJ7VOeK62pHKkoZUFeFvWGjqUFBG26sSFyIKMPDmpqTkmSuEg/q8L6qlU1UObLpRmrqaWxpCwtlpFXFUL7+JHl5hEiwu3lMwVybR5VNpTdCkZp4wrwlaxtFKbWGhUQrITENGEMRUSEkWCECyl2LsjSb3oEpXMHyijCCKi0sqzSDl+pY7Zg6aZ1AsqOm3/X/ljlzwonIw2mrDBIRCSR+YIkjjFV5Uh5NEYJxwthQMfIOKbTadPqdIiTlK3Ll+kNh1QGJpMp/d4ALWD/8IDuYMDoeMyFi5fBCN568xYoQ6fdo9fvkWUZj7Yfsre3jyoLyrIiTbtkvTUe7BzS7XZIpKHXH3DjxjUODnY4mY5pt9p0oojh+pCz2tMhFFzzoboSS8HtvXf+pBCI2r4LN23ongwLv8LjGX3+tVC9DX+H1zzt9fB3szW1Ed+aavKK+o5ffNTBMuHpbTUBq36Gfn6/yX2//uQLM0TDFHSr4cSWfVmtahXeDPEcAhYdbxNJT0EnrNYj7cZStbAWSAkmEHTNcV4ZO+91Ccyt08Y21GL8a+E8r4xn5MfSrRn7hRUTRmntAMJVbbI5xx5BOGtum1qnMobSjg66LEmiiFhK0iQGg823iGzSU5ZldHo94sRuOY8rLBYL0rQFMRwc7CPThHavR384RGkYjUbcu3ufYafLiy+/SKUKCr3gzp07vPfeu3S7lr9zbc2WrJdScHiwj8DQaSXEUY+LW1tIqSnyOVLa6Mhe5yOq+/DxNQdO+Xx3YZNKBNRJRGiDTORKsA0sN7i3VcMsRlgFxpqt6TP3iy+0wWG5qZsmy2NPcYrm0Xy9CajhBJ0fA6RY+U4cx4hoyX2wAn66+yrL0p5eZpntB6y4JmuvilhWY/KRnN5e99/VWtNpt2llsSMcBS1ACVvEdpUqrQHENcYgFOBeIHjhFdK7h2PjrxPOQ+gRquePsBS9dW3HcWxtfB14NDAQfLcpHGoh9gHC/rQ1pA1URoHRGCnRcYxIUuJWTJImoCKSVpt2p8PW1kXiNGU6dQSu3S7z+dxVIcsYTybIKCLDmouz+Yyj0Yid7Ud0Wh1+4id/kjRKOT45osgt98NLL71kK4uNjmi1MmazGePxCe1Wi+lsRpb00Mbw/TffJM+nbG1uOA1Sc3BwcOqzwtMiFNwa8pmSOPYAX7bL25a6tmVXw3tDwXCWmzLUCJqvNeMPfAtPh+ZC/yD7ufleeHo+Lhz8ACwFg9Ea7Z8ziohju3n95g4rNPvfVSAYQzPKP1udayAqjFmGd3t0vCiK2o0JkGYp3a5lCfIUZf7aSgOEwvf01OZwvDyQ3BSQobALvU2rLuHHsR4/eHFiOQwEkCU2HbksCjS23oXWNlw5vFaTrak2T9TjWl5zzptz780MY8xjQs8LfA+YK6UoygKV58goRghfWjCllbW5cvUq61sX2D84xAjB5sUtnrnxDD/+yo9TzBfcv3Of3qDH6MGBI1fJePDgAf1Om/F4glYVs9mUi1tbdNstVDnn0aNHPHP9Cp/+9KeZTsdMTk5Ik7Qeg1PX71mq8JNsQog9YArsf0K3sPkJ9v2j3v+P8rN/0v3fNMZsNV98KoQCgBDim+YUDvr/v/f9o97/j/KzPw39n9ZON47P23k7bz+y7VwonLfzdt5W2tMkFH7zR7TvH/X+f5Sf/Wno/7H21GAK5+28nbenoz1NmsJ5O2/n7Slon7hQEEJ8UQjxlhDiHSHEbzyhPu8IIb4jhHhNCPFN99qGEOKrQoi33e/1j7C//10IsSuE+G7w2pn9CSH+azcebwkh/v2Poe+/K4R44J7/NSHEL38cfbvr3RBC/KEQ4k0hxPeEEP+Fe/1jf/4P6PuJPL8QoiWE+FMhxOuu///uST37v1UL4+Of9A8QAbeB54EUeB145Qn0ewfYbLz23wO/4f7+DeDvf4T9/QLwBeC7/6b+gFfcOGTAc258oo+4778L/FenfPYj7dtd8wrwBfd3H7jl+vnYn/8D+n4iz4+NSuu5vxPgT4B/50nN/Q/780lrCn8ZeMcY864xpgC+DPzqJ3Qvvwp8yf39JeA//KgubIz5V8Dhh+zvV4EvG2NyY8x7wDvYcfoo+z6rfaR9u/63jTHfcn+PgTeBazyB5/+Avs9qH/XYG2PMxP03cT+GJzT3P2z7pIXCNeBe8P/7fPCkfVTNAF8RQvy5EOLX3WuXjDHbYBcTcPFjvoez+ntSY/KfCyHecOaFV18/1r6FEM8Cn8eemE/0+Rt9wxN6fiFEJIR4DdgFvmqMeeLP/hdtn7RQOC2B4Em4Q37eGPMF4G8A/5kQ4heeQJ8ftj2JMfkHwAvA54Bt4H/4uPsWQvSA/xv4L40xJx/00Y/6Hk7p+4k9vzFGGWM+B1wH/rIQ4jMfdKsfdf8/TPukhcJ94Ebw/+vAw4+7U2PMQ/d7F/hdrIq2I4S4AuB+737Mt3FWfx/7mBhjdtxi1cD/ylJF/Vj6FkIk2E35fxpjfse9/ESe/7S+n/Tzuz5HwNeAL/IJzv2HaZ+0UPgz4EUhxHNCiBT4m8A/+Tg7FEJ0hRB9/zfwS8B3Xb+/5j72a8A//jjv4wP6+yfA3xRCZEKI54AXgT/9KDv2C9K1/wj7/B9L38KmFP5vwJvGmP8xeOtjf/6z+n5Szy+E2BJCrLm/28BfB37AJzj3H6o9aWTzFIT2l7Go8G3g7zyB/p7HIryvA9/zfQIXgD8A3na/Nz7CPv8hVk0tsafB3/qg/oC/48bjLeBvfAx9/x/Ad4A3sAvxysfRt7veX8GqwG8Ar7mfX34Sz/8BfT+R5wd+Avi26+e7wH/zb1prH/X4/zA/5xGN5+28nbeV9kmbD+ftvJ23p6ydC4Xzdt7O20o7Fwrn7bydt5V2LhTO23k7byvtXCict/N23lbauVA4b+ftvK20c6Fw3s7beVtp50LhvJ2387bS/j+Er59CDmqSnwAAAABJRU5ErkJggg==\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "# Testing event\n", - "cat_image_url = 'https://s3.amazonaws.com/iguazio-sample-data/images/catanddog/cat.102.jpg'\n", - "response = requests.get(cat_image_url)\n", - "cat_image = response.content\n", - "img = Image.open(BytesIO(cat_image))\n", - "\n", - "print('Test image:')\n", - "plt.imshow(img)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Define Function specifications" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "from mlrun import mlconf\n", - "\n", - "# Specific model variables\n", - "function_envs = {\n", - " 'IMAGE_HEIGHT': 224,\n", - " 'IMAGE_WIDTH': 224,\n", - " 'classes_map': '/Userv3io/projects/cat-and-dog-servers/artifacts/categories_map.json',\n", - "}" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Deploy the serving function to the cluster" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import code_to_function, mount_v3io" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2021-01-29 23:47:54,881 [info] function spec saved to path: function.yaml\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Setup the model server function\n", - "\n", - "fn = code_to_function('tf2-serving-v2', kind=\"serving\")\n", - "fn.spec.description = \"tf2 image classification server v2\"\n", - "fn.metadata.categories = ['serving', 'dl']\n", - "fn.metadata.labels = {'author': 'yaronh'}\n", - "fn.export(\"function.yaml\")\n", - "fn.set_envs(function_envs)\n", - "fn.add_model(key=\"model\",\n", - " model_path=\"/User/mlrun_repos/demos/image-classification-with-distributed-training/pipe/52f2145e-7a54-4137-8c7b-b6c20cc8b1fd/tfmodels/model.h5\",\n", - " class_name=\"TFModel\")" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "if \"V3IO_HOME\" in list(os.environ):\n", - " from mlrun import mount_v3io\n", - " fn.apply(mount_v3io())\n", - "else:\n", - " # is you set up mlrun using the instructions at\n", - " # https://github.com/mlrun/mlrun/blob/master/hack/local/README.md\n", - " from mlrun.platforms import mount_pvc\n", - " fn.apply(mount_pvc('nfsvol', 'nfsvol', '/home/joyan/data'))" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2021-01-29 23:47:54,893 [info] Starting remote function deploy\n", - "2021-01-29 23:47:55 (info) Deploying function\n", - "2021-01-29 23:47:55 (info) Building\n", - "2021-01-29 23:47:55 (info) Staging files and preparing base images\n", - "2021-01-29 23:47:56 (info) Building processor image\n", - "2021-01-29 23:47:57 (info) Build complete\n", - "2021-01-29 23:48:07 (info) Function deploy complete\n", - "> 2021-01-29 23:48:08,029 [info] function deployed, address=default-tenant.app.us-sales30-demo.iguazio-cd2.com:31946\n" - ] - } - ], - "source": [ - "# Deploy the model server\n", - "addr = fn.deploy(project='cat-and-dog-servers')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Test the deployed function on the cluster" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Test the deployed function (with URL)" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [], - "source": [ - "payload = json.dumps({\"data_url\" : cat_image_url})" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'id': '38224902-a688-4985-9424-578ff9ccb4a5',\n", - " 'model_name': 'model',\n", - " 'outputs': [0.0]}" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fn.invoke(path='/v2/models/model/predict', body=payload)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Test the deployed function (with Jpeg Image)" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'id': '246c00fc-225c-44ec-b221-4e6c99f7bc5d',\n", - " 'model_name': 'model',\n", - " 'outputs': [0.0]}" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fn.invoke(path='/v2/models/model/predict',\n", - " body=cat_image,\n", - " headers={'Content-type': 'image/jpeg'})" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/functions/development/tf2_serving_v2/latest/src/tf2_serving_v2.py b/functions/development/tf2_serving_v2/latest/src/tf2_serving_v2.py deleted file mode 100644 index d3642c20..00000000 --- a/functions/development/tf2_serving_v2/latest/src/tf2_serving_v2.py +++ /dev/null @@ -1,82 +0,0 @@ -# Copyright 2019 Iguazio -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# Generated by nuclio.export.NuclioExporter - -import warnings - -warnings.simplefilter(action="ignore", category=FutureWarning) - -import json -import numpy as np -import requests -from tensorflow import keras -from tensorflow.keras.models import load_model -from tensorflow.keras.preprocessing import image -from tensorflow.keras.preprocessing.image import load_img -from os import environ, path -from PIL import Image -from io import BytesIO -from urllib.request import urlopen -import mlrun - - -class TFModel(mlrun.serving.V2ModelServer): - def load(self): - self.IMAGE_WIDTH = int(environ.get("IMAGE_WIDTH", "128")) - self.IMAGE_HEIGHT = int(environ.get("IMAGE_HEIGHT", "128")) - - try: - with open(environ["classes_map"], "r") as f: - self.classes = json.load(f) - except: - self.classes = None - - model_file, extra_data = self.get_model(".h5") - self.model = load_model(model_file) - - def preprocess(self, body, operation): - try: - output = {"inputs": []} - inputs = body.get("inputs", []) - for byte_image in inputs: - img = Image.open(byte_image) - img = img.resize((self.IMAGE_WIDTH, self.IMAGE_HEIGHT)) - - x = image.img_to_array(img) - x = np.expand_dims(x, axis=0) - output["inputs"].append(x) - - output["inputs"] = [np.vstack(output["inputs"])] - return output - except: - raise Exception(f"received: {body}") - - def predict(self, data): - images = data.get("inputs", []) - - predicted_probability = self.model.predict(images) - - return predicted_probability.tolist()[0] - - -from mlrun.runtimes import nuclio_init_hook - - -def init_context(context): - nuclio_init_hook(context, globals(), "serving_v2") - - -def handler(context, event): - return context.mlrun_handler(context, event) diff --git a/functions/development/tf2_serving_v2/latest/static/documentation.html b/functions/development/tf2_serving_v2/latest/static/documentation.html deleted file mode 100644 index 99a5cbd7..00000000 --- a/functions/development/tf2_serving_v2/latest/static/documentation.html +++ /dev/null @@ -1,250 +0,0 @@ - - - - - - - -tf2_serving_v2 package - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - - - -
-
- - -
-
-
- -
-

tf2_serving_v2 package

- -
- -
-
-
-
-
-

tf2_serving_v2 package#

-
-

Submodules#

-
-
-

tf2_serving_v2.tf2_serving_v2 module#

-
-
-class tf2_serving_v2.tf2_serving_v2.TFModel(context=None, name: Optional[str] = None, model_path: Optional[str] = None, model=None, protocol=None, input_path: Optional[str] = None, result_path: Optional[str] = None, **kwargs)[source]#
-

Bases: mlrun.serving.v2_serving.V2ModelServer

-
-
-load()[source]#
-

model loading function, see also .get_model() method

-
-
-
-predict(data)[source]#
-

model prediction operation

-
-
-
-preprocess(body, operation)[source]#
-

preprocess the event body before validate and action

-
-
-
-
-tf2_serving_v2.tf2_serving_v2.handler(context, event)[source]#
-
-
-
-tf2_serving_v2.tf2_serving_v2.init_context(context)[source]#
-
-
-
-

Module contents#

-
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/tf2_serving_v2/latest/static/example.html b/functions/development/tf2_serving_v2/latest/static/example.html deleted file mode 100644 index edba841b..00000000 --- a/functions/development/tf2_serving_v2/latest/static/example.html +++ /dev/null @@ -1,680 +0,0 @@ - - - - - - - -Image Classification Model - Serving Function - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - - - -
-
- - -
-
-
- - -
-
-
-

Image Classification Model - Serving Function#

-

This notebook demonstrates how to deploy a Tensorflow model using MLRun & Nuclio.

-

In this notebook you will:

-
    -
  • Write a Tensorflow-Model class to load and predict on the incoming data

  • -
  • Deploy the model as a serverless function

  • -
  • Invoke the serving endpoint with data as:

    -
      -
    • URLs to images hosted on S3

    • -
    • Direct image send

    • -
    -
  • -
-

Steps:

-
    -
  • Define Nuclio function

    -
      -
    • Install dependencies and set config

    • -
    • Model serving class

    • -
    -
  • -
  • Deploy the serving function to the cluster

  • -
  • Define test parameters

  • -
  • Test the deployed function on the cluster

  • -
-
-

Define Nuclio Function#

-

To use the magic commands for deploying this jupyter notebook as a nuclio function we must first import nuclio
-Since we do not want to import nuclio in the actual function, the comment annotation nuclio: ignore is used. This marks the cell for nuclio, telling it to ignore the cell’s values when building the function.

-
-
-
# nuclio: ignore
-import nuclio
-
-
-
-
-
The history saving thread hit an unexpected error (DatabaseError('database disk image is malformed')).History will not be written to the database.
-
-
-
-
-
-

Install dependencies and set config#

-
-

Note: Since tensorflow is being pulled from the baseimage it is not directly installed as a build command. -If it is not installed on your system please uninstall and install using the line: pip install tensorflow

-
-
-
-
%nuclio config kind="serving"
-
-# tensorflow 2 use the default serving image (or the mlrun/ml-models for a faster build)
-
-%nuclio config spec.build.baseImage = "mlrun/mlrun"
-
-
-
-
-
%nuclio: setting kind to 'serving'
-%nuclio: setting spec.build.baseImage to 'mlrun/mlrun'
-
-
-
-
-

Since we are using packages which are not surely installed on our baseimage, or want to verify that a specific version of the package will be installed we use the %nuclio cmd annotation.

-
-

%nuclio cmd works both locally and during deployment by default, but can be set with -c flag to only run the commands while deploying or -l to set the variable for the local environment only.

-
-
-
-
%%nuclio cmd -c
-pip install tensorflow>=2.1
-pip install requests pillow
-
-
-
-
-
-
-
-

Function Code#

-
-
-
import warnings
-warnings.simplefilter(action="ignore", category=FutureWarning)
-
-
-
-
-
-
-
import json
-import numpy as np
-import requests
-from tensorflow import keras
-from tensorflow.keras.models import load_model
-from tensorflow.keras.preprocessing import image
-from tensorflow.keras.preprocessing.image import load_img
-from os import environ, path
-from PIL import Image
-from io import BytesIO
-from urllib.request import urlopen
-import mlrun
-
-
-
-
-
> 2021-01-29 23:47:50,165 [warning] Failed resolving version info. Ignoring and using defaults
-> 2021-01-29 23:47:51,342 [warning] Unable to parse server or client version. Assuming compatible: {'server_version': '0.6.0-rc9', 'client_version': 'unstable'}
-
-
-
-
-
-

Model Serving Class#

-

We define the TFModel class which we will use to define data handling and prediction of our model.

-

The class should consist of:

-
    -
  • __init__(name, model_dir) - Setup the internal parameters

  • -
  • load(self) - How to load the model and broadcast it’s ready for prediction

  • -
  • preprocess(self, body) - How to handle the incoming event, forming the request to an {'instances': [<samples>]} dictionary as requested by the protocol

  • -
  • predict(self, data) - Receives and {'instances': [<samples>]} and returns the model’s prediction as a list

  • -
  • postprocess(self, data) - Does any additional processing needed on the predictions.

  • -
-
-
-
class TFModel(mlrun.serving.V2ModelServer):
-
-    def load(self):
-        self.IMAGE_WIDTH = int(environ.get('IMAGE_WIDTH', '128'))
-        self.IMAGE_HEIGHT = int(environ.get('IMAGE_HEIGHT', '128'))
-        
-        try:
-            with open(environ['classes_map'], 'r') as f:
-                self.classes = json.load(f)
-        except:
-            self.classes = None
-        
-        model_file, extra_data = self.get_model('.h5')
-        self.model = load_model(model_file)
-        
-    def preprocess(self, body, operation):
-        try:
-            output = {'inputs': []}
-            inputs = body.get('inputs', [])
-            for byte_image in inputs:
-                img = Image.open(byte_image)
-                img = img.resize((self.IMAGE_WIDTH, self.IMAGE_HEIGHT))
-
-                # Load image
-                x = image.img_to_array(img)
-                x = np.expand_dims(x, axis=0)
-                output['inputs'].append(x)
-            
-            # Format inputs list
-            output['inputs'] = [np.vstack(output['inputs'])]
-            return output
-        except:
-            raise Exception(f'received: {body}')
-            
-
-    def predict(self, data):
-        images = data.get('inputs', [])
-
-        # Predict
-        predicted_probability = self.model.predict(images)
-
-        # return prediction
-        return predicted_probability.tolist()[0]
-
-
-
-
-

To let our nuclio builder know that our function code ends at this point we will use the comment annotation nuclio: end-code.

-

Any new cell from now on will be treated as if a nuclio: ignore comment was set, and will not be added to the funcion.

-
-
-
# nuclio: end-code
-
-
-
-
-
-
-
-

Test the function locally#

-

Make sure your local TF / Keras version is the same as pulled in the nuclio image for accurate testing

-

Set the served models and their file paths using: SERVING_MODEL_<name> = <model file path>

-
-

Note: this notebook assumes the model and categories are under /User/mlrun/examples/

-
-
-
-
from PIL import Image
-from io import BytesIO
-import matplotlib.pyplot as plt
-import os
-
-
-
-
-
-

Define test parameters#

-
-
-
# Testing event
-cat_image_url = 'https://s3.amazonaws.com/iguazio-sample-data/images/catanddog/cat.102.jpg'
-response = requests.get(cat_image_url)
-cat_image = response.content
-img = Image.open(BytesIO(cat_image))
-
-print('Test image:')
-plt.imshow(img)
-
-
-
-
-
Test image:
-
-
-
<matplotlib.image.AxesImage at 0x7f01b9643350>
-
-
-_images/cd2de2ed0d1841d97e37ee5a2c7a1f70e7e7ba5a4dc5416e11e1af47c1b99e03.png -
-
-
-
-

Define Function specifications#

-
-
-
import os
-from mlrun import mlconf
-
-# Specific model variables
-function_envs = {
-    'IMAGE_HEIGHT': 224,
-    'IMAGE_WIDTH': 224,
-    'classes_map': '/Userv3io/projects/cat-and-dog-servers/artifacts/categories_map.json',
-}
-
-
-
-
-
-
-
-

Deploy the serving function to the cluster#

-
-
-
from mlrun import code_to_function, mount_v3io
-
-
-
-
-
-
-
# Setup the model server function
-
-fn = code_to_function('tf2-serving-v2', kind="serving")
-fn.spec.description = "tf2 image classification server v2"
-fn.metadata.categories = ['serving', 'dl']
-fn.metadata.labels = {'author': 'yaronh'}
-fn.export("function.yaml")
-fn.set_envs(function_envs)
-fn.add_model(key="model",
-             model_path="/User/mlrun_repos/demos/image-classification-with-distributed-training/pipe/52f2145e-7a54-4137-8c7b-b6c20cc8b1fd/tfmodels/model.h5",
-             class_name="TFModel")
-
-
-
-
-
> 2021-01-29 23:47:54,881 [info] function spec saved to path: function.yaml
-
-
-
<mlrun.serving.states.TaskState at 0x7f01b86cbd90>
-
-
-
-
-
-
-
if "V3IO_HOME" in list(os.environ):
-    from mlrun import mount_v3io
-    fn.apply(mount_v3io())
-else:
-    # is you set up mlrun using the instructions at
-    # https://github.com/mlrun/mlrun/blob/master/hack/local/README.md
-    from mlrun.platforms import mount_pvc
-    fn.apply(mount_pvc('nfsvol', 'nfsvol', '/home/joyan/data'))
-
-
-
-
-
-
-
# Deploy the model server
-addr = fn.deploy(project='cat-and-dog-servers')
-
-
-
-
-
> 2021-01-29 23:47:54,893 [info] Starting remote function deploy
-2021-01-29 23:47:55  (info) Deploying function
-2021-01-29 23:47:55  (info) Building
-2021-01-29 23:47:55  (info) Staging files and preparing base images
-2021-01-29 23:47:56  (info) Building processor image
-2021-01-29 23:47:57  (info) Build complete
-2021-01-29 23:48:07  (info) Function deploy complete
-> 2021-01-29 23:48:08,029 [info] function deployed, address=default-tenant.app.us-sales30-demo.iguazio-cd2.com:31946
-
-
-
-
-
-
-

Test the deployed function on the cluster#

-
-

Test the deployed function (with URL)#

-
-
-
payload = json.dumps({"data_url" : cat_image_url})
-
-
-
-
-
-
-
fn.invoke(path='/v2/models/model/predict', body=payload)
-
-
-
-
-
{'id': '38224902-a688-4985-9424-578ff9ccb4a5',
- 'model_name': 'model',
- 'outputs': [0.0]}
-
-
-
-
-
-
-

Test the deployed function (with Jpeg Image)#

-
-
-
fn.invoke(path='/v2/models/model/predict',
-          body=cat_image,
-          headers={'Content-type': 'image/jpeg'})
-
-
-
-
-
{'id': '246c00fc-225c-44ec-b221-4e6c99f7bc5d',
- 'model_name': 'model',
- 'outputs': [0.0]}
-
-
-
-
-
-
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/tf2_serving_v2/latest/static/function.html b/functions/development/tf2_serving_v2/latest/static/function.html deleted file mode 100644 index 1d00ec42..00000000 --- a/functions/development/tf2_serving_v2/latest/static/function.html +++ /dev/null @@ -1,67 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: serving
-metadata:
-  name: tf2-serving-v2
-  tag: ''
-  hash: 8748deb1d9804f9b436c913322c84d5b46c82bd9
-  project: ''
-  labels:
-    author: yaronh
-  categories:
-  - model-serving
-  - machine-learning
-spec:
-  command: ''
-  args: []
-  image: mlrun/mlrun
-  description: tf2 image classification server v2
-  min_replicas: 1
-  max_replicas: 4
-  env: []
-  base_spec:
-    apiVersion: nuclio.io/v1
-    kind: Function
-    metadata:
-      name: tf2-serving-v2
-      labels: {}
-      annotations:
-        nuclio.io/generated_by: function generated from /home/kali/functions/tf2_serving_v2/tf2_serving_v2.py
-    spec:
-      runtime: python:3.6
-      handler: tf2_serving_v2:handler
-      env: []
-      volumes: []
-      build:
-        commands: []
-        noBaseImagesPull: true
-        functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHdhcm5pbmdzCgp3YXJuaW5ncy5zaW1wbGVmaWx0ZXIoYWN0aW9uPSJpZ25vcmUiLCBjYXRlZ29yeT1GdXR1cmVXYXJuaW5nKQoKaW1wb3J0IGpzb24KaW1wb3J0IG51bXB5IGFzIG5wCmltcG9ydCByZXF1ZXN0cwpmcm9tIHRlbnNvcmZsb3cgaW1wb3J0IGtlcmFzCmZyb20gdGVuc29yZmxvdy5rZXJhcy5tb2RlbHMgaW1wb3J0IGxvYWRfbW9kZWwKZnJvbSB0ZW5zb3JmbG93LmtlcmFzLnByZXByb2Nlc3NpbmcgaW1wb3J0IGltYWdlCmZyb20gdGVuc29yZmxvdy5rZXJhcy5wcmVwcm9jZXNzaW5nLmltYWdlIGltcG9ydCBsb2FkX2ltZwpmcm9tIG9zIGltcG9ydCBlbnZpcm9uLCBwYXRoCmZyb20gUElMIGltcG9ydCBJbWFnZQpmcm9tIGlvIGltcG9ydCBCeXRlc0lPCmZyb20gdXJsbGliLnJlcXVlc3QgaW1wb3J0IHVybG9wZW4KaW1wb3J0IG1scnVuCgoKY2xhc3MgVEZNb2RlbChtbHJ1bi5zZXJ2aW5nLlYyTW9kZWxTZXJ2ZXIpOgogICAgZGVmIGxvYWQoc2VsZik6CiAgICAgICAgc2VsZi5JTUFHRV9XSURUSCA9IGludChlbnZpcm9uLmdldCgiSU1BR0VfV0lEVEgiLCAiMTI4IikpCiAgICAgICAgc2VsZi5JTUFHRV9IRUlHSFQgPSBpbnQoZW52aXJvbi5nZXQoIklNQUdFX0hFSUdIVCIsICIxMjgiKSkKCiAgICAgICAgdHJ5OgogICAgICAgICAgICB3aXRoIG9wZW4oZW52aXJvblsiY2xhc3Nlc19tYXAiXSwgInIiKSBhcyBmOgogICAgICAgICAgICAgICAgc2VsZi5jbGFzc2VzID0ganNvbi5sb2FkKGYpCiAgICAgICAgZXhjZXB0OgogICAgICAgICAgICBzZWxmLmNsYXNzZXMgPSBOb25lCgogICAgICAgIG1vZGVsX2ZpbGUsIGV4dHJhX2RhdGEgPSBzZWxmLmdldF9tb2RlbCgiLmg1IikKICAgICAgICBzZWxmLm1vZGVsID0gbG9hZF9tb2RlbChtb2RlbF9maWxlKQoKICAgIGRlZiBwcmVwcm9jZXNzKHNlbGYsIGJvZHksIG9wZXJhdGlvbik6CiAgICAgICAgdHJ5OgogICAgICAgICAgICBvdXRwdXQgPSB7ImlucHV0cyI6IFtdfQogICAgICAgICAgICBpbnB1dHMgPSBib2R5LmdldCgiaW5wdXRzIiwgW10pCiAgICAgICAgICAgIGZvciBieXRlX2ltYWdlIGluIGlucHV0czoKICAgICAgICAgICAgICAgIGltZyA9IEltYWdlLm9wZW4oYnl0ZV9pbWFnZSkKICAgICAgICAgICAgICAgIGltZyA9IGltZy5yZXNpemUoKHNlbGYuSU1BR0VfV0lEVEgsIHNlbGYuSU1BR0VfSEVJR0hUKSkKCiAgICAgICAgICAgICAgICB4ID0gaW1hZ2UuaW1nX3RvX2FycmF5KGltZykKICAgICAgICAgICAgICAgIHggPSBucC5leHBhbmRfZGltcyh4LCBheGlzPTApCiAgICAgICAgICAgICAgICBvdXRwdXRbImlucHV0cyJdLmFwcGVuZCh4KQoKICAgICAgICAgICAgb3V0cHV0WyJpbnB1dHMiXSA9IFtucC52c3RhY2sob3V0cHV0WyJpbnB1dHMiXSldCiAgICAgICAgICAgIHJldHVybiBvdXRwdXQKICAgICAgICBleGNlcHQ6CiAgICAgICAgICAgIHJhaXNlIEV4Y2VwdGlvbihmInJlY2VpdmVkOiB7Ym9keX0iKQoKICAgIGRlZiBwcmVkaWN0KHNlbGYsIGRhdGEpOgogICAgICAgIGltYWdlcyA9IGRhdGEuZ2V0KCJpbnB1dHMiLCBbXSkKCiAgICAgICAgcHJlZGljdGVkX3Byb2JhYmlsaXR5ID0gc2VsZi5tb2RlbC5wcmVkaWN0KGltYWdlcykKCiAgICAgICAgcmV0dXJuIHByZWRpY3RlZF9wcm9iYWJpbGl0eS50b2xpc3QoKVswXQoKCmZyb20gbWxydW4ucnVudGltZXMgaW1wb3J0IG51Y2xpb19pbml0X2hvb2sKCgpkZWYgaW5pdF9jb250ZXh0KGNvbnRleHQpOgogICAgbnVjbGlvX2luaXRfaG9vayhjb250ZXh0LCBnbG9iYWxzKCksICJzZXJ2aW5nX3YyIikKCgpkZWYgaGFuZGxlcihjb250ZXh0LCBldmVudCk6CiAgICByZXR1cm4gY29udGV4dC5tbHJ1bl9oYW5kbGVyKGNvbnRleHQsIGV2ZW50KQoKZnJvbSBtbHJ1bi5ydW50aW1lcyBpbXBvcnQgbnVjbGlvX2luaXRfaG9vawpkZWYgaW5pdF9jb250ZXh0KGNvbnRleHQpOgogICAgbnVjbGlvX2luaXRfaG9vayhjb250ZXh0LCBnbG9iYWxzKCksICdzZXJ2aW5nX3YyJykKCmRlZiBoYW5kbGVyKGNvbnRleHQsIGV2ZW50KToKICAgIHJldHVybiBjb250ZXh0Lm1scnVuX2hhbmRsZXIoY29udGV4dCwgZXZlbnQpCg==
-  source: ''
-  function_kind: serving_v2
-  build:
-    commands:
-    - python -m pip install requests pillow tensorflow>=2.1
-    code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/tf2_serving_v2/tf2_serving_v2.py
-  secret_sources: []
-  affinity: null
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/tf2_serving_v2/latest/static/item.html b/functions/development/tf2_serving_v2/latest/static/item.html deleted file mode 100644 index fdc080f8..00000000 --- a/functions/development/tf2_serving_v2/latest/static/item.html +++ /dev/null @@ -1,50 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- model-serving
-- machine-learning
-description: tf2 image classification server v2
-doc: ''
-example: tf2_serving_v2.ipynb
-generationDate: 2022-08-28:17-25
-hidden: false
-icon: ''
-labels:
-  author: yaronh
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 1.1.0
-name: tf2-serving-v2
-platformVersion: 3.5.0
-spec:
-  filename: tf2_serving_v2.py
-  handler: handler
-  image: mlrun/mlrun
-  kind: serving
-  requirements:
-  - requests
-  - pillow
-  - tensorflow>=2.1
-url: ''
-version: 1.1.0
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/tf2_serving_v2/latest/static/source.html b/functions/development/tf2_serving_v2/latest/static/source.html deleted file mode 100644 index 65434955..00000000 --- a/functions/development/tf2_serving_v2/latest/static/source.html +++ /dev/null @@ -1,104 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-# Copyright 2019 Iguazio
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# Generated by nuclio.export.NuclioExporter
-
-import warnings
-
-warnings.simplefilter(action="ignore", category=FutureWarning)
-
-import json
-import numpy as np
-import requests
-from tensorflow import keras
-from tensorflow.keras.models import load_model
-from tensorflow.keras.preprocessing import image
-from tensorflow.keras.preprocessing.image import load_img
-from os import environ, path
-from PIL import Image
-from io import BytesIO
-from urllib.request import urlopen
-import mlrun
-
-
-class TFModel(mlrun.serving.V2ModelServer):
-    def load(self):
-        self.IMAGE_WIDTH = int(environ.get("IMAGE_WIDTH", "128"))
-        self.IMAGE_HEIGHT = int(environ.get("IMAGE_HEIGHT", "128"))
-
-        try:
-            with open(environ["classes_map"], "r") as f:
-                self.classes = json.load(f)
-        except:
-            self.classes = None
-
-        model_file, extra_data = self.get_model(".h5")
-        self.model = load_model(model_file)
-
-    def preprocess(self, body, operation):
-        try:
-            output = {"inputs": []}
-            inputs = body.get("inputs", [])
-            for byte_image in inputs:
-                img = Image.open(byte_image)
-                img = img.resize((self.IMAGE_WIDTH, self.IMAGE_HEIGHT))
-
-                x = image.img_to_array(img)
-                x = np.expand_dims(x, axis=0)
-                output["inputs"].append(x)
-
-            output["inputs"] = [np.vstack(output["inputs"])]
-            return output
-        except:
-            raise Exception(f"received: {body}")
-
-    def predict(self, data):
-        images = data.get("inputs", [])
-
-        predicted_probability = self.model.predict(images)
-
-        return predicted_probability.tolist()[0]
-
-
-from mlrun.runtimes import nuclio_init_hook
-
-
-def init_context(context):
-    nuclio_init_hook(context, globals(), "serving_v2")
-
-
-def handler(context, event):
-    return context.mlrun_handler(context, event)
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/tf2_serving_v2/latest/static/tf2_serving_v2.html b/functions/development/tf2_serving_v2/latest/static/tf2_serving_v2.html deleted file mode 100644 index 27b5b357..00000000 --- a/functions/development/tf2_serving_v2/latest/static/tf2_serving_v2.html +++ /dev/null @@ -1,222 +0,0 @@ - - - - - - - -tf2_serving_v2.tf2_serving_v2 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - -
-
- -
-
-
-
-
- -
-

- -
-
-
-
-
-
-
-

Source code for tf2_serving_v2.tf2_serving_v2

-# Copyright 2019 Iguazio
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# Generated by nuclio.export.NuclioExporter
-
-import warnings
-
-warnings.simplefilter(action="ignore", category=FutureWarning)
-
-import json
-import numpy as np
-import requests
-from tensorflow import keras
-from tensorflow.keras.models import load_model
-from tensorflow.keras.preprocessing import image
-from tensorflow.keras.preprocessing.image import load_img
-from os import environ, path
-from PIL import Image
-from io import BytesIO
-from urllib.request import urlopen
-import mlrun
-
-
-
[docs]class TFModel(mlrun.serving.V2ModelServer): -
[docs] def load(self): - self.IMAGE_WIDTH = int(environ.get("IMAGE_WIDTH", "128")) - self.IMAGE_HEIGHT = int(environ.get("IMAGE_HEIGHT", "128")) - - try: - with open(environ["classes_map"], "r") as f: - self.classes = json.load(f) - except: - self.classes = None - - model_file, extra_data = self.get_model(".h5") - self.model = load_model(model_file)
- -
[docs] def preprocess(self, body, operation): - try: - output = {"inputs": []} - inputs = body.get("inputs", []) - for byte_image in inputs: - img = Image.open(byte_image) - img = img.resize((self.IMAGE_WIDTH, self.IMAGE_HEIGHT)) - - x = image.img_to_array(img) - x = np.expand_dims(x, axis=0) - output["inputs"].append(x) - - output["inputs"] = [np.vstack(output["inputs"])] - return output - except: - raise Exception(f"received: {body}")
- -
[docs] def predict(self, data): - images = data.get("inputs", []) - - predicted_probability = self.model.predict(images) - - return predicted_probability.tolist()[0]
- - -from mlrun.runtimes import nuclio_init_hook - - -
[docs]def init_context(context): - nuclio_init_hook(context, globals(), "serving_v2")
- - -
[docs]def handler(context, event): - return context.mlrun_handler(context, event)
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/virtual_drift/0.0.1/src/README.md b/functions/development/virtual_drift/0.0.1/src/README.md deleted file mode 100644 index cd738390..00000000 --- a/functions/development/virtual_drift/0.0.1/src/README.md +++ /dev/null @@ -1,56 +0,0 @@ -# Drift Magnitude - -Concept drift and shift are major issues that greatly affect the accuracy and reliability of many real-world applications of machine learning. We can use the following Drift Magnitude metrics to map and understand our concepts and how close the properties of the data we used to train the models on are to the current data we receive. - -## How to integrate - -The Virtual Drift function is built to receive two data batches of data (as `dataitem` or `Dataframe`), base batch *t* and current batch *u*. - -```markdown -:param context: MLRun context -:param t: Base dataset for the drift metrics -:param u: Test dataset for the drift metrics -:param label_col: Label colum in t and u -:param prediction_col: Predictions column in t and u -:param discritizers: Dictionary of dicsritizers for the features if available - (Created automatically if not provided) -:param n_bins: Number of bins to be used for histrogram creation from continuous variables -:param stream_name: Output stream to push metrics to -:param results_tsdb_container: TSDB table container to push metrics to -:param results_tsdb_table: TSDB table to push metrics to -``` - -The function will calculate the selected drift mangitude metrics that were selected and apply them to the **features**, **labels** and **predictions**. It will then save those metrics and export them via Parquet and TSDB. Alerting could be added on top of the metrics via Grafana or a function. - -## Metrics - -The drift magnitude metrics we calculate are: - -### TVD - Total Variation Distance - -Provides a symetric drift distance between two periods *t* and *u* -Z - vector of random variables -P*t* - Probability distribution over timespan *t* - -![\sigma_{t, u}(Z)=\frac{1}{2}\sum_{\hat{z}\in{dom(Z)}}{|P_t{(\hat{Z})-P_u{(\hat{Z})}}|}]() - -### Helinger Distance - -Hellinger distance is an *f* divergence measuer, similar to the Kullback-Leibler (KL) divergence. However, unlike KL Divergence the Hellinger divergence is symmetric and bounded over a probability space. - -P, Q - Discrete probability distributions (P*i*, ..., P*k*). - -![H(P,Q)=\frac{1}{\sqrt{2}}\sqrt{\sum_{i=1}^{k}{(\sqrt{p_i}-\sqrt{q_i})^2}}]() - - -### KL Divergence - -KL Divergence (or relative entropy) is a measure of how one probability distribution differs from another. It is an asymmetric measure (thus it's not a metric) and it doesn't satisfy the triangle inequality. KL Divergence of 0, indicates two identical distributrions. - -![D_{KL}(P||Q)=\sum_{x\in{X}}{(P(x)\log{\frac{P(x)}{Q(x)}})}]() - -## Additional Resources - -Webb, Geoffrey I. et al. “[Characterizing Concept Drift.](https://arxiv.org/abs/1511.03816)” Data Mining and Knowledge Discovery 30.4 (2016): 964–994. Crossref. Web. - -[MLOps Live #4 - How to Detect & Remediate Drift in Production with MLOps Automation](https://www.youtube.com/watch?v=66_Q7mJZOSc&t=1296s) diff --git a/functions/development/virtual_drift/0.0.1/src/function.yaml b/functions/development/virtual_drift/0.0.1/src/function.yaml deleted file mode 100644 index 1e9086e4..00000000 --- a/functions/development/virtual_drift/0.0.1/src/function.yaml +++ /dev/null @@ -1,129 +0,0 @@ -kind: job -metadata: - name: virtual-drift - tag: '' - hash: 8990fdd72fc550189a0c8b488b69997428b786c9 - project: default - labels: - author: orz - categories: - - data-analysis - - machine-learning -spec: - command: '' - args: [] - image: mlrun/ml-models - env: [] - default_handler: drift_magnitude - entry_points: - to_observations: - name: to_observations - doc: '' - parameters: - - name: context - default: '' - - name: t - default: '' - - name: u - default: '' - - name: key - default: '' - outputs: - - default: '' - lineno: 16 - tvd: - name: tvd - doc: '' - parameters: - - name: t - default: '' - - name: u - default: '' - outputs: - - default: '' - lineno: 42 - helinger: - name: helinger - doc: '' - parameters: - - name: t - default: '' - - name: u - default: '' - outputs: - - default: '' - lineno: 46 - kl_divergence: - name: kl_divergence - doc: '' - parameters: - - name: t - default: '' - - name: u - default: '' - outputs: - - default: '' - lineno: 50 - all_metrics: - name: all_metrics - doc: '' - parameters: - - name: t - default: '' - - name: u - default: '' - outputs: - - default: '' - lineno: 56 - drift_magnitude: - name: drift_magnitude - doc: "Drift magnitude metrics\n Computes drift magnitude metrics between base\ - \ dataset t and dataset u.\n Metrics:\n - TVD (Total Variation Distance)\n\ - \ - Helinger\n - KL Divergence" - parameters: - - name: context - doc: MLRun context - default: '' - - name: t - type: DataFrame - doc: Base dataset for the drift metrics - default: '' - - name: u - type: DataFrame - doc: Test dataset for the drift metrics - default: '' - - name: label_col - doc: Label colum in t and u - default: null - - name: prediction_col - doc: Predictions column in t and u - default: null - - name: discretizers - type: dict - default: null - - name: n_bins - doc: Number of bins to be used for histrogram creation from continuous variables - default: 5 - - name: stream_name - type: str - doc: Output stream to push metrics to - default: some_stream - - name: results_tsdb_container - type: str - doc: TSDB table container to push metrics to - default: bigdata - - name: results_tsdb_table - type: str - doc: TSDB table to push metrics to - default: concept_drift/drift_magnitude - outputs: - - default: '' - lineno: 60 - description: Compute drift magnitude between Time-Samples T and U - build: - functionSourceCode:  - commands: - - python -m pip install scikit-learn scipy v3io_frames - code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/virtual_drift/virtual_drift.py - affinity: null -verbose: false diff --git a/functions/development/virtual_drift/0.0.1/src/item.yaml b/functions/development/virtual_drift/0.0.1/src/item.yaml deleted file mode 100644 index 21101b4e..00000000 --- a/functions/development/virtual_drift/0.0.1/src/item.yaml +++ /dev/null @@ -1,27 +0,0 @@ -apiVersion: v1 -categories: -- data-analysis -- machine-learning -description: Compute drift magnitude between Time-Samples T and U -doc: '' -example: virtual_drift.ipynb -generationDate: 2021-05-19:23-13 -icon: '' -labels: - author: orz -maintainers: [] -marketplaceType: '' -mlrunVersion: '' -name: virtual-drift -platformVersion: '' -spec: - filename: virtual_drift.py - handler: drift_magnitude - image: mlrun/ml-models - kind: job - requirements: - - scikit-learn - - scipy - - v3io_frames -url: '' -version: 0.0.1 diff --git a/functions/development/virtual_drift/0.0.1/src/virtual_drift.ipynb b/functions/development/virtual_drift/0.0.1/src/virtual_drift.ipynb deleted file mode 100644 index 45ac6cd0..00000000 --- a/functions/development/virtual_drift/0.0.1/src/virtual_drift.ipynb +++ /dev/null @@ -1,888 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Virtual Drift\n", - "\n", - "Drift magnitude metrics\n", - " Computes drift magnitude metrics between base dataset t and dataset u. \n", - "\n", - "Metrics:\n", - "- TVD (Total Variation Distance)\n", - "- Helinger\n", - "- KL Divergence" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Environment setup" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "%load_ext autoreload\n", - "%autoreload 2" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "import nuclio" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import code_to_function, mount_v3io, run_local" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Function" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: start-code" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "import pandas as pd\n", - "import numpy as np\n", - "import scipy as sp\n", - "import pickle\n", - "import datetime\n", - "\n", - "import v3io_frames as v3f\n", - "\n", - "import matplotlib.pyplot as plt\n", - "from sklearn.preprocessing import KBinsDiscretizer" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "def to_observations(context, t, u, key):\n", - " # Create density\n", - " t = t.apply(lambda row: f\"{'_'.join([str(row[val]) for val in t.columns])}\", axis=1).value_counts().sort_index()\n", - " u = u.apply(lambda row: f\"{'_'.join([str(row[val]) for val in u.columns])}\", axis=1).value_counts().sort_index()\n", - "\n", - " # Add 0s if needed\n", - " joined_uniques = pd.DataFrame([t, u]).T.fillna(0).sort_index()\n", - " joined_uniques.columns = ['t', 'u']\n", - "\n", - " t_obs = joined_uniques.loc[:, 't']\n", - " u_obs = joined_uniques.loc[:, 'u']\n", - "\n", - " t_pdf = t_obs/t_obs.sum()\n", - " u_pdf = u_obs/u_obs.sum()\n", - "\n", - " context.log_dataset(f'{key}_t_pdf', pd.DataFrame(t_pdf), format='parquet')\n", - " context.log_dataset(f'{key}_u_pdf', pd.DataFrame(u_pdf), format='parquet')\n", - " return t_pdf, u_pdf" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "def tvd(t, u):\n", - " return sum(abs(t - u)) / 2\n", - "\n", - "def helinger(t, u):\n", - " return (np.sqrt(np.sum(np.power(np.sqrt(t) - np.sqrt(u), 2))))/np.sqrt(2)\n", - "\n", - "def kl_divergence(t, u):\n", - " t_u = np.sum(np.where(t != 0, t * np.log(t / u), 0))\n", - " u_t = np.sum(np.where(u != 0, u * np.log(u / t), 0))\n", - " return t_u + u_t\n", - "\n", - "def all_metrics(t, u):\n", - " return tvd(t, u), helinger(t, u), kl_divergence(t, u)" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "metadata": {}, - "outputs": [], - "source": [ - "def drift_magnitude(context, t: pd.DataFrame, u: pd.DataFrame, \n", - " label_col=None, prediction_col=None, \n", - " discretizers: dict = None, n_bins=5,\n", - " stream_name: str = 'some_stream',\n", - " results_tsdb_container: str = 'bigdata',\n", - " results_tsdb_table: str = 'concept_drift/drift_magnitude'):\n", - " \"\"\"Drift magnitude metrics\n", - " Computes drift magnitude metrics between base dataset t and dataset u.\n", - " Metrics:\n", - " - TVD (Total Variation Distance)\n", - " - Helinger\n", - " - KL Divergence\n", - " \n", - " :param context: MLRun context\n", - " :param t: Base dataset for the drift metrics\n", - " :param u: Test dataset for the drift metrics\n", - " :param label_col: Label colum in t and u\n", - " :param prediction_col: Predictions column in t and u\n", - " :param discritizers: Dictionary of dicsritizers for the features if available\n", - " (Created automatically if not provided)\n", - " :param n_bins: Number of bins to be used for histrogram creation from continuous variables\n", - " :param stream_name: Output stream to push metrics to\n", - " :param results_tsdb_container: TSDB table container to push metrics to\n", - " :param results_tsdb_table: TSDB table to push metrics to\n", - " \"\"\"\n", - " \n", - " # Setup v3io connection and TSDB table\n", - " v3io_client = v3f.Client('framesd:8081', container=results_tsdb_container)\n", - " try:\n", - " v3io_client.create('tsdb', results_tsdb_table, if_exists=1, rate='1/s')\n", - " except:\n", - " v3io_client.create('tsdb', results_tsdb_table, if_exists=1, attrs={'rate': '1/s'})\n", - " \n", - " # Get input DFs\n", - " df_t = t.as_df()\n", - " df_u = u.as_df()\n", - " \n", - " # Get feature cols\n", - " \n", - " drop_columns = []\n", - " if label_col is not None:\n", - " drop_columns.append(label_col)\n", - " if prediction_col is not None:\n", - " drop_columns.append(prediction_col)\n", - " \n", - " \n", - " # Discretize continuous featuers\n", - " continuous_features = df_t.select_dtypes(['float'])\n", - " if discretizers is None:\n", - " discretizers = {}\n", - " for feature in continuous_features.columns:\n", - " context.logger.info(f'Fitting discretizer for {feature}')\n", - " # Need to train a new discretizer\n", - " discretizer = KBinsDiscretizer(n_bins=n_bins,\n", - " encode='ordinal',\n", - " strategy='uniform')\n", - "\n", - " discretizer.fit(continuous_features.loc[:, feature].values.reshape(-1, 1))\n", - " discretizers[feature] = discretizer\n", - " os.makedirs(context.artifact_path, exist_ok=True)\n", - " discretizers_path = os.path.abspath(f'{context.artifact_path}/discritizer.pkl')\n", - " with open(discretizers_path, 'wb') as f:\n", - " pickle.dump(discretizers, f)\n", - " context.log_artifact('discritizers', target_path=discretizers_path)\n", - " context.logger.info('Discretizing featuers')\n", - " for feature, discretizer in discretizers.items():\n", - " df_t[feature] = discretizer.transform(df_t.loc[:, feature].values.reshape(-1, 1))\n", - " df_u[feature] = discretizer.transform(df_u.loc[:, feature].values.reshape(-1, 1))\n", - " df_t[feature] = df_t[feature].astype('int')\n", - " df_u[feature] = df_u[feature].astype('int')\n", - " context.log_dataset('t_discrete', df_t, format='parquet')\n", - " context.log_dataset('u_discrete', df_u, format='parquet')\n", - " \n", - " # Estimate probabilities \n", - " # P(X), P(y), P(X|y), P(y|X) for t and u\n", - " \n", - " context.logger.info('Compute prior metrics')\n", - " \n", - " results = {}\n", - " t_prior, u_prior = to_observations(context, df_t.drop(drop_columns, axis=1), \n", - " df_u.drop(drop_columns, axis=1), 'features')\n", - " results['prior_tvd'], results['prior_helinger'], results['prior_kld'] = all_metrics(t_prior, u_prior)\n", - " \n", - " if prediction_col is not None:\n", - " context.logger.info('Compute prediction metrics')\n", - " t_predictions = pd.DataFrame(df_t.loc[:, prediction_col])\n", - " u_predictions = pd.DataFrame(df_u.loc[:, prediction_col])\n", - " t_class, u_class = to_observations(context, t_predictions,\n", - " u_predictions, 'prediction')\n", - " results['prediction_shift_tvd'], results['prediction_shift_helinger'], results['prediction_shift_kld'] = all_metrics(t_class, u_class)\n", - " \n", - " if label_col is not None:\n", - " context.logger.info('Compute class metrics')\n", - " t_labels = pd.DataFrame(df_t.loc[:, label_col])\n", - " u_labels = pd.DataFrame(df_u.loc[:, label_col])\n", - " t_class, u_class = to_observations(context, t_labels,\n", - " u_labels, 'class')\n", - " results['class_shift_tvd'], results['class_shift_helinger'], results['class_shift_kld'] = all_metrics(t_class, u_class)\n", - " \n", - " for key, value in results.items():\n", - " if value == float('inf'):\n", - " context.logger.info(f'value: {value}')\n", - " results[key]=10\n", - " # Log results\n", - " for key, result in results.items():\n", - " context.log_result(key, round(result, 3))\n", - " \n", - " # Push results to TSDB\n", - " now = pd.to_datetime(str(datetime.datetime.now()))\n", - " now\n", - " \n", - " results['timestamp'] = pd.to_datetime(str((datetime.datetime.now())))\n", - " context.logger.info(f\"Timestamp: {results['timestamp']}\")\n", - " results['stream'] = stream_name\n", - " results_df = pd.DataFrame(data=[list(results.values())],\n", - " columns=list(results.keys()))\n", - " results_df = results_df.set_index(['timestamp', 'stream'])\n", - " v3io_client.write('tsdb', results_tsdb_table, dfs=results_df)\n", - "# context.log_dataset('results', results_df, format='pq')" - ] - }, - { - "cell_type": "code", - "execution_count": 46, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: end-code" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Test" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "import random" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Wine dataset" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.datasets import load_wine" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import NewTask" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": {}, - "outputs": [], - "source": [ - "wine = load_wine()" - ] - }, - { - "cell_type": "code", - "execution_count": 41, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
alcoholmalic_acidashalcalinity_of_ashmagnesiumtotal_phenolsflavanoidsnonflavanoid_phenolsproanthocyaninscolor_intensityhueod280/od315_of_diluted_winesprolineyprediction
014.231.712.4315.6127.02.803.060.282.295.641.043.921065.000
113.201.782.1411.2100.02.652.760.261.284.381.053.401050.000
213.162.362.6718.6101.02.803.240.302.815.681.033.171185.000
314.371.952.5016.8113.03.853.490.242.187.800.863.451480.000
413.242.592.8721.0118.02.802.690.391.824.321.042.93735.000
\n", - "
" - ], - "text/plain": [ - " alcohol malic_acid ash alcalinity_of_ash magnesium total_phenols \\\n", - "0 14.23 1.71 2.43 15.6 127.0 2.80 \n", - "1 13.20 1.78 2.14 11.2 100.0 2.65 \n", - "2 13.16 2.36 2.67 18.6 101.0 2.80 \n", - "3 14.37 1.95 2.50 16.8 113.0 3.85 \n", - "4 13.24 2.59 2.87 21.0 118.0 2.80 \n", - "\n", - " flavanoids nonflavanoid_phenols proanthocyanins color_intensity hue \\\n", - "0 3.06 0.28 2.29 5.64 1.04 \n", - "1 2.76 0.26 1.28 4.38 1.05 \n", - "2 3.24 0.30 2.81 5.68 1.03 \n", - "3 3.49 0.24 2.18 7.80 0.86 \n", - "4 2.69 0.39 1.82 4.32 1.04 \n", - "\n", - " od280/od315_of_diluted_wines proline y prediction \n", - "0 3.92 1065.0 0 0 \n", - "1 3.40 1050.0 0 0 \n", - "2 3.17 1185.0 0 0 \n", - "3 3.45 1480.0 0 0 \n", - "4 2.93 735.0 0 0 " - ] - }, - "execution_count": 41, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df_wine = pd.DataFrame(data=wine['data'],\n", - " columns=wine['feature_names'])\n", - "df_wine['y'] = wine['target']\n", - "df_wine['prediction'] = wine['target']\n", - "df_wine.to_parquet('data/wine_t.pq')\n", - "df_wine.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 42, - "metadata": {}, - "outputs": [], - "source": [ - "u = df_wine.sample(frac=0.5).copy()\n", - "# change_feature = [random.choice(wine['feature_names']), random.choice(wine['feature_names'])]\n", - "# u[change_feature] = 1\n", - "u.to_parquet('data/wine_u.pq')" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-07-14 13:58:50,586 function spec saved to path: function.yaml\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# create job function object from notebook code\n", - "fn = code_to_function(\"virtual_drift\", \n", - " kind='job', \n", - " image='mlrun/ml-models')\n", - "\n", - "# add metadata (for templates and reuse)\n", - "fn.spec.default_handler = \"drift_magnitude\"\n", - "fn.spec.description = \"Compute drift magnitude between Time-Samples T and U\"\n", - "fn.metadata.categories = [\"ml\", \"serve\", \"concept-drift\"]\n", - "fn.metadata.labels = {\"author\": \"orz\"}\n", - "fn.export(\"function.yaml\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "fn.deploy()" - ] - }, - { - "cell_type": "code", - "execution_count": 45, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 45, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fn.apply(mount_v3io())" - ] - }, - { - "cell_type": "code", - "execution_count": 46, - "metadata": {}, - "outputs": [], - "source": [ - "task = NewTask(name='drift_magnitude',\n", - " handler='drift_magnitude',\n", - " params={'label_col': 'y',\n", - " 'results_tsdb_container': 'bigdata',\n", - " 'results_tsdb_table': 'drift_magnitude'},\n", - " inputs={'t': '/User/functions/virtual_drift/data/wine_t.pq',\n", - " 'u': '/User/functions/virtual_drift/data/wine_u.pq'},\n", - " artifact_path=os.path.abspath('/User/functions/virtual_drift/artifacts'))" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-06-02 12:56:25,352 starting run drift_magnitude uid=a20c78ddc72e45119ac4684bc4b32876 -> http://10.192.65.32:8080\n", - "[mlrun] 2020-06-02 12:56:26,121 Job is running in the background, pod: drift-magnitude-xqb5r\n", - "[mlrun] 2020-06-02 12:56:44,171 starting local run: main.py # drift_magnitude\n", - "[mlrun] 2020-06-02 12:56:48,652 Fitting discretizer for alcohol\n", - "[mlrun] 2020-06-02 12:56:48,655 Fitting discretizer for malic_acid\n", - "[mlrun] 2020-06-02 12:56:48,657 Fitting discretizer for ash\n", - "[mlrun] 2020-06-02 12:56:48,658 Fitting discretizer for alcalinity_of_ash\n", - "[mlrun] 2020-06-02 12:56:48,660 Fitting discretizer for magnesium\n", - "[mlrun] 2020-06-02 12:56:48,662 Fitting discretizer for total_phenols\n", - "[mlrun] 2020-06-02 12:56:48,663 Fitting discretizer for flavanoids\n", - "[mlrun] 2020-06-02 12:56:48,664 Fitting discretizer for nonflavanoid_phenols\n", - "[mlrun] 2020-06-02 12:56:48,666 Fitting discretizer for proanthocyanins\n", - "[mlrun] 2020-06-02 12:56:48,668 Fitting discretizer for color_intensity\n", - "[mlrun] 2020-06-02 12:56:48,670 Fitting discretizer for hue\n", - "[mlrun] 2020-06-02 12:56:48,670 Fitting discretizer for od280/od315_of_diluted_wines\n", - "[mlrun] 2020-06-02 12:56:48,672 Fitting discretizer for proline\n", - "[mlrun] 2020-06-02 12:56:48,752 log artifact discritizers at /User/demo-network-operations/artifacts/discritizer.pkl, size: None, db: Y\n", - "[mlrun] 2020-06-02 12:56:48,754 Discretizing featuers\n", - "[mlrun] 2020-06-02 12:56:49,199 log artifact t_discrete at /User/demo-network-operations/artifacts/t_discrete.parquet, size: 11803, db: Y\n", - "[mlrun] 2020-06-02 12:56:49,549 log artifact u_discrete at /User/demo-network-operations/artifacts/u_discrete.parquet, size: 12370, db: Y\n", - "[mlrun] 2020-06-02 12:56:49,552 Compute prior metrics\n", - "[mlrun] 2020-06-02 12:56:49,968 log artifact features_t_pdf at /User/demo-network-operations/artifacts/features_t_pdf.parquet, size: 4399, db: Y\n", - "[mlrun] 2020-06-02 12:56:50,103 log artifact features_u_pdf at /User/demo-network-operations/artifacts/features_u_pdf.parquet, size: 4428, db: Y\n", - "[mlrun] 2020-06-02 12:56:50,123 Compute class metrics\n", - "[mlrun] 2020-06-02 12:56:50,292 log artifact class_t_pdf at /User/demo-network-operations/artifacts/class_t_pdf.parquet, size: 2162, db: Y\n", - "[mlrun] 2020-06-02 12:56:50,408 log artifact class_u_pdf at /User/demo-network-operations/artifacts/class_u_pdf.parquet, size: 2162, db: Y\n", - "[mlrun] 2020-06-02 12:56:50,424 value: inf\n", - "[mlrun] 2020-06-02 12:56:50,459 Timestamp: 2020-06-02 12:56:50.458032\n", - "\n", - "[mlrun] 2020-06-02 12:56:50,834 run executed, status=completed\n", - "/usr/local/lib/python3.7/site-packages/pandas/core/series.py:679: RuntimeWarning: divide by zero encountered in log\n", - " result = getattr(ufunc, method)(*inputs, **kwargs)\n", - "final state: succeeded\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 02 12:56:48completeddrift_magnitude
v3io_user=admin
kind=job
owner=admin
host=drift-magnitude-xqb5r
t
u
label_col=y
results_tsdb_container=bigdata
results_tsdb_table=drift_magnitude
prior_tvd=0.5
prior_helinger=0.541
prior_kld=10
class_shift_tvd=0.028
class_shift_helinger=0.02
class_shift_kld=0.003
discritizers
t_discrete
u_discrete
features_t_pdf
features_u_pdf
class_t_pdf
class_u_pdf
\n", - "
\n", - "
\n", - "
\n", - " Title\n", - " ×\n", - "
\n", - " \n", - "
\n", - "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "to track results use .show() or .logs() or in CLI: \n", - "!mlrun get run a20c78ddc72e45119ac4684bc4b32876 , !mlrun logs a20c78ddc72e45119ac4684bc4b32876 \n", - "[mlrun] 2020-06-02 12:56:57,590 run executed, status=completed\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 37, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fn.with_code().run(task)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.8" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} \ No newline at end of file diff --git a/functions/development/virtual_drift/0.0.1/src/virtual_drift.py b/functions/development/virtual_drift/0.0.1/src/virtual_drift.py deleted file mode 100644 index e677d721..00000000 --- a/functions/development/virtual_drift/0.0.1/src/virtual_drift.py +++ /dev/null @@ -1,192 +0,0 @@ -# Generated by nuclio.export.NuclioExporter - -import os -import pandas as pd -import numpy as np -import scipy as sp -import pickle -import datetime - -import v3io_frames as v3f - -import matplotlib.pyplot as plt -from sklearn.preprocessing import KBinsDiscretizer - - -def to_observations(context, t, u, key): - t = ( - t.apply(lambda row: f"{'_'.join([str(row[val]) for val in t.columns])}", axis=1) - .value_counts() - .sort_index() - ) - u = ( - u.apply(lambda row: f"{'_'.join([str(row[val]) for val in u.columns])}", axis=1) - .value_counts() - .sort_index() - ) - - joined_uniques = pd.DataFrame([t, u]).T.fillna(0).sort_index() - joined_uniques.columns = ["t", "u"] - - t_obs = joined_uniques.loc[:, "t"] - u_obs = joined_uniques.loc[:, "u"] - - t_pdf = t_obs / t_obs.sum() - u_pdf = u_obs / u_obs.sum() - - context.log_dataset(f"{key}_t_pdf", pd.DataFrame(t_pdf), format="parquet") - context.log_dataset(f"{key}_u_pdf", pd.DataFrame(u_pdf), format="parquet") - return t_pdf, u_pdf - - -def tvd(t, u): - return sum(abs(t - u)) / 2 - - -def helinger(t, u): - return (np.sqrt(np.sum(np.power(np.sqrt(t) - np.sqrt(u), 2)))) / np.sqrt(2) - - -def kl_divergence(t, u): - t_u = np.sum(np.where(t != 0, t * np.log(t / u), 0)) - u_t = np.sum(np.where(u != 0, u * np.log(u / t), 0)) - return t_u + u_t - - -def all_metrics(t, u): - return tvd(t, u), helinger(t, u), kl_divergence(t, u) - - -def drift_magnitude( - context, - t: pd.DataFrame, - u: pd.DataFrame, - label_col=None, - prediction_col=None, - discretizers: dict = None, - n_bins=5, - stream_name: str = "some_stream", - results_tsdb_container: str = "bigdata", - results_tsdb_table: str = "concept_drift/drift_magnitude", -): - """Drift magnitude metrics - Computes drift magnitude metrics between base dataset t and dataset u. - Metrics: - - TVD (Total Variation Distance) - - Helinger - - KL Divergence - - :param context: MLRun context - :param t: Base dataset for the drift metrics - :param u: Test dataset for the drift metrics - :param label_col: Label colum in t and u - :param prediction_col: Predictions column in t and u - :param discritizers: Dictionary of dicsritizers for the features if available - (Created automatically if not provided) - :param n_bins: Number of bins to be used for histrogram creation from continuous variables - :param stream_name: Output stream to push metrics to - :param results_tsdb_container: TSDB table container to push metrics to - :param results_tsdb_table: TSDB table to push metrics to - """ - - v3io_client = v3f.Client("framesd:8081", container=results_tsdb_container) - try: - v3io_client.create("tsdb", results_tsdb_table, if_exists=1, rate="1/s") - except: - v3io_client.create( - "tsdb", results_tsdb_table, if_exists=1, attrs={"rate": "1/s"} - ) - - df_t = t.as_df() - df_u = u.as_df() - - drop_columns = [] - if label_col is not None: - drop_columns.append(label_col) - if prediction_col is not None: - drop_columns.append(prediction_col) - - continuous_features = df_t.select_dtypes(["float"]) - if discretizers is None: - discretizers = {} - for feature in continuous_features.columns: - context.logger.info(f"Fitting discretizer for {feature}") - discretizer = KBinsDiscretizer( - n_bins=n_bins, encode="ordinal", strategy="uniform" - ) - - discretizer.fit(continuous_features.loc[:, feature].values.reshape(-1, 1)) - discretizers[feature] = discretizer - os.makedirs(context.artifact_path, exist_ok=True) - discretizers_path = os.path.abspath(f"{context.artifact_path}/discritizer.pkl") - with open(discretizers_path, "wb") as f: - pickle.dump(discretizers, f) - context.log_artifact("discritizers", target_path=discretizers_path) - context.logger.info("Discretizing featuers") - for feature, discretizer in discretizers.items(): - df_t[feature] = discretizer.transform( - df_t.loc[:, feature].values.reshape(-1, 1) - ) - df_u[feature] = discretizer.transform( - df_u.loc[:, feature].values.reshape(-1, 1) - ) - df_t[feature] = df_t[feature].astype("int") - df_u[feature] = df_u[feature].astype("int") - context.log_dataset("t_discrete", df_t, format="parquet") - context.log_dataset("u_discrete", df_u, format="parquet") - - context.logger.info("Compute prior metrics") - - results = {} - t_prior, u_prior = to_observations( - context, - df_t.drop(drop_columns, axis=1), - df_u.drop(drop_columns, axis=1), - "features", - ) - results["prior_tvd"], results["prior_helinger"], results["prior_kld"] = all_metrics( - t_prior, u_prior - ) - - if prediction_col is not None: - context.logger.info("Compute prediction metrics") - t_predictions = pd.DataFrame(df_t.loc[:, prediction_col]) - u_predictions = pd.DataFrame(df_u.loc[:, prediction_col]) - t_class, u_class = to_observations( - context, t_predictions, u_predictions, "prediction" - ) - ( - results["prediction_shift_tvd"], - results["prediction_shift_helinger"], - results["prediction_shift_kld"], - ) = all_metrics(t_class, u_class) - - if label_col is not None: - context.logger.info("Compute class metrics") - t_labels = pd.DataFrame(df_t.loc[:, label_col]) - u_labels = pd.DataFrame(df_u.loc[:, label_col]) - t_class, u_class = to_observations(context, t_labels, u_labels, "class") - ( - results["class_shift_tvd"], - results["class_shift_helinger"], - results["class_shift_kld"], - ) = all_metrics(t_class, u_class) - - for key, value in results.items(): - if value == float("inf"): - context.logger.info(f"value: {value}") - results[key] = 10 - for key, result in results.items(): - context.log_result(key, round(result, 3)) - - now = pd.to_datetime(str(datetime.datetime.now())) - now - - results["timestamp"] = pd.to_datetime(str((datetime.datetime.now()))) - context.logger.info(f"Timestamp: {results['timestamp']}") - results["stream"] = stream_name - results_df = pd.DataFrame( - data=[list(results.values())], columns=list(results.keys()) - ) - results_df = results_df.set_index(["timestamp", "stream"]) - v3io_client.write("tsdb", results_tsdb_table, dfs=results_df) diff --git a/functions/development/virtual_drift/0.0.1/static/documentation.html b/functions/development/virtual_drift/0.0.1/static/documentation.html deleted file mode 100644 index 37387134..00000000 --- a/functions/development/virtual_drift/0.0.1/static/documentation.html +++ /dev/null @@ -1,178 +0,0 @@ - - - - - - - -virtual_drift package - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- - -
-
-
-
-
-
-

virtual_drift package

-
-

Submodules

-
-
-

virtual_drift.virtual_drift module

-
-
-virtual_drift.virtual_drift.all_metrics(t, u)[source]
-
-
-
-virtual_drift.virtual_drift.drift_magnitude(context, t: pandas.core.frame.DataFrame, u: pandas.core.frame.DataFrame, label_col=None, prediction_col=None, discretizers: Optional[dict] = None, n_bins=5, stream_name: str = 'some_stream', results_tsdb_container: str = 'bigdata', results_tsdb_table: str = 'concept_drift/drift_magnitude')[source]
-
-
Drift magnitude metrics

Computes drift magnitude metrics between base dataset t and dataset u. -Metrics:

-
-
    -
  • TVD (Total Variation Distance)

  • -
  • Helinger

  • -
  • KL Divergence

  • -
-
-
-
-
-
Parameters
-
    -
  • context – MLRun context

  • -
  • t – Base dataset for the drift metrics

  • -
  • u – Test dataset for the drift metrics

  • -
  • label_col – Label colum in t and u

  • -
  • prediction_col – Predictions column in t and u

  • -
  • discritizers – Dictionary of dicsritizers for the features if available -(Created automatically if not provided)

  • -
  • n_bins – Number of bins to be used for histrogram creation from continuous variables

  • -
  • stream_name – Output stream to push metrics to

  • -
  • results_tsdb_container – TSDB table container to push metrics to

  • -
  • results_tsdb_table – TSDB table to push metrics to

  • -
-
-
-
-
-
-virtual_drift.virtual_drift.helinger(t, u)[source]
-
-
-
-virtual_drift.virtual_drift.kl_divergence(t, u)[source]
-
-
-
-virtual_drift.virtual_drift.to_observations(context, t, u, key)[source]
-
-
-
-virtual_drift.virtual_drift.tvd(t, u)[source]
-
-
-
-

Module contents

-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/virtual_drift/0.0.1/static/example.html b/functions/development/virtual_drift/0.0.1/static/example.html deleted file mode 100644 index 97caa4f9..00000000 --- a/functions/development/virtual_drift/0.0.1/static/example.html +++ /dev/null @@ -1,851 +0,0 @@ - - - - - - - -Virtual Drift - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- -
-
- Contents -
- -
-
-
-
-
-
-
-

Virtual Drift

-

Drift magnitude metrics -Computes drift magnitude metrics between base dataset t and dataset u.

-

Metrics:

-
    -
  • TVD (Total Variation Distance)

  • -
  • Helinger

  • -
  • KL Divergence

  • -
-
-

Environment setup

-
-
-
%load_ext autoreload
-%autoreload 2
-
-
-
-
-
-
-
import nuclio
-
-
-
-
-
-
-
from mlrun import code_to_function, mount_v3io, run_local
-
-
-
-
-
-
-

Function

-
-
-
# nuclio: start-code
-
-
-
-
-
-
-
import os
-import pandas as pd
-import numpy as np
-import scipy as sp
-import pickle
-import datetime
-
-import v3io_frames as v3f
-
-import matplotlib.pyplot as plt
-from sklearn.preprocessing import KBinsDiscretizer
-
-
-
-
-
-
-
def to_observations(context, t, u, key):
-    # Create density
-    t = t.apply(lambda row: f"{'_'.join([str(row[val]) for val in t.columns])}", axis=1).value_counts().sort_index()
-    u = u.apply(lambda row: f"{'_'.join([str(row[val]) for val in u.columns])}", axis=1).value_counts().sort_index()
-
-    # Add 0s if needed
-    joined_uniques = pd.DataFrame([t, u]).T.fillna(0).sort_index()
-    joined_uniques.columns = ['t', 'u']
-
-    t_obs = joined_uniques.loc[:, 't']
-    u_obs = joined_uniques.loc[:, 'u']
-
-    t_pdf = t_obs/t_obs.sum()
-    u_pdf = u_obs/u_obs.sum()
-
-    context.log_dataset(f'{key}_t_pdf', pd.DataFrame(t_pdf), format='parquet')
-    context.log_dataset(f'{key}_u_pdf', pd.DataFrame(u_pdf), format='parquet')
-    return t_pdf, u_pdf
-
-
-
-
-
-
-
def tvd(t, u):
-    return sum(abs(t - u)) / 2
-
-def helinger(t, u):
-    return (np.sqrt(np.sum(np.power(np.sqrt(t) - np.sqrt(u), 2))))/np.sqrt(2)
-
-def kl_divergence(t, u):
-    t_u = np.sum(np.where(t != 0, t * np.log(t / u), 0))
-    u_t = np.sum(np.where(u != 0, u * np.log(u / t), 0))
-    return t_u + u_t
-
-def all_metrics(t, u):
-    return tvd(t, u), helinger(t, u), kl_divergence(t, u)
-
-
-
-
-
-
-
def drift_magnitude(context, t: pd.DataFrame, u: pd.DataFrame, 
-         label_col=None, prediction_col=None, 
-         discretizers: dict = None, n_bins=5,
-         stream_name: str = 'some_stream',
-         results_tsdb_container: str = 'bigdata',
-         results_tsdb_table: str = 'concept_drift/drift_magnitude'):
-    """Drift magnitude metrics
-       Computes drift magnitude metrics between base dataset t and dataset u.
-       Metrics:
-        - TVD (Total Variation Distance)
-        - Helinger
-        - KL Divergence
-        
-    :param context: MLRun context
-    :param t: Base dataset for the drift metrics
-    :param u: Test dataset for the drift metrics
-    :param label_col: Label colum in t and u
-    :param prediction_col: Predictions column in t and u
-    :param discritizers: Dictionary of dicsritizers for the features if available
-                         (Created automatically if not provided)
-    :param n_bins: Number of bins to be used for histrogram creation from continuous variables
-    :param stream_name: Output stream to push metrics to
-    :param results_tsdb_container: TSDB table container to push metrics to
-    :param results_tsdb_table: TSDB table to push metrics to
-    """
-    
-    # Setup v3io connection and TSDB table
-    v3io_client = v3f.Client('framesd:8081', container=results_tsdb_container)
-    try:
-        v3io_client.create('tsdb', results_tsdb_table, if_exists=1, rate='1/s')
-    except:
-        v3io_client.create('tsdb', results_tsdb_table, if_exists=1, attrs={'rate': '1/s'})
-    
-    # Get input DFs
-    df_t = t.as_df()
-    df_u = u.as_df()
-    
-    # Get feature cols
-    
-    drop_columns = []
-    if label_col is not None:
-        drop_columns.append(label_col)
-    if prediction_col is not None:
-        drop_columns.append(prediction_col)
-    
-    
-    # Discretize continuous featuers
-    continuous_features = df_t.select_dtypes(['float'])
-    if discretizers is None:
-        discretizers = {}
-        for feature in continuous_features.columns:
-            context.logger.info(f'Fitting discretizer for {feature}')
-            # Need to train a new discretizer
-            discretizer = KBinsDiscretizer(n_bins=n_bins,
-                                           encode='ordinal',
-                                           strategy='uniform')
-
-            discretizer.fit(continuous_features.loc[:, feature].values.reshape(-1, 1))
-            discretizers[feature] = discretizer
-    os.makedirs(context.artifact_path, exist_ok=True)
-    discretizers_path = os.path.abspath(f'{context.artifact_path}/discritizer.pkl')
-    with open(discretizers_path, 'wb') as f:
-        pickle.dump(discretizers, f)
-    context.log_artifact('discritizers', target_path=discretizers_path)
-    context.logger.info('Discretizing featuers')
-    for feature, discretizer in discretizers.items():
-        df_t[feature] = discretizer.transform(df_t.loc[:, feature].values.reshape(-1, 1))
-        df_u[feature] = discretizer.transform(df_u.loc[:, feature].values.reshape(-1, 1))
-        df_t[feature] = df_t[feature].astype('int')
-        df_u[feature] = df_u[feature].astype('int')
-    context.log_dataset('t_discrete', df_t, format='parquet')
-    context.log_dataset('u_discrete', df_u, format='parquet')
-    
-    # Estimate probabilities 
-    # P(X), P(y), P(X|y), P(y|X) for t and u
-    
-    context.logger.info('Compute prior metrics')
-    
-    results = {}
-    t_prior, u_prior = to_observations(context, df_t.drop(drop_columns, axis=1), 
-                                       df_u.drop(drop_columns, axis=1), 'features')
-    results['prior_tvd'], results['prior_helinger'], results['prior_kld'] = all_metrics(t_prior, u_prior)
-    
-    if prediction_col is not None:
-        context.logger.info('Compute prediction metrics')
-        t_predictions = pd.DataFrame(df_t.loc[:, prediction_col])
-        u_predictions = pd.DataFrame(df_u.loc[:, prediction_col])
-        t_class, u_class = to_observations(context, t_predictions,
-                                           u_predictions, 'prediction')
-        results['prediction_shift_tvd'], results['prediction_shift_helinger'], results['prediction_shift_kld'] = all_metrics(t_class, u_class)
-        
-    if label_col is not None:
-        context.logger.info('Compute class metrics')
-        t_labels = pd.DataFrame(df_t.loc[:, label_col])
-        u_labels = pd.DataFrame(df_u.loc[:, label_col])
-        t_class, u_class = to_observations(context, t_labels,
-                                           u_labels, 'class')
-        results['class_shift_tvd'], results['class_shift_helinger'], results['class_shift_kld'] = all_metrics(t_class, u_class)
-    
-    for key, value in results.items():
-        if value == float('inf'):
-            context.logger.info(f'value: {value}')
-            results[key]=10
-    # Log results
-    for key, result in results.items():
-        context.log_result(key, round(result, 3))
-        
-    # Push results to TSDB
-    now = pd.to_datetime(str(datetime.datetime.now()))
-    now
-    
-    results['timestamp'] = pd.to_datetime(str((datetime.datetime.now())))
-    context.logger.info(f"Timestamp: {results['timestamp']}")
-    results['stream'] = stream_name
-    results_df = pd.DataFrame(data=[list(results.values())],
-                              columns=list(results.keys()))
-    results_df = results_df.set_index(['timestamp', 'stream'])
-    v3io_client.write('tsdb', results_tsdb_table, dfs=results_df)
-#     context.log_dataset('results', results_df, format='pq')
-
-
-
-
-
-
-
# nuclio: end-code
-
-
-
-
-
-
-
-

Test

-
-
-
import random
-
-
-
-
-
-

Wine dataset

-
-
-
from sklearn.datasets import load_wine
-
-
-
-
-
-
-
from mlrun import NewTask
-
-
-
-
-
-
-
wine = load_wine()
-
-
-
-
-
-
-
df_wine = pd.DataFrame(data=wine['data'],
-                       columns=wine['feature_names'])
-df_wine['y'] = wine['target']
-df_wine['prediction'] = wine['target']
-df_wine.to_parquet('data/wine_t.pq')
-df_wine.head()
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
alcoholmalic_acidashalcalinity_of_ashmagnesiumtotal_phenolsflavanoidsnonflavanoid_phenolsproanthocyaninscolor_intensityhueod280/od315_of_diluted_winesprolineyprediction
014.231.712.4315.6127.02.803.060.282.295.641.043.921065.000
113.201.782.1411.2100.02.652.760.261.284.381.053.401050.000
213.162.362.6718.6101.02.803.240.302.815.681.033.171185.000
314.371.952.5016.8113.03.853.490.242.187.800.863.451480.000
413.242.592.8721.0118.02.802.690.391.824.321.042.93735.000
-
-
-
-
-
u = df_wine.sample(frac=0.5).copy()
-# change_feature = [random.choice(wine['feature_names']), random.choice(wine['feature_names'])]
-# u[change_feature] = 1
-u.to_parquet('data/wine_u.pq')
-
-
-
-
-
-
-
# create job function object from notebook code
-fn = code_to_function("virtual_drift", 
-                      kind='job', 
-                      image='mlrun/ml-models')
-
-# add metadata (for templates and reuse)
-fn.spec.default_handler = "drift_magnitude"
-fn.spec.description = "Compute drift magnitude between Time-Samples T and U"
-fn.metadata.categories = ["ml", "serve", "concept-drift"]
-fn.metadata.labels = {"author": "orz"}
-fn.export("function.yaml")
-
-
-
-
-
[mlrun] 2020-07-14 13:58:50,586 function spec saved to path: function.yaml
-
-
-
<mlrun.runtimes.kubejob.KubejobRuntime at 0x7f23c8ab3f28>
-
-
-
-
-
-
-
fn.deploy()
-
-
-
-
-
-
-
fn.apply(mount_v3io())
-
-
-
-
-
<mlrun.runtimes.kubejob.KubejobRuntime at 0x7f9b09fb0e80>
-
-
-
-
-
-
-
task = NewTask(name='drift_magnitude',
-               handler='drift_magnitude',
-               params={'label_col': 'y',
-                       'results_tsdb_container': 'bigdata',
-                       'results_tsdb_table': 'drift_magnitude'},
-               inputs={'t': '/User/functions/virtual_drift/data/wine_t.pq',
-                       'u': '/User/functions/virtual_drift/data/wine_u.pq'},
-               artifact_path=os.path.abspath('/User/functions/virtual_drift/artifacts'))
-
-
-
-
-
-
-
fn.with_code().run(task)
-
-
-
-
-
[mlrun] 2020-06-02 12:56:25,352 starting run drift_magnitude uid=a20c78ddc72e45119ac4684bc4b32876  -> http://10.192.65.32:8080
-[mlrun] 2020-06-02 12:56:26,121 Job is running in the background, pod: drift-magnitude-xqb5r
-[mlrun] 2020-06-02 12:56:44,171 starting local run: main.py # drift_magnitude
-[mlrun] 2020-06-02 12:56:48,652 Fitting discretizer for alcohol
-[mlrun] 2020-06-02 12:56:48,655 Fitting discretizer for malic_acid
-[mlrun] 2020-06-02 12:56:48,657 Fitting discretizer for ash
-[mlrun] 2020-06-02 12:56:48,658 Fitting discretizer for alcalinity_of_ash
-[mlrun] 2020-06-02 12:56:48,660 Fitting discretizer for magnesium
-[mlrun] 2020-06-02 12:56:48,662 Fitting discretizer for total_phenols
-[mlrun] 2020-06-02 12:56:48,663 Fitting discretizer for flavanoids
-[mlrun] 2020-06-02 12:56:48,664 Fitting discretizer for nonflavanoid_phenols
-[mlrun] 2020-06-02 12:56:48,666 Fitting discretizer for proanthocyanins
-[mlrun] 2020-06-02 12:56:48,668 Fitting discretizer for color_intensity
-[mlrun] 2020-06-02 12:56:48,670 Fitting discretizer for hue
-[mlrun] 2020-06-02 12:56:48,670 Fitting discretizer for od280/od315_of_diluted_wines
-[mlrun] 2020-06-02 12:56:48,672 Fitting discretizer for proline
-[mlrun] 2020-06-02 12:56:48,752 log artifact discritizers at /User/demo-network-operations/artifacts/discritizer.pkl, size: None, db: Y
-[mlrun] 2020-06-02 12:56:48,754 Discretizing featuers
-[mlrun] 2020-06-02 12:56:49,199 log artifact t_discrete at /User/demo-network-operations/artifacts/t_discrete.parquet, size: 11803, db: Y
-[mlrun] 2020-06-02 12:56:49,549 log artifact u_discrete at /User/demo-network-operations/artifacts/u_discrete.parquet, size: 12370, db: Y
-[mlrun] 2020-06-02 12:56:49,552 Compute prior metrics
-[mlrun] 2020-06-02 12:56:49,968 log artifact features_t_pdf at /User/demo-network-operations/artifacts/features_t_pdf.parquet, size: 4399, db: Y
-[mlrun] 2020-06-02 12:56:50,103 log artifact features_u_pdf at /User/demo-network-operations/artifacts/features_u_pdf.parquet, size: 4428, db: Y
-[mlrun] 2020-06-02 12:56:50,123 Compute class metrics
-[mlrun] 2020-06-02 12:56:50,292 log artifact class_t_pdf at /User/demo-network-operations/artifacts/class_t_pdf.parquet, size: 2162, db: Y
-[mlrun] 2020-06-02 12:56:50,408 log artifact class_u_pdf at /User/demo-network-operations/artifacts/class_u_pdf.parquet, size: 2162, db: Y
-[mlrun] 2020-06-02 12:56:50,424 value: inf
-[mlrun] 2020-06-02 12:56:50,459 Timestamp: 2020-06-02 12:56:50.458032
-
-[mlrun] 2020-06-02 12:56:50,834 run executed, status=completed
-/usr/local/lib/python3.7/site-packages/pandas/core/series.py:679: RuntimeWarning: divide by zero encountered in log
-  result = getattr(ufunc, method)(*inputs, **kwargs)
-final state: succeeded
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 02 12:56:48completeddrift_magnitude
v3io_user=admin
kind=job
owner=admin
host=drift-magnitude-xqb5r
t
u
label_col=y
results_tsdb_container=bigdata
results_tsdb_table=drift_magnitude
prior_tvd=0.5
prior_helinger=0.541
prior_kld=10
class_shift_tvd=0.028
class_shift_helinger=0.02
class_shift_kld=0.003
discritizers
t_discrete
u_discrete
features_t_pdf
features_u_pdf
class_t_pdf
class_u_pdf
-
- -
-
to track results use .show() or .logs() or in CLI: 
-!mlrun get run a20c78ddc72e45119ac4684bc4b32876  , !mlrun logs a20c78ddc72e45119ac4684bc4b32876 
-[mlrun] 2020-06-02 12:56:57,590 run executed, status=completed
-
-
-
<mlrun.model.RunObject at 0x7f9b0d967128>
-
-
-
-
-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/virtual_drift/0.0.1/static/function.html b/functions/development/virtual_drift/0.0.1/static/function.html deleted file mode 100644 index 69d956a8..00000000 --- a/functions/development/virtual_drift/0.0.1/static/function.html +++ /dev/null @@ -1,151 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: job
-metadata:
-  name: virtual-drift
-  tag: ''
-  hash: 8990fdd72fc550189a0c8b488b69997428b786c9
-  project: default
-  labels:
-    author: orz
-  categories:
-  - data-analysis
-  - machine-learning
-spec:
-  command: ''
-  args: []
-  image: mlrun/ml-models
-  env: []
-  default_handler: drift_magnitude
-  entry_points:
-    to_observations:
-      name: to_observations
-      doc: ''
-      parameters:
-      - name: context
-        default: ''
-      - name: t
-        default: ''
-      - name: u
-        default: ''
-      - name: key
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 16
-    tvd:
-      name: tvd
-      doc: ''
-      parameters:
-      - name: t
-        default: ''
-      - name: u
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 42
-    helinger:
-      name: helinger
-      doc: ''
-      parameters:
-      - name: t
-        default: ''
-      - name: u
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 46
-    kl_divergence:
-      name: kl_divergence
-      doc: ''
-      parameters:
-      - name: t
-        default: ''
-      - name: u
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 50
-    all_metrics:
-      name: all_metrics
-      doc: ''
-      parameters:
-      - name: t
-        default: ''
-      - name: u
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 56
-    drift_magnitude:
-      name: drift_magnitude
-      doc: "Drift magnitude metrics\n   Computes drift magnitude metrics between base\
-        \ dataset t and dataset u.\n   Metrics:\n    - TVD (Total Variation Distance)\n\
-        \    - Helinger\n    - KL Divergence"
-      parameters:
-      - name: context
-        doc: MLRun context
-        default: ''
-      - name: t
-        type: DataFrame
-        doc: Base dataset for the drift metrics
-        default: ''
-      - name: u
-        type: DataFrame
-        doc: Test dataset for the drift metrics
-        default: ''
-      - name: label_col
-        doc: Label colum in t and u
-        default: null
-      - name: prediction_col
-        doc: Predictions column in t and u
-        default: null
-      - name: discretizers
-        type: dict
-        default: null
-      - name: n_bins
-        doc: Number of bins to be used for histrogram creation from continuous variables
-        default: 5
-      - name: stream_name
-        type: str
-        doc: Output stream to push metrics to
-        default: some_stream
-      - name: results_tsdb_container
-        type: str
-        doc: TSDB table container to push metrics to
-        default: bigdata
-      - name: results_tsdb_table
-        type: str
-        doc: TSDB table to push metrics to
-        default: concept_drift/drift_magnitude
-      outputs:
-      - default: ''
-      lineno: 60
-  description: Compute drift magnitude between Time-Samples T and U
-  build:
-    functionSourceCode: 
-    commands:
-    - python -m pip install scikit-learn scipy v3io_frames
-    code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/virtual_drift/virtual_drift.py
-  affinity: null
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/virtual_drift/0.0.1/static/item.html b/functions/development/virtual_drift/0.0.1/static/item.html deleted file mode 100644 index 939fcfe1..00000000 --- a/functions/development/virtual_drift/0.0.1/static/item.html +++ /dev/null @@ -1,49 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- data-analysis
-- machine-learning
-description: Compute drift magnitude between Time-Samples T and U
-doc: ''
-example: virtual_drift.ipynb
-generationDate: 2021-05-19:23-13
-icon: ''
-labels:
-  author: orz
-maintainers: []
-marketplaceType: ''
-mlrunVersion: ''
-name: virtual-drift
-platformVersion: ''
-spec:
-  filename: virtual_drift.py
-  handler: drift_magnitude
-  image: mlrun/ml-models
-  kind: job
-  requirements:
-  - scikit-learn
-  - scipy
-  - v3io_frames
-url: ''
-version: 0.0.1
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/virtual_drift/0.0.1/static/source.html b/functions/development/virtual_drift/0.0.1/static/source.html deleted file mode 100644 index 802cf249..00000000 --- a/functions/development/virtual_drift/0.0.1/static/source.html +++ /dev/null @@ -1,214 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-# Generated by nuclio.export.NuclioExporter
-
-import os
-import pandas as pd
-import numpy as np
-import scipy as sp
-import pickle
-import datetime
-
-import v3io_frames as v3f
-
-import matplotlib.pyplot as plt
-from sklearn.preprocessing import KBinsDiscretizer
-
-
-def to_observations(context, t, u, key):
-    t = (
-        t.apply(lambda row: f"{'_'.join([str(row[val]) for val in t.columns])}", axis=1)
-        .value_counts()
-        .sort_index()
-    )
-    u = (
-        u.apply(lambda row: f"{'_'.join([str(row[val]) for val in u.columns])}", axis=1)
-        .value_counts()
-        .sort_index()
-    )
-
-    joined_uniques = pd.DataFrame([t, u]).T.fillna(0).sort_index()
-    joined_uniques.columns = ["t", "u"]
-
-    t_obs = joined_uniques.loc[:, "t"]
-    u_obs = joined_uniques.loc[:, "u"]
-
-    t_pdf = t_obs / t_obs.sum()
-    u_pdf = u_obs / u_obs.sum()
-
-    context.log_dataset(f"{key}_t_pdf", pd.DataFrame(t_pdf), format="parquet")
-    context.log_dataset(f"{key}_u_pdf", pd.DataFrame(u_pdf), format="parquet")
-    return t_pdf, u_pdf
-
-
-def tvd(t, u):
-    return sum(abs(t - u)) / 2
-
-
-def helinger(t, u):
-    return (np.sqrt(np.sum(np.power(np.sqrt(t) - np.sqrt(u), 2)))) / np.sqrt(2)
-
-
-def kl_divergence(t, u):
-    t_u = np.sum(np.where(t != 0, t * np.log(t / u), 0))
-    u_t = np.sum(np.where(u != 0, u * np.log(u / t), 0))
-    return t_u + u_t
-
-
-def all_metrics(t, u):
-    return tvd(t, u), helinger(t, u), kl_divergence(t, u)
-
-
-def drift_magnitude(
-    context,
-    t: pd.DataFrame,
-    u: pd.DataFrame,
-    label_col=None,
-    prediction_col=None,
-    discretizers: dict = None,
-    n_bins=5,
-    stream_name: str = "some_stream",
-    results_tsdb_container: str = "bigdata",
-    results_tsdb_table: str = "concept_drift/drift_magnitude",
-):
-    """Drift magnitude metrics
-       Computes drift magnitude metrics between base dataset t and dataset u.
-       Metrics:
-        - TVD (Total Variation Distance)
-        - Helinger
-        - KL Divergence
-
-    :param context: MLRun context
-    :param t: Base dataset for the drift metrics
-    :param u: Test dataset for the drift metrics
-    :param label_col: Label colum in t and u
-    :param prediction_col: Predictions column in t and u
-    :param discritizers: Dictionary of dicsritizers for the features if available
-                         (Created automatically if not provided)
-    :param n_bins: Number of bins to be used for histrogram creation from continuous variables
-    :param stream_name: Output stream to push metrics to
-    :param results_tsdb_container: TSDB table container to push metrics to
-    :param results_tsdb_table: TSDB table to push metrics to
-    """
-
-    v3io_client = v3f.Client("framesd:8081", container=results_tsdb_container)
-    try:
-        v3io_client.create("tsdb", results_tsdb_table, if_exists=1, rate="1/s")
-    except:
-        v3io_client.create(
-            "tsdb", results_tsdb_table, if_exists=1, attrs={"rate": "1/s"}
-        )
-
-    df_t = t.as_df()
-    df_u = u.as_df()
-
-    drop_columns = []
-    if label_col is not None:
-        drop_columns.append(label_col)
-    if prediction_col is not None:
-        drop_columns.append(prediction_col)
-
-    continuous_features = df_t.select_dtypes(["float"])
-    if discretizers is None:
-        discretizers = {}
-        for feature in continuous_features.columns:
-            context.logger.info(f"Fitting discretizer for {feature}")
-            discretizer = KBinsDiscretizer(
-                n_bins=n_bins, encode="ordinal", strategy="uniform"
-            )
-
-            discretizer.fit(continuous_features.loc[:, feature].values.reshape(-1, 1))
-            discretizers[feature] = discretizer
-    os.makedirs(context.artifact_path, exist_ok=True)
-    discretizers_path = os.path.abspath(f"{context.artifact_path}/discritizer.pkl")
-    with open(discretizers_path, "wb") as f:
-        pickle.dump(discretizers, f)
-    context.log_artifact("discritizers", target_path=discretizers_path)
-    context.logger.info("Discretizing featuers")
-    for feature, discretizer in discretizers.items():
-        df_t[feature] = discretizer.transform(
-            df_t.loc[:, feature].values.reshape(-1, 1)
-        )
-        df_u[feature] = discretizer.transform(
-            df_u.loc[:, feature].values.reshape(-1, 1)
-        )
-        df_t[feature] = df_t[feature].astype("int")
-        df_u[feature] = df_u[feature].astype("int")
-    context.log_dataset("t_discrete", df_t, format="parquet")
-    context.log_dataset("u_discrete", df_u, format="parquet")
-
-    context.logger.info("Compute prior metrics")
-
-    results = {}
-    t_prior, u_prior = to_observations(
-        context,
-        df_t.drop(drop_columns, axis=1),
-        df_u.drop(drop_columns, axis=1),
-        "features",
-    )
-    results["prior_tvd"], results["prior_helinger"], results["prior_kld"] = all_metrics(
-        t_prior, u_prior
-    )
-
-    if prediction_col is not None:
-        context.logger.info("Compute prediction metrics")
-        t_predictions = pd.DataFrame(df_t.loc[:, prediction_col])
-        u_predictions = pd.DataFrame(df_u.loc[:, prediction_col])
-        t_class, u_class = to_observations(
-            context, t_predictions, u_predictions, "prediction"
-        )
-        (
-            results["prediction_shift_tvd"],
-            results["prediction_shift_helinger"],
-            results["prediction_shift_kld"],
-        ) = all_metrics(t_class, u_class)
-
-    if label_col is not None:
-        context.logger.info("Compute class metrics")
-        t_labels = pd.DataFrame(df_t.loc[:, label_col])
-        u_labels = pd.DataFrame(df_u.loc[:, label_col])
-        t_class, u_class = to_observations(context, t_labels, u_labels, "class")
-        (
-            results["class_shift_tvd"],
-            results["class_shift_helinger"],
-            results["class_shift_kld"],
-        ) = all_metrics(t_class, u_class)
-
-    for key, value in results.items():
-        if value == float("inf"):
-            context.logger.info(f"value: {value}")
-            results[key] = 10
-    for key, result in results.items():
-        context.log_result(key, round(result, 3))
-
-    now = pd.to_datetime(str(datetime.datetime.now()))
-    now
-
-    results["timestamp"] = pd.to_datetime(str((datetime.datetime.now())))
-    context.logger.info(f"Timestamp: {results['timestamp']}")
-    results["stream"] = stream_name
-    results_df = pd.DataFrame(
-        data=[list(results.values())], columns=list(results.keys())
-    )
-    results_df = results_df.set_index(["timestamp", "stream"])
-    v3io_client.write("tsdb", results_tsdb_table, dfs=results_df)
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/virtual_drift/0.8.0/src/README.md b/functions/development/virtual_drift/0.8.0/src/README.md deleted file mode 100644 index cd738390..00000000 --- a/functions/development/virtual_drift/0.8.0/src/README.md +++ /dev/null @@ -1,56 +0,0 @@ -# Drift Magnitude - -Concept drift and shift are major issues that greatly affect the accuracy and reliability of many real-world applications of machine learning. We can use the following Drift Magnitude metrics to map and understand our concepts and how close the properties of the data we used to train the models on are to the current data we receive. - -## How to integrate - -The Virtual Drift function is built to receive two data batches of data (as `dataitem` or `Dataframe`), base batch *t* and current batch *u*. - -```markdown -:param context: MLRun context -:param t: Base dataset for the drift metrics -:param u: Test dataset for the drift metrics -:param label_col: Label colum in t and u -:param prediction_col: Predictions column in t and u -:param discritizers: Dictionary of dicsritizers for the features if available - (Created automatically if not provided) -:param n_bins: Number of bins to be used for histrogram creation from continuous variables -:param stream_name: Output stream to push metrics to -:param results_tsdb_container: TSDB table container to push metrics to -:param results_tsdb_table: TSDB table to push metrics to -``` - -The function will calculate the selected drift mangitude metrics that were selected and apply them to the **features**, **labels** and **predictions**. It will then save those metrics and export them via Parquet and TSDB. Alerting could be added on top of the metrics via Grafana or a function. - -## Metrics - -The drift magnitude metrics we calculate are: - -### TVD - Total Variation Distance - -Provides a symetric drift distance between two periods *t* and *u* -Z - vector of random variables -P*t* - Probability distribution over timespan *t* - -![\sigma_{t, u}(Z)=\frac{1}{2}\sum_{\hat{z}\in{dom(Z)}}{|P_t{(\hat{Z})-P_u{(\hat{Z})}}|}]() - -### Helinger Distance - -Hellinger distance is an *f* divergence measuer, similar to the Kullback-Leibler (KL) divergence. However, unlike KL Divergence the Hellinger divergence is symmetric and bounded over a probability space. - -P, Q - Discrete probability distributions (P*i*, ..., P*k*). - -![H(P,Q)=\frac{1}{\sqrt{2}}\sqrt{\sum_{i=1}^{k}{(\sqrt{p_i}-\sqrt{q_i})^2}}]() - - -### KL Divergence - -KL Divergence (or relative entropy) is a measure of how one probability distribution differs from another. It is an asymmetric measure (thus it's not a metric) and it doesn't satisfy the triangle inequality. KL Divergence of 0, indicates two identical distributrions. - -![D_{KL}(P||Q)=\sum_{x\in{X}}{(P(x)\log{\frac{P(x)}{Q(x)}})}]() - -## Additional Resources - -Webb, Geoffrey I. et al. “[Characterizing Concept Drift.](https://arxiv.org/abs/1511.03816)” Data Mining and Knowledge Discovery 30.4 (2016): 964–994. Crossref. Web. - -[MLOps Live #4 - How to Detect & Remediate Drift in Production with MLOps Automation](https://www.youtube.com/watch?v=66_Q7mJZOSc&t=1296s) diff --git a/functions/development/virtual_drift/0.8.0/src/function.yaml b/functions/development/virtual_drift/0.8.0/src/function.yaml deleted file mode 100644 index 1e9086e4..00000000 --- a/functions/development/virtual_drift/0.8.0/src/function.yaml +++ /dev/null @@ -1,129 +0,0 @@ -kind: job -metadata: - name: virtual-drift - tag: '' - hash: 8990fdd72fc550189a0c8b488b69997428b786c9 - project: default - labels: - author: orz - categories: - - data-analysis - - machine-learning -spec: - command: '' - args: [] - image: mlrun/ml-models - env: [] - default_handler: drift_magnitude - entry_points: - to_observations: - name: to_observations - doc: '' - parameters: - - name: context - default: '' - - name: t - default: '' - - name: u - default: '' - - name: key - default: '' - outputs: - - default: '' - lineno: 16 - tvd: - name: tvd - doc: '' - parameters: - - name: t - default: '' - - name: u - default: '' - outputs: - - default: '' - lineno: 42 - helinger: - name: helinger - doc: '' - parameters: - - name: t - default: '' - - name: u - default: '' - outputs: - - default: '' - lineno: 46 - kl_divergence: - name: kl_divergence - doc: '' - parameters: - - name: t - default: '' - - name: u - default: '' - outputs: - - default: '' - lineno: 50 - all_metrics: - name: all_metrics - doc: '' - parameters: - - name: t - default: '' - - name: u - default: '' - outputs: - - default: '' - lineno: 56 - drift_magnitude: - name: drift_magnitude - doc: "Drift magnitude metrics\n Computes drift magnitude metrics between base\ - \ dataset t and dataset u.\n Metrics:\n - TVD (Total Variation Distance)\n\ - \ - Helinger\n - KL Divergence" - parameters: - - name: context - doc: MLRun context - default: '' - - name: t - type: DataFrame - doc: Base dataset for the drift metrics - default: '' - - name: u - type: DataFrame - doc: Test dataset for the drift metrics - default: '' - - name: label_col - doc: Label colum in t and u - default: null - - name: prediction_col - doc: Predictions column in t and u - default: null - - name: discretizers - type: dict - default: null - - name: n_bins - doc: Number of bins to be used for histrogram creation from continuous variables - default: 5 - - name: stream_name - type: str - doc: Output stream to push metrics to - default: some_stream - - name: results_tsdb_container - type: str - doc: TSDB table container to push metrics to - default: bigdata - - name: results_tsdb_table - type: str - doc: TSDB table to push metrics to - default: concept_drift/drift_magnitude - outputs: - - default: '' - lineno: 60 - description: Compute drift magnitude between Time-Samples T and U - build: - functionSourceCode:  - commands: - - python -m pip install scikit-learn scipy v3io_frames - code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/virtual_drift/virtual_drift.py - affinity: null -verbose: false diff --git a/functions/development/virtual_drift/0.8.0/src/item.yaml b/functions/development/virtual_drift/0.8.0/src/item.yaml deleted file mode 100644 index 32609e07..00000000 --- a/functions/development/virtual_drift/0.8.0/src/item.yaml +++ /dev/null @@ -1,27 +0,0 @@ -apiVersion: v1 -categories: -- data-analysis -- machine-learning -description: Compute drift magnitude between Time-Samples T and U -doc: '' -example: virtual_drift.ipynb -generationDate: 2021-05-19:23-13 -icon: '' -labels: - author: orz -maintainers: [] -marketplaceType: '' -mlrunVersion: 0.8.0 -name: virtual-drift -platformVersion: 3.2.0 -spec: - filename: virtual_drift.py - handler: drift_magnitude - image: mlrun/ml-models - kind: job - requirements: - - scikit-learn - - scipy - - v3io_frames -url: '' -version: 0.8.0 diff --git a/functions/development/virtual_drift/0.8.0/src/virtual_drift.ipynb b/functions/development/virtual_drift/0.8.0/src/virtual_drift.ipynb deleted file mode 100644 index 45ac6cd0..00000000 --- a/functions/development/virtual_drift/0.8.0/src/virtual_drift.ipynb +++ /dev/null @@ -1,888 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Virtual Drift\n", - "\n", - "Drift magnitude metrics\n", - " Computes drift magnitude metrics between base dataset t and dataset u. \n", - "\n", - "Metrics:\n", - "- TVD (Total Variation Distance)\n", - "- Helinger\n", - "- KL Divergence" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Environment setup" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "%load_ext autoreload\n", - "%autoreload 2" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "import nuclio" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import code_to_function, mount_v3io, run_local" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Function" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: start-code" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "import pandas as pd\n", - "import numpy as np\n", - "import scipy as sp\n", - "import pickle\n", - "import datetime\n", - "\n", - "import v3io_frames as v3f\n", - "\n", - "import matplotlib.pyplot as plt\n", - "from sklearn.preprocessing import KBinsDiscretizer" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "def to_observations(context, t, u, key):\n", - " # Create density\n", - " t = t.apply(lambda row: f\"{'_'.join([str(row[val]) for val in t.columns])}\", axis=1).value_counts().sort_index()\n", - " u = u.apply(lambda row: f\"{'_'.join([str(row[val]) for val in u.columns])}\", axis=1).value_counts().sort_index()\n", - "\n", - " # Add 0s if needed\n", - " joined_uniques = pd.DataFrame([t, u]).T.fillna(0).sort_index()\n", - " joined_uniques.columns = ['t', 'u']\n", - "\n", - " t_obs = joined_uniques.loc[:, 't']\n", - " u_obs = joined_uniques.loc[:, 'u']\n", - "\n", - " t_pdf = t_obs/t_obs.sum()\n", - " u_pdf = u_obs/u_obs.sum()\n", - "\n", - " context.log_dataset(f'{key}_t_pdf', pd.DataFrame(t_pdf), format='parquet')\n", - " context.log_dataset(f'{key}_u_pdf', pd.DataFrame(u_pdf), format='parquet')\n", - " return t_pdf, u_pdf" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "def tvd(t, u):\n", - " return sum(abs(t - u)) / 2\n", - "\n", - "def helinger(t, u):\n", - " return (np.sqrt(np.sum(np.power(np.sqrt(t) - np.sqrt(u), 2))))/np.sqrt(2)\n", - "\n", - "def kl_divergence(t, u):\n", - " t_u = np.sum(np.where(t != 0, t * np.log(t / u), 0))\n", - " u_t = np.sum(np.where(u != 0, u * np.log(u / t), 0))\n", - " return t_u + u_t\n", - "\n", - "def all_metrics(t, u):\n", - " return tvd(t, u), helinger(t, u), kl_divergence(t, u)" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "metadata": {}, - "outputs": [], - "source": [ - "def drift_magnitude(context, t: pd.DataFrame, u: pd.DataFrame, \n", - " label_col=None, prediction_col=None, \n", - " discretizers: dict = None, n_bins=5,\n", - " stream_name: str = 'some_stream',\n", - " results_tsdb_container: str = 'bigdata',\n", - " results_tsdb_table: str = 'concept_drift/drift_magnitude'):\n", - " \"\"\"Drift magnitude metrics\n", - " Computes drift magnitude metrics between base dataset t and dataset u.\n", - " Metrics:\n", - " - TVD (Total Variation Distance)\n", - " - Helinger\n", - " - KL Divergence\n", - " \n", - " :param context: MLRun context\n", - " :param t: Base dataset for the drift metrics\n", - " :param u: Test dataset for the drift metrics\n", - " :param label_col: Label colum in t and u\n", - " :param prediction_col: Predictions column in t and u\n", - " :param discritizers: Dictionary of dicsritizers for the features if available\n", - " (Created automatically if not provided)\n", - " :param n_bins: Number of bins to be used for histrogram creation from continuous variables\n", - " :param stream_name: Output stream to push metrics to\n", - " :param results_tsdb_container: TSDB table container to push metrics to\n", - " :param results_tsdb_table: TSDB table to push metrics to\n", - " \"\"\"\n", - " \n", - " # Setup v3io connection and TSDB table\n", - " v3io_client = v3f.Client('framesd:8081', container=results_tsdb_container)\n", - " try:\n", - " v3io_client.create('tsdb', results_tsdb_table, if_exists=1, rate='1/s')\n", - " except:\n", - " v3io_client.create('tsdb', results_tsdb_table, if_exists=1, attrs={'rate': '1/s'})\n", - " \n", - " # Get input DFs\n", - " df_t = t.as_df()\n", - " df_u = u.as_df()\n", - " \n", - " # Get feature cols\n", - " \n", - " drop_columns = []\n", - " if label_col is not None:\n", - " drop_columns.append(label_col)\n", - " if prediction_col is not None:\n", - " drop_columns.append(prediction_col)\n", - " \n", - " \n", - " # Discretize continuous featuers\n", - " continuous_features = df_t.select_dtypes(['float'])\n", - " if discretizers is None:\n", - " discretizers = {}\n", - " for feature in continuous_features.columns:\n", - " context.logger.info(f'Fitting discretizer for {feature}')\n", - " # Need to train a new discretizer\n", - " discretizer = KBinsDiscretizer(n_bins=n_bins,\n", - " encode='ordinal',\n", - " strategy='uniform')\n", - "\n", - " discretizer.fit(continuous_features.loc[:, feature].values.reshape(-1, 1))\n", - " discretizers[feature] = discretizer\n", - " os.makedirs(context.artifact_path, exist_ok=True)\n", - " discretizers_path = os.path.abspath(f'{context.artifact_path}/discritizer.pkl')\n", - " with open(discretizers_path, 'wb') as f:\n", - " pickle.dump(discretizers, f)\n", - " context.log_artifact('discritizers', target_path=discretizers_path)\n", - " context.logger.info('Discretizing featuers')\n", - " for feature, discretizer in discretizers.items():\n", - " df_t[feature] = discretizer.transform(df_t.loc[:, feature].values.reshape(-1, 1))\n", - " df_u[feature] = discretizer.transform(df_u.loc[:, feature].values.reshape(-1, 1))\n", - " df_t[feature] = df_t[feature].astype('int')\n", - " df_u[feature] = df_u[feature].astype('int')\n", - " context.log_dataset('t_discrete', df_t, format='parquet')\n", - " context.log_dataset('u_discrete', df_u, format='parquet')\n", - " \n", - " # Estimate probabilities \n", - " # P(X), P(y), P(X|y), P(y|X) for t and u\n", - " \n", - " context.logger.info('Compute prior metrics')\n", - " \n", - " results = {}\n", - " t_prior, u_prior = to_observations(context, df_t.drop(drop_columns, axis=1), \n", - " df_u.drop(drop_columns, axis=1), 'features')\n", - " results['prior_tvd'], results['prior_helinger'], results['prior_kld'] = all_metrics(t_prior, u_prior)\n", - " \n", - " if prediction_col is not None:\n", - " context.logger.info('Compute prediction metrics')\n", - " t_predictions = pd.DataFrame(df_t.loc[:, prediction_col])\n", - " u_predictions = pd.DataFrame(df_u.loc[:, prediction_col])\n", - " t_class, u_class = to_observations(context, t_predictions,\n", - " u_predictions, 'prediction')\n", - " results['prediction_shift_tvd'], results['prediction_shift_helinger'], results['prediction_shift_kld'] = all_metrics(t_class, u_class)\n", - " \n", - " if label_col is not None:\n", - " context.logger.info('Compute class metrics')\n", - " t_labels = pd.DataFrame(df_t.loc[:, label_col])\n", - " u_labels = pd.DataFrame(df_u.loc[:, label_col])\n", - " t_class, u_class = to_observations(context, t_labels,\n", - " u_labels, 'class')\n", - " results['class_shift_tvd'], results['class_shift_helinger'], results['class_shift_kld'] = all_metrics(t_class, u_class)\n", - " \n", - " for key, value in results.items():\n", - " if value == float('inf'):\n", - " context.logger.info(f'value: {value}')\n", - " results[key]=10\n", - " # Log results\n", - " for key, result in results.items():\n", - " context.log_result(key, round(result, 3))\n", - " \n", - " # Push results to TSDB\n", - " now = pd.to_datetime(str(datetime.datetime.now()))\n", - " now\n", - " \n", - " results['timestamp'] = pd.to_datetime(str((datetime.datetime.now())))\n", - " context.logger.info(f\"Timestamp: {results['timestamp']}\")\n", - " results['stream'] = stream_name\n", - " results_df = pd.DataFrame(data=[list(results.values())],\n", - " columns=list(results.keys()))\n", - " results_df = results_df.set_index(['timestamp', 'stream'])\n", - " v3io_client.write('tsdb', results_tsdb_table, dfs=results_df)\n", - "# context.log_dataset('results', results_df, format='pq')" - ] - }, - { - "cell_type": "code", - "execution_count": 46, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: end-code" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Test" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "import random" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Wine dataset" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.datasets import load_wine" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import NewTask" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": {}, - "outputs": [], - "source": [ - "wine = load_wine()" - ] - }, - { - "cell_type": "code", - "execution_count": 41, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
alcoholmalic_acidashalcalinity_of_ashmagnesiumtotal_phenolsflavanoidsnonflavanoid_phenolsproanthocyaninscolor_intensityhueod280/od315_of_diluted_winesprolineyprediction
014.231.712.4315.6127.02.803.060.282.295.641.043.921065.000
113.201.782.1411.2100.02.652.760.261.284.381.053.401050.000
213.162.362.6718.6101.02.803.240.302.815.681.033.171185.000
314.371.952.5016.8113.03.853.490.242.187.800.863.451480.000
413.242.592.8721.0118.02.802.690.391.824.321.042.93735.000
\n", - "
" - ], - "text/plain": [ - " alcohol malic_acid ash alcalinity_of_ash magnesium total_phenols \\\n", - "0 14.23 1.71 2.43 15.6 127.0 2.80 \n", - "1 13.20 1.78 2.14 11.2 100.0 2.65 \n", - "2 13.16 2.36 2.67 18.6 101.0 2.80 \n", - "3 14.37 1.95 2.50 16.8 113.0 3.85 \n", - "4 13.24 2.59 2.87 21.0 118.0 2.80 \n", - "\n", - " flavanoids nonflavanoid_phenols proanthocyanins color_intensity hue \\\n", - "0 3.06 0.28 2.29 5.64 1.04 \n", - "1 2.76 0.26 1.28 4.38 1.05 \n", - "2 3.24 0.30 2.81 5.68 1.03 \n", - "3 3.49 0.24 2.18 7.80 0.86 \n", - "4 2.69 0.39 1.82 4.32 1.04 \n", - "\n", - " od280/od315_of_diluted_wines proline y prediction \n", - "0 3.92 1065.0 0 0 \n", - "1 3.40 1050.0 0 0 \n", - "2 3.17 1185.0 0 0 \n", - "3 3.45 1480.0 0 0 \n", - "4 2.93 735.0 0 0 " - ] - }, - "execution_count": 41, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df_wine = pd.DataFrame(data=wine['data'],\n", - " columns=wine['feature_names'])\n", - "df_wine['y'] = wine['target']\n", - "df_wine['prediction'] = wine['target']\n", - "df_wine.to_parquet('data/wine_t.pq')\n", - "df_wine.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 42, - "metadata": {}, - "outputs": [], - "source": [ - "u = df_wine.sample(frac=0.5).copy()\n", - "# change_feature = [random.choice(wine['feature_names']), random.choice(wine['feature_names'])]\n", - "# u[change_feature] = 1\n", - "u.to_parquet('data/wine_u.pq')" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-07-14 13:58:50,586 function spec saved to path: function.yaml\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# create job function object from notebook code\n", - "fn = code_to_function(\"virtual_drift\", \n", - " kind='job', \n", - " image='mlrun/ml-models')\n", - "\n", - "# add metadata (for templates and reuse)\n", - "fn.spec.default_handler = \"drift_magnitude\"\n", - "fn.spec.description = \"Compute drift magnitude between Time-Samples T and U\"\n", - "fn.metadata.categories = [\"ml\", \"serve\", \"concept-drift\"]\n", - "fn.metadata.labels = {\"author\": \"orz\"}\n", - "fn.export(\"function.yaml\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "fn.deploy()" - ] - }, - { - "cell_type": "code", - "execution_count": 45, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 45, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fn.apply(mount_v3io())" - ] - }, - { - "cell_type": "code", - "execution_count": 46, - "metadata": {}, - "outputs": [], - "source": [ - "task = NewTask(name='drift_magnitude',\n", - " handler='drift_magnitude',\n", - " params={'label_col': 'y',\n", - " 'results_tsdb_container': 'bigdata',\n", - " 'results_tsdb_table': 'drift_magnitude'},\n", - " inputs={'t': '/User/functions/virtual_drift/data/wine_t.pq',\n", - " 'u': '/User/functions/virtual_drift/data/wine_u.pq'},\n", - " artifact_path=os.path.abspath('/User/functions/virtual_drift/artifacts'))" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-06-02 12:56:25,352 starting run drift_magnitude uid=a20c78ddc72e45119ac4684bc4b32876 -> http://10.192.65.32:8080\n", - "[mlrun] 2020-06-02 12:56:26,121 Job is running in the background, pod: drift-magnitude-xqb5r\n", - "[mlrun] 2020-06-02 12:56:44,171 starting local run: main.py # drift_magnitude\n", - "[mlrun] 2020-06-02 12:56:48,652 Fitting discretizer for alcohol\n", - "[mlrun] 2020-06-02 12:56:48,655 Fitting discretizer for malic_acid\n", - "[mlrun] 2020-06-02 12:56:48,657 Fitting discretizer for ash\n", - "[mlrun] 2020-06-02 12:56:48,658 Fitting discretizer for alcalinity_of_ash\n", - "[mlrun] 2020-06-02 12:56:48,660 Fitting discretizer for magnesium\n", - "[mlrun] 2020-06-02 12:56:48,662 Fitting discretizer for total_phenols\n", - "[mlrun] 2020-06-02 12:56:48,663 Fitting discretizer for flavanoids\n", - "[mlrun] 2020-06-02 12:56:48,664 Fitting discretizer for nonflavanoid_phenols\n", - "[mlrun] 2020-06-02 12:56:48,666 Fitting discretizer for proanthocyanins\n", - "[mlrun] 2020-06-02 12:56:48,668 Fitting discretizer for color_intensity\n", - "[mlrun] 2020-06-02 12:56:48,670 Fitting discretizer for hue\n", - "[mlrun] 2020-06-02 12:56:48,670 Fitting discretizer for od280/od315_of_diluted_wines\n", - "[mlrun] 2020-06-02 12:56:48,672 Fitting discretizer for proline\n", - "[mlrun] 2020-06-02 12:56:48,752 log artifact discritizers at /User/demo-network-operations/artifacts/discritizer.pkl, size: None, db: Y\n", - "[mlrun] 2020-06-02 12:56:48,754 Discretizing featuers\n", - "[mlrun] 2020-06-02 12:56:49,199 log artifact t_discrete at /User/demo-network-operations/artifacts/t_discrete.parquet, size: 11803, db: Y\n", - "[mlrun] 2020-06-02 12:56:49,549 log artifact u_discrete at /User/demo-network-operations/artifacts/u_discrete.parquet, size: 12370, db: Y\n", - "[mlrun] 2020-06-02 12:56:49,552 Compute prior metrics\n", - "[mlrun] 2020-06-02 12:56:49,968 log artifact features_t_pdf at /User/demo-network-operations/artifacts/features_t_pdf.parquet, size: 4399, db: Y\n", - "[mlrun] 2020-06-02 12:56:50,103 log artifact features_u_pdf at /User/demo-network-operations/artifacts/features_u_pdf.parquet, size: 4428, db: Y\n", - "[mlrun] 2020-06-02 12:56:50,123 Compute class metrics\n", - "[mlrun] 2020-06-02 12:56:50,292 log artifact class_t_pdf at /User/demo-network-operations/artifacts/class_t_pdf.parquet, size: 2162, db: Y\n", - "[mlrun] 2020-06-02 12:56:50,408 log artifact class_u_pdf at /User/demo-network-operations/artifacts/class_u_pdf.parquet, size: 2162, db: Y\n", - "[mlrun] 2020-06-02 12:56:50,424 value: inf\n", - "[mlrun] 2020-06-02 12:56:50,459 Timestamp: 2020-06-02 12:56:50.458032\n", - "\n", - "[mlrun] 2020-06-02 12:56:50,834 run executed, status=completed\n", - "/usr/local/lib/python3.7/site-packages/pandas/core/series.py:679: RuntimeWarning: divide by zero encountered in log\n", - " result = getattr(ufunc, method)(*inputs, **kwargs)\n", - "final state: succeeded\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 02 12:56:48completeddrift_magnitude
v3io_user=admin
kind=job
owner=admin
host=drift-magnitude-xqb5r
t
u
label_col=y
results_tsdb_container=bigdata
results_tsdb_table=drift_magnitude
prior_tvd=0.5
prior_helinger=0.541
prior_kld=10
class_shift_tvd=0.028
class_shift_helinger=0.02
class_shift_kld=0.003
discritizers
t_discrete
u_discrete
features_t_pdf
features_u_pdf
class_t_pdf
class_u_pdf
\n", - "
\n", - "
\n", - "
\n", - " Title\n", - " ×\n", - "
\n", - " \n", - "
\n", - "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "to track results use .show() or .logs() or in CLI: \n", - "!mlrun get run a20c78ddc72e45119ac4684bc4b32876 , !mlrun logs a20c78ddc72e45119ac4684bc4b32876 \n", - "[mlrun] 2020-06-02 12:56:57,590 run executed, status=completed\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 37, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fn.with_code().run(task)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.8" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} \ No newline at end of file diff --git a/functions/development/virtual_drift/0.8.0/src/virtual_drift.py b/functions/development/virtual_drift/0.8.0/src/virtual_drift.py deleted file mode 100644 index e677d721..00000000 --- a/functions/development/virtual_drift/0.8.0/src/virtual_drift.py +++ /dev/null @@ -1,192 +0,0 @@ -# Generated by nuclio.export.NuclioExporter - -import os -import pandas as pd -import numpy as np -import scipy as sp -import pickle -import datetime - -import v3io_frames as v3f - -import matplotlib.pyplot as plt -from sklearn.preprocessing import KBinsDiscretizer - - -def to_observations(context, t, u, key): - t = ( - t.apply(lambda row: f"{'_'.join([str(row[val]) for val in t.columns])}", axis=1) - .value_counts() - .sort_index() - ) - u = ( - u.apply(lambda row: f"{'_'.join([str(row[val]) for val in u.columns])}", axis=1) - .value_counts() - .sort_index() - ) - - joined_uniques = pd.DataFrame([t, u]).T.fillna(0).sort_index() - joined_uniques.columns = ["t", "u"] - - t_obs = joined_uniques.loc[:, "t"] - u_obs = joined_uniques.loc[:, "u"] - - t_pdf = t_obs / t_obs.sum() - u_pdf = u_obs / u_obs.sum() - - context.log_dataset(f"{key}_t_pdf", pd.DataFrame(t_pdf), format="parquet") - context.log_dataset(f"{key}_u_pdf", pd.DataFrame(u_pdf), format="parquet") - return t_pdf, u_pdf - - -def tvd(t, u): - return sum(abs(t - u)) / 2 - - -def helinger(t, u): - return (np.sqrt(np.sum(np.power(np.sqrt(t) - np.sqrt(u), 2)))) / np.sqrt(2) - - -def kl_divergence(t, u): - t_u = np.sum(np.where(t != 0, t * np.log(t / u), 0)) - u_t = np.sum(np.where(u != 0, u * np.log(u / t), 0)) - return t_u + u_t - - -def all_metrics(t, u): - return tvd(t, u), helinger(t, u), kl_divergence(t, u) - - -def drift_magnitude( - context, - t: pd.DataFrame, - u: pd.DataFrame, - label_col=None, - prediction_col=None, - discretizers: dict = None, - n_bins=5, - stream_name: str = "some_stream", - results_tsdb_container: str = "bigdata", - results_tsdb_table: str = "concept_drift/drift_magnitude", -): - """Drift magnitude metrics - Computes drift magnitude metrics between base dataset t and dataset u. - Metrics: - - TVD (Total Variation Distance) - - Helinger - - KL Divergence - - :param context: MLRun context - :param t: Base dataset for the drift metrics - :param u: Test dataset for the drift metrics - :param label_col: Label colum in t and u - :param prediction_col: Predictions column in t and u - :param discritizers: Dictionary of dicsritizers for the features if available - (Created automatically if not provided) - :param n_bins: Number of bins to be used for histrogram creation from continuous variables - :param stream_name: Output stream to push metrics to - :param results_tsdb_container: TSDB table container to push metrics to - :param results_tsdb_table: TSDB table to push metrics to - """ - - v3io_client = v3f.Client("framesd:8081", container=results_tsdb_container) - try: - v3io_client.create("tsdb", results_tsdb_table, if_exists=1, rate="1/s") - except: - v3io_client.create( - "tsdb", results_tsdb_table, if_exists=1, attrs={"rate": "1/s"} - ) - - df_t = t.as_df() - df_u = u.as_df() - - drop_columns = [] - if label_col is not None: - drop_columns.append(label_col) - if prediction_col is not None: - drop_columns.append(prediction_col) - - continuous_features = df_t.select_dtypes(["float"]) - if discretizers is None: - discretizers = {} - for feature in continuous_features.columns: - context.logger.info(f"Fitting discretizer for {feature}") - discretizer = KBinsDiscretizer( - n_bins=n_bins, encode="ordinal", strategy="uniform" - ) - - discretizer.fit(continuous_features.loc[:, feature].values.reshape(-1, 1)) - discretizers[feature] = discretizer - os.makedirs(context.artifact_path, exist_ok=True) - discretizers_path = os.path.abspath(f"{context.artifact_path}/discritizer.pkl") - with open(discretizers_path, "wb") as f: - pickle.dump(discretizers, f) - context.log_artifact("discritizers", target_path=discretizers_path) - context.logger.info("Discretizing featuers") - for feature, discretizer in discretizers.items(): - df_t[feature] = discretizer.transform( - df_t.loc[:, feature].values.reshape(-1, 1) - ) - df_u[feature] = discretizer.transform( - df_u.loc[:, feature].values.reshape(-1, 1) - ) - df_t[feature] = df_t[feature].astype("int") - df_u[feature] = df_u[feature].astype("int") - context.log_dataset("t_discrete", df_t, format="parquet") - context.log_dataset("u_discrete", df_u, format="parquet") - - context.logger.info("Compute prior metrics") - - results = {} - t_prior, u_prior = to_observations( - context, - df_t.drop(drop_columns, axis=1), - df_u.drop(drop_columns, axis=1), - "features", - ) - results["prior_tvd"], results["prior_helinger"], results["prior_kld"] = all_metrics( - t_prior, u_prior - ) - - if prediction_col is not None: - context.logger.info("Compute prediction metrics") - t_predictions = pd.DataFrame(df_t.loc[:, prediction_col]) - u_predictions = pd.DataFrame(df_u.loc[:, prediction_col]) - t_class, u_class = to_observations( - context, t_predictions, u_predictions, "prediction" - ) - ( - results["prediction_shift_tvd"], - results["prediction_shift_helinger"], - results["prediction_shift_kld"], - ) = all_metrics(t_class, u_class) - - if label_col is not None: - context.logger.info("Compute class metrics") - t_labels = pd.DataFrame(df_t.loc[:, label_col]) - u_labels = pd.DataFrame(df_u.loc[:, label_col]) - t_class, u_class = to_observations(context, t_labels, u_labels, "class") - ( - results["class_shift_tvd"], - results["class_shift_helinger"], - results["class_shift_kld"], - ) = all_metrics(t_class, u_class) - - for key, value in results.items(): - if value == float("inf"): - context.logger.info(f"value: {value}") - results[key] = 10 - for key, result in results.items(): - context.log_result(key, round(result, 3)) - - now = pd.to_datetime(str(datetime.datetime.now())) - now - - results["timestamp"] = pd.to_datetime(str((datetime.datetime.now()))) - context.logger.info(f"Timestamp: {results['timestamp']}") - results["stream"] = stream_name - results_df = pd.DataFrame( - data=[list(results.values())], columns=list(results.keys()) - ) - results_df = results_df.set_index(["timestamp", "stream"]) - v3io_client.write("tsdb", results_tsdb_table, dfs=results_df) diff --git a/functions/development/virtual_drift/0.8.0/static/documentation.html b/functions/development/virtual_drift/0.8.0/static/documentation.html deleted file mode 100644 index 37387134..00000000 --- a/functions/development/virtual_drift/0.8.0/static/documentation.html +++ /dev/null @@ -1,178 +0,0 @@ - - - - - - - -virtual_drift package - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- - -
-
-
-
-
-
-

virtual_drift package

-
-

Submodules

-
-
-

virtual_drift.virtual_drift module

-
-
-virtual_drift.virtual_drift.all_metrics(t, u)[source]
-
-
-
-virtual_drift.virtual_drift.drift_magnitude(context, t: pandas.core.frame.DataFrame, u: pandas.core.frame.DataFrame, label_col=None, prediction_col=None, discretizers: Optional[dict] = None, n_bins=5, stream_name: str = 'some_stream', results_tsdb_container: str = 'bigdata', results_tsdb_table: str = 'concept_drift/drift_magnitude')[source]
-
-
Drift magnitude metrics

Computes drift magnitude metrics between base dataset t and dataset u. -Metrics:

-
-
    -
  • TVD (Total Variation Distance)

  • -
  • Helinger

  • -
  • KL Divergence

  • -
-
-
-
-
-
Parameters
-
    -
  • context – MLRun context

  • -
  • t – Base dataset for the drift metrics

  • -
  • u – Test dataset for the drift metrics

  • -
  • label_col – Label colum in t and u

  • -
  • prediction_col – Predictions column in t and u

  • -
  • discritizers – Dictionary of dicsritizers for the features if available -(Created automatically if not provided)

  • -
  • n_bins – Number of bins to be used for histrogram creation from continuous variables

  • -
  • stream_name – Output stream to push metrics to

  • -
  • results_tsdb_container – TSDB table container to push metrics to

  • -
  • results_tsdb_table – TSDB table to push metrics to

  • -
-
-
-
-
-
-virtual_drift.virtual_drift.helinger(t, u)[source]
-
-
-
-virtual_drift.virtual_drift.kl_divergence(t, u)[source]
-
-
-
-virtual_drift.virtual_drift.to_observations(context, t, u, key)[source]
-
-
-
-virtual_drift.virtual_drift.tvd(t, u)[source]
-
-
-
-

Module contents

-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/virtual_drift/0.8.0/static/example.html b/functions/development/virtual_drift/0.8.0/static/example.html deleted file mode 100644 index 367da03d..00000000 --- a/functions/development/virtual_drift/0.8.0/static/example.html +++ /dev/null @@ -1,851 +0,0 @@ - - - - - - - -Virtual Drift - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- -
-
- Contents -
- -
-
-
-
-
-
-
-

Virtual Drift

-

Drift magnitude metrics -Computes drift magnitude metrics between base dataset t and dataset u.

-

Metrics:

-
    -
  • TVD (Total Variation Distance)

  • -
  • Helinger

  • -
  • KL Divergence

  • -
-
-

Environment setup

-
-
-
%load_ext autoreload
-%autoreload 2
-
-
-
-
-
-
-
import nuclio
-
-
-
-
-
-
-
from mlrun import code_to_function, mount_v3io, run_local
-
-
-
-
-
-
-

Function

-
-
-
# nuclio: start-code
-
-
-
-
-
-
-
import os
-import pandas as pd
-import numpy as np
-import scipy as sp
-import pickle
-import datetime
-
-import v3io_frames as v3f
-
-import matplotlib.pyplot as plt
-from sklearn.preprocessing import KBinsDiscretizer
-
-
-
-
-
-
-
def to_observations(context, t, u, key):
-    # Create density
-    t = t.apply(lambda row: f"{'_'.join([str(row[val]) for val in t.columns])}", axis=1).value_counts().sort_index()
-    u = u.apply(lambda row: f"{'_'.join([str(row[val]) for val in u.columns])}", axis=1).value_counts().sort_index()
-
-    # Add 0s if needed
-    joined_uniques = pd.DataFrame([t, u]).T.fillna(0).sort_index()
-    joined_uniques.columns = ['t', 'u']
-
-    t_obs = joined_uniques.loc[:, 't']
-    u_obs = joined_uniques.loc[:, 'u']
-
-    t_pdf = t_obs/t_obs.sum()
-    u_pdf = u_obs/u_obs.sum()
-
-    context.log_dataset(f'{key}_t_pdf', pd.DataFrame(t_pdf), format='parquet')
-    context.log_dataset(f'{key}_u_pdf', pd.DataFrame(u_pdf), format='parquet')
-    return t_pdf, u_pdf
-
-
-
-
-
-
-
def tvd(t, u):
-    return sum(abs(t - u)) / 2
-
-def helinger(t, u):
-    return (np.sqrt(np.sum(np.power(np.sqrt(t) - np.sqrt(u), 2))))/np.sqrt(2)
-
-def kl_divergence(t, u):
-    t_u = np.sum(np.where(t != 0, t * np.log(t / u), 0))
-    u_t = np.sum(np.where(u != 0, u * np.log(u / t), 0))
-    return t_u + u_t
-
-def all_metrics(t, u):
-    return tvd(t, u), helinger(t, u), kl_divergence(t, u)
-
-
-
-
-
-
-
def drift_magnitude(context, t: pd.DataFrame, u: pd.DataFrame, 
-         label_col=None, prediction_col=None, 
-         discretizers: dict = None, n_bins=5,
-         stream_name: str = 'some_stream',
-         results_tsdb_container: str = 'bigdata',
-         results_tsdb_table: str = 'concept_drift/drift_magnitude'):
-    """Drift magnitude metrics
-       Computes drift magnitude metrics between base dataset t and dataset u.
-       Metrics:
-        - TVD (Total Variation Distance)
-        - Helinger
-        - KL Divergence
-        
-    :param context: MLRun context
-    :param t: Base dataset for the drift metrics
-    :param u: Test dataset for the drift metrics
-    :param label_col: Label colum in t and u
-    :param prediction_col: Predictions column in t and u
-    :param discritizers: Dictionary of dicsritizers for the features if available
-                         (Created automatically if not provided)
-    :param n_bins: Number of bins to be used for histrogram creation from continuous variables
-    :param stream_name: Output stream to push metrics to
-    :param results_tsdb_container: TSDB table container to push metrics to
-    :param results_tsdb_table: TSDB table to push metrics to
-    """
-    
-    # Setup v3io connection and TSDB table
-    v3io_client = v3f.Client('framesd:8081', container=results_tsdb_container)
-    try:
-        v3io_client.create('tsdb', results_tsdb_table, if_exists=1, rate='1/s')
-    except:
-        v3io_client.create('tsdb', results_tsdb_table, if_exists=1, attrs={'rate': '1/s'})
-    
-    # Get input DFs
-    df_t = t.as_df()
-    df_u = u.as_df()
-    
-    # Get feature cols
-    
-    drop_columns = []
-    if label_col is not None:
-        drop_columns.append(label_col)
-    if prediction_col is not None:
-        drop_columns.append(prediction_col)
-    
-    
-    # Discretize continuous featuers
-    continuous_features = df_t.select_dtypes(['float'])
-    if discretizers is None:
-        discretizers = {}
-        for feature in continuous_features.columns:
-            context.logger.info(f'Fitting discretizer for {feature}')
-            # Need to train a new discretizer
-            discretizer = KBinsDiscretizer(n_bins=n_bins,
-                                           encode='ordinal',
-                                           strategy='uniform')
-
-            discretizer.fit(continuous_features.loc[:, feature].values.reshape(-1, 1))
-            discretizers[feature] = discretizer
-    os.makedirs(context.artifact_path, exist_ok=True)
-    discretizers_path = os.path.abspath(f'{context.artifact_path}/discritizer.pkl')
-    with open(discretizers_path, 'wb') as f:
-        pickle.dump(discretizers, f)
-    context.log_artifact('discritizers', target_path=discretizers_path)
-    context.logger.info('Discretizing featuers')
-    for feature, discretizer in discretizers.items():
-        df_t[feature] = discretizer.transform(df_t.loc[:, feature].values.reshape(-1, 1))
-        df_u[feature] = discretizer.transform(df_u.loc[:, feature].values.reshape(-1, 1))
-        df_t[feature] = df_t[feature].astype('int')
-        df_u[feature] = df_u[feature].astype('int')
-    context.log_dataset('t_discrete', df_t, format='parquet')
-    context.log_dataset('u_discrete', df_u, format='parquet')
-    
-    # Estimate probabilities 
-    # P(X), P(y), P(X|y), P(y|X) for t and u
-    
-    context.logger.info('Compute prior metrics')
-    
-    results = {}
-    t_prior, u_prior = to_observations(context, df_t.drop(drop_columns, axis=1), 
-                                       df_u.drop(drop_columns, axis=1), 'features')
-    results['prior_tvd'], results['prior_helinger'], results['prior_kld'] = all_metrics(t_prior, u_prior)
-    
-    if prediction_col is not None:
-        context.logger.info('Compute prediction metrics')
-        t_predictions = pd.DataFrame(df_t.loc[:, prediction_col])
-        u_predictions = pd.DataFrame(df_u.loc[:, prediction_col])
-        t_class, u_class = to_observations(context, t_predictions,
-                                           u_predictions, 'prediction')
-        results['prediction_shift_tvd'], results['prediction_shift_helinger'], results['prediction_shift_kld'] = all_metrics(t_class, u_class)
-        
-    if label_col is not None:
-        context.logger.info('Compute class metrics')
-        t_labels = pd.DataFrame(df_t.loc[:, label_col])
-        u_labels = pd.DataFrame(df_u.loc[:, label_col])
-        t_class, u_class = to_observations(context, t_labels,
-                                           u_labels, 'class')
-        results['class_shift_tvd'], results['class_shift_helinger'], results['class_shift_kld'] = all_metrics(t_class, u_class)
-    
-    for key, value in results.items():
-        if value == float('inf'):
-            context.logger.info(f'value: {value}')
-            results[key]=10
-    # Log results
-    for key, result in results.items():
-        context.log_result(key, round(result, 3))
-        
-    # Push results to TSDB
-    now = pd.to_datetime(str(datetime.datetime.now()))
-    now
-    
-    results['timestamp'] = pd.to_datetime(str((datetime.datetime.now())))
-    context.logger.info(f"Timestamp: {results['timestamp']}")
-    results['stream'] = stream_name
-    results_df = pd.DataFrame(data=[list(results.values())],
-                              columns=list(results.keys()))
-    results_df = results_df.set_index(['timestamp', 'stream'])
-    v3io_client.write('tsdb', results_tsdb_table, dfs=results_df)
-#     context.log_dataset('results', results_df, format='pq')
-
-
-
-
-
-
-
# nuclio: end-code
-
-
-
-
-
-
-
-

Test

-
-
-
import random
-
-
-
-
-
-

Wine dataset

-
-
-
from sklearn.datasets import load_wine
-
-
-
-
-
-
-
from mlrun import NewTask
-
-
-
-
-
-
-
wine = load_wine()
-
-
-
-
-
-
-
df_wine = pd.DataFrame(data=wine['data'],
-                       columns=wine['feature_names'])
-df_wine['y'] = wine['target']
-df_wine['prediction'] = wine['target']
-df_wine.to_parquet('data/wine_t.pq')
-df_wine.head()
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
alcoholmalic_acidashalcalinity_of_ashmagnesiumtotal_phenolsflavanoidsnonflavanoid_phenolsproanthocyaninscolor_intensityhueod280/od315_of_diluted_winesprolineyprediction
014.231.712.4315.6127.02.803.060.282.295.641.043.921065.000
113.201.782.1411.2100.02.652.760.261.284.381.053.401050.000
213.162.362.6718.6101.02.803.240.302.815.681.033.171185.000
314.371.952.5016.8113.03.853.490.242.187.800.863.451480.000
413.242.592.8721.0118.02.802.690.391.824.321.042.93735.000
-
-
-
-
-
u = df_wine.sample(frac=0.5).copy()
-# change_feature = [random.choice(wine['feature_names']), random.choice(wine['feature_names'])]
-# u[change_feature] = 1
-u.to_parquet('data/wine_u.pq')
-
-
-
-
-
-
-
# create job function object from notebook code
-fn = code_to_function("virtual_drift", 
-                      kind='job', 
-                      image='mlrun/ml-models')
-
-# add metadata (for templates and reuse)
-fn.spec.default_handler = "drift_magnitude"
-fn.spec.description = "Compute drift magnitude between Time-Samples T and U"
-fn.metadata.categories = ["ml", "serve", "concept-drift"]
-fn.metadata.labels = {"author": "orz"}
-fn.export("function.yaml")
-
-
-
-
-
[mlrun] 2020-07-14 13:58:50,586 function spec saved to path: function.yaml
-
-
-
<mlrun.runtimes.kubejob.KubejobRuntime at 0x7f23c8ab3f28>
-
-
-
-
-
-
-
fn.deploy()
-
-
-
-
-
-
-
fn.apply(mount_v3io())
-
-
-
-
-
<mlrun.runtimes.kubejob.KubejobRuntime at 0x7f9b09fb0e80>
-
-
-
-
-
-
-
task = NewTask(name='drift_magnitude',
-               handler='drift_magnitude',
-               params={'label_col': 'y',
-                       'results_tsdb_container': 'bigdata',
-                       'results_tsdb_table': 'drift_magnitude'},
-               inputs={'t': '/User/functions/virtual_drift/data/wine_t.pq',
-                       'u': '/User/functions/virtual_drift/data/wine_u.pq'},
-               artifact_path=os.path.abspath('/User/functions/virtual_drift/artifacts'))
-
-
-
-
-
-
-
fn.with_code().run(task)
-
-
-
-
-
[mlrun] 2020-06-02 12:56:25,352 starting run drift_magnitude uid=a20c78ddc72e45119ac4684bc4b32876  -> http://10.192.65.32:8080
-[mlrun] 2020-06-02 12:56:26,121 Job is running in the background, pod: drift-magnitude-xqb5r
-[mlrun] 2020-06-02 12:56:44,171 starting local run: main.py # drift_magnitude
-[mlrun] 2020-06-02 12:56:48,652 Fitting discretizer for alcohol
-[mlrun] 2020-06-02 12:56:48,655 Fitting discretizer for malic_acid
-[mlrun] 2020-06-02 12:56:48,657 Fitting discretizer for ash
-[mlrun] 2020-06-02 12:56:48,658 Fitting discretizer for alcalinity_of_ash
-[mlrun] 2020-06-02 12:56:48,660 Fitting discretizer for magnesium
-[mlrun] 2020-06-02 12:56:48,662 Fitting discretizer for total_phenols
-[mlrun] 2020-06-02 12:56:48,663 Fitting discretizer for flavanoids
-[mlrun] 2020-06-02 12:56:48,664 Fitting discretizer for nonflavanoid_phenols
-[mlrun] 2020-06-02 12:56:48,666 Fitting discretizer for proanthocyanins
-[mlrun] 2020-06-02 12:56:48,668 Fitting discretizer for color_intensity
-[mlrun] 2020-06-02 12:56:48,670 Fitting discretizer for hue
-[mlrun] 2020-06-02 12:56:48,670 Fitting discretizer for od280/od315_of_diluted_wines
-[mlrun] 2020-06-02 12:56:48,672 Fitting discretizer for proline
-[mlrun] 2020-06-02 12:56:48,752 log artifact discritizers at /User/demo-network-operations/artifacts/discritizer.pkl, size: None, db: Y
-[mlrun] 2020-06-02 12:56:48,754 Discretizing featuers
-[mlrun] 2020-06-02 12:56:49,199 log artifact t_discrete at /User/demo-network-operations/artifacts/t_discrete.parquet, size: 11803, db: Y
-[mlrun] 2020-06-02 12:56:49,549 log artifact u_discrete at /User/demo-network-operations/artifacts/u_discrete.parquet, size: 12370, db: Y
-[mlrun] 2020-06-02 12:56:49,552 Compute prior metrics
-[mlrun] 2020-06-02 12:56:49,968 log artifact features_t_pdf at /User/demo-network-operations/artifacts/features_t_pdf.parquet, size: 4399, db: Y
-[mlrun] 2020-06-02 12:56:50,103 log artifact features_u_pdf at /User/demo-network-operations/artifacts/features_u_pdf.parquet, size: 4428, db: Y
-[mlrun] 2020-06-02 12:56:50,123 Compute class metrics
-[mlrun] 2020-06-02 12:56:50,292 log artifact class_t_pdf at /User/demo-network-operations/artifacts/class_t_pdf.parquet, size: 2162, db: Y
-[mlrun] 2020-06-02 12:56:50,408 log artifact class_u_pdf at /User/demo-network-operations/artifacts/class_u_pdf.parquet, size: 2162, db: Y
-[mlrun] 2020-06-02 12:56:50,424 value: inf
-[mlrun] 2020-06-02 12:56:50,459 Timestamp: 2020-06-02 12:56:50.458032
-
-[mlrun] 2020-06-02 12:56:50,834 run executed, status=completed
-/usr/local/lib/python3.7/site-packages/pandas/core/series.py:679: RuntimeWarning: divide by zero encountered in log
-  result = getattr(ufunc, method)(*inputs, **kwargs)
-final state: succeeded
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 02 12:56:48completeddrift_magnitude
v3io_user=admin
kind=job
owner=admin
host=drift-magnitude-xqb5r
t
u
label_col=y
results_tsdb_container=bigdata
results_tsdb_table=drift_magnitude
prior_tvd=0.5
prior_helinger=0.541
prior_kld=10
class_shift_tvd=0.028
class_shift_helinger=0.02
class_shift_kld=0.003
discritizers
t_discrete
u_discrete
features_t_pdf
features_u_pdf
class_t_pdf
class_u_pdf
-
- -
-
to track results use .show() or .logs() or in CLI: 
-!mlrun get run a20c78ddc72e45119ac4684bc4b32876  , !mlrun logs a20c78ddc72e45119ac4684bc4b32876 
-[mlrun] 2020-06-02 12:56:57,590 run executed, status=completed
-
-
-
<mlrun.model.RunObject at 0x7f9b0d967128>
-
-
-
-
-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/virtual_drift/0.8.0/static/function.html b/functions/development/virtual_drift/0.8.0/static/function.html deleted file mode 100644 index 69d956a8..00000000 --- a/functions/development/virtual_drift/0.8.0/static/function.html +++ /dev/null @@ -1,151 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: job
-metadata:
-  name: virtual-drift
-  tag: ''
-  hash: 8990fdd72fc550189a0c8b488b69997428b786c9
-  project: default
-  labels:
-    author: orz
-  categories:
-  - data-analysis
-  - machine-learning
-spec:
-  command: ''
-  args: []
-  image: mlrun/ml-models
-  env: []
-  default_handler: drift_magnitude
-  entry_points:
-    to_observations:
-      name: to_observations
-      doc: ''
-      parameters:
-      - name: context
-        default: ''
-      - name: t
-        default: ''
-      - name: u
-        default: ''
-      - name: key
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 16
-    tvd:
-      name: tvd
-      doc: ''
-      parameters:
-      - name: t
-        default: ''
-      - name: u
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 42
-    helinger:
-      name: helinger
-      doc: ''
-      parameters:
-      - name: t
-        default: ''
-      - name: u
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 46
-    kl_divergence:
-      name: kl_divergence
-      doc: ''
-      parameters:
-      - name: t
-        default: ''
-      - name: u
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 50
-    all_metrics:
-      name: all_metrics
-      doc: ''
-      parameters:
-      - name: t
-        default: ''
-      - name: u
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 56
-    drift_magnitude:
-      name: drift_magnitude
-      doc: "Drift magnitude metrics\n   Computes drift magnitude metrics between base\
-        \ dataset t and dataset u.\n   Metrics:\n    - TVD (Total Variation Distance)\n\
-        \    - Helinger\n    - KL Divergence"
-      parameters:
-      - name: context
-        doc: MLRun context
-        default: ''
-      - name: t
-        type: DataFrame
-        doc: Base dataset for the drift metrics
-        default: ''
-      - name: u
-        type: DataFrame
-        doc: Test dataset for the drift metrics
-        default: ''
-      - name: label_col
-        doc: Label colum in t and u
-        default: null
-      - name: prediction_col
-        doc: Predictions column in t and u
-        default: null
-      - name: discretizers
-        type: dict
-        default: null
-      - name: n_bins
-        doc: Number of bins to be used for histrogram creation from continuous variables
-        default: 5
-      - name: stream_name
-        type: str
-        doc: Output stream to push metrics to
-        default: some_stream
-      - name: results_tsdb_container
-        type: str
-        doc: TSDB table container to push metrics to
-        default: bigdata
-      - name: results_tsdb_table
-        type: str
-        doc: TSDB table to push metrics to
-        default: concept_drift/drift_magnitude
-      outputs:
-      - default: ''
-      lineno: 60
-  description: Compute drift magnitude between Time-Samples T and U
-  build:
-    functionSourceCode: 
-    commands:
-    - python -m pip install scikit-learn scipy v3io_frames
-    code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/virtual_drift/virtual_drift.py
-  affinity: null
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/virtual_drift/0.8.0/static/item.html b/functions/development/virtual_drift/0.8.0/static/item.html deleted file mode 100644 index 78861044..00000000 --- a/functions/development/virtual_drift/0.8.0/static/item.html +++ /dev/null @@ -1,49 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- data-analysis
-- machine-learning
-description: Compute drift magnitude between Time-Samples T and U
-doc: ''
-example: virtual_drift.ipynb
-generationDate: 2021-05-19:23-13
-icon: ''
-labels:
-  author: orz
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 0.8.0
-name: virtual-drift
-platformVersion: 3.2.0
-spec:
-  filename: virtual_drift.py
-  handler: drift_magnitude
-  image: mlrun/ml-models
-  kind: job
-  requirements:
-  - scikit-learn
-  - scipy
-  - v3io_frames
-url: ''
-version: 0.8.0
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/virtual_drift/0.8.0/static/source.html b/functions/development/virtual_drift/0.8.0/static/source.html deleted file mode 100644 index 802cf249..00000000 --- a/functions/development/virtual_drift/0.8.0/static/source.html +++ /dev/null @@ -1,214 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-# Generated by nuclio.export.NuclioExporter
-
-import os
-import pandas as pd
-import numpy as np
-import scipy as sp
-import pickle
-import datetime
-
-import v3io_frames as v3f
-
-import matplotlib.pyplot as plt
-from sklearn.preprocessing import KBinsDiscretizer
-
-
-def to_observations(context, t, u, key):
-    t = (
-        t.apply(lambda row: f"{'_'.join([str(row[val]) for val in t.columns])}", axis=1)
-        .value_counts()
-        .sort_index()
-    )
-    u = (
-        u.apply(lambda row: f"{'_'.join([str(row[val]) for val in u.columns])}", axis=1)
-        .value_counts()
-        .sort_index()
-    )
-
-    joined_uniques = pd.DataFrame([t, u]).T.fillna(0).sort_index()
-    joined_uniques.columns = ["t", "u"]
-
-    t_obs = joined_uniques.loc[:, "t"]
-    u_obs = joined_uniques.loc[:, "u"]
-
-    t_pdf = t_obs / t_obs.sum()
-    u_pdf = u_obs / u_obs.sum()
-
-    context.log_dataset(f"{key}_t_pdf", pd.DataFrame(t_pdf), format="parquet")
-    context.log_dataset(f"{key}_u_pdf", pd.DataFrame(u_pdf), format="parquet")
-    return t_pdf, u_pdf
-
-
-def tvd(t, u):
-    return sum(abs(t - u)) / 2
-
-
-def helinger(t, u):
-    return (np.sqrt(np.sum(np.power(np.sqrt(t) - np.sqrt(u), 2)))) / np.sqrt(2)
-
-
-def kl_divergence(t, u):
-    t_u = np.sum(np.where(t != 0, t * np.log(t / u), 0))
-    u_t = np.sum(np.where(u != 0, u * np.log(u / t), 0))
-    return t_u + u_t
-
-
-def all_metrics(t, u):
-    return tvd(t, u), helinger(t, u), kl_divergence(t, u)
-
-
-def drift_magnitude(
-    context,
-    t: pd.DataFrame,
-    u: pd.DataFrame,
-    label_col=None,
-    prediction_col=None,
-    discretizers: dict = None,
-    n_bins=5,
-    stream_name: str = "some_stream",
-    results_tsdb_container: str = "bigdata",
-    results_tsdb_table: str = "concept_drift/drift_magnitude",
-):
-    """Drift magnitude metrics
-       Computes drift magnitude metrics between base dataset t and dataset u.
-       Metrics:
-        - TVD (Total Variation Distance)
-        - Helinger
-        - KL Divergence
-
-    :param context: MLRun context
-    :param t: Base dataset for the drift metrics
-    :param u: Test dataset for the drift metrics
-    :param label_col: Label colum in t and u
-    :param prediction_col: Predictions column in t and u
-    :param discritizers: Dictionary of dicsritizers for the features if available
-                         (Created automatically if not provided)
-    :param n_bins: Number of bins to be used for histrogram creation from continuous variables
-    :param stream_name: Output stream to push metrics to
-    :param results_tsdb_container: TSDB table container to push metrics to
-    :param results_tsdb_table: TSDB table to push metrics to
-    """
-
-    v3io_client = v3f.Client("framesd:8081", container=results_tsdb_container)
-    try:
-        v3io_client.create("tsdb", results_tsdb_table, if_exists=1, rate="1/s")
-    except:
-        v3io_client.create(
-            "tsdb", results_tsdb_table, if_exists=1, attrs={"rate": "1/s"}
-        )
-
-    df_t = t.as_df()
-    df_u = u.as_df()
-
-    drop_columns = []
-    if label_col is not None:
-        drop_columns.append(label_col)
-    if prediction_col is not None:
-        drop_columns.append(prediction_col)
-
-    continuous_features = df_t.select_dtypes(["float"])
-    if discretizers is None:
-        discretizers = {}
-        for feature in continuous_features.columns:
-            context.logger.info(f"Fitting discretizer for {feature}")
-            discretizer = KBinsDiscretizer(
-                n_bins=n_bins, encode="ordinal", strategy="uniform"
-            )
-
-            discretizer.fit(continuous_features.loc[:, feature].values.reshape(-1, 1))
-            discretizers[feature] = discretizer
-    os.makedirs(context.artifact_path, exist_ok=True)
-    discretizers_path = os.path.abspath(f"{context.artifact_path}/discritizer.pkl")
-    with open(discretizers_path, "wb") as f:
-        pickle.dump(discretizers, f)
-    context.log_artifact("discritizers", target_path=discretizers_path)
-    context.logger.info("Discretizing featuers")
-    for feature, discretizer in discretizers.items():
-        df_t[feature] = discretizer.transform(
-            df_t.loc[:, feature].values.reshape(-1, 1)
-        )
-        df_u[feature] = discretizer.transform(
-            df_u.loc[:, feature].values.reshape(-1, 1)
-        )
-        df_t[feature] = df_t[feature].astype("int")
-        df_u[feature] = df_u[feature].astype("int")
-    context.log_dataset("t_discrete", df_t, format="parquet")
-    context.log_dataset("u_discrete", df_u, format="parquet")
-
-    context.logger.info("Compute prior metrics")
-
-    results = {}
-    t_prior, u_prior = to_observations(
-        context,
-        df_t.drop(drop_columns, axis=1),
-        df_u.drop(drop_columns, axis=1),
-        "features",
-    )
-    results["prior_tvd"], results["prior_helinger"], results["prior_kld"] = all_metrics(
-        t_prior, u_prior
-    )
-
-    if prediction_col is not None:
-        context.logger.info("Compute prediction metrics")
-        t_predictions = pd.DataFrame(df_t.loc[:, prediction_col])
-        u_predictions = pd.DataFrame(df_u.loc[:, prediction_col])
-        t_class, u_class = to_observations(
-            context, t_predictions, u_predictions, "prediction"
-        )
-        (
-            results["prediction_shift_tvd"],
-            results["prediction_shift_helinger"],
-            results["prediction_shift_kld"],
-        ) = all_metrics(t_class, u_class)
-
-    if label_col is not None:
-        context.logger.info("Compute class metrics")
-        t_labels = pd.DataFrame(df_t.loc[:, label_col])
-        u_labels = pd.DataFrame(df_u.loc[:, label_col])
-        t_class, u_class = to_observations(context, t_labels, u_labels, "class")
-        (
-            results["class_shift_tvd"],
-            results["class_shift_helinger"],
-            results["class_shift_kld"],
-        ) = all_metrics(t_class, u_class)
-
-    for key, value in results.items():
-        if value == float("inf"):
-            context.logger.info(f"value: {value}")
-            results[key] = 10
-    for key, result in results.items():
-        context.log_result(key, round(result, 3))
-
-    now = pd.to_datetime(str(datetime.datetime.now()))
-    now
-
-    results["timestamp"] = pd.to_datetime(str((datetime.datetime.now())))
-    context.logger.info(f"Timestamp: {results['timestamp']}")
-    results["stream"] = stream_name
-    results_df = pd.DataFrame(
-        data=[list(results.values())], columns=list(results.keys())
-    )
-    results_df = results_df.set_index(["timestamp", "stream"])
-    v3io_client.write("tsdb", results_tsdb_table, dfs=results_df)
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/virtual_drift/0.9.0/src/README.md b/functions/development/virtual_drift/0.9.0/src/README.md deleted file mode 100644 index cd738390..00000000 --- a/functions/development/virtual_drift/0.9.0/src/README.md +++ /dev/null @@ -1,56 +0,0 @@ -# Drift Magnitude - -Concept drift and shift are major issues that greatly affect the accuracy and reliability of many real-world applications of machine learning. We can use the following Drift Magnitude metrics to map and understand our concepts and how close the properties of the data we used to train the models on are to the current data we receive. - -## How to integrate - -The Virtual Drift function is built to receive two data batches of data (as `dataitem` or `Dataframe`), base batch *t* and current batch *u*. - -```markdown -:param context: MLRun context -:param t: Base dataset for the drift metrics -:param u: Test dataset for the drift metrics -:param label_col: Label colum in t and u -:param prediction_col: Predictions column in t and u -:param discritizers: Dictionary of dicsritizers for the features if available - (Created automatically if not provided) -:param n_bins: Number of bins to be used for histrogram creation from continuous variables -:param stream_name: Output stream to push metrics to -:param results_tsdb_container: TSDB table container to push metrics to -:param results_tsdb_table: TSDB table to push metrics to -``` - -The function will calculate the selected drift mangitude metrics that were selected and apply them to the **features**, **labels** and **predictions**. It will then save those metrics and export them via Parquet and TSDB. Alerting could be added on top of the metrics via Grafana or a function. - -## Metrics - -The drift magnitude metrics we calculate are: - -### TVD - Total Variation Distance - -Provides a symetric drift distance between two periods *t* and *u* -Z - vector of random variables -P*t* - Probability distribution over timespan *t* - -![\sigma_{t, u}(Z)=\frac{1}{2}\sum_{\hat{z}\in{dom(Z)}}{|P_t{(\hat{Z})-P_u{(\hat{Z})}}|}]() - -### Helinger Distance - -Hellinger distance is an *f* divergence measuer, similar to the Kullback-Leibler (KL) divergence. However, unlike KL Divergence the Hellinger divergence is symmetric and bounded over a probability space. - -P, Q - Discrete probability distributions (P*i*, ..., P*k*). - -![H(P,Q)=\frac{1}{\sqrt{2}}\sqrt{\sum_{i=1}^{k}{(\sqrt{p_i}-\sqrt{q_i})^2}}]() - - -### KL Divergence - -KL Divergence (or relative entropy) is a measure of how one probability distribution differs from another. It is an asymmetric measure (thus it's not a metric) and it doesn't satisfy the triangle inequality. KL Divergence of 0, indicates two identical distributrions. - -![D_{KL}(P||Q)=\sum_{x\in{X}}{(P(x)\log{\frac{P(x)}{Q(x)}})}]() - -## Additional Resources - -Webb, Geoffrey I. et al. “[Characterizing Concept Drift.](https://arxiv.org/abs/1511.03816)” Data Mining and Knowledge Discovery 30.4 (2016): 964–994. Crossref. Web. - -[MLOps Live #4 - How to Detect & Remediate Drift in Production with MLOps Automation](https://www.youtube.com/watch?v=66_Q7mJZOSc&t=1296s) diff --git a/functions/development/virtual_drift/0.9.0/src/function.yaml b/functions/development/virtual_drift/0.9.0/src/function.yaml deleted file mode 100644 index 55dcec11..00000000 --- a/functions/development/virtual_drift/0.9.0/src/function.yaml +++ /dev/null @@ -1,129 +0,0 @@ -kind: job -metadata: - name: virtual-drift - tag: '' - hash: 8990fdd72fc550189a0c8b488b69997428b786c9 - project: '' - labels: - author: orz - categories: - - data-analysis - - machine-learning -spec: - command: '' - args: [] - image: mlrun/ml-models - env: [] - default_handler: drift_magnitude - entry_points: - to_observations: - name: to_observations - doc: '' - parameters: - - name: context - default: '' - - name: t - default: '' - - name: u - default: '' - - name: key - default: '' - outputs: - - default: '' - lineno: 16 - tvd: - name: tvd - doc: '' - parameters: - - name: t - default: '' - - name: u - default: '' - outputs: - - default: '' - lineno: 42 - helinger: - name: helinger - doc: '' - parameters: - - name: t - default: '' - - name: u - default: '' - outputs: - - default: '' - lineno: 46 - kl_divergence: - name: kl_divergence - doc: '' - parameters: - - name: t - default: '' - - name: u - default: '' - outputs: - - default: '' - lineno: 50 - all_metrics: - name: all_metrics - doc: '' - parameters: - - name: t - default: '' - - name: u - default: '' - outputs: - - default: '' - lineno: 56 - drift_magnitude: - name: drift_magnitude - doc: "Drift magnitude metrics\n Computes drift magnitude metrics between base\ - \ dataset t and dataset u.\n Metrics:\n - TVD (Total Variation Distance)\n\ - \ - Helinger\n - KL Divergence" - parameters: - - name: context - doc: MLRun context - default: '' - - name: t - type: DataFrame - doc: Base dataset for the drift metrics - default: '' - - name: u - type: DataFrame - doc: Test dataset for the drift metrics - default: '' - - name: label_col - doc: Label colum in t and u - default: null - - name: prediction_col - doc: Predictions column in t and u - default: null - - name: discretizers - type: dict - default: null - - name: n_bins - doc: Number of bins to be used for histrogram creation from continuous variables - default: 5 - - name: stream_name - type: str - doc: Output stream to push metrics to - default: some_stream - - name: results_tsdb_container - type: str - doc: TSDB table container to push metrics to - default: bigdata - - name: results_tsdb_table - type: str - doc: TSDB table to push metrics to - default: concept_drift/drift_magnitude - outputs: - - default: '' - lineno: 60 - description: Compute drift magnitude between Time-Samples T and U - build: - functionSourceCode:  - commands: - - python -m pip install scikit-learn scipy v3io_frames - code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/virtual_drift/virtual_drift.py - affinity: null -verbose: false diff --git a/functions/development/virtual_drift/0.9.0/src/item.yaml b/functions/development/virtual_drift/0.9.0/src/item.yaml deleted file mode 100644 index 41f99e9e..00000000 --- a/functions/development/virtual_drift/0.9.0/src/item.yaml +++ /dev/null @@ -1,27 +0,0 @@ -apiVersion: v1 -categories: -- data-analysis -- machine-learning -description: Compute drift magnitude between Time-Samples T and U -doc: '' -example: virtual_drift.ipynb -generationDate: 2021-11-18:12-28 -icon: '' -labels: - author: orz -maintainers: [] -marketplaceType: '' -mlrunVersion: 0.8.0 -name: virtual-drift -platformVersion: 3.2.0 -spec: - filename: virtual_drift.py - handler: drift_magnitude - image: mlrun/ml-models - kind: job - requirements: - - scikit-learn - - scipy - - v3io_frames -url: '' -version: 0.9.0 diff --git a/functions/development/virtual_drift/0.9.0/src/virtual_drift.ipynb b/functions/development/virtual_drift/0.9.0/src/virtual_drift.ipynb deleted file mode 100644 index 23b9ef43..00000000 --- a/functions/development/virtual_drift/0.9.0/src/virtual_drift.ipynb +++ /dev/null @@ -1,935 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Virtual Drift\n", - "\n", - "Drift magnitude metrics\n", - " Computes drift magnitude metrics between base dataset t and dataset u. \n", - "\n", - "Metrics:\n", - "- TVD (Total Variation Distance)\n", - "- Helinger\n", - "- KL Divergence" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Steps**\n", - "\n", - "1. [Data exploration](#Data-exploration)\n", - "2. [Importing the function](#Importing-the-function)\n", - "3. [Running the function locally](#Running-the-function-locally)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Data exploration**" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - ".. _wine_dataset:\n", - "\n", - "Wine recognition dataset\n", - "------------------------\n", - "\n", - "**Data Set Characteristics:**\n", - "\n", - " :Number of Instances: 178 (50 in each of three classes)\n", - " :Number of Attributes: 13 numeric, predictive attributes and the class\n", - " :Attribute Information:\n", - " \t\t- Alcohol\n", - " \t\t- Malic acid\n", - " \t\t- Ash\n", - "\t\t- Alcalinity of ash \n", - " \t\t- Magnesium\n", - "\t\t- Total phenols\n", - " \t\t- Flavanoids\n", - " \t\t- Nonflavanoid phenols\n", - " \t\t- Proanthocyanins\n", - "\t\t- Color intensity\n", - " \t\t- Hue\n", - " \t\t- OD280/OD315 of diluted wines\n", - " \t\t- Proline\n", - "\n", - " - class:\n", - " - class_0\n", - " - class_1\n", - " - class_2\n", - "\t\t\n", - " :Summary Statistics:\n", - " \n", - " ============================= ==== ===== ======= =====\n", - " Min Max Mean SD\n", - " ============================= ==== ===== ======= =====\n", - " Alcohol: 11.0 14.8 13.0 0.8\n", - " Malic Acid: 0.74 5.80 2.34 1.12\n", - " Ash: 1.36 3.23 2.36 0.27\n", - " Alcalinity of Ash: 10.6 30.0 19.5 3.3\n", - " Magnesium: 70.0 162.0 99.7 14.3\n", - " Total Phenols: 0.98 3.88 2.29 0.63\n", - " Flavanoids: 0.34 5.08 2.03 1.00\n", - " Nonflavanoid Phenols: 0.13 0.66 0.36 0.12\n", - " Proanthocyanins: 0.41 3.58 1.59 0.57\n", - " Colour Intensity: 1.3 13.0 5.1 2.3\n", - " Hue: 0.48 1.71 0.96 0.23\n", - " OD280/OD315 of diluted wines: 1.27 4.00 2.61 0.71\n", - " Proline: 278 1680 746 315\n", - " ============================= ==== ===== ======= =====\n", - "\n", - " :Missing Attribute Values: None\n", - " :Class Distribution: class_0 (59), class_1 (71), class_2 (48)\n", - " :Creator: R.A. Fisher\n", - " :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)\n", - " :Date: July, 1988\n", - "\n", - "This is a copy of UCI ML Wine recognition datasets.\n", - "https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data\n", - "\n", - "The data is the results of a chemical analysis of wines grown in the same\n", - "region in Italy by three different cultivators. There are thirteen different\n", - "measurements taken for different constituents found in the three types of\n", - "wine.\n", - "\n", - "Original Owners: \n", - "\n", - "Forina, M. et al, PARVUS - \n", - "An Extendible Package for Data Exploration, Classification and Correlation. \n", - "Institute of Pharmaceutical and Food Analysis and Technologies,\n", - "Via Brigata Salerno, 16147 Genoa, Italy.\n", - "\n", - "Citation:\n", - "\n", - "Lichman, M. (2013). UCI Machine Learning Repository\n", - "[https://archive.ics.uci.edu/ml]. Irvine, CA: University of California,\n", - "School of Information and Computer Science. \n", - "\n", - ".. topic:: References\n", - "\n", - " (1) S. Aeberhard, D. Coomans and O. de Vel, \n", - " Comparison of Classifiers in High Dimensional Settings, \n", - " Tech. Rep. no. 92-02, (1992), Dept. of Computer Science and Dept. of \n", - " Mathematics and Statistics, James Cook University of North Queensland. \n", - " (Also submitted to Technometrics). \n", - "\n", - " The data was used with many others for comparing various \n", - " classifiers. The classes are separable, though only RDA \n", - " has achieved 100% correct classification. \n", - " (RDA : 100%, QDA 99.4%, LDA 98.9%, 1NN 96.1% (z-transformed data)) \n", - " (All results using the leave-one-out technique) \n", - "\n", - " (2) S. Aeberhard, D. Coomans and O. de Vel, \n", - " \"THE CLASSIFICATION PERFORMANCE OF RDA\" \n", - " Tech. Rep. no. 92-01, (1992), Dept. of Computer Science and Dept. of \n", - " Mathematics and Statistics, James Cook University of North Queensland. \n", - " (Also submitted to Journal of Chemometrics).\n", - "\n" - ] - } - ], - "source": [ - "# Scikit-learn's wine dataset\n", - "from sklearn.datasets import load_wine\n", - "\n", - "wine = load_wine()\n", - "print(wine[\"DESCR\"])" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "wine_t and wine_u are generated from the wine dataset, where wine_t is the entire dataset while wine_u is a sample (50%) of the entire dataset. \n", - "wine_t shape is 178 and wine_u shape is 89 \n", - "\n", - "\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
alcoholmalic_acidashalcalinity_of_ashmagnesiumtotal_phenolsflavanoidsnonflavanoid_phenolsproanthocyaninscolor_intensityhueod280/od315_of_diluted_winesprolineyprediction
014.231.712.4315.6127.02.803.060.282.295.641.043.921065.000
113.201.782.1411.2100.02.652.760.261.284.381.053.401050.000
213.162.362.6718.6101.02.803.240.302.815.681.033.171185.000
314.371.952.5016.8113.03.853.490.242.187.800.863.451480.000
413.242.592.8721.0118.02.802.690.391.824.321.042.93735.000
\n", - "
" - ], - "text/plain": [ - " alcohol malic_acid ash alcalinity_of_ash magnesium total_phenols \\\n", - "0 14.23 1.71 2.43 15.6 127.0 2.80 \n", - "1 13.20 1.78 2.14 11.2 100.0 2.65 \n", - "2 13.16 2.36 2.67 18.6 101.0 2.80 \n", - "3 14.37 1.95 2.50 16.8 113.0 3.85 \n", - "4 13.24 2.59 2.87 21.0 118.0 2.80 \n", - "\n", - " flavanoids nonflavanoid_phenols proanthocyanins color_intensity hue \\\n", - "0 3.06 0.28 2.29 5.64 1.04 \n", - "1 2.76 0.26 1.28 4.38 1.05 \n", - "2 3.24 0.30 2.81 5.68 1.03 \n", - "3 3.49 0.24 2.18 7.80 0.86 \n", - "4 2.69 0.39 1.82 4.32 1.04 \n", - "\n", - " od280/od315_of_diluted_wines proline y prediction \n", - "0 3.92 1065.0 0 0 \n", - "1 3.40 1050.0 0 0 \n", - "2 3.17 1185.0 0 0 \n", - "3 3.45 1480.0 0 0 \n", - "4 2.93 735.0 0 0 " - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "wine_t_path = 'https://s3.wasabisys.com/iguazio/data/function-marketplace-data/virtual_drift/wine_t.pq'\n", - "wine_u_path = 'https://s3.wasabisys.com/iguazio/data/function-marketplace-data/virtual_drift/wine_u.pq'\n", - "wine_t=pd.read_parquet(wine_t_path)\n", - "wine_u=pd.read_parquet(wine_u_path)\n", - "print(f'wine_t and wine_u are generated from the wine dataset, where wine_t is the entire dataset while wine_u is a sample (50%) of the entire dataset. \\n\\\n", - "wine_t shape is {wine_t.shape[0]} and wine_u shape is {wine_u.shape[0]} \\n\\n')\n", - "wine_t.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Importing the function**" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2021-10-26 13:45:22,345 [info] created and saved project function-marketplace\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import mlrun\n", - "\n", - "# Importing the function\n", - "mlrun.set_environment(project='function-marketplace')\n", - "\n", - "fn = mlrun.import_function(\"hub://virtual_drift\")\n", - "fn.apply(mlrun.auto_mount())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Running the function locally**" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [], - "source": [ - "import os \n", - "\n", - "container = os.path.join('/',os.environ['V3IO_HOME'].split('/')[0])\n", - "user = os.environ[\"V3IO_USERNAME\"]\n", - "rel_path = os.getcwd()[6:] + '/artifacts'\n", - "tsdb_path = os.path.join(user,rel_path) + \"/output_tsdb\"" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2021-10-26 14:00:41,020 [info] starting run virtual-drift-drift_magnitude uid=28ec7f08ce7c4c528114e2590ff49325 DB=http://mlrun-api:8080\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Warning - Server version '0.8.14' is different from client version '0.9.4'. Some operations may not work as expected.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2021-10-26 14:00:43,469 [info] Fitting discretizer for alcohol\n", - "> 2021-10-26 14:00:43,471 [info] Fitting discretizer for malic_acid\n", - "> 2021-10-26 14:00:43,471 [info] Fitting discretizer for ash\n", - "> 2021-10-26 14:00:43,472 [info] Fitting discretizer for alcalinity_of_ash\n", - "> 2021-10-26 14:00:43,473 [info] Fitting discretizer for magnesium\n", - "> 2021-10-26 14:00:43,474 [info] Fitting discretizer for total_phenols\n", - "> 2021-10-26 14:00:43,475 [info] Fitting discretizer for flavanoids\n", - "> 2021-10-26 14:00:43,476 [info] Fitting discretizer for nonflavanoid_phenols\n", - "> 2021-10-26 14:00:43,477 [info] Fitting discretizer for proanthocyanins\n", - "> 2021-10-26 14:00:43,477 [info] Fitting discretizer for color_intensity\n", - "> 2021-10-26 14:00:43,478 [info] Fitting discretizer for hue\n", - "> 2021-10-26 14:00:43,479 [info] Fitting discretizer for od280/od315_of_diluted_wines\n", - "> 2021-10-26 14:00:43,480 [info] Fitting discretizer for proline\n", - "> 2021-10-26 14:00:43,531 [info] Discretizing featuers\n", - "> 2021-10-26 14:00:43,752 [info] Compute prior metrics\n", - "> 2021-10-26 14:00:43,889 [info] Compute class metrics\n", - "> 2021-10-26 14:00:44,000 [info] value: inf\n", - "> 2021-10-26 14:00:44,009 [info] Timestamp: 2021-10-26 14:00:44.008992\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "divide by zero encountered in log\n", - "casting datetime64[ns] values to int64 with .astype(...) is deprecated and will raise in a future version. Use .view(...) instead.\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
function-marketplace0Oct 26 14:00:41completedvirtual-drift-drift_magnitude
v3io_user=dani
kind=
owner=dani
host=jupyter-dani-6bfbd76d96-zxx6f
t
u
label_col=y
results_tsdb_container=users
results_tsdb_table=dani/test/functions/virtual_drift/artifacts/output_tsdb
prior_tvd=0.5
prior_helinger=0.541
prior_kld=10
class_shift_tvd=0.017
class_shift_helinger=0.014
class_shift_kld=0.002
discritizers
t_discrete
u_discrete
features_t_pdf
features_u_pdf
class_t_pdf
class_u_pdf
\n", - "
\n", - "
\n", - "
\n", - " Title\n", - " ×\n", - "
\n", - " \n", - "
\n", - "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - }, - { - "data": { - "text/html": [ - " > to track results use the .show() or .logs() methods or click here to open in UI" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2021-10-26 14:00:44,153 [info] run executed, status=completed\n" - ] - } - ], - "source": [ - "virtual_drift_run=fn.run(params={'label_col': 'y',\n", - " 'results_tsdb_container': container[1:],\n", - " 'results_tsdb_table': tsdb_path},\n", - " inputs={'t': wine_t_path,\n", - " 'u': wine_u_path},\n", - " artifact_path=os.getcwd(),\n", - " local=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
u
00.348315
10.382022
20.269663
\n", - "
" - ], - "text/plain": [ - " u\n", - "0 0.348315\n", - "1 0.382022\n", - "2 0.269663" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
t
00.331461
10.398876
20.269663
\n", - "
" - ], - "text/plain": [ - " t\n", - "0 0.331461\n", - "1 0.398876\n", - "2 0.269663" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "virtual_drift_run.artifact('class_u_pdf').show()\n", - "virtual_drift_run.artifact('class_t_pdf').show()" - ] - }, - { - "cell_type": "code", - "execution_count": 69, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Warning - Server version '0.8.14' is different from client version '0.9.4'. Some operations may not work as expected.\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
class_shift_helingerclass_shift_kldclass_shift_tvdprior_helingerprior_kldprior_tvdstream
time
2021-10-26 13:58:04.445000+00:000.013980.0015640.0168540.54119610.00.5some_stream
2021-10-26 14:00:44.008000+00:000.013980.0015640.0168540.54119610.00.5some_stream
\n", - "
" - ], - "text/plain": [ - " class_shift_helinger class_shift_kld \\\n", - "time \n", - "2021-10-26 13:58:04.445000+00:00 0.01398 0.001564 \n", - "2021-10-26 14:00:44.008000+00:00 0.01398 0.001564 \n", - "\n", - " class_shift_tvd prior_helinger prior_kld \\\n", - "time \n", - "2021-10-26 13:58:04.445000+00:00 0.016854 0.541196 10.0 \n", - "2021-10-26 14:00:44.008000+00:00 0.016854 0.541196 10.0 \n", - "\n", - " prior_tvd stream \n", - "time \n", - "2021-10-26 13:58:04.445000+00:00 0.5 some_stream \n", - "2021-10-26 14:00:44.008000+00:00 0.5 some_stream " - ] - }, - "execution_count": 69, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import v3io_frames as v3f\n", - "client = v3f.Client(os.environ[\"V3IO_FRAMESD\"],container=container[1:])\n", - "client.read(backend='tsdb',table=tsdb_path)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "[Back to the top](#Virtual-Drift)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/functions/development/virtual_drift/0.9.0/src/virtual_drift.py b/functions/development/virtual_drift/0.9.0/src/virtual_drift.py deleted file mode 100644 index e677d721..00000000 --- a/functions/development/virtual_drift/0.9.0/src/virtual_drift.py +++ /dev/null @@ -1,192 +0,0 @@ -# Generated by nuclio.export.NuclioExporter - -import os -import pandas as pd -import numpy as np -import scipy as sp -import pickle -import datetime - -import v3io_frames as v3f - -import matplotlib.pyplot as plt -from sklearn.preprocessing import KBinsDiscretizer - - -def to_observations(context, t, u, key): - t = ( - t.apply(lambda row: f"{'_'.join([str(row[val]) for val in t.columns])}", axis=1) - .value_counts() - .sort_index() - ) - u = ( - u.apply(lambda row: f"{'_'.join([str(row[val]) for val in u.columns])}", axis=1) - .value_counts() - .sort_index() - ) - - joined_uniques = pd.DataFrame([t, u]).T.fillna(0).sort_index() - joined_uniques.columns = ["t", "u"] - - t_obs = joined_uniques.loc[:, "t"] - u_obs = joined_uniques.loc[:, "u"] - - t_pdf = t_obs / t_obs.sum() - u_pdf = u_obs / u_obs.sum() - - context.log_dataset(f"{key}_t_pdf", pd.DataFrame(t_pdf), format="parquet") - context.log_dataset(f"{key}_u_pdf", pd.DataFrame(u_pdf), format="parquet") - return t_pdf, u_pdf - - -def tvd(t, u): - return sum(abs(t - u)) / 2 - - -def helinger(t, u): - return (np.sqrt(np.sum(np.power(np.sqrt(t) - np.sqrt(u), 2)))) / np.sqrt(2) - - -def kl_divergence(t, u): - t_u = np.sum(np.where(t != 0, t * np.log(t / u), 0)) - u_t = np.sum(np.where(u != 0, u * np.log(u / t), 0)) - return t_u + u_t - - -def all_metrics(t, u): - return tvd(t, u), helinger(t, u), kl_divergence(t, u) - - -def drift_magnitude( - context, - t: pd.DataFrame, - u: pd.DataFrame, - label_col=None, - prediction_col=None, - discretizers: dict = None, - n_bins=5, - stream_name: str = "some_stream", - results_tsdb_container: str = "bigdata", - results_tsdb_table: str = "concept_drift/drift_magnitude", -): - """Drift magnitude metrics - Computes drift magnitude metrics between base dataset t and dataset u. - Metrics: - - TVD (Total Variation Distance) - - Helinger - - KL Divergence - - :param context: MLRun context - :param t: Base dataset for the drift metrics - :param u: Test dataset for the drift metrics - :param label_col: Label colum in t and u - :param prediction_col: Predictions column in t and u - :param discritizers: Dictionary of dicsritizers for the features if available - (Created automatically if not provided) - :param n_bins: Number of bins to be used for histrogram creation from continuous variables - :param stream_name: Output stream to push metrics to - :param results_tsdb_container: TSDB table container to push metrics to - :param results_tsdb_table: TSDB table to push metrics to - """ - - v3io_client = v3f.Client("framesd:8081", container=results_tsdb_container) - try: - v3io_client.create("tsdb", results_tsdb_table, if_exists=1, rate="1/s") - except: - v3io_client.create( - "tsdb", results_tsdb_table, if_exists=1, attrs={"rate": "1/s"} - ) - - df_t = t.as_df() - df_u = u.as_df() - - drop_columns = [] - if label_col is not None: - drop_columns.append(label_col) - if prediction_col is not None: - drop_columns.append(prediction_col) - - continuous_features = df_t.select_dtypes(["float"]) - if discretizers is None: - discretizers = {} - for feature in continuous_features.columns: - context.logger.info(f"Fitting discretizer for {feature}") - discretizer = KBinsDiscretizer( - n_bins=n_bins, encode="ordinal", strategy="uniform" - ) - - discretizer.fit(continuous_features.loc[:, feature].values.reshape(-1, 1)) - discretizers[feature] = discretizer - os.makedirs(context.artifact_path, exist_ok=True) - discretizers_path = os.path.abspath(f"{context.artifact_path}/discritizer.pkl") - with open(discretizers_path, "wb") as f: - pickle.dump(discretizers, f) - context.log_artifact("discritizers", target_path=discretizers_path) - context.logger.info("Discretizing featuers") - for feature, discretizer in discretizers.items(): - df_t[feature] = discretizer.transform( - df_t.loc[:, feature].values.reshape(-1, 1) - ) - df_u[feature] = discretizer.transform( - df_u.loc[:, feature].values.reshape(-1, 1) - ) - df_t[feature] = df_t[feature].astype("int") - df_u[feature] = df_u[feature].astype("int") - context.log_dataset("t_discrete", df_t, format="parquet") - context.log_dataset("u_discrete", df_u, format="parquet") - - context.logger.info("Compute prior metrics") - - results = {} - t_prior, u_prior = to_observations( - context, - df_t.drop(drop_columns, axis=1), - df_u.drop(drop_columns, axis=1), - "features", - ) - results["prior_tvd"], results["prior_helinger"], results["prior_kld"] = all_metrics( - t_prior, u_prior - ) - - if prediction_col is not None: - context.logger.info("Compute prediction metrics") - t_predictions = pd.DataFrame(df_t.loc[:, prediction_col]) - u_predictions = pd.DataFrame(df_u.loc[:, prediction_col]) - t_class, u_class = to_observations( - context, t_predictions, u_predictions, "prediction" - ) - ( - results["prediction_shift_tvd"], - results["prediction_shift_helinger"], - results["prediction_shift_kld"], - ) = all_metrics(t_class, u_class) - - if label_col is not None: - context.logger.info("Compute class metrics") - t_labels = pd.DataFrame(df_t.loc[:, label_col]) - u_labels = pd.DataFrame(df_u.loc[:, label_col]) - t_class, u_class = to_observations(context, t_labels, u_labels, "class") - ( - results["class_shift_tvd"], - results["class_shift_helinger"], - results["class_shift_kld"], - ) = all_metrics(t_class, u_class) - - for key, value in results.items(): - if value == float("inf"): - context.logger.info(f"value: {value}") - results[key] = 10 - for key, result in results.items(): - context.log_result(key, round(result, 3)) - - now = pd.to_datetime(str(datetime.datetime.now())) - now - - results["timestamp"] = pd.to_datetime(str((datetime.datetime.now()))) - context.logger.info(f"Timestamp: {results['timestamp']}") - results["stream"] = stream_name - results_df = pd.DataFrame( - data=[list(results.values())], columns=list(results.keys()) - ) - results_df = results_df.set_index(["timestamp", "stream"]) - v3io_client.write("tsdb", results_tsdb_table, dfs=results_df) diff --git a/functions/development/virtual_drift/0.9.0/static/documentation.html b/functions/development/virtual_drift/0.9.0/static/documentation.html deleted file mode 100644 index 37387134..00000000 --- a/functions/development/virtual_drift/0.9.0/static/documentation.html +++ /dev/null @@ -1,178 +0,0 @@ - - - - - - - -virtual_drift package - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- - -
-
-
-
-
-
-

virtual_drift package

-
-

Submodules

-
-
-

virtual_drift.virtual_drift module

-
-
-virtual_drift.virtual_drift.all_metrics(t, u)[source]
-
-
-
-virtual_drift.virtual_drift.drift_magnitude(context, t: pandas.core.frame.DataFrame, u: pandas.core.frame.DataFrame, label_col=None, prediction_col=None, discretizers: Optional[dict] = None, n_bins=5, stream_name: str = 'some_stream', results_tsdb_container: str = 'bigdata', results_tsdb_table: str = 'concept_drift/drift_magnitude')[source]
-
-
Drift magnitude metrics

Computes drift magnitude metrics between base dataset t and dataset u. -Metrics:

-
-
    -
  • TVD (Total Variation Distance)

  • -
  • Helinger

  • -
  • KL Divergence

  • -
-
-
-
-
-
Parameters
-
    -
  • context – MLRun context

  • -
  • t – Base dataset for the drift metrics

  • -
  • u – Test dataset for the drift metrics

  • -
  • label_col – Label colum in t and u

  • -
  • prediction_col – Predictions column in t and u

  • -
  • discritizers – Dictionary of dicsritizers for the features if available -(Created automatically if not provided)

  • -
  • n_bins – Number of bins to be used for histrogram creation from continuous variables

  • -
  • stream_name – Output stream to push metrics to

  • -
  • results_tsdb_container – TSDB table container to push metrics to

  • -
  • results_tsdb_table – TSDB table to push metrics to

  • -
-
-
-
-
-
-virtual_drift.virtual_drift.helinger(t, u)[source]
-
-
-
-virtual_drift.virtual_drift.kl_divergence(t, u)[source]
-
-
-
-virtual_drift.virtual_drift.to_observations(context, t, u, key)[source]
-
-
-
-virtual_drift.virtual_drift.tvd(t, u)[source]
-
-
-
-

Module contents

-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/virtual_drift/0.9.0/static/example.html b/functions/development/virtual_drift/0.9.0/static/example.html deleted file mode 100644 index 512fbc45..00000000 --- a/functions/development/virtual_drift/0.9.0/static/example.html +++ /dev/null @@ -1,842 +0,0 @@ - - - - - - - -Virtual Drift - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- - -
-
-
-
-
-
-

Virtual Drift

-

Drift magnitude metrics -Computes drift magnitude metrics between base dataset t and dataset u.

-

Metrics:

-
    -
  • TVD (Total Variation Distance)

  • -
  • Helinger

  • -
  • KL Divergence

  • -
- -
-

Data exploration

-
-
-
# Scikit-learn's wine dataset
-from sklearn.datasets import load_wine
-
-wine = load_wine()
-print(wine["DESCR"])
-
-
-
-
-
.. _wine_dataset:
-
-Wine recognition dataset
-------------------------
-
-**Data Set Characteristics:**
-
-    :Number of Instances: 178 (50 in each of three classes)
-    :Number of Attributes: 13 numeric, predictive attributes and the class
-    :Attribute Information:
- 		- Alcohol
- 		- Malic acid
- 		- Ash
-		- Alcalinity of ash  
- 		- Magnesium
-		- Total phenols
- 		- Flavanoids
- 		- Nonflavanoid phenols
- 		- Proanthocyanins
-		- Color intensity
- 		- Hue
- 		- OD280/OD315 of diluted wines
- 		- Proline
-
-    - class:
-            - class_0
-            - class_1
-            - class_2
-		
-    :Summary Statistics:
-    
-    ============================= ==== ===== ======= =====
-                                   Min   Max   Mean     SD
-    ============================= ==== ===== ======= =====
-    Alcohol:                      11.0  14.8    13.0   0.8
-    Malic Acid:                   0.74  5.80    2.34  1.12
-    Ash:                          1.36  3.23    2.36  0.27
-    Alcalinity of Ash:            10.6  30.0    19.5   3.3
-    Magnesium:                    70.0 162.0    99.7  14.3
-    Total Phenols:                0.98  3.88    2.29  0.63
-    Flavanoids:                   0.34  5.08    2.03  1.00
-    Nonflavanoid Phenols:         0.13  0.66    0.36  0.12
-    Proanthocyanins:              0.41  3.58    1.59  0.57
-    Colour Intensity:              1.3  13.0     5.1   2.3
-    Hue:                          0.48  1.71    0.96  0.23
-    OD280/OD315 of diluted wines: 1.27  4.00    2.61  0.71
-    Proline:                       278  1680     746   315
-    ============================= ==== ===== ======= =====
-
-    :Missing Attribute Values: None
-    :Class Distribution: class_0 (59), class_1 (71), class_2 (48)
-    :Creator: R.A. Fisher
-    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
-    :Date: July, 1988
-
-This is a copy of UCI ML Wine recognition datasets.
-https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data
-
-The data is the results of a chemical analysis of wines grown in the same
-region in Italy by three different cultivators. There are thirteen different
-measurements taken for different constituents found in the three types of
-wine.
-
-Original Owners: 
-
-Forina, M. et al, PARVUS - 
-An Extendible Package for Data Exploration, Classification and Correlation. 
-Institute of Pharmaceutical and Food Analysis and Technologies,
-Via Brigata Salerno, 16147 Genoa, Italy.
-
-Citation:
-
-Lichman, M. (2013). UCI Machine Learning Repository
-[https://archive.ics.uci.edu/ml]. Irvine, CA: University of California,
-School of Information and Computer Science. 
-
-.. topic:: References
-
-  (1) S. Aeberhard, D. Coomans and O. de Vel, 
-  Comparison of Classifiers in High Dimensional Settings, 
-  Tech. Rep. no. 92-02, (1992), Dept. of Computer Science and Dept. of  
-  Mathematics and Statistics, James Cook University of North Queensland. 
-  (Also submitted to Technometrics). 
-
-  The data was used with many others for comparing various 
-  classifiers. The classes are separable, though only RDA 
-  has achieved 100% correct classification. 
-  (RDA : 100%, QDA 99.4%, LDA 98.9%, 1NN 96.1% (z-transformed data)) 
-  (All results using the leave-one-out technique) 
-
-  (2) S. Aeberhard, D. Coomans and O. de Vel, 
-  "THE CLASSIFICATION PERFORMANCE OF RDA" 
-  Tech. Rep. no. 92-01, (1992), Dept. of Computer Science and Dept. of 
-  Mathematics and Statistics, James Cook University of North Queensland. 
-  (Also submitted to Journal of Chemometrics).
-
-
-
-
-
-
-
wine_t_path = 'https://s3.wasabisys.com/iguazio/data/function-marketplace-data/virtual_drift/wine_t.pq'
-wine_u_path = 'https://s3.wasabisys.com/iguazio/data/function-marketplace-data/virtual_drift/wine_u.pq'
-wine_t=pd.read_parquet(wine_t_path)
-wine_u=pd.read_parquet(wine_u_path)
-print(f'wine_t and wine_u are generated from the wine dataset, where wine_t is the entire dataset while wine_u is a sample (50%) of the entire dataset. \n\
-wine_t shape is {wine_t.shape[0]} and wine_u shape is {wine_u.shape[0]} \n\n')
-wine_t.head()
-
-
-
-
-
wine_t and wine_u are generated from the wine dataset, where wine_t is the entire dataset while wine_u is a sample (50%) of the entire dataset. 
-wine_t shape is 178 and wine_u shape is 89 
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
alcoholmalic_acidashalcalinity_of_ashmagnesiumtotal_phenolsflavanoidsnonflavanoid_phenolsproanthocyaninscolor_intensityhueod280/od315_of_diluted_winesprolineyprediction
014.231.712.4315.6127.02.803.060.282.295.641.043.921065.000
113.201.782.1411.2100.02.652.760.261.284.381.053.401050.000
213.162.362.6718.6101.02.803.240.302.815.681.033.171185.000
314.371.952.5016.8113.03.853.490.242.187.800.863.451480.000
413.242.592.8721.0118.02.802.690.391.824.321.042.93735.000
-
-
-
-
-

Importing the function

-
-
-
import mlrun
-
-# Importing the function
-mlrun.set_environment(project='function-marketplace')
-
-fn = mlrun.import_function("hub://virtual_drift")
-fn.apply(mlrun.auto_mount())
-
-
-
-
-
> 2021-10-26 13:45:22,345 [info] created and saved project function-marketplace
-
-
-
<mlrun.runtimes.kubejob.KubejobRuntime at 0x7ff54a864dd0>
-
-
-
-
-
-
-

Running the function locally

-
-
-
import os 
-
-container = os.path.join('/',os.environ['V3IO_HOME'].split('/')[0])
-user = os.environ["V3IO_USERNAME"]
-rel_path = os.getcwd()[6:] + '/artifacts'
-tsdb_path = os.path.join(user,rel_path) + "/output_tsdb"
-
-
-
-
-
-
-
virtual_drift_run=fn.run(params={'label_col': 'y',
-                                 'results_tsdb_container': container[1:],
-                                 'results_tsdb_table': tsdb_path},
-                         inputs={'t': wine_t_path,
-                                 'u': wine_u_path},
-                         artifact_path=os.getcwd(),
-                         local=True)
-
-
-
-
-
> 2021-10-26 14:00:41,020 [info] starting run virtual-drift-drift_magnitude uid=28ec7f08ce7c4c528114e2590ff49325 DB=http://mlrun-api:8080
-
-
-
Warning - Server version '0.8.14' is different from client version '0.9.4'. Some operations may not work as expected.
-
-
-
> 2021-10-26 14:00:43,469 [info] Fitting discretizer for alcohol
-> 2021-10-26 14:00:43,471 [info] Fitting discretizer for malic_acid
-> 2021-10-26 14:00:43,471 [info] Fitting discretizer for ash
-> 2021-10-26 14:00:43,472 [info] Fitting discretizer for alcalinity_of_ash
-> 2021-10-26 14:00:43,473 [info] Fitting discretizer for magnesium
-> 2021-10-26 14:00:43,474 [info] Fitting discretizer for total_phenols
-> 2021-10-26 14:00:43,475 [info] Fitting discretizer for flavanoids
-> 2021-10-26 14:00:43,476 [info] Fitting discretizer for nonflavanoid_phenols
-> 2021-10-26 14:00:43,477 [info] Fitting discretizer for proanthocyanins
-> 2021-10-26 14:00:43,477 [info] Fitting discretizer for color_intensity
-> 2021-10-26 14:00:43,478 [info] Fitting discretizer for hue
-> 2021-10-26 14:00:43,479 [info] Fitting discretizer for od280/od315_of_diluted_wines
-> 2021-10-26 14:00:43,480 [info] Fitting discretizer for proline
-> 2021-10-26 14:00:43,531 [info] Discretizing featuers
-> 2021-10-26 14:00:43,752 [info] Compute prior metrics
-> 2021-10-26 14:00:43,889 [info] Compute class metrics
-> 2021-10-26 14:00:44,000 [info] value: inf
-> 2021-10-26 14:00:44,009 [info] Timestamp: 2021-10-26 14:00:44.008992
-
-
-
divide by zero encountered in log
-casting datetime64[ns] values to int64 with .astype(...) is deprecated and will raise in a future version. Use .view(...) instead.
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
function-marketplace0Oct 26 14:00:41completedvirtual-drift-drift_magnitude
v3io_user=dani
kind=
owner=dani
host=jupyter-dani-6bfbd76d96-zxx6f
t
u
label_col=y
results_tsdb_container=users
results_tsdb_table=dani/test/functions/virtual_drift/artifacts/output_tsdb
prior_tvd=0.5
prior_helinger=0.541
prior_kld=10
class_shift_tvd=0.017
class_shift_helinger=0.014
class_shift_kld=0.002
discritizers
t_discrete
u_discrete
features_t_pdf
features_u_pdf
class_t_pdf
class_u_pdf
-
- -
-

-
-
-
> to track results use the .show() or .logs() methods or click here to open in UI
> 2021-10-26 14:00:44,153 [info] run executed, status=completed
-
-
-
-
-
-
-
virtual_drift_run.artifact('class_u_pdf').show()
-virtual_drift_run.artifact('class_t_pdf').show()
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - -
u
00.348315
10.382022
20.269663
-
- - - - - - - - - - - - - - - - - - - - - - -
t
00.331461
10.398876
20.269663
-
-
-
-
-
import v3io_frames as v3f
-client = v3f.Client(os.environ["V3IO_FRAMESD"],container=container[1:])
-client.read(backend='tsdb',table=tsdb_path)
-
-
-
-
-
Warning - Server version '0.8.14' is different from client version '0.9.4'. Some operations may not work as expected.
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
class_shift_helingerclass_shift_kldclass_shift_tvdprior_helingerprior_kldprior_tvdstream
time
2021-10-26 13:58:04.445000+00:000.013980.0015640.0168540.54119610.00.5some_stream
2021-10-26 14:00:44.008000+00:000.013980.0015640.0168540.54119610.00.5some_stream
-
-
-

Back to the top

-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/virtual_drift/0.9.0/static/function.html b/functions/development/virtual_drift/0.9.0/static/function.html deleted file mode 100644 index 989d4fe0..00000000 --- a/functions/development/virtual_drift/0.9.0/static/function.html +++ /dev/null @@ -1,151 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: job
-metadata:
-  name: virtual-drift
-  tag: ''
-  hash: 8990fdd72fc550189a0c8b488b69997428b786c9
-  project: ''
-  labels:
-    author: orz
-  categories:
-  - data-analysis
-  - machine-learning
-spec:
-  command: ''
-  args: []
-  image: mlrun/ml-models
-  env: []
-  default_handler: drift_magnitude
-  entry_points:
-    to_observations:
-      name: to_observations
-      doc: ''
-      parameters:
-      - name: context
-        default: ''
-      - name: t
-        default: ''
-      - name: u
-        default: ''
-      - name: key
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 16
-    tvd:
-      name: tvd
-      doc: ''
-      parameters:
-      - name: t
-        default: ''
-      - name: u
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 42
-    helinger:
-      name: helinger
-      doc: ''
-      parameters:
-      - name: t
-        default: ''
-      - name: u
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 46
-    kl_divergence:
-      name: kl_divergence
-      doc: ''
-      parameters:
-      - name: t
-        default: ''
-      - name: u
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 50
-    all_metrics:
-      name: all_metrics
-      doc: ''
-      parameters:
-      - name: t
-        default: ''
-      - name: u
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 56
-    drift_magnitude:
-      name: drift_magnitude
-      doc: "Drift magnitude metrics\n   Computes drift magnitude metrics between base\
-        \ dataset t and dataset u.\n   Metrics:\n    - TVD (Total Variation Distance)\n\
-        \    - Helinger\n    - KL Divergence"
-      parameters:
-      - name: context
-        doc: MLRun context
-        default: ''
-      - name: t
-        type: DataFrame
-        doc: Base dataset for the drift metrics
-        default: ''
-      - name: u
-        type: DataFrame
-        doc: Test dataset for the drift metrics
-        default: ''
-      - name: label_col
-        doc: Label colum in t and u
-        default: null
-      - name: prediction_col
-        doc: Predictions column in t and u
-        default: null
-      - name: discretizers
-        type: dict
-        default: null
-      - name: n_bins
-        doc: Number of bins to be used for histrogram creation from continuous variables
-        default: 5
-      - name: stream_name
-        type: str
-        doc: Output stream to push metrics to
-        default: some_stream
-      - name: results_tsdb_container
-        type: str
-        doc: TSDB table container to push metrics to
-        default: bigdata
-      - name: results_tsdb_table
-        type: str
-        doc: TSDB table to push metrics to
-        default: concept_drift/drift_magnitude
-      outputs:
-      - default: ''
-      lineno: 60
-  description: Compute drift magnitude between Time-Samples T and U
-  build:
-    functionSourceCode: 
-    commands:
-    - python -m pip install scikit-learn scipy v3io_frames
-    code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/virtual_drift/virtual_drift.py
-  affinity: null
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/virtual_drift/0.9.0/static/item.html b/functions/development/virtual_drift/0.9.0/static/item.html deleted file mode 100644 index 524877b1..00000000 --- a/functions/development/virtual_drift/0.9.0/static/item.html +++ /dev/null @@ -1,49 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- data-analysis
-- machine-learning
-description: Compute drift magnitude between Time-Samples T and U
-doc: ''
-example: virtual_drift.ipynb
-generationDate: 2021-11-18:12-28
-icon: ''
-labels:
-  author: orz
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 0.8.0
-name: virtual-drift
-platformVersion: 3.2.0
-spec:
-  filename: virtual_drift.py
-  handler: drift_magnitude
-  image: mlrun/ml-models
-  kind: job
-  requirements:
-  - scikit-learn
-  - scipy
-  - v3io_frames
-url: ''
-version: 0.9.0
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/virtual_drift/0.9.0/static/source.html b/functions/development/virtual_drift/0.9.0/static/source.html deleted file mode 100644 index 802cf249..00000000 --- a/functions/development/virtual_drift/0.9.0/static/source.html +++ /dev/null @@ -1,214 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-# Generated by nuclio.export.NuclioExporter
-
-import os
-import pandas as pd
-import numpy as np
-import scipy as sp
-import pickle
-import datetime
-
-import v3io_frames as v3f
-
-import matplotlib.pyplot as plt
-from sklearn.preprocessing import KBinsDiscretizer
-
-
-def to_observations(context, t, u, key):
-    t = (
-        t.apply(lambda row: f"{'_'.join([str(row[val]) for val in t.columns])}", axis=1)
-        .value_counts()
-        .sort_index()
-    )
-    u = (
-        u.apply(lambda row: f"{'_'.join([str(row[val]) for val in u.columns])}", axis=1)
-        .value_counts()
-        .sort_index()
-    )
-
-    joined_uniques = pd.DataFrame([t, u]).T.fillna(0).sort_index()
-    joined_uniques.columns = ["t", "u"]
-
-    t_obs = joined_uniques.loc[:, "t"]
-    u_obs = joined_uniques.loc[:, "u"]
-
-    t_pdf = t_obs / t_obs.sum()
-    u_pdf = u_obs / u_obs.sum()
-
-    context.log_dataset(f"{key}_t_pdf", pd.DataFrame(t_pdf), format="parquet")
-    context.log_dataset(f"{key}_u_pdf", pd.DataFrame(u_pdf), format="parquet")
-    return t_pdf, u_pdf
-
-
-def tvd(t, u):
-    return sum(abs(t - u)) / 2
-
-
-def helinger(t, u):
-    return (np.sqrt(np.sum(np.power(np.sqrt(t) - np.sqrt(u), 2)))) / np.sqrt(2)
-
-
-def kl_divergence(t, u):
-    t_u = np.sum(np.where(t != 0, t * np.log(t / u), 0))
-    u_t = np.sum(np.where(u != 0, u * np.log(u / t), 0))
-    return t_u + u_t
-
-
-def all_metrics(t, u):
-    return tvd(t, u), helinger(t, u), kl_divergence(t, u)
-
-
-def drift_magnitude(
-    context,
-    t: pd.DataFrame,
-    u: pd.DataFrame,
-    label_col=None,
-    prediction_col=None,
-    discretizers: dict = None,
-    n_bins=5,
-    stream_name: str = "some_stream",
-    results_tsdb_container: str = "bigdata",
-    results_tsdb_table: str = "concept_drift/drift_magnitude",
-):
-    """Drift magnitude metrics
-       Computes drift magnitude metrics between base dataset t and dataset u.
-       Metrics:
-        - TVD (Total Variation Distance)
-        - Helinger
-        - KL Divergence
-
-    :param context: MLRun context
-    :param t: Base dataset for the drift metrics
-    :param u: Test dataset for the drift metrics
-    :param label_col: Label colum in t and u
-    :param prediction_col: Predictions column in t and u
-    :param discritizers: Dictionary of dicsritizers for the features if available
-                         (Created automatically if not provided)
-    :param n_bins: Number of bins to be used for histrogram creation from continuous variables
-    :param stream_name: Output stream to push metrics to
-    :param results_tsdb_container: TSDB table container to push metrics to
-    :param results_tsdb_table: TSDB table to push metrics to
-    """
-
-    v3io_client = v3f.Client("framesd:8081", container=results_tsdb_container)
-    try:
-        v3io_client.create("tsdb", results_tsdb_table, if_exists=1, rate="1/s")
-    except:
-        v3io_client.create(
-            "tsdb", results_tsdb_table, if_exists=1, attrs={"rate": "1/s"}
-        )
-
-    df_t = t.as_df()
-    df_u = u.as_df()
-
-    drop_columns = []
-    if label_col is not None:
-        drop_columns.append(label_col)
-    if prediction_col is not None:
-        drop_columns.append(prediction_col)
-
-    continuous_features = df_t.select_dtypes(["float"])
-    if discretizers is None:
-        discretizers = {}
-        for feature in continuous_features.columns:
-            context.logger.info(f"Fitting discretizer for {feature}")
-            discretizer = KBinsDiscretizer(
-                n_bins=n_bins, encode="ordinal", strategy="uniform"
-            )
-
-            discretizer.fit(continuous_features.loc[:, feature].values.reshape(-1, 1))
-            discretizers[feature] = discretizer
-    os.makedirs(context.artifact_path, exist_ok=True)
-    discretizers_path = os.path.abspath(f"{context.artifact_path}/discritizer.pkl")
-    with open(discretizers_path, "wb") as f:
-        pickle.dump(discretizers, f)
-    context.log_artifact("discritizers", target_path=discretizers_path)
-    context.logger.info("Discretizing featuers")
-    for feature, discretizer in discretizers.items():
-        df_t[feature] = discretizer.transform(
-            df_t.loc[:, feature].values.reshape(-1, 1)
-        )
-        df_u[feature] = discretizer.transform(
-            df_u.loc[:, feature].values.reshape(-1, 1)
-        )
-        df_t[feature] = df_t[feature].astype("int")
-        df_u[feature] = df_u[feature].astype("int")
-    context.log_dataset("t_discrete", df_t, format="parquet")
-    context.log_dataset("u_discrete", df_u, format="parquet")
-
-    context.logger.info("Compute prior metrics")
-
-    results = {}
-    t_prior, u_prior = to_observations(
-        context,
-        df_t.drop(drop_columns, axis=1),
-        df_u.drop(drop_columns, axis=1),
-        "features",
-    )
-    results["prior_tvd"], results["prior_helinger"], results["prior_kld"] = all_metrics(
-        t_prior, u_prior
-    )
-
-    if prediction_col is not None:
-        context.logger.info("Compute prediction metrics")
-        t_predictions = pd.DataFrame(df_t.loc[:, prediction_col])
-        u_predictions = pd.DataFrame(df_u.loc[:, prediction_col])
-        t_class, u_class = to_observations(
-            context, t_predictions, u_predictions, "prediction"
-        )
-        (
-            results["prediction_shift_tvd"],
-            results["prediction_shift_helinger"],
-            results["prediction_shift_kld"],
-        ) = all_metrics(t_class, u_class)
-
-    if label_col is not None:
-        context.logger.info("Compute class metrics")
-        t_labels = pd.DataFrame(df_t.loc[:, label_col])
-        u_labels = pd.DataFrame(df_u.loc[:, label_col])
-        t_class, u_class = to_observations(context, t_labels, u_labels, "class")
-        (
-            results["class_shift_tvd"],
-            results["class_shift_helinger"],
-            results["class_shift_kld"],
-        ) = all_metrics(t_class, u_class)
-
-    for key, value in results.items():
-        if value == float("inf"):
-            context.logger.info(f"value: {value}")
-            results[key] = 10
-    for key, result in results.items():
-        context.log_result(key, round(result, 3))
-
-    now = pd.to_datetime(str(datetime.datetime.now()))
-    now
-
-    results["timestamp"] = pd.to_datetime(str((datetime.datetime.now())))
-    context.logger.info(f"Timestamp: {results['timestamp']}")
-    results["stream"] = stream_name
-    results_df = pd.DataFrame(
-        data=[list(results.values())], columns=list(results.keys())
-    )
-    results_df = results_df.set_index(["timestamp", "stream"])
-    v3io_client.write("tsdb", results_tsdb_table, dfs=results_df)
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/virtual_drift/1.1.0/src/README.md b/functions/development/virtual_drift/1.1.0/src/README.md deleted file mode 100644 index cd738390..00000000 --- a/functions/development/virtual_drift/1.1.0/src/README.md +++ /dev/null @@ -1,56 +0,0 @@ -# Drift Magnitude - -Concept drift and shift are major issues that greatly affect the accuracy and reliability of many real-world applications of machine learning. We can use the following Drift Magnitude metrics to map and understand our concepts and how close the properties of the data we used to train the models on are to the current data we receive. - -## How to integrate - -The Virtual Drift function is built to receive two data batches of data (as `dataitem` or `Dataframe`), base batch *t* and current batch *u*. - -```markdown -:param context: MLRun context -:param t: Base dataset for the drift metrics -:param u: Test dataset for the drift metrics -:param label_col: Label colum in t and u -:param prediction_col: Predictions column in t and u -:param discritizers: Dictionary of dicsritizers for the features if available - (Created automatically if not provided) -:param n_bins: Number of bins to be used for histrogram creation from continuous variables -:param stream_name: Output stream to push metrics to -:param results_tsdb_container: TSDB table container to push metrics to -:param results_tsdb_table: TSDB table to push metrics to -``` - -The function will calculate the selected drift mangitude metrics that were selected and apply them to the **features**, **labels** and **predictions**. It will then save those metrics and export them via Parquet and TSDB. Alerting could be added on top of the metrics via Grafana or a function. - -## Metrics - -The drift magnitude metrics we calculate are: - -### TVD - Total Variation Distance - -Provides a symetric drift distance between two periods *t* and *u* -Z - vector of random variables -P*t* - Probability distribution over timespan *t* - -![\sigma_{t, u}(Z)=\frac{1}{2}\sum_{\hat{z}\in{dom(Z)}}{|P_t{(\hat{Z})-P_u{(\hat{Z})}}|}]() - -### Helinger Distance - -Hellinger distance is an *f* divergence measuer, similar to the Kullback-Leibler (KL) divergence. However, unlike KL Divergence the Hellinger divergence is symmetric and bounded over a probability space. - -P, Q - Discrete probability distributions (P*i*, ..., P*k*). - -![H(P,Q)=\frac{1}{\sqrt{2}}\sqrt{\sum_{i=1}^{k}{(\sqrt{p_i}-\sqrt{q_i})^2}}]() - - -### KL Divergence - -KL Divergence (or relative entropy) is a measure of how one probability distribution differs from another. It is an asymmetric measure (thus it's not a metric) and it doesn't satisfy the triangle inequality. KL Divergence of 0, indicates two identical distributrions. - -![D_{KL}(P||Q)=\sum_{x\in{X}}{(P(x)\log{\frac{P(x)}{Q(x)}})}]() - -## Additional Resources - -Webb, Geoffrey I. et al. “[Characterizing Concept Drift.](https://arxiv.org/abs/1511.03816)” Data Mining and Knowledge Discovery 30.4 (2016): 964–994. Crossref. Web. - -[MLOps Live #4 - How to Detect & Remediate Drift in Production with MLOps Automation](https://www.youtube.com/watch?v=66_Q7mJZOSc&t=1296s) diff --git a/functions/development/virtual_drift/1.1.0/src/function.yaml b/functions/development/virtual_drift/1.1.0/src/function.yaml deleted file mode 100644 index 55dcec11..00000000 --- a/functions/development/virtual_drift/1.1.0/src/function.yaml +++ /dev/null @@ -1,129 +0,0 @@ -kind: job -metadata: - name: virtual-drift - tag: '' - hash: 8990fdd72fc550189a0c8b488b69997428b786c9 - project: '' - labels: - author: orz - categories: - - data-analysis - - machine-learning -spec: - command: '' - args: [] - image: mlrun/ml-models - env: [] - default_handler: drift_magnitude - entry_points: - to_observations: - name: to_observations - doc: '' - parameters: - - name: context - default: '' - - name: t - default: '' - - name: u - default: '' - - name: key - default: '' - outputs: - - default: '' - lineno: 16 - tvd: - name: tvd - doc: '' - parameters: - - name: t - default: '' - - name: u - default: '' - outputs: - - default: '' - lineno: 42 - helinger: - name: helinger - doc: '' - parameters: - - name: t - default: '' - - name: u - default: '' - outputs: - - default: '' - lineno: 46 - kl_divergence: - name: kl_divergence - doc: '' - parameters: - - name: t - default: '' - - name: u - default: '' - outputs: - - default: '' - lineno: 50 - all_metrics: - name: all_metrics - doc: '' - parameters: - - name: t - default: '' - - name: u - default: '' - outputs: - - default: '' - lineno: 56 - drift_magnitude: - name: drift_magnitude - doc: "Drift magnitude metrics\n Computes drift magnitude metrics between base\ - \ dataset t and dataset u.\n Metrics:\n - TVD (Total Variation Distance)\n\ - \ - Helinger\n - KL Divergence" - parameters: - - name: context - doc: MLRun context - default: '' - - name: t - type: DataFrame - doc: Base dataset for the drift metrics - default: '' - - name: u - type: DataFrame - doc: Test dataset for the drift metrics - default: '' - - name: label_col - doc: Label colum in t and u - default: null - - name: prediction_col - doc: Predictions column in t and u - default: null - - name: discretizers - type: dict - default: null - - name: n_bins - doc: Number of bins to be used for histrogram creation from continuous variables - default: 5 - - name: stream_name - type: str - doc: Output stream to push metrics to - default: some_stream - - name: results_tsdb_container - type: str - doc: TSDB table container to push metrics to - default: bigdata - - name: results_tsdb_table - type: str - doc: TSDB table to push metrics to - default: concept_drift/drift_magnitude - outputs: - - default: '' - lineno: 60 - description: Compute drift magnitude between Time-Samples T and U - build: - functionSourceCode:  - commands: - - python -m pip install scikit-learn scipy v3io_frames - code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/virtual_drift/virtual_drift.py - affinity: null -verbose: false diff --git a/functions/development/virtual_drift/1.1.0/src/item.yaml b/functions/development/virtual_drift/1.1.0/src/item.yaml deleted file mode 100644 index d66f9e9c..00000000 --- a/functions/development/virtual_drift/1.1.0/src/item.yaml +++ /dev/null @@ -1,28 +0,0 @@ -apiVersion: v1 -categories: -- data-analysis -- machine-learning -description: Compute drift magnitude between Time-Samples T and U -doc: '' -example: virtual_drift.ipynb -generationDate: 2022-08-28:17-25 -hidden: false -icon: '' -labels: - author: orz -maintainers: [] -marketplaceType: '' -mlrunVersion: 1.1.0 -name: virtual-drift -platformVersion: 3.5.0 -spec: - filename: virtual_drift.py - handler: drift_magnitude - image: mlrun/ml-models - kind: job - requirements: - - scikit-learn - - scipy - - v3io_frames -url: '' -version: 1.1.0 diff --git a/functions/development/virtual_drift/1.1.0/src/virtual_drift.ipynb b/functions/development/virtual_drift/1.1.0/src/virtual_drift.ipynb deleted file mode 100644 index 23b9ef43..00000000 --- a/functions/development/virtual_drift/1.1.0/src/virtual_drift.ipynb +++ /dev/null @@ -1,935 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Virtual Drift\n", - "\n", - "Drift magnitude metrics\n", - " Computes drift magnitude metrics between base dataset t and dataset u. \n", - "\n", - "Metrics:\n", - "- TVD (Total Variation Distance)\n", - "- Helinger\n", - "- KL Divergence" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Steps**\n", - "\n", - "1. [Data exploration](#Data-exploration)\n", - "2. [Importing the function](#Importing-the-function)\n", - "3. [Running the function locally](#Running-the-function-locally)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Data exploration**" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - ".. _wine_dataset:\n", - "\n", - "Wine recognition dataset\n", - "------------------------\n", - "\n", - "**Data Set Characteristics:**\n", - "\n", - " :Number of Instances: 178 (50 in each of three classes)\n", - " :Number of Attributes: 13 numeric, predictive attributes and the class\n", - " :Attribute Information:\n", - " \t\t- Alcohol\n", - " \t\t- Malic acid\n", - " \t\t- Ash\n", - "\t\t- Alcalinity of ash \n", - " \t\t- Magnesium\n", - "\t\t- Total phenols\n", - " \t\t- Flavanoids\n", - " \t\t- Nonflavanoid phenols\n", - " \t\t- Proanthocyanins\n", - "\t\t- Color intensity\n", - " \t\t- Hue\n", - " \t\t- OD280/OD315 of diluted wines\n", - " \t\t- Proline\n", - "\n", - " - class:\n", - " - class_0\n", - " - class_1\n", - " - class_2\n", - "\t\t\n", - " :Summary Statistics:\n", - " \n", - " ============================= ==== ===== ======= =====\n", - " Min Max Mean SD\n", - " ============================= ==== ===== ======= =====\n", - " Alcohol: 11.0 14.8 13.0 0.8\n", - " Malic Acid: 0.74 5.80 2.34 1.12\n", - " Ash: 1.36 3.23 2.36 0.27\n", - " Alcalinity of Ash: 10.6 30.0 19.5 3.3\n", - " Magnesium: 70.0 162.0 99.7 14.3\n", - " Total Phenols: 0.98 3.88 2.29 0.63\n", - " Flavanoids: 0.34 5.08 2.03 1.00\n", - " Nonflavanoid Phenols: 0.13 0.66 0.36 0.12\n", - " Proanthocyanins: 0.41 3.58 1.59 0.57\n", - " Colour Intensity: 1.3 13.0 5.1 2.3\n", - " Hue: 0.48 1.71 0.96 0.23\n", - " OD280/OD315 of diluted wines: 1.27 4.00 2.61 0.71\n", - " Proline: 278 1680 746 315\n", - " ============================= ==== ===== ======= =====\n", - "\n", - " :Missing Attribute Values: None\n", - " :Class Distribution: class_0 (59), class_1 (71), class_2 (48)\n", - " :Creator: R.A. Fisher\n", - " :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)\n", - " :Date: July, 1988\n", - "\n", - "This is a copy of UCI ML Wine recognition datasets.\n", - "https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data\n", - "\n", - "The data is the results of a chemical analysis of wines grown in the same\n", - "region in Italy by three different cultivators. There are thirteen different\n", - "measurements taken for different constituents found in the three types of\n", - "wine.\n", - "\n", - "Original Owners: \n", - "\n", - "Forina, M. et al, PARVUS - \n", - "An Extendible Package for Data Exploration, Classification and Correlation. \n", - "Institute of Pharmaceutical and Food Analysis and Technologies,\n", - "Via Brigata Salerno, 16147 Genoa, Italy.\n", - "\n", - "Citation:\n", - "\n", - "Lichman, M. (2013). UCI Machine Learning Repository\n", - "[https://archive.ics.uci.edu/ml]. Irvine, CA: University of California,\n", - "School of Information and Computer Science. \n", - "\n", - ".. topic:: References\n", - "\n", - " (1) S. Aeberhard, D. Coomans and O. de Vel, \n", - " Comparison of Classifiers in High Dimensional Settings, \n", - " Tech. Rep. no. 92-02, (1992), Dept. of Computer Science and Dept. of \n", - " Mathematics and Statistics, James Cook University of North Queensland. \n", - " (Also submitted to Technometrics). \n", - "\n", - " The data was used with many others for comparing various \n", - " classifiers. The classes are separable, though only RDA \n", - " has achieved 100% correct classification. \n", - " (RDA : 100%, QDA 99.4%, LDA 98.9%, 1NN 96.1% (z-transformed data)) \n", - " (All results using the leave-one-out technique) \n", - "\n", - " (2) S. Aeberhard, D. Coomans and O. de Vel, \n", - " \"THE CLASSIFICATION PERFORMANCE OF RDA\" \n", - " Tech. Rep. no. 92-01, (1992), Dept. of Computer Science and Dept. of \n", - " Mathematics and Statistics, James Cook University of North Queensland. \n", - " (Also submitted to Journal of Chemometrics).\n", - "\n" - ] - } - ], - "source": [ - "# Scikit-learn's wine dataset\n", - "from sklearn.datasets import load_wine\n", - "\n", - "wine = load_wine()\n", - "print(wine[\"DESCR\"])" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "wine_t and wine_u are generated from the wine dataset, where wine_t is the entire dataset while wine_u is a sample (50%) of the entire dataset. \n", - "wine_t shape is 178 and wine_u shape is 89 \n", - "\n", - "\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
alcoholmalic_acidashalcalinity_of_ashmagnesiumtotal_phenolsflavanoidsnonflavanoid_phenolsproanthocyaninscolor_intensityhueod280/od315_of_diluted_winesprolineyprediction
014.231.712.4315.6127.02.803.060.282.295.641.043.921065.000
113.201.782.1411.2100.02.652.760.261.284.381.053.401050.000
213.162.362.6718.6101.02.803.240.302.815.681.033.171185.000
314.371.952.5016.8113.03.853.490.242.187.800.863.451480.000
413.242.592.8721.0118.02.802.690.391.824.321.042.93735.000
\n", - "
" - ], - "text/plain": [ - " alcohol malic_acid ash alcalinity_of_ash magnesium total_phenols \\\n", - "0 14.23 1.71 2.43 15.6 127.0 2.80 \n", - "1 13.20 1.78 2.14 11.2 100.0 2.65 \n", - "2 13.16 2.36 2.67 18.6 101.0 2.80 \n", - "3 14.37 1.95 2.50 16.8 113.0 3.85 \n", - "4 13.24 2.59 2.87 21.0 118.0 2.80 \n", - "\n", - " flavanoids nonflavanoid_phenols proanthocyanins color_intensity hue \\\n", - "0 3.06 0.28 2.29 5.64 1.04 \n", - "1 2.76 0.26 1.28 4.38 1.05 \n", - "2 3.24 0.30 2.81 5.68 1.03 \n", - "3 3.49 0.24 2.18 7.80 0.86 \n", - "4 2.69 0.39 1.82 4.32 1.04 \n", - "\n", - " od280/od315_of_diluted_wines proline y prediction \n", - "0 3.92 1065.0 0 0 \n", - "1 3.40 1050.0 0 0 \n", - "2 3.17 1185.0 0 0 \n", - "3 3.45 1480.0 0 0 \n", - "4 2.93 735.0 0 0 " - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "wine_t_path = 'https://s3.wasabisys.com/iguazio/data/function-marketplace-data/virtual_drift/wine_t.pq'\n", - "wine_u_path = 'https://s3.wasabisys.com/iguazio/data/function-marketplace-data/virtual_drift/wine_u.pq'\n", - "wine_t=pd.read_parquet(wine_t_path)\n", - "wine_u=pd.read_parquet(wine_u_path)\n", - "print(f'wine_t and wine_u are generated from the wine dataset, where wine_t is the entire dataset while wine_u is a sample (50%) of the entire dataset. \\n\\\n", - "wine_t shape is {wine_t.shape[0]} and wine_u shape is {wine_u.shape[0]} \\n\\n')\n", - "wine_t.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Importing the function**" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2021-10-26 13:45:22,345 [info] created and saved project function-marketplace\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import mlrun\n", - "\n", - "# Importing the function\n", - "mlrun.set_environment(project='function-marketplace')\n", - "\n", - "fn = mlrun.import_function(\"hub://virtual_drift\")\n", - "fn.apply(mlrun.auto_mount())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Running the function locally**" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [], - "source": [ - "import os \n", - "\n", - "container = os.path.join('/',os.environ['V3IO_HOME'].split('/')[0])\n", - "user = os.environ[\"V3IO_USERNAME\"]\n", - "rel_path = os.getcwd()[6:] + '/artifacts'\n", - "tsdb_path = os.path.join(user,rel_path) + \"/output_tsdb\"" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2021-10-26 14:00:41,020 [info] starting run virtual-drift-drift_magnitude uid=28ec7f08ce7c4c528114e2590ff49325 DB=http://mlrun-api:8080\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Warning - Server version '0.8.14' is different from client version '0.9.4'. Some operations may not work as expected.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2021-10-26 14:00:43,469 [info] Fitting discretizer for alcohol\n", - "> 2021-10-26 14:00:43,471 [info] Fitting discretizer for malic_acid\n", - "> 2021-10-26 14:00:43,471 [info] Fitting discretizer for ash\n", - "> 2021-10-26 14:00:43,472 [info] Fitting discretizer for alcalinity_of_ash\n", - "> 2021-10-26 14:00:43,473 [info] Fitting discretizer for magnesium\n", - "> 2021-10-26 14:00:43,474 [info] Fitting discretizer for total_phenols\n", - "> 2021-10-26 14:00:43,475 [info] Fitting discretizer for flavanoids\n", - "> 2021-10-26 14:00:43,476 [info] Fitting discretizer for nonflavanoid_phenols\n", - "> 2021-10-26 14:00:43,477 [info] Fitting discretizer for proanthocyanins\n", - "> 2021-10-26 14:00:43,477 [info] Fitting discretizer for color_intensity\n", - "> 2021-10-26 14:00:43,478 [info] Fitting discretizer for hue\n", - "> 2021-10-26 14:00:43,479 [info] Fitting discretizer for od280/od315_of_diluted_wines\n", - "> 2021-10-26 14:00:43,480 [info] Fitting discretizer for proline\n", - "> 2021-10-26 14:00:43,531 [info] Discretizing featuers\n", - "> 2021-10-26 14:00:43,752 [info] Compute prior metrics\n", - "> 2021-10-26 14:00:43,889 [info] Compute class metrics\n", - "> 2021-10-26 14:00:44,000 [info] value: inf\n", - "> 2021-10-26 14:00:44,009 [info] Timestamp: 2021-10-26 14:00:44.008992\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "divide by zero encountered in log\n", - "casting datetime64[ns] values to int64 with .astype(...) is deprecated and will raise in a future version. Use .view(...) instead.\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
function-marketplace0Oct 26 14:00:41completedvirtual-drift-drift_magnitude
v3io_user=dani
kind=
owner=dani
host=jupyter-dani-6bfbd76d96-zxx6f
t
u
label_col=y
results_tsdb_container=users
results_tsdb_table=dani/test/functions/virtual_drift/artifacts/output_tsdb
prior_tvd=0.5
prior_helinger=0.541
prior_kld=10
class_shift_tvd=0.017
class_shift_helinger=0.014
class_shift_kld=0.002
discritizers
t_discrete
u_discrete
features_t_pdf
features_u_pdf
class_t_pdf
class_u_pdf
\n", - "
\n", - "
\n", - "
\n", - " Title\n", - " ×\n", - "
\n", - " \n", - "
\n", - "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - }, - { - "data": { - "text/html": [ - " > to track results use the .show() or .logs() methods or click here to open in UI" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2021-10-26 14:00:44,153 [info] run executed, status=completed\n" - ] - } - ], - "source": [ - "virtual_drift_run=fn.run(params={'label_col': 'y',\n", - " 'results_tsdb_container': container[1:],\n", - " 'results_tsdb_table': tsdb_path},\n", - " inputs={'t': wine_t_path,\n", - " 'u': wine_u_path},\n", - " artifact_path=os.getcwd(),\n", - " local=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
u
00.348315
10.382022
20.269663
\n", - "
" - ], - "text/plain": [ - " u\n", - "0 0.348315\n", - "1 0.382022\n", - "2 0.269663" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
t
00.331461
10.398876
20.269663
\n", - "
" - ], - "text/plain": [ - " t\n", - "0 0.331461\n", - "1 0.398876\n", - "2 0.269663" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "virtual_drift_run.artifact('class_u_pdf').show()\n", - "virtual_drift_run.artifact('class_t_pdf').show()" - ] - }, - { - "cell_type": "code", - "execution_count": 69, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Warning - Server version '0.8.14' is different from client version '0.9.4'. Some operations may not work as expected.\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
class_shift_helingerclass_shift_kldclass_shift_tvdprior_helingerprior_kldprior_tvdstream
time
2021-10-26 13:58:04.445000+00:000.013980.0015640.0168540.54119610.00.5some_stream
2021-10-26 14:00:44.008000+00:000.013980.0015640.0168540.54119610.00.5some_stream
\n", - "
" - ], - "text/plain": [ - " class_shift_helinger class_shift_kld \\\n", - "time \n", - "2021-10-26 13:58:04.445000+00:00 0.01398 0.001564 \n", - "2021-10-26 14:00:44.008000+00:00 0.01398 0.001564 \n", - "\n", - " class_shift_tvd prior_helinger prior_kld \\\n", - "time \n", - "2021-10-26 13:58:04.445000+00:00 0.016854 0.541196 10.0 \n", - "2021-10-26 14:00:44.008000+00:00 0.016854 0.541196 10.0 \n", - "\n", - " prior_tvd stream \n", - "time \n", - "2021-10-26 13:58:04.445000+00:00 0.5 some_stream \n", - "2021-10-26 14:00:44.008000+00:00 0.5 some_stream " - ] - }, - "execution_count": 69, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import v3io_frames as v3f\n", - "client = v3f.Client(os.environ[\"V3IO_FRAMESD\"],container=container[1:])\n", - "client.read(backend='tsdb',table=tsdb_path)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "[Back to the top](#Virtual-Drift)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/functions/development/virtual_drift/1.1.0/src/virtual_drift.py b/functions/development/virtual_drift/1.1.0/src/virtual_drift.py deleted file mode 100644 index 71dcf712..00000000 --- a/functions/development/virtual_drift/1.1.0/src/virtual_drift.py +++ /dev/null @@ -1,206 +0,0 @@ -# Copyright 2019 Iguazio -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# Generated by nuclio.export.NuclioExporter - -import os -import pandas as pd -import numpy as np -import scipy as sp -import pickle -import datetime - -import v3io_frames as v3f - -import matplotlib.pyplot as plt -from sklearn.preprocessing import KBinsDiscretizer - - -def to_observations(context, t, u, key): - t = ( - t.apply(lambda row: f"{'_'.join([str(row[val]) for val in t.columns])}", axis=1) - .value_counts() - .sort_index() - ) - u = ( - u.apply(lambda row: f"{'_'.join([str(row[val]) for val in u.columns])}", axis=1) - .value_counts() - .sort_index() - ) - - joined_uniques = pd.DataFrame([t, u]).T.fillna(0).sort_index() - joined_uniques.columns = ["t", "u"] - - t_obs = joined_uniques.loc[:, "t"] - u_obs = joined_uniques.loc[:, "u"] - - t_pdf = t_obs / t_obs.sum() - u_pdf = u_obs / u_obs.sum() - - context.log_dataset(f"{key}_t_pdf", pd.DataFrame(t_pdf), format="parquet") - context.log_dataset(f"{key}_u_pdf", pd.DataFrame(u_pdf), format="parquet") - return t_pdf, u_pdf - - -def tvd(t, u): - return sum(abs(t - u)) / 2 - - -def helinger(t, u): - return (np.sqrt(np.sum(np.power(np.sqrt(t) - np.sqrt(u), 2)))) / np.sqrt(2) - - -def kl_divergence(t, u): - t_u = np.sum(np.where(t != 0, t * np.log(t / u), 0)) - u_t = np.sum(np.where(u != 0, u * np.log(u / t), 0)) - return t_u + u_t - - -def all_metrics(t, u): - return tvd(t, u), helinger(t, u), kl_divergence(t, u) - - -def drift_magnitude( - context, - t: pd.DataFrame, - u: pd.DataFrame, - label_col=None, - prediction_col=None, - discretizers: dict = None, - n_bins=5, - stream_name: str = "some_stream", - results_tsdb_container: str = "bigdata", - results_tsdb_table: str = "concept_drift/drift_magnitude", -): - """Drift magnitude metrics - Computes drift magnitude metrics between base dataset t and dataset u. - Metrics: - - TVD (Total Variation Distance) - - Helinger - - KL Divergence - - :param context: MLRun context - :param t: Base dataset for the drift metrics - :param u: Test dataset for the drift metrics - :param label_col: Label colum in t and u - :param prediction_col: Predictions column in t and u - :param discritizers: Dictionary of dicsritizers for the features if available - (Created automatically if not provided) - :param n_bins: Number of bins to be used for histrogram creation from continuous variables - :param stream_name: Output stream to push metrics to - :param results_tsdb_container: TSDB table container to push metrics to - :param results_tsdb_table: TSDB table to push metrics to - """ - - v3io_client = v3f.Client("framesd:8081", container=results_tsdb_container) - try: - v3io_client.create("tsdb", results_tsdb_table, if_exists=1, rate="1/s") - except: - v3io_client.create( - "tsdb", results_tsdb_table, if_exists=1, attrs={"rate": "1/s"} - ) - - df_t = t.as_df() - df_u = u.as_df() - - drop_columns = [] - if label_col is not None: - drop_columns.append(label_col) - if prediction_col is not None: - drop_columns.append(prediction_col) - - continuous_features = df_t.select_dtypes(["float"]) - if discretizers is None: - discretizers = {} - for feature in continuous_features.columns: - context.logger.info(f"Fitting discretizer for {feature}") - discretizer = KBinsDiscretizer( - n_bins=n_bins, encode="ordinal", strategy="uniform" - ) - - discretizer.fit(continuous_features.loc[:, feature].values.reshape(-1, 1)) - discretizers[feature] = discretizer - os.makedirs(context.artifact_path, exist_ok=True) - discretizers_path = os.path.abspath(f"{context.artifact_path}/discritizer.pkl") - with open(discretizers_path, "wb") as f: - pickle.dump(discretizers, f) - context.log_artifact("discritizers", target_path=discretizers_path) - context.logger.info("Discretizing featuers") - for feature, discretizer in discretizers.items(): - df_t[feature] = discretizer.transform( - df_t.loc[:, feature].values.reshape(-1, 1) - ) - df_u[feature] = discretizer.transform( - df_u.loc[:, feature].values.reshape(-1, 1) - ) - df_t[feature] = df_t[feature].astype("int") - df_u[feature] = df_u[feature].astype("int") - context.log_dataset("t_discrete", df_t, format="parquet") - context.log_dataset("u_discrete", df_u, format="parquet") - - context.logger.info("Compute prior metrics") - - results = {} - t_prior, u_prior = to_observations( - context, - df_t.drop(drop_columns, axis=1), - df_u.drop(drop_columns, axis=1), - "features", - ) - results["prior_tvd"], results["prior_helinger"], results["prior_kld"] = all_metrics( - t_prior, u_prior - ) - - if prediction_col is not None: - context.logger.info("Compute prediction metrics") - t_predictions = pd.DataFrame(df_t.loc[:, prediction_col]) - u_predictions = pd.DataFrame(df_u.loc[:, prediction_col]) - t_class, u_class = to_observations( - context, t_predictions, u_predictions, "prediction" - ) - ( - results["prediction_shift_tvd"], - results["prediction_shift_helinger"], - results["prediction_shift_kld"], - ) = all_metrics(t_class, u_class) - - if label_col is not None: - context.logger.info("Compute class metrics") - t_labels = pd.DataFrame(df_t.loc[:, label_col]) - u_labels = pd.DataFrame(df_u.loc[:, label_col]) - t_class, u_class = to_observations(context, t_labels, u_labels, "class") - ( - results["class_shift_tvd"], - results["class_shift_helinger"], - results["class_shift_kld"], - ) = all_metrics(t_class, u_class) - - for key, value in results.items(): - if value == float("inf"): - context.logger.info(f"value: {value}") - results[key] = 10 - for key, result in results.items(): - context.log_result(key, round(result, 3)) - - now = pd.to_datetime(str(datetime.datetime.now())) - now - - results["timestamp"] = pd.to_datetime(str((datetime.datetime.now()))) - context.logger.info(f"Timestamp: {results['timestamp']}") - results["stream"] = stream_name - results_df = pd.DataFrame( - data=[list(results.values())], columns=list(results.keys()) - ) - results_df = results_df.set_index(["timestamp", "stream"]) - v3io_client.write("tsdb", results_tsdb_table, dfs=results_df) diff --git a/functions/development/virtual_drift/1.1.0/static/documentation.html b/functions/development/virtual_drift/1.1.0/static/documentation.html deleted file mode 100644 index 417e5ac4..00000000 --- a/functions/development/virtual_drift/1.1.0/static/documentation.html +++ /dev/null @@ -1,275 +0,0 @@ - - - - - - - -virtual_drift package - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - - - -
-
- - -
-
-
- -
-

virtual_drift package

- -
- -
-
-
-
-
-

virtual_drift package#

-
-

Submodules#

-
-
-

virtual_drift.virtual_drift module#

-
-
-virtual_drift.virtual_drift.all_metrics(t, u)[source]#
-
-
-
-virtual_drift.virtual_drift.drift_magnitude(context, t: pandas.core.frame.DataFrame, u: pandas.core.frame.DataFrame, label_col=None, prediction_col=None, discretizers: Optional[dict] = None, n_bins=5, stream_name: str = 'some_stream', results_tsdb_container: str = 'bigdata', results_tsdb_table: str = 'concept_drift/drift_magnitude')[source]#
-
-
Drift magnitude metrics

Computes drift magnitude metrics between base dataset t and dataset u. -Metrics:

-
-
    -
  • TVD (Total Variation Distance)

  • -
  • Helinger

  • -
  • KL Divergence

  • -
-
-
-
-
-
Parameters
-
    -
  • context – MLRun context

  • -
  • t – Base dataset for the drift metrics

  • -
  • u – Test dataset for the drift metrics

  • -
  • label_col – Label colum in t and u

  • -
  • prediction_col – Predictions column in t and u

  • -
  • discritizers – Dictionary of dicsritizers for the features if available -(Created automatically if not provided)

  • -
  • n_bins – Number of bins to be used for histrogram creation from continuous variables

  • -
  • stream_name – Output stream to push metrics to

  • -
  • results_tsdb_container – TSDB table container to push metrics to

  • -
  • results_tsdb_table – TSDB table to push metrics to

  • -
-
-
-
-
-
-virtual_drift.virtual_drift.helinger(t, u)[source]#
-
-
-
-virtual_drift.virtual_drift.kl_divergence(t, u)[source]#
-
-
-
-virtual_drift.virtual_drift.to_observations(context, t, u, key)[source]#
-
-
-
-virtual_drift.virtual_drift.tvd(t, u)[source]#
-
-
-
-

Module contents#

-
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/virtual_drift/1.1.0/static/example.html b/functions/development/virtual_drift/1.1.0/static/example.html deleted file mode 100644 index aafbe33c..00000000 --- a/functions/development/virtual_drift/1.1.0/static/example.html +++ /dev/null @@ -1,962 +0,0 @@ - - - - - - - -Virtual Drift - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - - - -
-
- - -
-
-
- -
-

Virtual Drift

- -
- -
-
-
-
-
-

Virtual Drift#

-

Drift magnitude metrics -Computes drift magnitude metrics between base dataset t and dataset u.

-

Metrics:

-
    -
  • TVD (Total Variation Distance)

  • -
  • Helinger

  • -
  • KL Divergence

  • -
-
-

Steps#

-
    -
  1. Data exploration

  2. -
  3. Importing the function

  4. -
  5. Running the function locally

  6. -
-
-
-

Data exploration#

-
-
-
# Scikit-learn's wine dataset
-from sklearn.datasets import load_wine
-
-wine = load_wine()
-print(wine["DESCR"])
-
-
-
-
-
.. _wine_dataset:
-
-Wine recognition dataset
-------------------------
-
-**Data Set Characteristics:**
-
-    :Number of Instances: 178 (50 in each of three classes)
-    :Number of Attributes: 13 numeric, predictive attributes and the class
-    :Attribute Information:
- 		- Alcohol
- 		- Malic acid
- 		- Ash
-		- Alcalinity of ash  
- 		- Magnesium
-		- Total phenols
- 		- Flavanoids
- 		- Nonflavanoid phenols
- 		- Proanthocyanins
-		- Color intensity
- 		- Hue
- 		- OD280/OD315 of diluted wines
- 		- Proline
-
-    - class:
-            - class_0
-            - class_1
-            - class_2
-		
-    :Summary Statistics:
-    
-    ============================= ==== ===== ======= =====
-                                   Min   Max   Mean     SD
-    ============================= ==== ===== ======= =====
-    Alcohol:                      11.0  14.8    13.0   0.8
-    Malic Acid:                   0.74  5.80    2.34  1.12
-    Ash:                          1.36  3.23    2.36  0.27
-    Alcalinity of Ash:            10.6  30.0    19.5   3.3
-    Magnesium:                    70.0 162.0    99.7  14.3
-    Total Phenols:                0.98  3.88    2.29  0.63
-    Flavanoids:                   0.34  5.08    2.03  1.00
-    Nonflavanoid Phenols:         0.13  0.66    0.36  0.12
-    Proanthocyanins:              0.41  3.58    1.59  0.57
-    Colour Intensity:              1.3  13.0     5.1   2.3
-    Hue:                          0.48  1.71    0.96  0.23
-    OD280/OD315 of diluted wines: 1.27  4.00    2.61  0.71
-    Proline:                       278  1680     746   315
-    ============================= ==== ===== ======= =====
-
-    :Missing Attribute Values: None
-    :Class Distribution: class_0 (59), class_1 (71), class_2 (48)
-    :Creator: R.A. Fisher
-    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
-    :Date: July, 1988
-
-This is a copy of UCI ML Wine recognition datasets.
-https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data
-
-The data is the results of a chemical analysis of wines grown in the same
-region in Italy by three different cultivators. There are thirteen different
-measurements taken for different constituents found in the three types of
-wine.
-
-Original Owners: 
-
-Forina, M. et al, PARVUS - 
-An Extendible Package for Data Exploration, Classification and Correlation. 
-Institute of Pharmaceutical and Food Analysis and Technologies,
-Via Brigata Salerno, 16147 Genoa, Italy.
-
-Citation:
-
-Lichman, M. (2013). UCI Machine Learning Repository
-[https://archive.ics.uci.edu/ml]. Irvine, CA: University of California,
-School of Information and Computer Science. 
-
-.. topic:: References
-
-  (1) S. Aeberhard, D. Coomans and O. de Vel, 
-  Comparison of Classifiers in High Dimensional Settings, 
-  Tech. Rep. no. 92-02, (1992), Dept. of Computer Science and Dept. of  
-  Mathematics and Statistics, James Cook University of North Queensland. 
-  (Also submitted to Technometrics). 
-
-  The data was used with many others for comparing various 
-  classifiers. The classes are separable, though only RDA 
-  has achieved 100% correct classification. 
-  (RDA : 100%, QDA 99.4%, LDA 98.9%, 1NN 96.1% (z-transformed data)) 
-  (All results using the leave-one-out technique) 
-
-  (2) S. Aeberhard, D. Coomans and O. de Vel, 
-  "THE CLASSIFICATION PERFORMANCE OF RDA" 
-  Tech. Rep. no. 92-01, (1992), Dept. of Computer Science and Dept. of 
-  Mathematics and Statistics, James Cook University of North Queensland. 
-  (Also submitted to Journal of Chemometrics).
-
-
-
-
-
-
-
wine_t_path = 'https://s3.wasabisys.com/iguazio/data/function-marketplace-data/virtual_drift/wine_t.pq'
-wine_u_path = 'https://s3.wasabisys.com/iguazio/data/function-marketplace-data/virtual_drift/wine_u.pq'
-wine_t=pd.read_parquet(wine_t_path)
-wine_u=pd.read_parquet(wine_u_path)
-print(f'wine_t and wine_u are generated from the wine dataset, where wine_t is the entire dataset while wine_u is a sample (50%) of the entire dataset. \n\
-wine_t shape is {wine_t.shape[0]} and wine_u shape is {wine_u.shape[0]} \n\n')
-wine_t.head()
-
-
-
-
-
wine_t and wine_u are generated from the wine dataset, where wine_t is the entire dataset while wine_u is a sample (50%) of the entire dataset. 
-wine_t shape is 178 and wine_u shape is 89 
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
alcoholmalic_acidashalcalinity_of_ashmagnesiumtotal_phenolsflavanoidsnonflavanoid_phenolsproanthocyaninscolor_intensityhueod280/od315_of_diluted_winesprolineyprediction
014.231.712.4315.6127.02.803.060.282.295.641.043.921065.000
113.201.782.1411.2100.02.652.760.261.284.381.053.401050.000
213.162.362.6718.6101.02.803.240.302.815.681.033.171185.000
314.371.952.5016.8113.03.853.490.242.187.800.863.451480.000
413.242.592.8721.0118.02.802.690.391.824.321.042.93735.000
-
-
-
-
-

Importing the function#

-
-
-
import mlrun
-
-# Importing the function
-mlrun.set_environment(project='function-marketplace')
-
-fn = mlrun.import_function("hub://virtual_drift")
-fn.apply(mlrun.auto_mount())
-
-
-
-
-
> 2021-10-26 13:45:22,345 [info] created and saved project function-marketplace
-
-
-
<mlrun.runtimes.kubejob.KubejobRuntime at 0x7ff54a864dd0>
-
-
-
-
-
-
-

Running the function locally#

-
-
-
import os 
-
-container = os.path.join('/',os.environ['V3IO_HOME'].split('/')[0])
-user = os.environ["V3IO_USERNAME"]
-rel_path = os.getcwd()[6:] + '/artifacts'
-tsdb_path = os.path.join(user,rel_path) + "/output_tsdb"
-
-
-
-
-
-
-
virtual_drift_run=fn.run(params={'label_col': 'y',
-                                 'results_tsdb_container': container[1:],
-                                 'results_tsdb_table': tsdb_path},
-                         inputs={'t': wine_t_path,
-                                 'u': wine_u_path},
-                         artifact_path=os.getcwd(),
-                         local=True)
-
-
-
-
-
> 2021-10-26 14:00:41,020 [info] starting run virtual-drift-drift_magnitude uid=28ec7f08ce7c4c528114e2590ff49325 DB=http://mlrun-api:8080
-
-
-
Warning - Server version '0.8.14' is different from client version '0.9.4'. Some operations may not work as expected.
-
-
-
> 2021-10-26 14:00:43,469 [info] Fitting discretizer for alcohol
-> 2021-10-26 14:00:43,471 [info] Fitting discretizer for malic_acid
-> 2021-10-26 14:00:43,471 [info] Fitting discretizer for ash
-> 2021-10-26 14:00:43,472 [info] Fitting discretizer for alcalinity_of_ash
-> 2021-10-26 14:00:43,473 [info] Fitting discretizer for magnesium
-> 2021-10-26 14:00:43,474 [info] Fitting discretizer for total_phenols
-> 2021-10-26 14:00:43,475 [info] Fitting discretizer for flavanoids
-> 2021-10-26 14:00:43,476 [info] Fitting discretizer for nonflavanoid_phenols
-> 2021-10-26 14:00:43,477 [info] Fitting discretizer for proanthocyanins
-> 2021-10-26 14:00:43,477 [info] Fitting discretizer for color_intensity
-> 2021-10-26 14:00:43,478 [info] Fitting discretizer for hue
-> 2021-10-26 14:00:43,479 [info] Fitting discretizer for od280/od315_of_diluted_wines
-> 2021-10-26 14:00:43,480 [info] Fitting discretizer for proline
-> 2021-10-26 14:00:43,531 [info] Discretizing featuers
-> 2021-10-26 14:00:43,752 [info] Compute prior metrics
-> 2021-10-26 14:00:43,889 [info] Compute class metrics
-> 2021-10-26 14:00:44,000 [info] value: inf
-> 2021-10-26 14:00:44,009 [info] Timestamp: 2021-10-26 14:00:44.008992
-
-
-
divide by zero encountered in log
-casting datetime64[ns] values to int64 with .astype(...) is deprecated and will raise in a future version. Use .view(...) instead.
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
function-marketplace0Oct 26 14:00:41completedvirtual-drift-drift_magnitude
v3io_user=dani
kind=
owner=dani
host=jupyter-dani-6bfbd76d96-zxx6f
t
u
label_col=y
results_tsdb_container=users
results_tsdb_table=dani/test/functions/virtual_drift/artifacts/output_tsdb
prior_tvd=0.5
prior_helinger=0.541
prior_kld=10
class_shift_tvd=0.017
class_shift_helinger=0.014
class_shift_kld=0.002
discritizers
t_discrete
u_discrete
features_t_pdf
features_u_pdf
class_t_pdf
class_u_pdf
-
- -
-

-
-
-
> to track results use the .show() or .logs() methods or click here to open in UI
> 2021-10-26 14:00:44,153 [info] run executed, status=completed
-
-
-
-
-
-
-
virtual_drift_run.artifact('class_u_pdf').show()
-virtual_drift_run.artifact('class_t_pdf').show()
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - -
u
00.348315
10.382022
20.269663
-
- - - - - - - - - - - - - - - - - - - - - - -
t
00.331461
10.398876
20.269663
-
-
-
-
-
import v3io_frames as v3f
-client = v3f.Client(os.environ["V3IO_FRAMESD"],container=container[1:])
-client.read(backend='tsdb',table=tsdb_path)
-
-
-
-
-
Warning - Server version '0.8.14' is different from client version '0.9.4'. Some operations may not work as expected.
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
class_shift_helingerclass_shift_kldclass_shift_tvdprior_helingerprior_kldprior_tvdstream
time
2021-10-26 13:58:04.445000+00:000.013980.0015640.0168540.54119610.00.5some_stream
2021-10-26 14:00:44.008000+00:000.013980.0015640.0168540.54119610.00.5some_stream
-
-
-

Back to the top

-
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/virtual_drift/1.1.0/static/function.html b/functions/development/virtual_drift/1.1.0/static/function.html deleted file mode 100644 index 989d4fe0..00000000 --- a/functions/development/virtual_drift/1.1.0/static/function.html +++ /dev/null @@ -1,151 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: job
-metadata:
-  name: virtual-drift
-  tag: ''
-  hash: 8990fdd72fc550189a0c8b488b69997428b786c9
-  project: ''
-  labels:
-    author: orz
-  categories:
-  - data-analysis
-  - machine-learning
-spec:
-  command: ''
-  args: []
-  image: mlrun/ml-models
-  env: []
-  default_handler: drift_magnitude
-  entry_points:
-    to_observations:
-      name: to_observations
-      doc: ''
-      parameters:
-      - name: context
-        default: ''
-      - name: t
-        default: ''
-      - name: u
-        default: ''
-      - name: key
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 16
-    tvd:
-      name: tvd
-      doc: ''
-      parameters:
-      - name: t
-        default: ''
-      - name: u
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 42
-    helinger:
-      name: helinger
-      doc: ''
-      parameters:
-      - name: t
-        default: ''
-      - name: u
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 46
-    kl_divergence:
-      name: kl_divergence
-      doc: ''
-      parameters:
-      - name: t
-        default: ''
-      - name: u
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 50
-    all_metrics:
-      name: all_metrics
-      doc: ''
-      parameters:
-      - name: t
-        default: ''
-      - name: u
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 56
-    drift_magnitude:
-      name: drift_magnitude
-      doc: "Drift magnitude metrics\n   Computes drift magnitude metrics between base\
-        \ dataset t and dataset u.\n   Metrics:\n    - TVD (Total Variation Distance)\n\
-        \    - Helinger\n    - KL Divergence"
-      parameters:
-      - name: context
-        doc: MLRun context
-        default: ''
-      - name: t
-        type: DataFrame
-        doc: Base dataset for the drift metrics
-        default: ''
-      - name: u
-        type: DataFrame
-        doc: Test dataset for the drift metrics
-        default: ''
-      - name: label_col
-        doc: Label colum in t and u
-        default: null
-      - name: prediction_col
-        doc: Predictions column in t and u
-        default: null
-      - name: discretizers
-        type: dict
-        default: null
-      - name: n_bins
-        doc: Number of bins to be used for histrogram creation from continuous variables
-        default: 5
-      - name: stream_name
-        type: str
-        doc: Output stream to push metrics to
-        default: some_stream
-      - name: results_tsdb_container
-        type: str
-        doc: TSDB table container to push metrics to
-        default: bigdata
-      - name: results_tsdb_table
-        type: str
-        doc: TSDB table to push metrics to
-        default: concept_drift/drift_magnitude
-      outputs:
-      - default: ''
-      lineno: 60
-  description: Compute drift magnitude between Time-Samples T and U
-  build:
-    functionSourceCode: 
-    commands:
-    - python -m pip install scikit-learn scipy v3io_frames
-    code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/virtual_drift/virtual_drift.py
-  affinity: null
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/virtual_drift/1.1.0/static/item.html b/functions/development/virtual_drift/1.1.0/static/item.html deleted file mode 100644 index 14d2ee3f..00000000 --- a/functions/development/virtual_drift/1.1.0/static/item.html +++ /dev/null @@ -1,50 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- data-analysis
-- machine-learning
-description: Compute drift magnitude between Time-Samples T and U
-doc: ''
-example: virtual_drift.ipynb
-generationDate: 2022-08-28:17-25
-hidden: false
-icon: ''
-labels:
-  author: orz
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 1.1.0
-name: virtual-drift
-platformVersion: 3.5.0
-spec:
-  filename: virtual_drift.py
-  handler: drift_magnitude
-  image: mlrun/ml-models
-  kind: job
-  requirements:
-  - scikit-learn
-  - scipy
-  - v3io_frames
-url: ''
-version: 1.1.0
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/virtual_drift/1.1.0/static/source.html b/functions/development/virtual_drift/1.1.0/static/source.html deleted file mode 100644 index 87b05c07..00000000 --- a/functions/development/virtual_drift/1.1.0/static/source.html +++ /dev/null @@ -1,228 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-# Copyright 2019 Iguazio
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# Generated by nuclio.export.NuclioExporter
-
-import os
-import pandas as pd
-import numpy as np
-import scipy as sp
-import pickle
-import datetime
-
-import v3io_frames as v3f
-
-import matplotlib.pyplot as plt
-from sklearn.preprocessing import KBinsDiscretizer
-
-
-def to_observations(context, t, u, key):
-    t = (
-        t.apply(lambda row: f"{'_'.join([str(row[val]) for val in t.columns])}", axis=1)
-        .value_counts()
-        .sort_index()
-    )
-    u = (
-        u.apply(lambda row: f"{'_'.join([str(row[val]) for val in u.columns])}", axis=1)
-        .value_counts()
-        .sort_index()
-    )
-
-    joined_uniques = pd.DataFrame([t, u]).T.fillna(0).sort_index()
-    joined_uniques.columns = ["t", "u"]
-
-    t_obs = joined_uniques.loc[:, "t"]
-    u_obs = joined_uniques.loc[:, "u"]
-
-    t_pdf = t_obs / t_obs.sum()
-    u_pdf = u_obs / u_obs.sum()
-
-    context.log_dataset(f"{key}_t_pdf", pd.DataFrame(t_pdf), format="parquet")
-    context.log_dataset(f"{key}_u_pdf", pd.DataFrame(u_pdf), format="parquet")
-    return t_pdf, u_pdf
-
-
-def tvd(t, u):
-    return sum(abs(t - u)) / 2
-
-
-def helinger(t, u):
-    return (np.sqrt(np.sum(np.power(np.sqrt(t) - np.sqrt(u), 2)))) / np.sqrt(2)
-
-
-def kl_divergence(t, u):
-    t_u = np.sum(np.where(t != 0, t * np.log(t / u), 0))
-    u_t = np.sum(np.where(u != 0, u * np.log(u / t), 0))
-    return t_u + u_t
-
-
-def all_metrics(t, u):
-    return tvd(t, u), helinger(t, u), kl_divergence(t, u)
-
-
-def drift_magnitude(
-    context,
-    t: pd.DataFrame,
-    u: pd.DataFrame,
-    label_col=None,
-    prediction_col=None,
-    discretizers: dict = None,
-    n_bins=5,
-    stream_name: str = "some_stream",
-    results_tsdb_container: str = "bigdata",
-    results_tsdb_table: str = "concept_drift/drift_magnitude",
-):
-    """Drift magnitude metrics
-       Computes drift magnitude metrics between base dataset t and dataset u.
-       Metrics:
-        - TVD (Total Variation Distance)
-        - Helinger
-        - KL Divergence
-
-    :param context: MLRun context
-    :param t: Base dataset for the drift metrics
-    :param u: Test dataset for the drift metrics
-    :param label_col: Label colum in t and u
-    :param prediction_col: Predictions column in t and u
-    :param discritizers: Dictionary of dicsritizers for the features if available
-                         (Created automatically if not provided)
-    :param n_bins: Number of bins to be used for histrogram creation from continuous variables
-    :param stream_name: Output stream to push metrics to
-    :param results_tsdb_container: TSDB table container to push metrics to
-    :param results_tsdb_table: TSDB table to push metrics to
-    """
-
-    v3io_client = v3f.Client("framesd:8081", container=results_tsdb_container)
-    try:
-        v3io_client.create("tsdb", results_tsdb_table, if_exists=1, rate="1/s")
-    except:
-        v3io_client.create(
-            "tsdb", results_tsdb_table, if_exists=1, attrs={"rate": "1/s"}
-        )
-
-    df_t = t.as_df()
-    df_u = u.as_df()
-
-    drop_columns = []
-    if label_col is not None:
-        drop_columns.append(label_col)
-    if prediction_col is not None:
-        drop_columns.append(prediction_col)
-
-    continuous_features = df_t.select_dtypes(["float"])
-    if discretizers is None:
-        discretizers = {}
-        for feature in continuous_features.columns:
-            context.logger.info(f"Fitting discretizer for {feature}")
-            discretizer = KBinsDiscretizer(
-                n_bins=n_bins, encode="ordinal", strategy="uniform"
-            )
-
-            discretizer.fit(continuous_features.loc[:, feature].values.reshape(-1, 1))
-            discretizers[feature] = discretizer
-    os.makedirs(context.artifact_path, exist_ok=True)
-    discretizers_path = os.path.abspath(f"{context.artifact_path}/discritizer.pkl")
-    with open(discretizers_path, "wb") as f:
-        pickle.dump(discretizers, f)
-    context.log_artifact("discritizers", target_path=discretizers_path)
-    context.logger.info("Discretizing featuers")
-    for feature, discretizer in discretizers.items():
-        df_t[feature] = discretizer.transform(
-            df_t.loc[:, feature].values.reshape(-1, 1)
-        )
-        df_u[feature] = discretizer.transform(
-            df_u.loc[:, feature].values.reshape(-1, 1)
-        )
-        df_t[feature] = df_t[feature].astype("int")
-        df_u[feature] = df_u[feature].astype("int")
-    context.log_dataset("t_discrete", df_t, format="parquet")
-    context.log_dataset("u_discrete", df_u, format="parquet")
-
-    context.logger.info("Compute prior metrics")
-
-    results = {}
-    t_prior, u_prior = to_observations(
-        context,
-        df_t.drop(drop_columns, axis=1),
-        df_u.drop(drop_columns, axis=1),
-        "features",
-    )
-    results["prior_tvd"], results["prior_helinger"], results["prior_kld"] = all_metrics(
-        t_prior, u_prior
-    )
-
-    if prediction_col is not None:
-        context.logger.info("Compute prediction metrics")
-        t_predictions = pd.DataFrame(df_t.loc[:, prediction_col])
-        u_predictions = pd.DataFrame(df_u.loc[:, prediction_col])
-        t_class, u_class = to_observations(
-            context, t_predictions, u_predictions, "prediction"
-        )
-        (
-            results["prediction_shift_tvd"],
-            results["prediction_shift_helinger"],
-            results["prediction_shift_kld"],
-        ) = all_metrics(t_class, u_class)
-
-    if label_col is not None:
-        context.logger.info("Compute class metrics")
-        t_labels = pd.DataFrame(df_t.loc[:, label_col])
-        u_labels = pd.DataFrame(df_u.loc[:, label_col])
-        t_class, u_class = to_observations(context, t_labels, u_labels, "class")
-        (
-            results["class_shift_tvd"],
-            results["class_shift_helinger"],
-            results["class_shift_kld"],
-        ) = all_metrics(t_class, u_class)
-
-    for key, value in results.items():
-        if value == float("inf"):
-            context.logger.info(f"value: {value}")
-            results[key] = 10
-    for key, result in results.items():
-        context.log_result(key, round(result, 3))
-
-    now = pd.to_datetime(str(datetime.datetime.now()))
-    now
-
-    results["timestamp"] = pd.to_datetime(str((datetime.datetime.now())))
-    context.logger.info(f"Timestamp: {results['timestamp']}")
-    results["stream"] = stream_name
-    results_df = pd.DataFrame(
-        data=[list(results.values())], columns=list(results.keys())
-    )
-    results_df = results_df.set_index(["timestamp", "stream"])
-    v3io_client.write("tsdb", results_tsdb_table, dfs=results_df)
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/virtual_drift/1.1.0/static/virtual_drift.html b/functions/development/virtual_drift/1.1.0/static/virtual_drift.html deleted file mode 100644 index 0775caf1..00000000 --- a/functions/development/virtual_drift/1.1.0/static/virtual_drift.html +++ /dev/null @@ -1,346 +0,0 @@ - - - - - - - -virtual_drift.virtual_drift - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - -
-
- -
-
-
-
-
- -
-

- -
-
-
-
-
-
-
-

Source code for virtual_drift.virtual_drift

-# Copyright 2019 Iguazio
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# Generated by nuclio.export.NuclioExporter
-
-import os
-import pandas as pd
-import numpy as np
-import scipy as sp
-import pickle
-import datetime
-
-import v3io_frames as v3f
-
-import matplotlib.pyplot as plt
-from sklearn.preprocessing import KBinsDiscretizer
-
-
-
[docs]def to_observations(context, t, u, key): - t = ( - t.apply(lambda row: f"{'_'.join([str(row[val]) for val in t.columns])}", axis=1) - .value_counts() - .sort_index() - ) - u = ( - u.apply(lambda row: f"{'_'.join([str(row[val]) for val in u.columns])}", axis=1) - .value_counts() - .sort_index() - ) - - joined_uniques = pd.DataFrame([t, u]).T.fillna(0).sort_index() - joined_uniques.columns = ["t", "u"] - - t_obs = joined_uniques.loc[:, "t"] - u_obs = joined_uniques.loc[:, "u"] - - t_pdf = t_obs / t_obs.sum() - u_pdf = u_obs / u_obs.sum() - - context.log_dataset(f"{key}_t_pdf", pd.DataFrame(t_pdf), format="parquet") - context.log_dataset(f"{key}_u_pdf", pd.DataFrame(u_pdf), format="parquet") - return t_pdf, u_pdf
- - -
[docs]def tvd(t, u): - return sum(abs(t - u)) / 2
- - -
[docs]def helinger(t, u): - return (np.sqrt(np.sum(np.power(np.sqrt(t) - np.sqrt(u), 2)))) / np.sqrt(2)
- - -
[docs]def kl_divergence(t, u): - t_u = np.sum(np.where(t != 0, t * np.log(t / u), 0)) - u_t = np.sum(np.where(u != 0, u * np.log(u / t), 0)) - return t_u + u_t
- - -
[docs]def all_metrics(t, u): - return tvd(t, u), helinger(t, u), kl_divergence(t, u)
- - -
[docs]def drift_magnitude( - context, - t: pd.DataFrame, - u: pd.DataFrame, - label_col=None, - prediction_col=None, - discretizers: dict = None, - n_bins=5, - stream_name: str = "some_stream", - results_tsdb_container: str = "bigdata", - results_tsdb_table: str = "concept_drift/drift_magnitude", -): - """Drift magnitude metrics - Computes drift magnitude metrics between base dataset t and dataset u. - Metrics: - - TVD (Total Variation Distance) - - Helinger - - KL Divergence - - :param context: MLRun context - :param t: Base dataset for the drift metrics - :param u: Test dataset for the drift metrics - :param label_col: Label colum in t and u - :param prediction_col: Predictions column in t and u - :param discritizers: Dictionary of dicsritizers for the features if available - (Created automatically if not provided) - :param n_bins: Number of bins to be used for histrogram creation from continuous variables - :param stream_name: Output stream to push metrics to - :param results_tsdb_container: TSDB table container to push metrics to - :param results_tsdb_table: TSDB table to push metrics to - """ - - v3io_client = v3f.Client("framesd:8081", container=results_tsdb_container) - try: - v3io_client.create("tsdb", results_tsdb_table, if_exists=1, rate="1/s") - except: - v3io_client.create( - "tsdb", results_tsdb_table, if_exists=1, attrs={"rate": "1/s"} - ) - - df_t = t.as_df() - df_u = u.as_df() - - drop_columns = [] - if label_col is not None: - drop_columns.append(label_col) - if prediction_col is not None: - drop_columns.append(prediction_col) - - continuous_features = df_t.select_dtypes(["float"]) - if discretizers is None: - discretizers = {} - for feature in continuous_features.columns: - context.logger.info(f"Fitting discretizer for {feature}") - discretizer = KBinsDiscretizer( - n_bins=n_bins, encode="ordinal", strategy="uniform" - ) - - discretizer.fit(continuous_features.loc[:, feature].values.reshape(-1, 1)) - discretizers[feature] = discretizer - os.makedirs(context.artifact_path, exist_ok=True) - discretizers_path = os.path.abspath(f"{context.artifact_path}/discritizer.pkl") - with open(discretizers_path, "wb") as f: - pickle.dump(discretizers, f) - context.log_artifact("discritizers", target_path=discretizers_path) - context.logger.info("Discretizing featuers") - for feature, discretizer in discretizers.items(): - df_t[feature] = discretizer.transform( - df_t.loc[:, feature].values.reshape(-1, 1) - ) - df_u[feature] = discretizer.transform( - df_u.loc[:, feature].values.reshape(-1, 1) - ) - df_t[feature] = df_t[feature].astype("int") - df_u[feature] = df_u[feature].astype("int") - context.log_dataset("t_discrete", df_t, format="parquet") - context.log_dataset("u_discrete", df_u, format="parquet") - - context.logger.info("Compute prior metrics") - - results = {} - t_prior, u_prior = to_observations( - context, - df_t.drop(drop_columns, axis=1), - df_u.drop(drop_columns, axis=1), - "features", - ) - results["prior_tvd"], results["prior_helinger"], results["prior_kld"] = all_metrics( - t_prior, u_prior - ) - - if prediction_col is not None: - context.logger.info("Compute prediction metrics") - t_predictions = pd.DataFrame(df_t.loc[:, prediction_col]) - u_predictions = pd.DataFrame(df_u.loc[:, prediction_col]) - t_class, u_class = to_observations( - context, t_predictions, u_predictions, "prediction" - ) - ( - results["prediction_shift_tvd"], - results["prediction_shift_helinger"], - results["prediction_shift_kld"], - ) = all_metrics(t_class, u_class) - - if label_col is not None: - context.logger.info("Compute class metrics") - t_labels = pd.DataFrame(df_t.loc[:, label_col]) - u_labels = pd.DataFrame(df_u.loc[:, label_col]) - t_class, u_class = to_observations(context, t_labels, u_labels, "class") - ( - results["class_shift_tvd"], - results["class_shift_helinger"], - results["class_shift_kld"], - ) = all_metrics(t_class, u_class) - - for key, value in results.items(): - if value == float("inf"): - context.logger.info(f"value: {value}") - results[key] = 10 - for key, result in results.items(): - context.log_result(key, round(result, 3)) - - now = pd.to_datetime(str(datetime.datetime.now())) - now - - results["timestamp"] = pd.to_datetime(str((datetime.datetime.now()))) - context.logger.info(f"Timestamp: {results['timestamp']}") - results["stream"] = stream_name - results_df = pd.DataFrame( - data=[list(results.values())], columns=list(results.keys()) - ) - results_df = results_df.set_index(["timestamp", "stream"]) - v3io_client.write("tsdb", results_tsdb_table, dfs=results_df)
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/virtual_drift/latest/src/README.md b/functions/development/virtual_drift/latest/src/README.md deleted file mode 100644 index cd738390..00000000 --- a/functions/development/virtual_drift/latest/src/README.md +++ /dev/null @@ -1,56 +0,0 @@ -# Drift Magnitude - -Concept drift and shift are major issues that greatly affect the accuracy and reliability of many real-world applications of machine learning. We can use the following Drift Magnitude metrics to map and understand our concepts and how close the properties of the data we used to train the models on are to the current data we receive. - -## How to integrate - -The Virtual Drift function is built to receive two data batches of data (as `dataitem` or `Dataframe`), base batch *t* and current batch *u*. - -```markdown -:param context: MLRun context -:param t: Base dataset for the drift metrics -:param u: Test dataset for the drift metrics -:param label_col: Label colum in t and u -:param prediction_col: Predictions column in t and u -:param discritizers: Dictionary of dicsritizers for the features if available - (Created automatically if not provided) -:param n_bins: Number of bins to be used for histrogram creation from continuous variables -:param stream_name: Output stream to push metrics to -:param results_tsdb_container: TSDB table container to push metrics to -:param results_tsdb_table: TSDB table to push metrics to -``` - -The function will calculate the selected drift mangitude metrics that were selected and apply them to the **features**, **labels** and **predictions**. It will then save those metrics and export them via Parquet and TSDB. Alerting could be added on top of the metrics via Grafana or a function. - -## Metrics - -The drift magnitude metrics we calculate are: - -### TVD - Total Variation Distance - -Provides a symetric drift distance between two periods *t* and *u* -Z - vector of random variables -P*t* - Probability distribution over timespan *t* - -![\sigma_{t, u}(Z)=\frac{1}{2}\sum_{\hat{z}\in{dom(Z)}}{|P_t{(\hat{Z})-P_u{(\hat{Z})}}|}]() - -### Helinger Distance - -Hellinger distance is an *f* divergence measuer, similar to the Kullback-Leibler (KL) divergence. However, unlike KL Divergence the Hellinger divergence is symmetric and bounded over a probability space. - -P, Q - Discrete probability distributions (P*i*, ..., P*k*). - -![H(P,Q)=\frac{1}{\sqrt{2}}\sqrt{\sum_{i=1}^{k}{(\sqrt{p_i}-\sqrt{q_i})^2}}]() - - -### KL Divergence - -KL Divergence (or relative entropy) is a measure of how one probability distribution differs from another. It is an asymmetric measure (thus it's not a metric) and it doesn't satisfy the triangle inequality. KL Divergence of 0, indicates two identical distributrions. - -![D_{KL}(P||Q)=\sum_{x\in{X}}{(P(x)\log{\frac{P(x)}{Q(x)}})}]() - -## Additional Resources - -Webb, Geoffrey I. et al. “[Characterizing Concept Drift.](https://arxiv.org/abs/1511.03816)” Data Mining and Knowledge Discovery 30.4 (2016): 964–994. Crossref. Web. - -[MLOps Live #4 - How to Detect & Remediate Drift in Production with MLOps Automation](https://www.youtube.com/watch?v=66_Q7mJZOSc&t=1296s) diff --git a/functions/development/virtual_drift/latest/src/function.yaml b/functions/development/virtual_drift/latest/src/function.yaml deleted file mode 100644 index 55dcec11..00000000 --- a/functions/development/virtual_drift/latest/src/function.yaml +++ /dev/null @@ -1,129 +0,0 @@ -kind: job -metadata: - name: virtual-drift - tag: '' - hash: 8990fdd72fc550189a0c8b488b69997428b786c9 - project: '' - labels: - author: orz - categories: - - data-analysis - - machine-learning -spec: - command: '' - args: [] - image: mlrun/ml-models - env: [] - default_handler: drift_magnitude - entry_points: - to_observations: - name: to_observations - doc: '' - parameters: - - name: context - default: '' - - name: t - default: '' - - name: u - default: '' - - name: key - default: '' - outputs: - - default: '' - lineno: 16 - tvd: - name: tvd - doc: '' - parameters: - - name: t - default: '' - - name: u - default: '' - outputs: - - default: '' - lineno: 42 - helinger: - name: helinger - doc: '' - parameters: - - name: t - default: '' - - name: u - default: '' - outputs: - - default: '' - lineno: 46 - kl_divergence: - name: kl_divergence - doc: '' - parameters: - - name: t - default: '' - - name: u - default: '' - outputs: - - default: '' - lineno: 50 - all_metrics: - name: all_metrics - doc: '' - parameters: - - name: t - default: '' - - name: u - default: '' - outputs: - - default: '' - lineno: 56 - drift_magnitude: - name: drift_magnitude - doc: "Drift magnitude metrics\n Computes drift magnitude metrics between base\ - \ dataset t and dataset u.\n Metrics:\n - TVD (Total Variation Distance)\n\ - \ - Helinger\n - KL Divergence" - parameters: - - name: context - doc: MLRun context - default: '' - - name: t - type: DataFrame - doc: Base dataset for the drift metrics - default: '' - - name: u - type: DataFrame - doc: Test dataset for the drift metrics - default: '' - - name: label_col - doc: Label colum in t and u - default: null - - name: prediction_col - doc: Predictions column in t and u - default: null - - name: discretizers - type: dict - default: null - - name: n_bins - doc: Number of bins to be used for histrogram creation from continuous variables - default: 5 - - name: stream_name - type: str - doc: Output stream to push metrics to - default: some_stream - - name: results_tsdb_container - type: str - doc: TSDB table container to push metrics to - default: bigdata - - name: results_tsdb_table - type: str - doc: TSDB table to push metrics to - default: concept_drift/drift_magnitude - outputs: - - default: '' - lineno: 60 - description: Compute drift magnitude between Time-Samples T and U - build: - functionSourceCode:  - commands: - - python -m pip install scikit-learn scipy v3io_frames - code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/virtual_drift/virtual_drift.py - affinity: null -verbose: false diff --git a/functions/development/virtual_drift/latest/src/item.yaml b/functions/development/virtual_drift/latest/src/item.yaml deleted file mode 100644 index d66f9e9c..00000000 --- a/functions/development/virtual_drift/latest/src/item.yaml +++ /dev/null @@ -1,28 +0,0 @@ -apiVersion: v1 -categories: -- data-analysis -- machine-learning -description: Compute drift magnitude between Time-Samples T and U -doc: '' -example: virtual_drift.ipynb -generationDate: 2022-08-28:17-25 -hidden: false -icon: '' -labels: - author: orz -maintainers: [] -marketplaceType: '' -mlrunVersion: 1.1.0 -name: virtual-drift -platformVersion: 3.5.0 -spec: - filename: virtual_drift.py - handler: drift_magnitude - image: mlrun/ml-models - kind: job - requirements: - - scikit-learn - - scipy - - v3io_frames -url: '' -version: 1.1.0 diff --git a/functions/development/virtual_drift/latest/src/virtual_drift.ipynb b/functions/development/virtual_drift/latest/src/virtual_drift.ipynb deleted file mode 100644 index 23b9ef43..00000000 --- a/functions/development/virtual_drift/latest/src/virtual_drift.ipynb +++ /dev/null @@ -1,935 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Virtual Drift\n", - "\n", - "Drift magnitude metrics\n", - " Computes drift magnitude metrics between base dataset t and dataset u. \n", - "\n", - "Metrics:\n", - "- TVD (Total Variation Distance)\n", - "- Helinger\n", - "- KL Divergence" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Steps**\n", - "\n", - "1. [Data exploration](#Data-exploration)\n", - "2. [Importing the function](#Importing-the-function)\n", - "3. [Running the function locally](#Running-the-function-locally)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Data exploration**" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - ".. _wine_dataset:\n", - "\n", - "Wine recognition dataset\n", - "------------------------\n", - "\n", - "**Data Set Characteristics:**\n", - "\n", - " :Number of Instances: 178 (50 in each of three classes)\n", - " :Number of Attributes: 13 numeric, predictive attributes and the class\n", - " :Attribute Information:\n", - " \t\t- Alcohol\n", - " \t\t- Malic acid\n", - " \t\t- Ash\n", - "\t\t- Alcalinity of ash \n", - " \t\t- Magnesium\n", - "\t\t- Total phenols\n", - " \t\t- Flavanoids\n", - " \t\t- Nonflavanoid phenols\n", - " \t\t- Proanthocyanins\n", - "\t\t- Color intensity\n", - " \t\t- Hue\n", - " \t\t- OD280/OD315 of diluted wines\n", - " \t\t- Proline\n", - "\n", - " - class:\n", - " - class_0\n", - " - class_1\n", - " - class_2\n", - "\t\t\n", - " :Summary Statistics:\n", - " \n", - " ============================= ==== ===== ======= =====\n", - " Min Max Mean SD\n", - " ============================= ==== ===== ======= =====\n", - " Alcohol: 11.0 14.8 13.0 0.8\n", - " Malic Acid: 0.74 5.80 2.34 1.12\n", - " Ash: 1.36 3.23 2.36 0.27\n", - " Alcalinity of Ash: 10.6 30.0 19.5 3.3\n", - " Magnesium: 70.0 162.0 99.7 14.3\n", - " Total Phenols: 0.98 3.88 2.29 0.63\n", - " Flavanoids: 0.34 5.08 2.03 1.00\n", - " Nonflavanoid Phenols: 0.13 0.66 0.36 0.12\n", - " Proanthocyanins: 0.41 3.58 1.59 0.57\n", - " Colour Intensity: 1.3 13.0 5.1 2.3\n", - " Hue: 0.48 1.71 0.96 0.23\n", - " OD280/OD315 of diluted wines: 1.27 4.00 2.61 0.71\n", - " Proline: 278 1680 746 315\n", - " ============================= ==== ===== ======= =====\n", - "\n", - " :Missing Attribute Values: None\n", - " :Class Distribution: class_0 (59), class_1 (71), class_2 (48)\n", - " :Creator: R.A. Fisher\n", - " :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)\n", - " :Date: July, 1988\n", - "\n", - "This is a copy of UCI ML Wine recognition datasets.\n", - "https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data\n", - "\n", - "The data is the results of a chemical analysis of wines grown in the same\n", - "region in Italy by three different cultivators. There are thirteen different\n", - "measurements taken for different constituents found in the three types of\n", - "wine.\n", - "\n", - "Original Owners: \n", - "\n", - "Forina, M. et al, PARVUS - \n", - "An Extendible Package for Data Exploration, Classification and Correlation. \n", - "Institute of Pharmaceutical and Food Analysis and Technologies,\n", - "Via Brigata Salerno, 16147 Genoa, Italy.\n", - "\n", - "Citation:\n", - "\n", - "Lichman, M. (2013). UCI Machine Learning Repository\n", - "[https://archive.ics.uci.edu/ml]. Irvine, CA: University of California,\n", - "School of Information and Computer Science. \n", - "\n", - ".. topic:: References\n", - "\n", - " (1) S. Aeberhard, D. Coomans and O. de Vel, \n", - " Comparison of Classifiers in High Dimensional Settings, \n", - " Tech. Rep. no. 92-02, (1992), Dept. of Computer Science and Dept. of \n", - " Mathematics and Statistics, James Cook University of North Queensland. \n", - " (Also submitted to Technometrics). \n", - "\n", - " The data was used with many others for comparing various \n", - " classifiers. The classes are separable, though only RDA \n", - " has achieved 100% correct classification. \n", - " (RDA : 100%, QDA 99.4%, LDA 98.9%, 1NN 96.1% (z-transformed data)) \n", - " (All results using the leave-one-out technique) \n", - "\n", - " (2) S. Aeberhard, D. Coomans and O. de Vel, \n", - " \"THE CLASSIFICATION PERFORMANCE OF RDA\" \n", - " Tech. Rep. no. 92-01, (1992), Dept. of Computer Science and Dept. of \n", - " Mathematics and Statistics, James Cook University of North Queensland. \n", - " (Also submitted to Journal of Chemometrics).\n", - "\n" - ] - } - ], - "source": [ - "# Scikit-learn's wine dataset\n", - "from sklearn.datasets import load_wine\n", - "\n", - "wine = load_wine()\n", - "print(wine[\"DESCR\"])" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "wine_t and wine_u are generated from the wine dataset, where wine_t is the entire dataset while wine_u is a sample (50%) of the entire dataset. \n", - "wine_t shape is 178 and wine_u shape is 89 \n", - "\n", - "\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
alcoholmalic_acidashalcalinity_of_ashmagnesiumtotal_phenolsflavanoidsnonflavanoid_phenolsproanthocyaninscolor_intensityhueod280/od315_of_diluted_winesprolineyprediction
014.231.712.4315.6127.02.803.060.282.295.641.043.921065.000
113.201.782.1411.2100.02.652.760.261.284.381.053.401050.000
213.162.362.6718.6101.02.803.240.302.815.681.033.171185.000
314.371.952.5016.8113.03.853.490.242.187.800.863.451480.000
413.242.592.8721.0118.02.802.690.391.824.321.042.93735.000
\n", - "
" - ], - "text/plain": [ - " alcohol malic_acid ash alcalinity_of_ash magnesium total_phenols \\\n", - "0 14.23 1.71 2.43 15.6 127.0 2.80 \n", - "1 13.20 1.78 2.14 11.2 100.0 2.65 \n", - "2 13.16 2.36 2.67 18.6 101.0 2.80 \n", - "3 14.37 1.95 2.50 16.8 113.0 3.85 \n", - "4 13.24 2.59 2.87 21.0 118.0 2.80 \n", - "\n", - " flavanoids nonflavanoid_phenols proanthocyanins color_intensity hue \\\n", - "0 3.06 0.28 2.29 5.64 1.04 \n", - "1 2.76 0.26 1.28 4.38 1.05 \n", - "2 3.24 0.30 2.81 5.68 1.03 \n", - "3 3.49 0.24 2.18 7.80 0.86 \n", - "4 2.69 0.39 1.82 4.32 1.04 \n", - "\n", - " od280/od315_of_diluted_wines proline y prediction \n", - "0 3.92 1065.0 0 0 \n", - "1 3.40 1050.0 0 0 \n", - "2 3.17 1185.0 0 0 \n", - "3 3.45 1480.0 0 0 \n", - "4 2.93 735.0 0 0 " - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "wine_t_path = 'https://s3.wasabisys.com/iguazio/data/function-marketplace-data/virtual_drift/wine_t.pq'\n", - "wine_u_path = 'https://s3.wasabisys.com/iguazio/data/function-marketplace-data/virtual_drift/wine_u.pq'\n", - "wine_t=pd.read_parquet(wine_t_path)\n", - "wine_u=pd.read_parquet(wine_u_path)\n", - "print(f'wine_t and wine_u are generated from the wine dataset, where wine_t is the entire dataset while wine_u is a sample (50%) of the entire dataset. \\n\\\n", - "wine_t shape is {wine_t.shape[0]} and wine_u shape is {wine_u.shape[0]} \\n\\n')\n", - "wine_t.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Importing the function**" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2021-10-26 13:45:22,345 [info] created and saved project function-marketplace\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import mlrun\n", - "\n", - "# Importing the function\n", - "mlrun.set_environment(project='function-marketplace')\n", - "\n", - "fn = mlrun.import_function(\"hub://virtual_drift\")\n", - "fn.apply(mlrun.auto_mount())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### **Running the function locally**" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [], - "source": [ - "import os \n", - "\n", - "container = os.path.join('/',os.environ['V3IO_HOME'].split('/')[0])\n", - "user = os.environ[\"V3IO_USERNAME\"]\n", - "rel_path = os.getcwd()[6:] + '/artifacts'\n", - "tsdb_path = os.path.join(user,rel_path) + \"/output_tsdb\"" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2021-10-26 14:00:41,020 [info] starting run virtual-drift-drift_magnitude uid=28ec7f08ce7c4c528114e2590ff49325 DB=http://mlrun-api:8080\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Warning - Server version '0.8.14' is different from client version '0.9.4'. Some operations may not work as expected.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2021-10-26 14:00:43,469 [info] Fitting discretizer for alcohol\n", - "> 2021-10-26 14:00:43,471 [info] Fitting discretizer for malic_acid\n", - "> 2021-10-26 14:00:43,471 [info] Fitting discretizer for ash\n", - "> 2021-10-26 14:00:43,472 [info] Fitting discretizer for alcalinity_of_ash\n", - "> 2021-10-26 14:00:43,473 [info] Fitting discretizer for magnesium\n", - "> 2021-10-26 14:00:43,474 [info] Fitting discretizer for total_phenols\n", - "> 2021-10-26 14:00:43,475 [info] Fitting discretizer for flavanoids\n", - "> 2021-10-26 14:00:43,476 [info] Fitting discretizer for nonflavanoid_phenols\n", - "> 2021-10-26 14:00:43,477 [info] Fitting discretizer for proanthocyanins\n", - "> 2021-10-26 14:00:43,477 [info] Fitting discretizer for color_intensity\n", - "> 2021-10-26 14:00:43,478 [info] Fitting discretizer for hue\n", - "> 2021-10-26 14:00:43,479 [info] Fitting discretizer for od280/od315_of_diluted_wines\n", - "> 2021-10-26 14:00:43,480 [info] Fitting discretizer for proline\n", - "> 2021-10-26 14:00:43,531 [info] Discretizing featuers\n", - "> 2021-10-26 14:00:43,752 [info] Compute prior metrics\n", - "> 2021-10-26 14:00:43,889 [info] Compute class metrics\n", - "> 2021-10-26 14:00:44,000 [info] value: inf\n", - "> 2021-10-26 14:00:44,009 [info] Timestamp: 2021-10-26 14:00:44.008992\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "divide by zero encountered in log\n", - "casting datetime64[ns] values to int64 with .astype(...) is deprecated and will raise in a future version. Use .view(...) instead.\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
function-marketplace0Oct 26 14:00:41completedvirtual-drift-drift_magnitude
v3io_user=dani
kind=
owner=dani
host=jupyter-dani-6bfbd76d96-zxx6f
t
u
label_col=y
results_tsdb_container=users
results_tsdb_table=dani/test/functions/virtual_drift/artifacts/output_tsdb
prior_tvd=0.5
prior_helinger=0.541
prior_kld=10
class_shift_tvd=0.017
class_shift_helinger=0.014
class_shift_kld=0.002
discritizers
t_discrete
u_discrete
features_t_pdf
features_u_pdf
class_t_pdf
class_u_pdf
\n", - "
\n", - "
\n", - "
\n", - " Title\n", - " ×\n", - "
\n", - " \n", - "
\n", - "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - }, - { - "data": { - "text/html": [ - " > to track results use the .show() or .logs() methods or click here to open in UI" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2021-10-26 14:00:44,153 [info] run executed, status=completed\n" - ] - } - ], - "source": [ - "virtual_drift_run=fn.run(params={'label_col': 'y',\n", - " 'results_tsdb_container': container[1:],\n", - " 'results_tsdb_table': tsdb_path},\n", - " inputs={'t': wine_t_path,\n", - " 'u': wine_u_path},\n", - " artifact_path=os.getcwd(),\n", - " local=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
u
00.348315
10.382022
20.269663
\n", - "
" - ], - "text/plain": [ - " u\n", - "0 0.348315\n", - "1 0.382022\n", - "2 0.269663" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
t
00.331461
10.398876
20.269663
\n", - "
" - ], - "text/plain": [ - " t\n", - "0 0.331461\n", - "1 0.398876\n", - "2 0.269663" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "virtual_drift_run.artifact('class_u_pdf').show()\n", - "virtual_drift_run.artifact('class_t_pdf').show()" - ] - }, - { - "cell_type": "code", - "execution_count": 69, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Warning - Server version '0.8.14' is different from client version '0.9.4'. Some operations may not work as expected.\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
class_shift_helingerclass_shift_kldclass_shift_tvdprior_helingerprior_kldprior_tvdstream
time
2021-10-26 13:58:04.445000+00:000.013980.0015640.0168540.54119610.00.5some_stream
2021-10-26 14:00:44.008000+00:000.013980.0015640.0168540.54119610.00.5some_stream
\n", - "
" - ], - "text/plain": [ - " class_shift_helinger class_shift_kld \\\n", - "time \n", - "2021-10-26 13:58:04.445000+00:00 0.01398 0.001564 \n", - "2021-10-26 14:00:44.008000+00:00 0.01398 0.001564 \n", - "\n", - " class_shift_tvd prior_helinger prior_kld \\\n", - "time \n", - "2021-10-26 13:58:04.445000+00:00 0.016854 0.541196 10.0 \n", - "2021-10-26 14:00:44.008000+00:00 0.016854 0.541196 10.0 \n", - "\n", - " prior_tvd stream \n", - "time \n", - "2021-10-26 13:58:04.445000+00:00 0.5 some_stream \n", - "2021-10-26 14:00:44.008000+00:00 0.5 some_stream " - ] - }, - "execution_count": 69, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import v3io_frames as v3f\n", - "client = v3f.Client(os.environ[\"V3IO_FRAMESD\"],container=container[1:])\n", - "client.read(backend='tsdb',table=tsdb_path)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "[Back to the top](#Virtual-Drift)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/functions/development/virtual_drift/latest/src/virtual_drift.py b/functions/development/virtual_drift/latest/src/virtual_drift.py deleted file mode 100644 index 71dcf712..00000000 --- a/functions/development/virtual_drift/latest/src/virtual_drift.py +++ /dev/null @@ -1,206 +0,0 @@ -# Copyright 2019 Iguazio -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# Generated by nuclio.export.NuclioExporter - -import os -import pandas as pd -import numpy as np -import scipy as sp -import pickle -import datetime - -import v3io_frames as v3f - -import matplotlib.pyplot as plt -from sklearn.preprocessing import KBinsDiscretizer - - -def to_observations(context, t, u, key): - t = ( - t.apply(lambda row: f"{'_'.join([str(row[val]) for val in t.columns])}", axis=1) - .value_counts() - .sort_index() - ) - u = ( - u.apply(lambda row: f"{'_'.join([str(row[val]) for val in u.columns])}", axis=1) - .value_counts() - .sort_index() - ) - - joined_uniques = pd.DataFrame([t, u]).T.fillna(0).sort_index() - joined_uniques.columns = ["t", "u"] - - t_obs = joined_uniques.loc[:, "t"] - u_obs = joined_uniques.loc[:, "u"] - - t_pdf = t_obs / t_obs.sum() - u_pdf = u_obs / u_obs.sum() - - context.log_dataset(f"{key}_t_pdf", pd.DataFrame(t_pdf), format="parquet") - context.log_dataset(f"{key}_u_pdf", pd.DataFrame(u_pdf), format="parquet") - return t_pdf, u_pdf - - -def tvd(t, u): - return sum(abs(t - u)) / 2 - - -def helinger(t, u): - return (np.sqrt(np.sum(np.power(np.sqrt(t) - np.sqrt(u), 2)))) / np.sqrt(2) - - -def kl_divergence(t, u): - t_u = np.sum(np.where(t != 0, t * np.log(t / u), 0)) - u_t = np.sum(np.where(u != 0, u * np.log(u / t), 0)) - return t_u + u_t - - -def all_metrics(t, u): - return tvd(t, u), helinger(t, u), kl_divergence(t, u) - - -def drift_magnitude( - context, - t: pd.DataFrame, - u: pd.DataFrame, - label_col=None, - prediction_col=None, - discretizers: dict = None, - n_bins=5, - stream_name: str = "some_stream", - results_tsdb_container: str = "bigdata", - results_tsdb_table: str = "concept_drift/drift_magnitude", -): - """Drift magnitude metrics - Computes drift magnitude metrics between base dataset t and dataset u. - Metrics: - - TVD (Total Variation Distance) - - Helinger - - KL Divergence - - :param context: MLRun context - :param t: Base dataset for the drift metrics - :param u: Test dataset for the drift metrics - :param label_col: Label colum in t and u - :param prediction_col: Predictions column in t and u - :param discritizers: Dictionary of dicsritizers for the features if available - (Created automatically if not provided) - :param n_bins: Number of bins to be used for histrogram creation from continuous variables - :param stream_name: Output stream to push metrics to - :param results_tsdb_container: TSDB table container to push metrics to - :param results_tsdb_table: TSDB table to push metrics to - """ - - v3io_client = v3f.Client("framesd:8081", container=results_tsdb_container) - try: - v3io_client.create("tsdb", results_tsdb_table, if_exists=1, rate="1/s") - except: - v3io_client.create( - "tsdb", results_tsdb_table, if_exists=1, attrs={"rate": "1/s"} - ) - - df_t = t.as_df() - df_u = u.as_df() - - drop_columns = [] - if label_col is not None: - drop_columns.append(label_col) - if prediction_col is not None: - drop_columns.append(prediction_col) - - continuous_features = df_t.select_dtypes(["float"]) - if discretizers is None: - discretizers = {} - for feature in continuous_features.columns: - context.logger.info(f"Fitting discretizer for {feature}") - discretizer = KBinsDiscretizer( - n_bins=n_bins, encode="ordinal", strategy="uniform" - ) - - discretizer.fit(continuous_features.loc[:, feature].values.reshape(-1, 1)) - discretizers[feature] = discretizer - os.makedirs(context.artifact_path, exist_ok=True) - discretizers_path = os.path.abspath(f"{context.artifact_path}/discritizer.pkl") - with open(discretizers_path, "wb") as f: - pickle.dump(discretizers, f) - context.log_artifact("discritizers", target_path=discretizers_path) - context.logger.info("Discretizing featuers") - for feature, discretizer in discretizers.items(): - df_t[feature] = discretizer.transform( - df_t.loc[:, feature].values.reshape(-1, 1) - ) - df_u[feature] = discretizer.transform( - df_u.loc[:, feature].values.reshape(-1, 1) - ) - df_t[feature] = df_t[feature].astype("int") - df_u[feature] = df_u[feature].astype("int") - context.log_dataset("t_discrete", df_t, format="parquet") - context.log_dataset("u_discrete", df_u, format="parquet") - - context.logger.info("Compute prior metrics") - - results = {} - t_prior, u_prior = to_observations( - context, - df_t.drop(drop_columns, axis=1), - df_u.drop(drop_columns, axis=1), - "features", - ) - results["prior_tvd"], results["prior_helinger"], results["prior_kld"] = all_metrics( - t_prior, u_prior - ) - - if prediction_col is not None: - context.logger.info("Compute prediction metrics") - t_predictions = pd.DataFrame(df_t.loc[:, prediction_col]) - u_predictions = pd.DataFrame(df_u.loc[:, prediction_col]) - t_class, u_class = to_observations( - context, t_predictions, u_predictions, "prediction" - ) - ( - results["prediction_shift_tvd"], - results["prediction_shift_helinger"], - results["prediction_shift_kld"], - ) = all_metrics(t_class, u_class) - - if label_col is not None: - context.logger.info("Compute class metrics") - t_labels = pd.DataFrame(df_t.loc[:, label_col]) - u_labels = pd.DataFrame(df_u.loc[:, label_col]) - t_class, u_class = to_observations(context, t_labels, u_labels, "class") - ( - results["class_shift_tvd"], - results["class_shift_helinger"], - results["class_shift_kld"], - ) = all_metrics(t_class, u_class) - - for key, value in results.items(): - if value == float("inf"): - context.logger.info(f"value: {value}") - results[key] = 10 - for key, result in results.items(): - context.log_result(key, round(result, 3)) - - now = pd.to_datetime(str(datetime.datetime.now())) - now - - results["timestamp"] = pd.to_datetime(str((datetime.datetime.now()))) - context.logger.info(f"Timestamp: {results['timestamp']}") - results["stream"] = stream_name - results_df = pd.DataFrame( - data=[list(results.values())], columns=list(results.keys()) - ) - results_df = results_df.set_index(["timestamp", "stream"]) - v3io_client.write("tsdb", results_tsdb_table, dfs=results_df) diff --git a/functions/development/virtual_drift/latest/static/documentation.html b/functions/development/virtual_drift/latest/static/documentation.html deleted file mode 100644 index 417e5ac4..00000000 --- a/functions/development/virtual_drift/latest/static/documentation.html +++ /dev/null @@ -1,275 +0,0 @@ - - - - - - - -virtual_drift package - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - - - -
-
- - -
-
-
- -
-

virtual_drift package

- -
- -
-
-
-
-
-

virtual_drift package#

-
-

Submodules#

-
-
-

virtual_drift.virtual_drift module#

-
-
-virtual_drift.virtual_drift.all_metrics(t, u)[source]#
-
-
-
-virtual_drift.virtual_drift.drift_magnitude(context, t: pandas.core.frame.DataFrame, u: pandas.core.frame.DataFrame, label_col=None, prediction_col=None, discretizers: Optional[dict] = None, n_bins=5, stream_name: str = 'some_stream', results_tsdb_container: str = 'bigdata', results_tsdb_table: str = 'concept_drift/drift_magnitude')[source]#
-
-
Drift magnitude metrics

Computes drift magnitude metrics between base dataset t and dataset u. -Metrics:

-
-
    -
  • TVD (Total Variation Distance)

  • -
  • Helinger

  • -
  • KL Divergence

  • -
-
-
-
-
-
Parameters
-
    -
  • context – MLRun context

  • -
  • t – Base dataset for the drift metrics

  • -
  • u – Test dataset for the drift metrics

  • -
  • label_col – Label colum in t and u

  • -
  • prediction_col – Predictions column in t and u

  • -
  • discritizers – Dictionary of dicsritizers for the features if available -(Created automatically if not provided)

  • -
  • n_bins – Number of bins to be used for histrogram creation from continuous variables

  • -
  • stream_name – Output stream to push metrics to

  • -
  • results_tsdb_container – TSDB table container to push metrics to

  • -
  • results_tsdb_table – TSDB table to push metrics to

  • -
-
-
-
-
-
-virtual_drift.virtual_drift.helinger(t, u)[source]#
-
-
-
-virtual_drift.virtual_drift.kl_divergence(t, u)[source]#
-
-
-
-virtual_drift.virtual_drift.to_observations(context, t, u, key)[source]#
-
-
-
-virtual_drift.virtual_drift.tvd(t, u)[source]#
-
-
-
-

Module contents#

-
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/virtual_drift/latest/static/example.html b/functions/development/virtual_drift/latest/static/example.html deleted file mode 100644 index aafbe33c..00000000 --- a/functions/development/virtual_drift/latest/static/example.html +++ /dev/null @@ -1,962 +0,0 @@ - - - - - - - -Virtual Drift - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - - - -
-
- - -
-
-
- -
-

Virtual Drift

- -
- -
-
-
-
-
-

Virtual Drift#

-

Drift magnitude metrics -Computes drift magnitude metrics between base dataset t and dataset u.

-

Metrics:

-
    -
  • TVD (Total Variation Distance)

  • -
  • Helinger

  • -
  • KL Divergence

  • -
-
-

Steps#

-
    -
  1. Data exploration

  2. -
  3. Importing the function

  4. -
  5. Running the function locally

  6. -
-
-
-

Data exploration#

-
-
-
# Scikit-learn's wine dataset
-from sklearn.datasets import load_wine
-
-wine = load_wine()
-print(wine["DESCR"])
-
-
-
-
-
.. _wine_dataset:
-
-Wine recognition dataset
-------------------------
-
-**Data Set Characteristics:**
-
-    :Number of Instances: 178 (50 in each of three classes)
-    :Number of Attributes: 13 numeric, predictive attributes and the class
-    :Attribute Information:
- 		- Alcohol
- 		- Malic acid
- 		- Ash
-		- Alcalinity of ash  
- 		- Magnesium
-		- Total phenols
- 		- Flavanoids
- 		- Nonflavanoid phenols
- 		- Proanthocyanins
-		- Color intensity
- 		- Hue
- 		- OD280/OD315 of diluted wines
- 		- Proline
-
-    - class:
-            - class_0
-            - class_1
-            - class_2
-		
-    :Summary Statistics:
-    
-    ============================= ==== ===== ======= =====
-                                   Min   Max   Mean     SD
-    ============================= ==== ===== ======= =====
-    Alcohol:                      11.0  14.8    13.0   0.8
-    Malic Acid:                   0.74  5.80    2.34  1.12
-    Ash:                          1.36  3.23    2.36  0.27
-    Alcalinity of Ash:            10.6  30.0    19.5   3.3
-    Magnesium:                    70.0 162.0    99.7  14.3
-    Total Phenols:                0.98  3.88    2.29  0.63
-    Flavanoids:                   0.34  5.08    2.03  1.00
-    Nonflavanoid Phenols:         0.13  0.66    0.36  0.12
-    Proanthocyanins:              0.41  3.58    1.59  0.57
-    Colour Intensity:              1.3  13.0     5.1   2.3
-    Hue:                          0.48  1.71    0.96  0.23
-    OD280/OD315 of diluted wines: 1.27  4.00    2.61  0.71
-    Proline:                       278  1680     746   315
-    ============================= ==== ===== ======= =====
-
-    :Missing Attribute Values: None
-    :Class Distribution: class_0 (59), class_1 (71), class_2 (48)
-    :Creator: R.A. Fisher
-    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
-    :Date: July, 1988
-
-This is a copy of UCI ML Wine recognition datasets.
-https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data
-
-The data is the results of a chemical analysis of wines grown in the same
-region in Italy by three different cultivators. There are thirteen different
-measurements taken for different constituents found in the three types of
-wine.
-
-Original Owners: 
-
-Forina, M. et al, PARVUS - 
-An Extendible Package for Data Exploration, Classification and Correlation. 
-Institute of Pharmaceutical and Food Analysis and Technologies,
-Via Brigata Salerno, 16147 Genoa, Italy.
-
-Citation:
-
-Lichman, M. (2013). UCI Machine Learning Repository
-[https://archive.ics.uci.edu/ml]. Irvine, CA: University of California,
-School of Information and Computer Science. 
-
-.. topic:: References
-
-  (1) S. Aeberhard, D. Coomans and O. de Vel, 
-  Comparison of Classifiers in High Dimensional Settings, 
-  Tech. Rep. no. 92-02, (1992), Dept. of Computer Science and Dept. of  
-  Mathematics and Statistics, James Cook University of North Queensland. 
-  (Also submitted to Technometrics). 
-
-  The data was used with many others for comparing various 
-  classifiers. The classes are separable, though only RDA 
-  has achieved 100% correct classification. 
-  (RDA : 100%, QDA 99.4%, LDA 98.9%, 1NN 96.1% (z-transformed data)) 
-  (All results using the leave-one-out technique) 
-
-  (2) S. Aeberhard, D. Coomans and O. de Vel, 
-  "THE CLASSIFICATION PERFORMANCE OF RDA" 
-  Tech. Rep. no. 92-01, (1992), Dept. of Computer Science and Dept. of 
-  Mathematics and Statistics, James Cook University of North Queensland. 
-  (Also submitted to Journal of Chemometrics).
-
-
-
-
-
-
-
wine_t_path = 'https://s3.wasabisys.com/iguazio/data/function-marketplace-data/virtual_drift/wine_t.pq'
-wine_u_path = 'https://s3.wasabisys.com/iguazio/data/function-marketplace-data/virtual_drift/wine_u.pq'
-wine_t=pd.read_parquet(wine_t_path)
-wine_u=pd.read_parquet(wine_u_path)
-print(f'wine_t and wine_u are generated from the wine dataset, where wine_t is the entire dataset while wine_u is a sample (50%) of the entire dataset. \n\
-wine_t shape is {wine_t.shape[0]} and wine_u shape is {wine_u.shape[0]} \n\n')
-wine_t.head()
-
-
-
-
-
wine_t and wine_u are generated from the wine dataset, where wine_t is the entire dataset while wine_u is a sample (50%) of the entire dataset. 
-wine_t shape is 178 and wine_u shape is 89 
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
alcoholmalic_acidashalcalinity_of_ashmagnesiumtotal_phenolsflavanoidsnonflavanoid_phenolsproanthocyaninscolor_intensityhueod280/od315_of_diluted_winesprolineyprediction
014.231.712.4315.6127.02.803.060.282.295.641.043.921065.000
113.201.782.1411.2100.02.652.760.261.284.381.053.401050.000
213.162.362.6718.6101.02.803.240.302.815.681.033.171185.000
314.371.952.5016.8113.03.853.490.242.187.800.863.451480.000
413.242.592.8721.0118.02.802.690.391.824.321.042.93735.000
-
-
-
-
-

Importing the function#

-
-
-
import mlrun
-
-# Importing the function
-mlrun.set_environment(project='function-marketplace')
-
-fn = mlrun.import_function("hub://virtual_drift")
-fn.apply(mlrun.auto_mount())
-
-
-
-
-
> 2021-10-26 13:45:22,345 [info] created and saved project function-marketplace
-
-
-
<mlrun.runtimes.kubejob.KubejobRuntime at 0x7ff54a864dd0>
-
-
-
-
-
-
-

Running the function locally#

-
-
-
import os 
-
-container = os.path.join('/',os.environ['V3IO_HOME'].split('/')[0])
-user = os.environ["V3IO_USERNAME"]
-rel_path = os.getcwd()[6:] + '/artifacts'
-tsdb_path = os.path.join(user,rel_path) + "/output_tsdb"
-
-
-
-
-
-
-
virtual_drift_run=fn.run(params={'label_col': 'y',
-                                 'results_tsdb_container': container[1:],
-                                 'results_tsdb_table': tsdb_path},
-                         inputs={'t': wine_t_path,
-                                 'u': wine_u_path},
-                         artifact_path=os.getcwd(),
-                         local=True)
-
-
-
-
-
> 2021-10-26 14:00:41,020 [info] starting run virtual-drift-drift_magnitude uid=28ec7f08ce7c4c528114e2590ff49325 DB=http://mlrun-api:8080
-
-
-
Warning - Server version '0.8.14' is different from client version '0.9.4'. Some operations may not work as expected.
-
-
-
> 2021-10-26 14:00:43,469 [info] Fitting discretizer for alcohol
-> 2021-10-26 14:00:43,471 [info] Fitting discretizer for malic_acid
-> 2021-10-26 14:00:43,471 [info] Fitting discretizer for ash
-> 2021-10-26 14:00:43,472 [info] Fitting discretizer for alcalinity_of_ash
-> 2021-10-26 14:00:43,473 [info] Fitting discretizer for magnesium
-> 2021-10-26 14:00:43,474 [info] Fitting discretizer for total_phenols
-> 2021-10-26 14:00:43,475 [info] Fitting discretizer for flavanoids
-> 2021-10-26 14:00:43,476 [info] Fitting discretizer for nonflavanoid_phenols
-> 2021-10-26 14:00:43,477 [info] Fitting discretizer for proanthocyanins
-> 2021-10-26 14:00:43,477 [info] Fitting discretizer for color_intensity
-> 2021-10-26 14:00:43,478 [info] Fitting discretizer for hue
-> 2021-10-26 14:00:43,479 [info] Fitting discretizer for od280/od315_of_diluted_wines
-> 2021-10-26 14:00:43,480 [info] Fitting discretizer for proline
-> 2021-10-26 14:00:43,531 [info] Discretizing featuers
-> 2021-10-26 14:00:43,752 [info] Compute prior metrics
-> 2021-10-26 14:00:43,889 [info] Compute class metrics
-> 2021-10-26 14:00:44,000 [info] value: inf
-> 2021-10-26 14:00:44,009 [info] Timestamp: 2021-10-26 14:00:44.008992
-
-
-
divide by zero encountered in log
-casting datetime64[ns] values to int64 with .astype(...) is deprecated and will raise in a future version. Use .view(...) instead.
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
function-marketplace0Oct 26 14:00:41completedvirtual-drift-drift_magnitude
v3io_user=dani
kind=
owner=dani
host=jupyter-dani-6bfbd76d96-zxx6f
t
u
label_col=y
results_tsdb_container=users
results_tsdb_table=dani/test/functions/virtual_drift/artifacts/output_tsdb
prior_tvd=0.5
prior_helinger=0.541
prior_kld=10
class_shift_tvd=0.017
class_shift_helinger=0.014
class_shift_kld=0.002
discritizers
t_discrete
u_discrete
features_t_pdf
features_u_pdf
class_t_pdf
class_u_pdf
-
- -
-

-
-
-
> to track results use the .show() or .logs() methods or click here to open in UI
> 2021-10-26 14:00:44,153 [info] run executed, status=completed
-
-
-
-
-
-
-
virtual_drift_run.artifact('class_u_pdf').show()
-virtual_drift_run.artifact('class_t_pdf').show()
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - -
u
00.348315
10.382022
20.269663
-
- - - - - - - - - - - - - - - - - - - - - - -
t
00.331461
10.398876
20.269663
-
-
-
-
-
import v3io_frames as v3f
-client = v3f.Client(os.environ["V3IO_FRAMESD"],container=container[1:])
-client.read(backend='tsdb',table=tsdb_path)
-
-
-
-
-
Warning - Server version '0.8.14' is different from client version '0.9.4'. Some operations may not work as expected.
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
class_shift_helingerclass_shift_kldclass_shift_tvdprior_helingerprior_kldprior_tvdstream
time
2021-10-26 13:58:04.445000+00:000.013980.0015640.0168540.54119610.00.5some_stream
2021-10-26 14:00:44.008000+00:000.013980.0015640.0168540.54119610.00.5some_stream
-
-
-

Back to the top

-
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/virtual_drift/latest/static/function.html b/functions/development/virtual_drift/latest/static/function.html deleted file mode 100644 index 989d4fe0..00000000 --- a/functions/development/virtual_drift/latest/static/function.html +++ /dev/null @@ -1,151 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: job
-metadata:
-  name: virtual-drift
-  tag: ''
-  hash: 8990fdd72fc550189a0c8b488b69997428b786c9
-  project: ''
-  labels:
-    author: orz
-  categories:
-  - data-analysis
-  - machine-learning
-spec:
-  command: ''
-  args: []
-  image: mlrun/ml-models
-  env: []
-  default_handler: drift_magnitude
-  entry_points:
-    to_observations:
-      name: to_observations
-      doc: ''
-      parameters:
-      - name: context
-        default: ''
-      - name: t
-        default: ''
-      - name: u
-        default: ''
-      - name: key
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 16
-    tvd:
-      name: tvd
-      doc: ''
-      parameters:
-      - name: t
-        default: ''
-      - name: u
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 42
-    helinger:
-      name: helinger
-      doc: ''
-      parameters:
-      - name: t
-        default: ''
-      - name: u
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 46
-    kl_divergence:
-      name: kl_divergence
-      doc: ''
-      parameters:
-      - name: t
-        default: ''
-      - name: u
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 50
-    all_metrics:
-      name: all_metrics
-      doc: ''
-      parameters:
-      - name: t
-        default: ''
-      - name: u
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 56
-    drift_magnitude:
-      name: drift_magnitude
-      doc: "Drift magnitude metrics\n   Computes drift magnitude metrics between base\
-        \ dataset t and dataset u.\n   Metrics:\n    - TVD (Total Variation Distance)\n\
-        \    - Helinger\n    - KL Divergence"
-      parameters:
-      - name: context
-        doc: MLRun context
-        default: ''
-      - name: t
-        type: DataFrame
-        doc: Base dataset for the drift metrics
-        default: ''
-      - name: u
-        type: DataFrame
-        doc: Test dataset for the drift metrics
-        default: ''
-      - name: label_col
-        doc: Label colum in t and u
-        default: null
-      - name: prediction_col
-        doc: Predictions column in t and u
-        default: null
-      - name: discretizers
-        type: dict
-        default: null
-      - name: n_bins
-        doc: Number of bins to be used for histrogram creation from continuous variables
-        default: 5
-      - name: stream_name
-        type: str
-        doc: Output stream to push metrics to
-        default: some_stream
-      - name: results_tsdb_container
-        type: str
-        doc: TSDB table container to push metrics to
-        default: bigdata
-      - name: results_tsdb_table
-        type: str
-        doc: TSDB table to push metrics to
-        default: concept_drift/drift_magnitude
-      outputs:
-      - default: ''
-      lineno: 60
-  description: Compute drift magnitude between Time-Samples T and U
-  build:
-    functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IG9zCmltcG9ydCBwYW5kYXMgYXMgcGQKaW1wb3J0IG51bXB5IGFzIG5wCmltcG9ydCBzY2lweSBhcyBzcAppbXBvcnQgcGlja2xlCmltcG9ydCBkYXRldGltZQoKaW1wb3J0IHYzaW9fZnJhbWVzIGFzIHYzZgoKaW1wb3J0IG1hdHBsb3RsaWIucHlwbG90IGFzIHBsdApmcm9tIHNrbGVhcm4ucHJlcHJvY2Vzc2luZyBpbXBvcnQgS0JpbnNEaXNjcmV0aXplcgoKCmRlZiB0b19vYnNlcnZhdGlvbnMoY29udGV4dCwgdCwgdSwga2V5KToKICAgIHQgPSAoCiAgICAgICAgdC5hcHBseShsYW1iZGEgcm93OiBmInsnXycuam9pbihbc3RyKHJvd1t2YWxdKSBmb3IgdmFsIGluIHQuY29sdW1uc10pfSIsIGF4aXM9MSkKICAgICAgICAudmFsdWVfY291bnRzKCkKICAgICAgICAuc29ydF9pbmRleCgpCiAgICApCiAgICB1ID0gKAogICAgICAgIHUuYXBwbHkobGFtYmRhIHJvdzogZiJ7J18nLmpvaW4oW3N0cihyb3dbdmFsXSkgZm9yIHZhbCBpbiB1LmNvbHVtbnNdKX0iLCBheGlzPTEpCiAgICAgICAgLnZhbHVlX2NvdW50cygpCiAgICAgICAgLnNvcnRfaW5kZXgoKQogICAgKQoKICAgIGpvaW5lZF91bmlxdWVzID0gcGQuRGF0YUZyYW1lKFt0LCB1XSkuVC5maWxsbmEoMCkuc29ydF9pbmRleCgpCiAgICBqb2luZWRfdW5pcXVlcy5jb2x1bW5zID0gWyJ0IiwgInUiXQoKICAgIHRfb2JzID0gam9pbmVkX3VuaXF1ZXMubG9jWzosICJ0Il0KICAgIHVfb2JzID0gam9pbmVkX3VuaXF1ZXMubG9jWzosICJ1Il0KCiAgICB0X3BkZiA9IHRfb2JzIC8gdF9vYnMuc3VtKCkKICAgIHVfcGRmID0gdV9vYnMgLyB1X29icy5zdW0oKQoKICAgIGNvbnRleHQubG9nX2RhdGFzZXQoZiJ7a2V5fV90X3BkZiIsIHBkLkRhdGFGcmFtZSh0X3BkZiksIGZvcm1hdD0icGFycXVldCIpCiAgICBjb250ZXh0LmxvZ19kYXRhc2V0KGYie2tleX1fdV9wZGYiLCBwZC5EYXRhRnJhbWUodV9wZGYpLCBmb3JtYXQ9InBhcnF1ZXQiKQogICAgcmV0dXJuIHRfcGRmLCB1X3BkZgoKCmRlZiB0dmQodCwgdSk6CiAgICByZXR1cm4gc3VtKGFicyh0IC0gdSkpIC8gMgoKCmRlZiBoZWxpbmdlcih0LCB1KToKICAgIHJldHVybiAobnAuc3FydChucC5zdW0obnAucG93ZXIobnAuc3FydCh0KSAtIG5wLnNxcnQodSksIDIpKSkpIC8gbnAuc3FydCgyKQoKCmRlZiBrbF9kaXZlcmdlbmNlKHQsIHUpOgogICAgdF91ID0gbnAuc3VtKG5wLndoZXJlKHQgIT0gMCwgdCAqIG5wLmxvZyh0IC8gdSksIDApKQogICAgdV90ID0gbnAuc3VtKG5wLndoZXJlKHUgIT0gMCwgdSAqIG5wLmxvZyh1IC8gdCksIDApKQogICAgcmV0dXJuIHRfdSArIHVfdAoKCmRlZiBhbGxfbWV0cmljcyh0LCB1KToKICAgIHJldHVybiB0dmQodCwgdSksIGhlbGluZ2VyKHQsIHUpLCBrbF9kaXZlcmdlbmNlKHQsIHUpCgoKZGVmIGRyaWZ0X21hZ25pdHVkZSgKICAgIGNvbnRleHQsCiAgICB0OiBwZC5EYXRhRnJhbWUsCiAgICB1OiBwZC5EYXRhRnJhbWUsCiAgICBsYWJlbF9jb2w9Tm9uZSwKICAgIHByZWRpY3Rpb25fY29sPU5vbmUsCiAgICBkaXNjcmV0aXplcnM6IGRpY3QgPSBOb25lLAogICAgbl9iaW5zPTUsCiAgICBzdHJlYW1fbmFtZTogc3RyID0gInNvbWVfc3RyZWFtIiwKICAgIHJlc3VsdHNfdHNkYl9jb250YWluZXI6IHN0ciA9ICJiaWdkYXRhIiwKICAgIHJlc3VsdHNfdHNkYl90YWJsZTogc3RyID0gImNvbmNlcHRfZHJpZnQvZHJpZnRfbWFnbml0dWRlIiwKKToKICAgICIiIkRyaWZ0IG1hZ25pdHVkZSBtZXRyaWNzCiAgICAgICBDb21wdXRlcyBkcmlmdCBtYWduaXR1ZGUgbWV0cmljcyBiZXR3ZWVuIGJhc2UgZGF0YXNldCB0IGFuZCBkYXRhc2V0IHUuCiAgICAgICBNZXRyaWNzOgogICAgICAgIC0gVFZEIChUb3RhbCBWYXJpYXRpb24gRGlzdGFuY2UpCiAgICAgICAgLSBIZWxpbmdlcgogICAgICAgIC0gS0wgRGl2ZXJnZW5jZQoKICAgIDpwYXJhbSBjb250ZXh0OiBNTFJ1biBjb250ZXh0CiAgICA6cGFyYW0gdDogQmFzZSBkYXRhc2V0IGZvciB0aGUgZHJpZnQgbWV0cmljcwogICAgOnBhcmFtIHU6IFRlc3QgZGF0YXNldCBmb3IgdGhlIGRyaWZ0IG1ldHJpY3MKICAgIDpwYXJhbSBsYWJlbF9jb2w6IExhYmVsIGNvbHVtIGluIHQgYW5kIHUKICAgIDpwYXJhbSBwcmVkaWN0aW9uX2NvbDogUHJlZGljdGlvbnMgY29sdW1uIGluIHQgYW5kIHUKICAgIDpwYXJhbSBkaXNjcml0aXplcnM6IERpY3Rpb25hcnkgb2YgZGljc3JpdGl6ZXJzIGZvciB0aGUgZmVhdHVyZXMgaWYgYXZhaWxhYmxlCiAgICAgICAgICAgICAgICAgICAgICAgICAoQ3JlYXRlZCBhdXRvbWF0aWNhbGx5IGlmIG5vdCBwcm92aWRlZCkKICAgIDpwYXJhbSBuX2JpbnM6IE51bWJlciBvZiBiaW5zIHRvIGJlIHVzZWQgZm9yIGhpc3Ryb2dyYW0gY3JlYXRpb24gZnJvbSBjb250aW51b3VzIHZhcmlhYmxlcwogICAgOnBhcmFtIHN0cmVhbV9uYW1lOiBPdXRwdXQgc3RyZWFtIHRvIHB1c2ggbWV0cmljcyB0bwogICAgOnBhcmFtIHJlc3VsdHNfdHNkYl9jb250YWluZXI6IFRTREIgdGFibGUgY29udGFpbmVyIHRvIHB1c2ggbWV0cmljcyB0bwogICAgOnBhcmFtIHJlc3VsdHNfdHNkYl90YWJsZTogVFNEQiB0YWJsZSB0byBwdXNoIG1ldHJpY3MgdG8KICAgICIiIgoKICAgIHYzaW9fY2xpZW50ID0gdjNmLkNsaWVudCgiZnJhbWVzZDo4MDgxIiwgY29udGFpbmVyPXJlc3VsdHNfdHNkYl9jb250YWluZXIpCiAgICB0cnk6CiAgICAgICAgdjNpb19jbGllbnQuY3JlYXRlKCJ0c2RiIiwgcmVzdWx0c190c2RiX3RhYmxlLCBpZl9leGlzdHM9MSwgcmF0ZT0iMS9zIikKICAgIGV4Y2VwdDoKICAgICAgICB2M2lvX2NsaWVudC5jcmVhdGUoCiAgICAgICAgICAgICJ0c2RiIiwgcmVzdWx0c190c2RiX3RhYmxlLCBpZl9leGlzdHM9MSwgYXR0cnM9eyJyYXRlIjogIjEvcyJ9CiAgICAgICAgKQoKICAgIGRmX3QgPSB0LmFzX2RmKCkKICAgIGRmX3UgPSB1LmFzX2RmKCkKCiAgICBkcm9wX2NvbHVtbnMgPSBbXQogICAgaWYgbGFiZWxfY29sIGlzIG5vdCBOb25lOgogICAgICAgIGRyb3BfY29sdW1ucy5hcHBlbmQobGFiZWxfY29sKQogICAgaWYgcHJlZGljdGlvbl9jb2wgaXMgbm90IE5vbmU6CiAgICAgICAgZHJvcF9jb2x1bW5zLmFwcGVuZChwcmVkaWN0aW9uX2NvbCkKCiAgICBjb250aW51b3VzX2ZlYXR1cmVzID0gZGZfdC5zZWxlY3RfZHR5cGVzKFsiZmxvYXQiXSkKICAgIGlmIGRpc2NyZXRpemVycyBpcyBOb25lOgogICAgICAgIGRpc2NyZXRpemVycyA9IHt9CiAgICAgICAgZm9yIGZlYXR1cmUgaW4gY29udGludW91c19mZWF0dXJlcy5jb2x1bW5zOgogICAgICAgICAgICBjb250ZXh0LmxvZ2dlci5pbmZvKGYiRml0dGluZyBkaXNjcmV0aXplciBmb3Ige2ZlYXR1cmV9IikKICAgICAgICAgICAgZGlzY3JldGl6ZXIgPSBLQmluc0Rpc2NyZXRpemVyKAogICAgICAgICAgICAgICAgbl9iaW5zPW5fYmlucywgZW5jb2RlPSJvcmRpbmFsIiwgc3RyYXRlZ3k9InVuaWZvcm0iCiAgICAgICAgICAgICkKCiAgICAgICAgICAgIGRpc2NyZXRpemVyLmZpdChjb250aW51b3VzX2ZlYXR1cmVzLmxvY1s6LCBmZWF0dXJlXS52YWx1ZXMucmVzaGFwZSgtMSwgMSkpCiAgICAgICAgICAgIGRpc2NyZXRpemVyc1tmZWF0dXJlXSA9IGRpc2NyZXRpemVyCiAgICBvcy5tYWtlZGlycyhjb250ZXh0LmFydGlmYWN0X3BhdGgsIGV4aXN0X29rPVRydWUpCiAgICBkaXNjcmV0aXplcnNfcGF0aCA9IG9zLnBhdGguYWJzcGF0aChmIntjb250ZXh0LmFydGlmYWN0X3BhdGh9L2Rpc2NyaXRpemVyLnBrbCIpCiAgICB3aXRoIG9wZW4oZGlzY3JldGl6ZXJzX3BhdGgsICJ3YiIpIGFzIGY6CiAgICAgICAgcGlja2xlLmR1bXAoZGlzY3JldGl6ZXJzLCBmKQogICAgY29udGV4dC5sb2dfYXJ0aWZhY3QoImRpc2NyaXRpemVycyIsIHRhcmdldF9wYXRoPWRpc2NyZXRpemVyc19wYXRoKQogICAgY29udGV4dC5sb2dnZXIuaW5mbygiRGlzY3JldGl6aW5nIGZlYXR1ZXJzIikKICAgIGZvciBmZWF0dXJlLCBkaXNjcmV0aXplciBpbiBkaXNjcmV0aXplcnMuaXRlbXMoKToKICAgICAgICBkZl90W2ZlYXR1cmVdID0gZGlzY3JldGl6ZXIudHJhbnNmb3JtKAogICAgICAgICAgICBkZl90LmxvY1s6LCBmZWF0dXJlXS52YWx1ZXMucmVzaGFwZSgtMSwgMSkKICAgICAgICApCiAgICAgICAgZGZfdVtmZWF0dXJlXSA9IGRpc2NyZXRpemVyLnRyYW5zZm9ybSgKICAgICAgICAgICAgZGZfdS5sb2NbOiwgZmVhdHVyZV0udmFsdWVzLnJlc2hhcGUoLTEsIDEpCiAgICAgICAgKQogICAgICAgIGRmX3RbZmVhdHVyZV0gPSBkZl90W2ZlYXR1cmVdLmFzdHlwZSgiaW50IikKICAgICAgICBkZl91W2ZlYXR1cmVdID0gZGZfdVtmZWF0dXJlXS5hc3R5cGUoImludCIpCiAgICBjb250ZXh0LmxvZ19kYXRhc2V0KCJ0X2Rpc2NyZXRlIiwgZGZfdCwgZm9ybWF0PSJwYXJxdWV0IikKICAgIGNvbnRleHQubG9nX2RhdGFzZXQoInVfZGlzY3JldGUiLCBkZl91LCBmb3JtYXQ9InBhcnF1ZXQiKQoKICAgIGNvbnRleHQubG9nZ2VyLmluZm8oIkNvbXB1dGUgcHJpb3IgbWV0cmljcyIpCgogICAgcmVzdWx0cyA9IHt9CiAgICB0X3ByaW9yLCB1X3ByaW9yID0gdG9fb2JzZXJ2YXRpb25zKAogICAgICAgIGNvbnRleHQsCiAgICAgICAgZGZfdC5kcm9wKGRyb3BfY29sdW1ucywgYXhpcz0xKSwKICAgICAgICBkZl91LmRyb3AoZHJvcF9jb2x1bW5zLCBheGlzPTEpLAogICAgICAgICJmZWF0dXJlcyIsCiAgICApCiAgICByZXN1bHRzWyJwcmlvcl90dmQiXSwgcmVzdWx0c1sicHJpb3JfaGVsaW5nZXIiXSwgcmVzdWx0c1sicHJpb3Jfa2xkIl0gPSBhbGxfbWV0cmljcygKICAgICAgICB0X3ByaW9yLCB1X3ByaW9yCiAgICApCgogICAgaWYgcHJlZGljdGlvbl9jb2wgaXMgbm90IE5vbmU6CiAgICAgICAgY29udGV4dC5sb2dnZXIuaW5mbygiQ29tcHV0ZSBwcmVkaWN0aW9uIG1ldHJpY3MiKQogICAgICAgIHRfcHJlZGljdGlvbnMgPSBwZC5EYXRhRnJhbWUoZGZfdC5sb2NbOiwgcHJlZGljdGlvbl9jb2xdKQogICAgICAgIHVfcHJlZGljdGlvbnMgPSBwZC5EYXRhRnJhbWUoZGZfdS5sb2NbOiwgcHJlZGljdGlvbl9jb2xdKQogICAgICAgIHRfY2xhc3MsIHVfY2xhc3MgPSB0b19vYnNlcnZhdGlvbnMoCiAgICAgICAgICAgIGNvbnRleHQsIHRfcHJlZGljdGlvbnMsIHVfcHJlZGljdGlvbnMsICJwcmVkaWN0aW9uIgogICAgICAgICkKICAgICAgICAoCiAgICAgICAgICAgIHJlc3VsdHNbInByZWRpY3Rpb25fc2hpZnRfdHZkIl0sCiAgICAgICAgICAgIHJlc3VsdHNbInByZWRpY3Rpb25fc2hpZnRfaGVsaW5nZXIiXSwKICAgICAgICAgICAgcmVzdWx0c1sicHJlZGljdGlvbl9zaGlmdF9rbGQiXSwKICAgICAgICApID0gYWxsX21ldHJpY3ModF9jbGFzcywgdV9jbGFzcykKCiAgICBpZiBsYWJlbF9jb2wgaXMgbm90IE5vbmU6CiAgICAgICAgY29udGV4dC5sb2dnZXIuaW5mbygiQ29tcHV0ZSBjbGFzcyBtZXRyaWNzIikKICAgICAgICB0X2xhYmVscyA9IHBkLkRhdGFGcmFtZShkZl90LmxvY1s6LCBsYWJlbF9jb2xdKQogICAgICAgIHVfbGFiZWxzID0gcGQuRGF0YUZyYW1lKGRmX3UubG9jWzosIGxhYmVsX2NvbF0pCiAgICAgICAgdF9jbGFzcywgdV9jbGFzcyA9IHRvX29ic2VydmF0aW9ucyhjb250ZXh0LCB0X2xhYmVscywgdV9sYWJlbHMsICJjbGFzcyIpCiAgICAgICAgKAogICAgICAgICAgICByZXN1bHRzWyJjbGFzc19zaGlmdF90dmQiXSwKICAgICAgICAgICAgcmVzdWx0c1siY2xhc3Nfc2hpZnRfaGVsaW5nZXIiXSwKICAgICAgICAgICAgcmVzdWx0c1siY2xhc3Nfc2hpZnRfa2xkIl0sCiAgICAgICAgKSA9IGFsbF9tZXRyaWNzKHRfY2xhc3MsIHVfY2xhc3MpCgogICAgZm9yIGtleSwgdmFsdWUgaW4gcmVzdWx0cy5pdGVtcygpOgogICAgICAgIGlmIHZhbHVlID09IGZsb2F0KCJpbmYiKToKICAgICAgICAgICAgY29udGV4dC5sb2dnZXIuaW5mbyhmInZhbHVlOiB7dmFsdWV9IikKICAgICAgICAgICAgcmVzdWx0c1trZXldID0gMTAKICAgIGZvciBrZXksIHJlc3VsdCBpbiByZXN1bHRzLml0ZW1zKCk6CiAgICAgICAgY29udGV4dC5sb2dfcmVzdWx0KGtleSwgcm91bmQocmVzdWx0LCAzKSkKCiAgICBub3cgPSBwZC50b19kYXRldGltZShzdHIoZGF0ZXRpbWUuZGF0ZXRpbWUubm93KCkpKQogICAgbm93CgogICAgcmVzdWx0c1sidGltZXN0YW1wIl0gPSBwZC50b19kYXRldGltZShzdHIoKGRhdGV0aW1lLmRhdGV0aW1lLm5vdygpKSkpCiAgICBjb250ZXh0LmxvZ2dlci5pbmZvKGYiVGltZXN0YW1wOiB7cmVzdWx0c1sndGltZXN0YW1wJ119IikKICAgIHJlc3VsdHNbInN0cmVhbSJdID0gc3RyZWFtX25hbWUKICAgIHJlc3VsdHNfZGYgPSBwZC5EYXRhRnJhbWUoCiAgICAgICAgZGF0YT1bbGlzdChyZXN1bHRzLnZhbHVlcygpKV0sIGNvbHVtbnM9bGlzdChyZXN1bHRzLmtleXMoKSkKICAgICkKICAgIHJlc3VsdHNfZGYgPSByZXN1bHRzX2RmLnNldF9pbmRleChbInRpbWVzdGFtcCIsICJzdHJlYW0iXSkKICAgIHYzaW9fY2xpZW50LndyaXRlKCJ0c2RiIiwgcmVzdWx0c190c2RiX3RhYmxlLCBkZnM9cmVzdWx0c19kZikK
-    commands:
-    - python -m pip install scikit-learn scipy v3io_frames
-    code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/virtual_drift/virtual_drift.py
-  affinity: null
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/virtual_drift/latest/static/item.html b/functions/development/virtual_drift/latest/static/item.html deleted file mode 100644 index 14d2ee3f..00000000 --- a/functions/development/virtual_drift/latest/static/item.html +++ /dev/null @@ -1,50 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- data-analysis
-- machine-learning
-description: Compute drift magnitude between Time-Samples T and U
-doc: ''
-example: virtual_drift.ipynb
-generationDate: 2022-08-28:17-25
-hidden: false
-icon: ''
-labels:
-  author: orz
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 1.1.0
-name: virtual-drift
-platformVersion: 3.5.0
-spec:
-  filename: virtual_drift.py
-  handler: drift_magnitude
-  image: mlrun/ml-models
-  kind: job
-  requirements:
-  - scikit-learn
-  - scipy
-  - v3io_frames
-url: ''
-version: 1.1.0
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/virtual_drift/latest/static/source.html b/functions/development/virtual_drift/latest/static/source.html deleted file mode 100644 index 87b05c07..00000000 --- a/functions/development/virtual_drift/latest/static/source.html +++ /dev/null @@ -1,228 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-# Copyright 2019 Iguazio
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# Generated by nuclio.export.NuclioExporter
-
-import os
-import pandas as pd
-import numpy as np
-import scipy as sp
-import pickle
-import datetime
-
-import v3io_frames as v3f
-
-import matplotlib.pyplot as plt
-from sklearn.preprocessing import KBinsDiscretizer
-
-
-def to_observations(context, t, u, key):
-    t = (
-        t.apply(lambda row: f"{'_'.join([str(row[val]) for val in t.columns])}", axis=1)
-        .value_counts()
-        .sort_index()
-    )
-    u = (
-        u.apply(lambda row: f"{'_'.join([str(row[val]) for val in u.columns])}", axis=1)
-        .value_counts()
-        .sort_index()
-    )
-
-    joined_uniques = pd.DataFrame([t, u]).T.fillna(0).sort_index()
-    joined_uniques.columns = ["t", "u"]
-
-    t_obs = joined_uniques.loc[:, "t"]
-    u_obs = joined_uniques.loc[:, "u"]
-
-    t_pdf = t_obs / t_obs.sum()
-    u_pdf = u_obs / u_obs.sum()
-
-    context.log_dataset(f"{key}_t_pdf", pd.DataFrame(t_pdf), format="parquet")
-    context.log_dataset(f"{key}_u_pdf", pd.DataFrame(u_pdf), format="parquet")
-    return t_pdf, u_pdf
-
-
-def tvd(t, u):
-    return sum(abs(t - u)) / 2
-
-
-def helinger(t, u):
-    return (np.sqrt(np.sum(np.power(np.sqrt(t) - np.sqrt(u), 2)))) / np.sqrt(2)
-
-
-def kl_divergence(t, u):
-    t_u = np.sum(np.where(t != 0, t * np.log(t / u), 0))
-    u_t = np.sum(np.where(u != 0, u * np.log(u / t), 0))
-    return t_u + u_t
-
-
-def all_metrics(t, u):
-    return tvd(t, u), helinger(t, u), kl_divergence(t, u)
-
-
-def drift_magnitude(
-    context,
-    t: pd.DataFrame,
-    u: pd.DataFrame,
-    label_col=None,
-    prediction_col=None,
-    discretizers: dict = None,
-    n_bins=5,
-    stream_name: str = "some_stream",
-    results_tsdb_container: str = "bigdata",
-    results_tsdb_table: str = "concept_drift/drift_magnitude",
-):
-    """Drift magnitude metrics
-       Computes drift magnitude metrics between base dataset t and dataset u.
-       Metrics:
-        - TVD (Total Variation Distance)
-        - Helinger
-        - KL Divergence
-
-    :param context: MLRun context
-    :param t: Base dataset for the drift metrics
-    :param u: Test dataset for the drift metrics
-    :param label_col: Label colum in t and u
-    :param prediction_col: Predictions column in t and u
-    :param discritizers: Dictionary of dicsritizers for the features if available
-                         (Created automatically if not provided)
-    :param n_bins: Number of bins to be used for histrogram creation from continuous variables
-    :param stream_name: Output stream to push metrics to
-    :param results_tsdb_container: TSDB table container to push metrics to
-    :param results_tsdb_table: TSDB table to push metrics to
-    """
-
-    v3io_client = v3f.Client("framesd:8081", container=results_tsdb_container)
-    try:
-        v3io_client.create("tsdb", results_tsdb_table, if_exists=1, rate="1/s")
-    except:
-        v3io_client.create(
-            "tsdb", results_tsdb_table, if_exists=1, attrs={"rate": "1/s"}
-        )
-
-    df_t = t.as_df()
-    df_u = u.as_df()
-
-    drop_columns = []
-    if label_col is not None:
-        drop_columns.append(label_col)
-    if prediction_col is not None:
-        drop_columns.append(prediction_col)
-
-    continuous_features = df_t.select_dtypes(["float"])
-    if discretizers is None:
-        discretizers = {}
-        for feature in continuous_features.columns:
-            context.logger.info(f"Fitting discretizer for {feature}")
-            discretizer = KBinsDiscretizer(
-                n_bins=n_bins, encode="ordinal", strategy="uniform"
-            )
-
-            discretizer.fit(continuous_features.loc[:, feature].values.reshape(-1, 1))
-            discretizers[feature] = discretizer
-    os.makedirs(context.artifact_path, exist_ok=True)
-    discretizers_path = os.path.abspath(f"{context.artifact_path}/discritizer.pkl")
-    with open(discretizers_path, "wb") as f:
-        pickle.dump(discretizers, f)
-    context.log_artifact("discritizers", target_path=discretizers_path)
-    context.logger.info("Discretizing featuers")
-    for feature, discretizer in discretizers.items():
-        df_t[feature] = discretizer.transform(
-            df_t.loc[:, feature].values.reshape(-1, 1)
-        )
-        df_u[feature] = discretizer.transform(
-            df_u.loc[:, feature].values.reshape(-1, 1)
-        )
-        df_t[feature] = df_t[feature].astype("int")
-        df_u[feature] = df_u[feature].astype("int")
-    context.log_dataset("t_discrete", df_t, format="parquet")
-    context.log_dataset("u_discrete", df_u, format="parquet")
-
-    context.logger.info("Compute prior metrics")
-
-    results = {}
-    t_prior, u_prior = to_observations(
-        context,
-        df_t.drop(drop_columns, axis=1),
-        df_u.drop(drop_columns, axis=1),
-        "features",
-    )
-    results["prior_tvd"], results["prior_helinger"], results["prior_kld"] = all_metrics(
-        t_prior, u_prior
-    )
-
-    if prediction_col is not None:
-        context.logger.info("Compute prediction metrics")
-        t_predictions = pd.DataFrame(df_t.loc[:, prediction_col])
-        u_predictions = pd.DataFrame(df_u.loc[:, prediction_col])
-        t_class, u_class = to_observations(
-            context, t_predictions, u_predictions, "prediction"
-        )
-        (
-            results["prediction_shift_tvd"],
-            results["prediction_shift_helinger"],
-            results["prediction_shift_kld"],
-        ) = all_metrics(t_class, u_class)
-
-    if label_col is not None:
-        context.logger.info("Compute class metrics")
-        t_labels = pd.DataFrame(df_t.loc[:, label_col])
-        u_labels = pd.DataFrame(df_u.loc[:, label_col])
-        t_class, u_class = to_observations(context, t_labels, u_labels, "class")
-        (
-            results["class_shift_tvd"],
-            results["class_shift_helinger"],
-            results["class_shift_kld"],
-        ) = all_metrics(t_class, u_class)
-
-    for key, value in results.items():
-        if value == float("inf"):
-            context.logger.info(f"value: {value}")
-            results[key] = 10
-    for key, result in results.items():
-        context.log_result(key, round(result, 3))
-
-    now = pd.to_datetime(str(datetime.datetime.now()))
-    now
-
-    results["timestamp"] = pd.to_datetime(str((datetime.datetime.now())))
-    context.logger.info(f"Timestamp: {results['timestamp']}")
-    results["stream"] = stream_name
-    results_df = pd.DataFrame(
-        data=[list(results.values())], columns=list(results.keys())
-    )
-    results_df = results_df.set_index(["timestamp", "stream"])
-    v3io_client.write("tsdb", results_tsdb_table, dfs=results_df)
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/virtual_drift/latest/static/virtual_drift.html b/functions/development/virtual_drift/latest/static/virtual_drift.html deleted file mode 100644 index 0775caf1..00000000 --- a/functions/development/virtual_drift/latest/static/virtual_drift.html +++ /dev/null @@ -1,346 +0,0 @@ - - - - - - - -virtual_drift.virtual_drift - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - -
-
- -
-
-
-
-
- -
-

- -
-
-
-
-
-
-
-

Source code for virtual_drift.virtual_drift

-# Copyright 2019 Iguazio
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# Generated by nuclio.export.NuclioExporter
-
-import os
-import pandas as pd
-import numpy as np
-import scipy as sp
-import pickle
-import datetime
-
-import v3io_frames as v3f
-
-import matplotlib.pyplot as plt
-from sklearn.preprocessing import KBinsDiscretizer
-
-
-
[docs]def to_observations(context, t, u, key): - t = ( - t.apply(lambda row: f"{'_'.join([str(row[val]) for val in t.columns])}", axis=1) - .value_counts() - .sort_index() - ) - u = ( - u.apply(lambda row: f"{'_'.join([str(row[val]) for val in u.columns])}", axis=1) - .value_counts() - .sort_index() - ) - - joined_uniques = pd.DataFrame([t, u]).T.fillna(0).sort_index() - joined_uniques.columns = ["t", "u"] - - t_obs = joined_uniques.loc[:, "t"] - u_obs = joined_uniques.loc[:, "u"] - - t_pdf = t_obs / t_obs.sum() - u_pdf = u_obs / u_obs.sum() - - context.log_dataset(f"{key}_t_pdf", pd.DataFrame(t_pdf), format="parquet") - context.log_dataset(f"{key}_u_pdf", pd.DataFrame(u_pdf), format="parquet") - return t_pdf, u_pdf
- - -
[docs]def tvd(t, u): - return sum(abs(t - u)) / 2
- - -
[docs]def helinger(t, u): - return (np.sqrt(np.sum(np.power(np.sqrt(t) - np.sqrt(u), 2)))) / np.sqrt(2)
- - -
[docs]def kl_divergence(t, u): - t_u = np.sum(np.where(t != 0, t * np.log(t / u), 0)) - u_t = np.sum(np.where(u != 0, u * np.log(u / t), 0)) - return t_u + u_t
- - -
[docs]def all_metrics(t, u): - return tvd(t, u), helinger(t, u), kl_divergence(t, u)
- - -
[docs]def drift_magnitude( - context, - t: pd.DataFrame, - u: pd.DataFrame, - label_col=None, - prediction_col=None, - discretizers: dict = None, - n_bins=5, - stream_name: str = "some_stream", - results_tsdb_container: str = "bigdata", - results_tsdb_table: str = "concept_drift/drift_magnitude", -): - """Drift magnitude metrics - Computes drift magnitude metrics between base dataset t and dataset u. - Metrics: - - TVD (Total Variation Distance) - - Helinger - - KL Divergence - - :param context: MLRun context - :param t: Base dataset for the drift metrics - :param u: Test dataset for the drift metrics - :param label_col: Label colum in t and u - :param prediction_col: Predictions column in t and u - :param discritizers: Dictionary of dicsritizers for the features if available - (Created automatically if not provided) - :param n_bins: Number of bins to be used for histrogram creation from continuous variables - :param stream_name: Output stream to push metrics to - :param results_tsdb_container: TSDB table container to push metrics to - :param results_tsdb_table: TSDB table to push metrics to - """ - - v3io_client = v3f.Client("framesd:8081", container=results_tsdb_container) - try: - v3io_client.create("tsdb", results_tsdb_table, if_exists=1, rate="1/s") - except: - v3io_client.create( - "tsdb", results_tsdb_table, if_exists=1, attrs={"rate": "1/s"} - ) - - df_t = t.as_df() - df_u = u.as_df() - - drop_columns = [] - if label_col is not None: - drop_columns.append(label_col) - if prediction_col is not None: - drop_columns.append(prediction_col) - - continuous_features = df_t.select_dtypes(["float"]) - if discretizers is None: - discretizers = {} - for feature in continuous_features.columns: - context.logger.info(f"Fitting discretizer for {feature}") - discretizer = KBinsDiscretizer( - n_bins=n_bins, encode="ordinal", strategy="uniform" - ) - - discretizer.fit(continuous_features.loc[:, feature].values.reshape(-1, 1)) - discretizers[feature] = discretizer - os.makedirs(context.artifact_path, exist_ok=True) - discretizers_path = os.path.abspath(f"{context.artifact_path}/discritizer.pkl") - with open(discretizers_path, "wb") as f: - pickle.dump(discretizers, f) - context.log_artifact("discritizers", target_path=discretizers_path) - context.logger.info("Discretizing featuers") - for feature, discretizer in discretizers.items(): - df_t[feature] = discretizer.transform( - df_t.loc[:, feature].values.reshape(-1, 1) - ) - df_u[feature] = discretizer.transform( - df_u.loc[:, feature].values.reshape(-1, 1) - ) - df_t[feature] = df_t[feature].astype("int") - df_u[feature] = df_u[feature].astype("int") - context.log_dataset("t_discrete", df_t, format="parquet") - context.log_dataset("u_discrete", df_u, format="parquet") - - context.logger.info("Compute prior metrics") - - results = {} - t_prior, u_prior = to_observations( - context, - df_t.drop(drop_columns, axis=1), - df_u.drop(drop_columns, axis=1), - "features", - ) - results["prior_tvd"], results["prior_helinger"], results["prior_kld"] = all_metrics( - t_prior, u_prior - ) - - if prediction_col is not None: - context.logger.info("Compute prediction metrics") - t_predictions = pd.DataFrame(df_t.loc[:, prediction_col]) - u_predictions = pd.DataFrame(df_u.loc[:, prediction_col]) - t_class, u_class = to_observations( - context, t_predictions, u_predictions, "prediction" - ) - ( - results["prediction_shift_tvd"], - results["prediction_shift_helinger"], - results["prediction_shift_kld"], - ) = all_metrics(t_class, u_class) - - if label_col is not None: - context.logger.info("Compute class metrics") - t_labels = pd.DataFrame(df_t.loc[:, label_col]) - u_labels = pd.DataFrame(df_u.loc[:, label_col]) - t_class, u_class = to_observations(context, t_labels, u_labels, "class") - ( - results["class_shift_tvd"], - results["class_shift_helinger"], - results["class_shift_kld"], - ) = all_metrics(t_class, u_class) - - for key, value in results.items(): - if value == float("inf"): - context.logger.info(f"value: {value}") - results[key] = 10 - for key, result in results.items(): - context.log_result(key, round(result, 3)) - - now = pd.to_datetime(str(datetime.datetime.now())) - now - - results["timestamp"] = pd.to_datetime(str((datetime.datetime.now()))) - context.logger.info(f"Timestamp: {results['timestamp']}") - results["stream"] = stream_name - results_df = pd.DataFrame( - data=[list(results.values())], columns=list(results.keys()) - ) - results_df = results_df.set_index(["timestamp", "stream"]) - v3io_client.write("tsdb", results_tsdb_table, dfs=results_df)
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/xgb_serving/0.0.1/src/function.yaml b/functions/development/xgb_serving/0.0.1/src/function.yaml deleted file mode 100644 index 2e7bc84c..00000000 --- a/functions/development/xgb_serving/0.0.1/src/function.yaml +++ /dev/null @@ -1,49 +0,0 @@ -kind: remote -metadata: - name: xgb-serving - tag: '' - hash: d80a0b814b7707e100b3cd7d69fee276ebb2ff16 - project: default - labels: - author: Daniel - categories: - - model-serving - - machine-learning -spec: - command: '' - args: [] - image: mlrun/ml-models - entry_points: - load: - name: load - doc: '' - parameters: - - name: self - default: '' - outputs: - - default: '' - lineno: 10 - predict: - name: predict - doc: '' - parameters: - - name: self - default: '' - - name: body - default: '' - outputs: - - default: '' - lineno: 14 - description: deploy an XGBoost model server. - min_replicas: 1 - max_replicas: 4 - env: [] - base_spec: '' - source: '' - build: - functionSourceCode: aW1wb3J0IG9zCmltcG9ydCBqc29uCmltcG9ydCBudW1weSBhcyBucAppbXBvcnQgeGdib29zdCBhcyB4Z2IKZnJvbSBjbG91ZHBpY2tsZSBpbXBvcnQgbG9hZAppbXBvcnQgbWxydW4KCgpjbGFzcyBYR0Jvb3N0TW9kZWwobWxydW4ucnVudGltZXMuTUxNb2RlbFNlcnZlcik6CiAgICBkZWYgbG9hZChzZWxmKToKICAgICAgICBtb2RlbF9maWxlLCBleHRyYV9kYXRhID0gc2VsZi5nZXRfbW9kZWwoIi5wa2wiKQogICAgICAgIHNlbGYubW9kZWwgPSBsb2FkKG9wZW4oc3RyKG1vZGVsX2ZpbGUpLCAicmIiKSkKCiAgICBkZWYgcHJlZGljdChzZWxmLCBib2R5KToKICAgICAgICB0cnk6CiAgICAgICAgICAgIGZlYXRzID0gbnAuYXNhcnJheShib2R5WyJpbnN0YW5jZXMiXSwgZHR5cGU9bnAuZmxvYXQzMikucmVzaGFwZSgtMSwgNSkKICAgICAgICAgICAgcmVzdWx0ID0gc2VsZi5tb2RlbC5wcmVkaWN0KGZlYXRzLCB2YWxpZGF0ZV9mZWF0dXJlcz1GYWxzZSkKICAgICAgICAgICAgcmV0dXJuIHJlc3VsdC50b2xpc3QoKQogICAgICAgIGV4Y2VwdCBFeGNlcHRpb24gYXMgZToKICAgICAgICAgICAgcmFpc2UgRXhjZXB0aW9uKCJGYWlsZWQgdG8gcHJlZGljdCAlcyIgJSBlKQ== - commands: [] - code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/xgb_serving/xgb_serving.py - default_handler: handler - affinity: null -verbose: false diff --git a/functions/development/xgb_serving/0.0.1/src/item.yaml b/functions/development/xgb_serving/0.0.1/src/item.yaml deleted file mode 100644 index 662b55b6..00000000 --- a/functions/development/xgb_serving/0.0.1/src/item.yaml +++ /dev/null @@ -1,24 +0,0 @@ -apiVersion: v1 -categories: -- model-serving -- machine-learning -description: deploy an XGBoost model server. -doc: '' -example: xgb_serving.ipynb -generationDate: 2021-05-19:23-13 -icon: '' -labels: - author: Daniel -maintainers: [] -marketplaceType: '' -mlrunVersion: 0.6.2 -name: xgb_serving -platformVersion: 3.0.0 -spec: - filename: xgb_serving.py - handler: handler - image: mlrun/ml-models - kind: remote - requirements: [] -url: '' -version: 0.0.1 diff --git a/functions/development/xgb_serving/0.0.1/src/requirements.txt b/functions/development/xgb_serving/0.0.1/src/requirements.txt deleted file mode 100644 index 2e6aaf5a..00000000 --- a/functions/development/xgb_serving/0.0.1/src/requirements.txt +++ /dev/null @@ -1,8 +0,0 @@ -mlrun -pandas -xgboost -cloudpickle -pygit2 -sklearn -scikit-plot -seaborn \ No newline at end of file diff --git a/functions/development/xgb_serving/0.0.1/src/test_xgb_serving.py b/functions/development/xgb_serving/0.0.1/src/test_xgb_serving.py deleted file mode 100644 index 4c7c7038..00000000 --- a/functions/development/xgb_serving/0.0.1/src/test_xgb_serving.py +++ /dev/null @@ -1,45 +0,0 @@ -from mlrun import import_function -import os -import pandas as pd -from xgb_serving import XGBoostModel - - -ARTIFACT_PATH = "artifacts" -FUNCTION_PATH = "functions" -MODELS_PATH = "models" -PLOTS_PATH = "plots" -RUNS_PATH = "runs" -SCHEDULES_PATH = "schedules" - - -def test_local_xgb_serving(): - # importing data preparation function (gen_class_data) locally - fn = import_function("../gen_class_data/function.yaml") - fn.run(params={ - "n_samples": 10_000, - "m_features": 5, - "k_classes": 2, - "weight": [0.5, 0.5], - "sk_params": {"n_informative": 2}, - "file_ext": "csv"}, local=True, artifact_path="./artifacts/inputs") - - # importing model training function (xgb_trainer) locally - fn = import_function("../xgb_trainer/function.yaml") - fn.run(params={ - "model_type": "classifier", - "CLASS_tree_method": "hist", - "CLASS_objective": "binary:logistic", - "CLASS_booster": "gbtree", - "FIT_verbose": 0, - "label_column": "labels", - "test_set": "./artifacts/test-set"}, - local=True, inputs={"dataset": './artifacts/inputs/classifier-data.csv'}) - - # because this class is implemented with MLModelServer, creating a class instance and not to_mock_server(V2_Model_Server). - model = os.getcwd() + "/models/model.pkl" - my_server = XGBoostModel("my-model", model_dir=model) - my_server.load() - # Testing the model - xtest = pd.read_csv('./artifacts/inputs/classifier-data.csv') - preds = my_server.predict({"instances": xtest.values[:10, :-1].tolist()}) - assert (True if preds == [1, 0, 0, 0, 0, 0, 1, 1, 0, 1] else False) is True diff --git a/functions/development/xgb_serving/0.0.1/src/xgb_serving.ipynb b/functions/development/xgb_serving/0.0.1/src/xgb_serving.ipynb deleted file mode 100644 index 6c605367..00000000 --- a/functions/development/xgb_serving/0.0.1/src/xgb_serving.ipynb +++ /dev/null @@ -1,421 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Deploy a Serverless XGBoost Model Server\n", - " --------------------------------------------------------------------\n", - "\n", - "The following notebook demonstrates how to deploy an XGBoost model server (a.k.a Nuclio-serving)\n", - "\n", - "#### **notebook how-to's**\n", - "* Write and test model serving class in a notebook.\n", - "* Deploy the model server function.\n", - "* Invoke and test the serving function." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "#### **steps**\n", - "**[define a new function and its dependencies](#define-function)**
\n", - "**[test the model serving class locally](#test-locally)**
\n", - "**[deploy our serving class using as a serverless function](#deploy)**
\n", - "**[test our model server using HTTP request](#test-model-server)**
" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: ignore\n", - "import nuclio " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "### **define a new function and its dependencies**" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "%nuclio: setting kind to 'nuclio:serving'\n", - "%nuclio: setting 'MODEL_CLASS' environment variable\n", - "%nuclio: setting spec.build.baseImage to 'mlrun/ml-models'\n" - ] - } - ], - "source": [ - "%nuclio config kind=\"nuclio:serving\"\n", - "%nuclio env MODEL_CLASS=XGBoostModel\n", - "\n", - "%nuclio config spec.build.baseImage = \"mlrun/ml-models\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Function Code" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "# import kfserving\n", - "import os\n", - "import json\n", - "import numpy as np\n", - "import xgboost as xgb\n", - "from cloudpickle import load\n", - "\n", - "### Model Serving Class\n", - "\n", - "import mlrun\n", - "class XGBoostModel(mlrun.runtimes.MLModelServer):\n", - " def load(self):\n", - " model_file, extra_data = self.get_model(\".pkl\")\n", - " self.model = load(open(str(model_file), \"rb\"))\n", - " \n", - "\n", - " def predict(self, body):\n", - " try:\n", - " feats = np.asarray(body[\"instances\"], dtype=np.float32).reshape(-1, 5)\n", - " result = self.model.predict(feats, validate_features=False)\n", - " return result.tolist()\n", - " except Exception as e:\n", - " raise Exception(\"Failed to predict %s\" % e)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The following end-code annotation tells ```nuclio``` to stop parsing the notebook from this cell. _**Please do not remove this cell**_:" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: end-code" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "## Test the function locally\n", - "\n", - "The class above can be tested locally. Just instantiate the class, `.load()` will load the model to a local dir.\n", - "\n", - "> **Verify there is a model file in the model_dir path (generated by the training notebook)**" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import mlconf\n", - "\n", - "model_dir = os.path.join(mlconf.artifact_path, \"xgb/models\")\n", - "\n", - "my_server = XGBoostModel(\"my-model\", model_dir=model_dir)\n", - "my_server.load()" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "metadata": {}, - "outputs": [], - "source": [ - "DATA_PATH = mlconf.artifact_path + \"/xgb/classifier-data.csv\"\n", - "MODEL_PATH = mlconf.artifact_path + \"/xgb/models/xgb_test\"" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "xtest = pd.read_csv(DATA_PATH)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can use the `.predict(body)` method to test the model." - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "metadata": {}, - "outputs": [], - "source": [ - "import json, numpy as np\n", - "preds = my_server.predict({\"instances\":xtest.values[:10,:-1].tolist()})" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "predicted class: [1, 0, 0, 0, 0, 0, 1, 1, 0, 1]\n" - ] - } - ], - "source": [ - "print(\"predicted class:\", preds)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "### **deploy our serving class using as a serverless function**\n", - "in the following section we create a new model serving function which wraps our class , and specify model and other resources.\n", - "\n", - "the `models` dict store model names and the assosiated model **dir** URL (the URL can start with `S3://` and other blob store options), the faster way is to use a shared file volume, we use `.apply(mount_v3io())` to attach a v3io (iguazio data fabric) volume to our function. By default v3io will mount the current user home into the `\\User` function path.\n", - "\n", - "**verify the model dir does contain a valid `model.bst` file**" - ] - }, - { - "cell_type": "code", - "execution_count": 41, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import new_model_server, mount_v3io\n", - "import requests" - ] - }, - { - "cell_type": "code", - "execution_count": 42, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-06-14 12:49:05,013 function spec saved to path: function.yaml\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 42, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fn = new_model_server(\"xgb-serving\",\n", - " model_class=\"XGBoostModel\",\n", - " models={\"xgb_serving_v2\": f\"{model_dir}\"})\n", - "fn.spec.description = \"xgboost test data classification server\"\n", - "fn.metadata.categories = [\"serving\", \"ml\"]\n", - "fn.metadata.labels = {\"author\": \"yaronh\", \"framework\": \"xgboost\"}\n", - "\n", - "fn.export(\"function.yaml\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## tests" - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 43, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from mlrun.platforms.other import auto_mount\n", - "fn.apply(auto_mount())" - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-06-14 12:49:18,128 deploy started\n", - "[nuclio] 2020-06-14 12:49:19,213 (info) Build complete\n", - "[nuclio] 2020-06-14 12:49:27,347 (info) Function deploy complete\n", - "[nuclio] 2020-06-14 12:49:27,354 done updating default-xgb-test, function address: 3.23.82.202:30104\n" - ] - } - ], - "source": [ - "addr = fn.deploy()" - ] - }, - { - "cell_type": "code", - "execution_count": 45, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'http://3.23.82.202:30104'" - ] - }, - "execution_count": 45, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "addr" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "### **test our model server using HTTP request**\n", - "\n", - "\n", - "We invoke our model serving function using test data, the data vector is specified in the `instances` attribute." - ] - }, - { - "cell_type": "code", - "execution_count": 46, - "metadata": {}, - "outputs": [], - "source": [ - "# KFServing protocol event\n", - "event_data = {\"instances\": xtest.values[:10,:-1].tolist()}" - ] - }, - { - "cell_type": "code", - "execution_count": 47, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'[1, 0, 0, 0, 0, 0, 1, 1, 0, 1]'" - ] - }, - "execution_count": 47, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import json\n", - "resp = requests.put(addr + \"/xgb_serving_v2/predict\", json=json.dumps(event_data))\n", - "resp.text" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[1, 0, 0, 0, 0, 0, 1, 1, 0, 1]" - ] - }, - "execution_count": 26, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "preds" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**[back to top](#top)**" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.8" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/functions/development/xgb_serving/0.0.1/src/xgb_serving.py b/functions/development/xgb_serving/0.0.1/src/xgb_serving.py deleted file mode 100644 index 41bf327c..00000000 --- a/functions/development/xgb_serving/0.0.1/src/xgb_serving.py +++ /dev/null @@ -1,20 +0,0 @@ -import os -import json -import numpy as np -import xgboost as xgb -from cloudpickle import load -import mlrun - - -class XGBoostModel(mlrun.runtimes.MLModelServer): - def load(self): - model_file, extra_data = self.get_model(".pkl") - self.model = load(open(str(model_file), "rb")) - - def predict(self, body): - try: - feats = np.asarray(body["instances"], dtype=np.float32).reshape(-1, 5) - result = self.model.predict(feats, validate_features=False) - return result.tolist() - except Exception as e: - raise Exception("Failed to predict %s" % e) \ No newline at end of file diff --git a/functions/development/xgb_serving/0.0.1/static/documentation.html b/functions/development/xgb_serving/0.0.1/static/documentation.html deleted file mode 100644 index 81099540..00000000 --- a/functions/development/xgb_serving/0.0.1/static/documentation.html +++ /dev/null @@ -1,138 +0,0 @@ - - - - - - - -xgb_serving package - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- - -
-
-
-
-
-
-

xgb_serving package

-
-

Submodules

-
-
-

xgb_serving.xgb_serving module

-
-
-class xgb_serving.xgb_serving.XGBoostModel(name: str, model_dir: Optional[str] = None, model=None)[source]
-

Bases: mlrun.serving.v1_serving.MLModelServer

-
-
-load()[source]
-
-
-
-predict(body)[source]
-
-
-
-
-

Module contents

-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/xgb_serving/0.0.1/static/example.html b/functions/development/xgb_serving/0.0.1/static/example.html deleted file mode 100644 index 83750b5b..00000000 --- a/functions/development/xgb_serving/0.0.1/static/example.html +++ /dev/null @@ -1,419 +0,0 @@ - - - - - - - -Deploy a Serverless XGBoost Model Server - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
- -
-
-
-
-

Deploy a Serverless XGBoost Model Server

-
-

The following notebook demonstrates how to deploy an XGBoost model server (a.k.a Nuclio-serving)

-
-

notebook how-to’s

-
    -
  • Write and test model serving class in a notebook.

  • -
  • Deploy the model server function.

  • -
  • Invoke and test the serving function.

  • -
-

-
- -
-

define a new function and its dependencies

-
-
-
%nuclio config kind="nuclio:serving"
-%nuclio env MODEL_CLASS=XGBoostModel
-
-%nuclio config spec.build.baseImage = "mlrun/ml-models"
-
-
-
-
-
%nuclio: setting kind to 'nuclio:serving'
-%nuclio: setting 'MODEL_CLASS' environment variable
-%nuclio: setting spec.build.baseImage to 'mlrun/ml-models'
-
-
-
-
-
-
-

Function Code

-
-
-
# import kfserving
-import os
-import json
-import numpy as np
-import xgboost as xgb
-from cloudpickle import load
-
-### Model Serving Class
-
-import mlrun
-class XGBoostModel(mlrun.runtimes.MLModelServer):
-    def load(self):
-        model_file, extra_data = self.get_model(".pkl")
-        self.model = load(open(str(model_file), "rb"))
-  
-
-    def predict(self, body):
-        try:
-            feats = np.asarray(body["instances"], dtype=np.float32).reshape(-1, 5)
-            result = self.model.predict(feats, validate_features=False)
-            return result.tolist()
-        except Exception as e:
-            raise Exception("Failed to predict %s" % e)
-
-
-
-
-

The following end-code annotation tells nuclio to stop parsing the notebook from this cell. Please do not remove this cell:

-
-
-
# nuclio: end-code
-
-
-
-
-

-
-
-

Test the function locally

-

The class above can be tested locally. Just instantiate the class, .load() will load the model to a local dir.

-
-

Verify there is a model file in the model_dir path (generated by the training notebook)

-
-
-
-
from mlrun import mlconf
-
-model_dir = os.path.join(mlconf.artifact_path, "xgb/models")
-
-my_server = XGBoostModel("my-model", model_dir=model_dir)
-my_server.load()
-
-
-
-
-
-
-
DATA_PATH = mlconf.artifact_path + "/xgb/classifier-data.csv"
-MODEL_PATH = mlconf.artifact_path + "/xgb/models/xgb_test"
-
-
-
-
-
-
-
import pandas as pd
-xtest = pd.read_csv(DATA_PATH)
-
-
-
-
-

We can use the .predict(body) method to test the model.

-
-
-
import json, numpy as np
-preds = my_server.predict({"instances":xtest.values[:10,:-1].tolist()})
-
-
-
-
-
-
-
print("predicted class:", preds)
-
-
-
-
-
predicted class: [1, 0, 0, 0, 0, 0, 1, 1, 0, 1]
-
-
-
-
-

-
-

deploy our serving class using as a serverless function

-

in the following section we create a new model serving function which wraps our class , and specify model and other resources.

-

the models dict store model names and the assosiated model dir URL (the URL can start with S3:// and other blob store options), the faster way is to use a shared file volume, we use .apply(mount_v3io()) to attach a v3io (iguazio data fabric) volume to our function. By default v3io will mount the current user home into the \User function path.

-

verify the model dir does contain a valid model.bst file

-
-
-
from mlrun import new_model_server, mount_v3io
-import requests
-
-
-
-
-
-
-
fn = new_model_server("xgb-serving",
-                      model_class="XGBoostModel",
-                      models={"xgb_serving_v2": f"{model_dir}"})
-fn.spec.description = "xgboost test data classification server"
-fn.metadata.categories = ["serving", "ml"]
-fn.metadata.labels = {"author": "yaronh", "framework": "xgboost"}
-
-fn.export("function.yaml")
-
-
-
-
-
[mlrun] 2020-06-14 12:49:05,013 function spec saved to path: function.yaml
-
-
-
<mlrun.runtimes.function.RemoteRuntime at 0x7f0218662f60>
-
-
-
-
-
-
-
-

tests

-
-
-
from mlrun.platforms.other import auto_mount
-fn.apply(auto_mount())
-
-
-
-
-
<mlrun.runtimes.function.RemoteRuntime at 0x7f0218662f60>
-
-
-
-
-
-
-
addr = fn.deploy()
-
-
-
-
-
[mlrun] 2020-06-14 12:49:18,128 deploy started
-[nuclio] 2020-06-14 12:49:19,213 (info) Build complete
-[nuclio] 2020-06-14 12:49:27,347 (info) Function deploy complete
-[nuclio] 2020-06-14 12:49:27,354 done updating default-xgb-test, function address: 3.23.82.202:30104
-
-
-
-
-
-
-
addr
-
-
-
-
-
'http://3.23.82.202:30104'
-
-
-
-
-

-
-

test our model server using HTTP request

-

We invoke our model serving function using test data, the data vector is specified in the instances attribute.

-
-
-
# KFServing protocol event
-event_data = {"instances": xtest.values[:10,:-1].tolist()}
-
-
-
-
-
-
-
import json
-resp = requests.put(addr + "/xgb_serving_v2/predict", json=json.dumps(event_data))
-resp.text
-
-
-
-
-
'[1, 0, 0, 0, 0, 0, 1, 1, 0, 1]'
-
-
-
-
-
-
-
preds
-
-
-
-
-
[1, 0, 0, 0, 0, 0, 1, 1, 0, 1]
-
-
-
-
-

back to top

-
-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/xgb_serving/0.0.1/static/function.html b/functions/development/xgb_serving/0.0.1/static/function.html deleted file mode 100644 index 88ef40f0..00000000 --- a/functions/development/xgb_serving/0.0.1/static/function.html +++ /dev/null @@ -1,71 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: remote
-metadata:
-  name: xgb-serving
-  tag: ''
-  hash: d80a0b814b7707e100b3cd7d69fee276ebb2ff16
-  project: default
-  labels:
-    author: Daniel
-  categories:
-  - model-serving
-  - machine-learning
-spec:
-  command: ''
-  args: []
-  image: mlrun/ml-models
-  entry_points:
-    load:
-      name: load
-      doc: ''
-      parameters:
-      - name: self
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 10
-    predict:
-      name: predict
-      doc: ''
-      parameters:
-      - name: self
-        default: ''
-      - name: body
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 14
-  description: deploy an XGBoost model server.
-  min_replicas: 1
-  max_replicas: 4
-  env: []
-  base_spec: ''
-  source: ''
-  build:
-    functionSourceCode: aW1wb3J0IG9zCmltcG9ydCBqc29uCmltcG9ydCBudW1weSBhcyBucAppbXBvcnQgeGdib29zdCBhcyB4Z2IKZnJvbSBjbG91ZHBpY2tsZSBpbXBvcnQgbG9hZAppbXBvcnQgbWxydW4KCgpjbGFzcyBYR0Jvb3N0TW9kZWwobWxydW4ucnVudGltZXMuTUxNb2RlbFNlcnZlcik6CiAgICBkZWYgbG9hZChzZWxmKToKICAgICAgICBtb2RlbF9maWxlLCBleHRyYV9kYXRhID0gc2VsZi5nZXRfbW9kZWwoIi5wa2wiKQogICAgICAgIHNlbGYubW9kZWwgPSBsb2FkKG9wZW4oc3RyKG1vZGVsX2ZpbGUpLCAicmIiKSkKCiAgICBkZWYgcHJlZGljdChzZWxmLCBib2R5KToKICAgICAgICB0cnk6CiAgICAgICAgICAgIGZlYXRzID0gbnAuYXNhcnJheShib2R5WyJpbnN0YW5jZXMiXSwgZHR5cGU9bnAuZmxvYXQzMikucmVzaGFwZSgtMSwgNSkKICAgICAgICAgICAgcmVzdWx0ID0gc2VsZi5tb2RlbC5wcmVkaWN0KGZlYXRzLCB2YWxpZGF0ZV9mZWF0dXJlcz1GYWxzZSkKICAgICAgICAgICAgcmV0dXJuIHJlc3VsdC50b2xpc3QoKQogICAgICAgIGV4Y2VwdCBFeGNlcHRpb24gYXMgZToKICAgICAgICAgICAgcmFpc2UgRXhjZXB0aW9uKCJGYWlsZWQgdG8gcHJlZGljdCAlcyIgJSBlKQ==
-    commands: []
-    code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/xgb_serving/xgb_serving.py
-  default_handler: handler
-  affinity: null
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/xgb_serving/0.0.1/static/item.html b/functions/development/xgb_serving/0.0.1/static/item.html deleted file mode 100644 index bf5c062e..00000000 --- a/functions/development/xgb_serving/0.0.1/static/item.html +++ /dev/null @@ -1,46 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- model-serving
-- machine-learning
-description: deploy an XGBoost model server.
-doc: ''
-example: xgb_serving.ipynb
-generationDate: 2021-05-19:23-13
-icon: ''
-labels:
-  author: Daniel
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 0.6.2
-name: xgb_serving
-platformVersion: 3.0.0
-spec:
-  filename: xgb_serving.py
-  handler: handler
-  image: mlrun/ml-models
-  kind: remote
-  requirements: []
-url: ''
-version: 0.0.1
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/xgb_serving/0.0.1/static/source.html b/functions/development/xgb_serving/0.0.1/static/source.html deleted file mode 100644 index ce0f84b6..00000000 --- a/functions/development/xgb_serving/0.0.1/static/source.html +++ /dev/null @@ -1,41 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-import os
-import json
-import numpy as np
-import xgboost as xgb
-from cloudpickle import load
-import mlrun
-
-
-class XGBoostModel(mlrun.runtimes.MLModelServer):
-    def load(self):
-        model_file, extra_data = self.get_model(".pkl")
-        self.model = load(open(str(model_file), "rb"))
-
-    def predict(self, body):
-        try:
-            feats = np.asarray(body["instances"], dtype=np.float32).reshape(-1, 5)
-            result = self.model.predict(feats, validate_features=False)
-            return result.tolist()
-        except Exception as e:
-            raise Exception("Failed to predict %s" % e)
-        
-    
- - \ No newline at end of file diff --git a/functions/development/xgb_serving/0.8.0/src/function.yaml b/functions/development/xgb_serving/0.8.0/src/function.yaml deleted file mode 100644 index 2e7bc84c..00000000 --- a/functions/development/xgb_serving/0.8.0/src/function.yaml +++ /dev/null @@ -1,49 +0,0 @@ -kind: remote -metadata: - name: xgb-serving - tag: '' - hash: d80a0b814b7707e100b3cd7d69fee276ebb2ff16 - project: default - labels: - author: Daniel - categories: - - model-serving - - machine-learning -spec: - command: '' - args: [] - image: mlrun/ml-models - entry_points: - load: - name: load - doc: '' - parameters: - - name: self - default: '' - outputs: - - default: '' - lineno: 10 - predict: - name: predict - doc: '' - parameters: - - name: self - default: '' - - name: body - default: '' - outputs: - - default: '' - lineno: 14 - description: deploy an XGBoost model server. - min_replicas: 1 - max_replicas: 4 - env: [] - base_spec: '' - source: '' - build: - functionSourceCode: aW1wb3J0IG9zCmltcG9ydCBqc29uCmltcG9ydCBudW1weSBhcyBucAppbXBvcnQgeGdib29zdCBhcyB4Z2IKZnJvbSBjbG91ZHBpY2tsZSBpbXBvcnQgbG9hZAppbXBvcnQgbWxydW4KCgpjbGFzcyBYR0Jvb3N0TW9kZWwobWxydW4ucnVudGltZXMuTUxNb2RlbFNlcnZlcik6CiAgICBkZWYgbG9hZChzZWxmKToKICAgICAgICBtb2RlbF9maWxlLCBleHRyYV9kYXRhID0gc2VsZi5nZXRfbW9kZWwoIi5wa2wiKQogICAgICAgIHNlbGYubW9kZWwgPSBsb2FkKG9wZW4oc3RyKG1vZGVsX2ZpbGUpLCAicmIiKSkKCiAgICBkZWYgcHJlZGljdChzZWxmLCBib2R5KToKICAgICAgICB0cnk6CiAgICAgICAgICAgIGZlYXRzID0gbnAuYXNhcnJheShib2R5WyJpbnN0YW5jZXMiXSwgZHR5cGU9bnAuZmxvYXQzMikucmVzaGFwZSgtMSwgNSkKICAgICAgICAgICAgcmVzdWx0ID0gc2VsZi5tb2RlbC5wcmVkaWN0KGZlYXRzLCB2YWxpZGF0ZV9mZWF0dXJlcz1GYWxzZSkKICAgICAgICAgICAgcmV0dXJuIHJlc3VsdC50b2xpc3QoKQogICAgICAgIGV4Y2VwdCBFeGNlcHRpb24gYXMgZToKICAgICAgICAgICAgcmFpc2UgRXhjZXB0aW9uKCJGYWlsZWQgdG8gcHJlZGljdCAlcyIgJSBlKQ== - commands: [] - code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/xgb_serving/xgb_serving.py - default_handler: handler - affinity: null -verbose: false diff --git a/functions/development/xgb_serving/0.8.0/src/item.yaml b/functions/development/xgb_serving/0.8.0/src/item.yaml deleted file mode 100644 index cee47d5f..00000000 --- a/functions/development/xgb_serving/0.8.0/src/item.yaml +++ /dev/null @@ -1,24 +0,0 @@ -apiVersion: v1 -categories: -- model-serving -- machine-learning -description: deploy an XGBoost model server. -doc: '' -example: xgb_serving.ipynb -generationDate: 2021-05-19:23-13 -icon: '' -labels: - author: Daniel -maintainers: [] -marketplaceType: '' -mlrunVersion: 0.8.0 -name: xgb_serving -platformVersion: 3.2.0 -spec: - filename: xgb_serving.py - handler: handler - image: mlrun/ml-models - kind: remote - requirements: [] -url: '' -version: 0.8.0 diff --git a/functions/development/xgb_serving/0.8.0/src/requirements.txt b/functions/development/xgb_serving/0.8.0/src/requirements.txt deleted file mode 100644 index 2e6aaf5a..00000000 --- a/functions/development/xgb_serving/0.8.0/src/requirements.txt +++ /dev/null @@ -1,8 +0,0 @@ -mlrun -pandas -xgboost -cloudpickle -pygit2 -sklearn -scikit-plot -seaborn \ No newline at end of file diff --git a/functions/development/xgb_serving/0.8.0/src/test_xgb_serving.py b/functions/development/xgb_serving/0.8.0/src/test_xgb_serving.py deleted file mode 100644 index 4c7c7038..00000000 --- a/functions/development/xgb_serving/0.8.0/src/test_xgb_serving.py +++ /dev/null @@ -1,45 +0,0 @@ -from mlrun import import_function -import os -import pandas as pd -from xgb_serving import XGBoostModel - - -ARTIFACT_PATH = "artifacts" -FUNCTION_PATH = "functions" -MODELS_PATH = "models" -PLOTS_PATH = "plots" -RUNS_PATH = "runs" -SCHEDULES_PATH = "schedules" - - -def test_local_xgb_serving(): - # importing data preparation function (gen_class_data) locally - fn = import_function("../gen_class_data/function.yaml") - fn.run(params={ - "n_samples": 10_000, - "m_features": 5, - "k_classes": 2, - "weight": [0.5, 0.5], - "sk_params": {"n_informative": 2}, - "file_ext": "csv"}, local=True, artifact_path="./artifacts/inputs") - - # importing model training function (xgb_trainer) locally - fn = import_function("../xgb_trainer/function.yaml") - fn.run(params={ - "model_type": "classifier", - "CLASS_tree_method": "hist", - "CLASS_objective": "binary:logistic", - "CLASS_booster": "gbtree", - "FIT_verbose": 0, - "label_column": "labels", - "test_set": "./artifacts/test-set"}, - local=True, inputs={"dataset": './artifacts/inputs/classifier-data.csv'}) - - # because this class is implemented with MLModelServer, creating a class instance and not to_mock_server(V2_Model_Server). - model = os.getcwd() + "/models/model.pkl" - my_server = XGBoostModel("my-model", model_dir=model) - my_server.load() - # Testing the model - xtest = pd.read_csv('./artifacts/inputs/classifier-data.csv') - preds = my_server.predict({"instances": xtest.values[:10, :-1].tolist()}) - assert (True if preds == [1, 0, 0, 0, 0, 0, 1, 1, 0, 1] else False) is True diff --git a/functions/development/xgb_serving/0.8.0/src/xgb_serving.ipynb b/functions/development/xgb_serving/0.8.0/src/xgb_serving.ipynb deleted file mode 100644 index 6c605367..00000000 --- a/functions/development/xgb_serving/0.8.0/src/xgb_serving.ipynb +++ /dev/null @@ -1,421 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Deploy a Serverless XGBoost Model Server\n", - " --------------------------------------------------------------------\n", - "\n", - "The following notebook demonstrates how to deploy an XGBoost model server (a.k.a Nuclio-serving)\n", - "\n", - "#### **notebook how-to's**\n", - "* Write and test model serving class in a notebook.\n", - "* Deploy the model server function.\n", - "* Invoke and test the serving function." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "#### **steps**\n", - "**[define a new function and its dependencies](#define-function)**
\n", - "**[test the model serving class locally](#test-locally)**
\n", - "**[deploy our serving class using as a serverless function](#deploy)**
\n", - "**[test our model server using HTTP request](#test-model-server)**
" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: ignore\n", - "import nuclio " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "### **define a new function and its dependencies**" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "%nuclio: setting kind to 'nuclio:serving'\n", - "%nuclio: setting 'MODEL_CLASS' environment variable\n", - "%nuclio: setting spec.build.baseImage to 'mlrun/ml-models'\n" - ] - } - ], - "source": [ - "%nuclio config kind=\"nuclio:serving\"\n", - "%nuclio env MODEL_CLASS=XGBoostModel\n", - "\n", - "%nuclio config spec.build.baseImage = \"mlrun/ml-models\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Function Code" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "# import kfserving\n", - "import os\n", - "import json\n", - "import numpy as np\n", - "import xgboost as xgb\n", - "from cloudpickle import load\n", - "\n", - "### Model Serving Class\n", - "\n", - "import mlrun\n", - "class XGBoostModel(mlrun.runtimes.MLModelServer):\n", - " def load(self):\n", - " model_file, extra_data = self.get_model(\".pkl\")\n", - " self.model = load(open(str(model_file), \"rb\"))\n", - " \n", - "\n", - " def predict(self, body):\n", - " try:\n", - " feats = np.asarray(body[\"instances\"], dtype=np.float32).reshape(-1, 5)\n", - " result = self.model.predict(feats, validate_features=False)\n", - " return result.tolist()\n", - " except Exception as e:\n", - " raise Exception(\"Failed to predict %s\" % e)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The following end-code annotation tells ```nuclio``` to stop parsing the notebook from this cell. _**Please do not remove this cell**_:" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: end-code" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "## Test the function locally\n", - "\n", - "The class above can be tested locally. Just instantiate the class, `.load()` will load the model to a local dir.\n", - "\n", - "> **Verify there is a model file in the model_dir path (generated by the training notebook)**" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import mlconf\n", - "\n", - "model_dir = os.path.join(mlconf.artifact_path, \"xgb/models\")\n", - "\n", - "my_server = XGBoostModel(\"my-model\", model_dir=model_dir)\n", - "my_server.load()" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "metadata": {}, - "outputs": [], - "source": [ - "DATA_PATH = mlconf.artifact_path + \"/xgb/classifier-data.csv\"\n", - "MODEL_PATH = mlconf.artifact_path + \"/xgb/models/xgb_test\"" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "xtest = pd.read_csv(DATA_PATH)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can use the `.predict(body)` method to test the model." - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "metadata": {}, - "outputs": [], - "source": [ - "import json, numpy as np\n", - "preds = my_server.predict({\"instances\":xtest.values[:10,:-1].tolist()})" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "predicted class: [1, 0, 0, 0, 0, 0, 1, 1, 0, 1]\n" - ] - } - ], - "source": [ - "print(\"predicted class:\", preds)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "### **deploy our serving class using as a serverless function**\n", - "in the following section we create a new model serving function which wraps our class , and specify model and other resources.\n", - "\n", - "the `models` dict store model names and the assosiated model **dir** URL (the URL can start with `S3://` and other blob store options), the faster way is to use a shared file volume, we use `.apply(mount_v3io())` to attach a v3io (iguazio data fabric) volume to our function. By default v3io will mount the current user home into the `\\User` function path.\n", - "\n", - "**verify the model dir does contain a valid `model.bst` file**" - ] - }, - { - "cell_type": "code", - "execution_count": 41, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import new_model_server, mount_v3io\n", - "import requests" - ] - }, - { - "cell_type": "code", - "execution_count": 42, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-06-14 12:49:05,013 function spec saved to path: function.yaml\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 42, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fn = new_model_server(\"xgb-serving\",\n", - " model_class=\"XGBoostModel\",\n", - " models={\"xgb_serving_v2\": f\"{model_dir}\"})\n", - "fn.spec.description = \"xgboost test data classification server\"\n", - "fn.metadata.categories = [\"serving\", \"ml\"]\n", - "fn.metadata.labels = {\"author\": \"yaronh\", \"framework\": \"xgboost\"}\n", - "\n", - "fn.export(\"function.yaml\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## tests" - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 43, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from mlrun.platforms.other import auto_mount\n", - "fn.apply(auto_mount())" - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-06-14 12:49:18,128 deploy started\n", - "[nuclio] 2020-06-14 12:49:19,213 (info) Build complete\n", - "[nuclio] 2020-06-14 12:49:27,347 (info) Function deploy complete\n", - "[nuclio] 2020-06-14 12:49:27,354 done updating default-xgb-test, function address: 3.23.82.202:30104\n" - ] - } - ], - "source": [ - "addr = fn.deploy()" - ] - }, - { - "cell_type": "code", - "execution_count": 45, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'http://3.23.82.202:30104'" - ] - }, - "execution_count": 45, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "addr" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "### **test our model server using HTTP request**\n", - "\n", - "\n", - "We invoke our model serving function using test data, the data vector is specified in the `instances` attribute." - ] - }, - { - "cell_type": "code", - "execution_count": 46, - "metadata": {}, - "outputs": [], - "source": [ - "# KFServing protocol event\n", - "event_data = {\"instances\": xtest.values[:10,:-1].tolist()}" - ] - }, - { - "cell_type": "code", - "execution_count": 47, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'[1, 0, 0, 0, 0, 0, 1, 1, 0, 1]'" - ] - }, - "execution_count": 47, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import json\n", - "resp = requests.put(addr + \"/xgb_serving_v2/predict\", json=json.dumps(event_data))\n", - "resp.text" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[1, 0, 0, 0, 0, 0, 1, 1, 0, 1]" - ] - }, - "execution_count": 26, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "preds" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**[back to top](#top)**" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.8" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/functions/development/xgb_serving/0.8.0/src/xgb_serving.py b/functions/development/xgb_serving/0.8.0/src/xgb_serving.py deleted file mode 100644 index 41bf327c..00000000 --- a/functions/development/xgb_serving/0.8.0/src/xgb_serving.py +++ /dev/null @@ -1,20 +0,0 @@ -import os -import json -import numpy as np -import xgboost as xgb -from cloudpickle import load -import mlrun - - -class XGBoostModel(mlrun.runtimes.MLModelServer): - def load(self): - model_file, extra_data = self.get_model(".pkl") - self.model = load(open(str(model_file), "rb")) - - def predict(self, body): - try: - feats = np.asarray(body["instances"], dtype=np.float32).reshape(-1, 5) - result = self.model.predict(feats, validate_features=False) - return result.tolist() - except Exception as e: - raise Exception("Failed to predict %s" % e) \ No newline at end of file diff --git a/functions/development/xgb_serving/0.8.0/static/documentation.html b/functions/development/xgb_serving/0.8.0/static/documentation.html deleted file mode 100644 index 81099540..00000000 --- a/functions/development/xgb_serving/0.8.0/static/documentation.html +++ /dev/null @@ -1,138 +0,0 @@ - - - - - - - -xgb_serving package - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- - -
-
-
-
-
-
-

xgb_serving package

-
-

Submodules

-
-
-

xgb_serving.xgb_serving module

-
-
-class xgb_serving.xgb_serving.XGBoostModel(name: str, model_dir: Optional[str] = None, model=None)[source]
-

Bases: mlrun.serving.v1_serving.MLModelServer

-
-
-load()[source]
-
-
-
-predict(body)[source]
-
-
-
-
-

Module contents

-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/xgb_serving/0.8.0/static/example.html b/functions/development/xgb_serving/0.8.0/static/example.html deleted file mode 100644 index 41836f94..00000000 --- a/functions/development/xgb_serving/0.8.0/static/example.html +++ /dev/null @@ -1,419 +0,0 @@ - - - - - - - -Deploy a Serverless XGBoost Model Server - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
- -
-
-
-
-

Deploy a Serverless XGBoost Model Server

-
-

The following notebook demonstrates how to deploy an XGBoost model server (a.k.a Nuclio-serving)

-
-

notebook how-to’s

-
    -
  • Write and test model serving class in a notebook.

  • -
  • Deploy the model server function.

  • -
  • Invoke and test the serving function.

  • -
-

-
- -
-

define a new function and its dependencies

-
-
-
%nuclio config kind="nuclio:serving"
-%nuclio env MODEL_CLASS=XGBoostModel
-
-%nuclio config spec.build.baseImage = "mlrun/ml-models"
-
-
-
-
-
%nuclio: setting kind to 'nuclio:serving'
-%nuclio: setting 'MODEL_CLASS' environment variable
-%nuclio: setting spec.build.baseImage to 'mlrun/ml-models'
-
-
-
-
-
-
-

Function Code

-
-
-
# import kfserving
-import os
-import json
-import numpy as np
-import xgboost as xgb
-from cloudpickle import load
-
-### Model Serving Class
-
-import mlrun
-class XGBoostModel(mlrun.runtimes.MLModelServer):
-    def load(self):
-        model_file, extra_data = self.get_model(".pkl")
-        self.model = load(open(str(model_file), "rb"))
-  
-
-    def predict(self, body):
-        try:
-            feats = np.asarray(body["instances"], dtype=np.float32).reshape(-1, 5)
-            result = self.model.predict(feats, validate_features=False)
-            return result.tolist()
-        except Exception as e:
-            raise Exception("Failed to predict %s" % e)
-
-
-
-
-

The following end-code annotation tells nuclio to stop parsing the notebook from this cell. Please do not remove this cell:

-
-
-
# nuclio: end-code
-
-
-
-
-

-
-
-

Test the function locally

-

The class above can be tested locally. Just instantiate the class, .load() will load the model to a local dir.

-
-

Verify there is a model file in the model_dir path (generated by the training notebook)

-
-
-
-
from mlrun import mlconf
-
-model_dir = os.path.join(mlconf.artifact_path, "xgb/models")
-
-my_server = XGBoostModel("my-model", model_dir=model_dir)
-my_server.load()
-
-
-
-
-
-
-
DATA_PATH = mlconf.artifact_path + "/xgb/classifier-data.csv"
-MODEL_PATH = mlconf.artifact_path + "/xgb/models/xgb_test"
-
-
-
-
-
-
-
import pandas as pd
-xtest = pd.read_csv(DATA_PATH)
-
-
-
-
-

We can use the .predict(body) method to test the model.

-
-
-
import json, numpy as np
-preds = my_server.predict({"instances":xtest.values[:10,:-1].tolist()})
-
-
-
-
-
-
-
print("predicted class:", preds)
-
-
-
-
-
predicted class: [1, 0, 0, 0, 0, 0, 1, 1, 0, 1]
-
-
-
-
-

-
-

deploy our serving class using as a serverless function

-

in the following section we create a new model serving function which wraps our class , and specify model and other resources.

-

the models dict store model names and the assosiated model dir URL (the URL can start with S3:// and other blob store options), the faster way is to use a shared file volume, we use .apply(mount_v3io()) to attach a v3io (iguazio data fabric) volume to our function. By default v3io will mount the current user home into the \User function path.

-

verify the model dir does contain a valid model.bst file

-
-
-
from mlrun import new_model_server, mount_v3io
-import requests
-
-
-
-
-
-
-
fn = new_model_server("xgb-serving",
-                      model_class="XGBoostModel",
-                      models={"xgb_serving_v2": f"{model_dir}"})
-fn.spec.description = "xgboost test data classification server"
-fn.metadata.categories = ["serving", "ml"]
-fn.metadata.labels = {"author": "yaronh", "framework": "xgboost"}
-
-fn.export("function.yaml")
-
-
-
-
-
[mlrun] 2020-06-14 12:49:05,013 function spec saved to path: function.yaml
-
-
-
<mlrun.runtimes.function.RemoteRuntime at 0x7f0218662f60>
-
-
-
-
-
-
-
-

tests

-
-
-
from mlrun.platforms.other import auto_mount
-fn.apply(auto_mount())
-
-
-
-
-
<mlrun.runtimes.function.RemoteRuntime at 0x7f0218662f60>
-
-
-
-
-
-
-
addr = fn.deploy()
-
-
-
-
-
[mlrun] 2020-06-14 12:49:18,128 deploy started
-[nuclio] 2020-06-14 12:49:19,213 (info) Build complete
-[nuclio] 2020-06-14 12:49:27,347 (info) Function deploy complete
-[nuclio] 2020-06-14 12:49:27,354 done updating default-xgb-test, function address: 3.23.82.202:30104
-
-
-
-
-
-
-
addr
-
-
-
-
-
'http://3.23.82.202:30104'
-
-
-
-
-

-
-

test our model server using HTTP request

-

We invoke our model serving function using test data, the data vector is specified in the instances attribute.

-
-
-
# KFServing protocol event
-event_data = {"instances": xtest.values[:10,:-1].tolist()}
-
-
-
-
-
-
-
import json
-resp = requests.put(addr + "/xgb_serving_v2/predict", json=json.dumps(event_data))
-resp.text
-
-
-
-
-
'[1, 0, 0, 0, 0, 0, 1, 1, 0, 1]'
-
-
-
-
-
-
-
preds
-
-
-
-
-
[1, 0, 0, 0, 0, 0, 1, 1, 0, 1]
-
-
-
-
-

back to top

-
-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/xgb_serving/0.8.0/static/function.html b/functions/development/xgb_serving/0.8.0/static/function.html deleted file mode 100644 index 88ef40f0..00000000 --- a/functions/development/xgb_serving/0.8.0/static/function.html +++ /dev/null @@ -1,71 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: remote
-metadata:
-  name: xgb-serving
-  tag: ''
-  hash: d80a0b814b7707e100b3cd7d69fee276ebb2ff16
-  project: default
-  labels:
-    author: Daniel
-  categories:
-  - model-serving
-  - machine-learning
-spec:
-  command: ''
-  args: []
-  image: mlrun/ml-models
-  entry_points:
-    load:
-      name: load
-      doc: ''
-      parameters:
-      - name: self
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 10
-    predict:
-      name: predict
-      doc: ''
-      parameters:
-      - name: self
-        default: ''
-      - name: body
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 14
-  description: deploy an XGBoost model server.
-  min_replicas: 1
-  max_replicas: 4
-  env: []
-  base_spec: ''
-  source: ''
-  build:
-    functionSourceCode: aW1wb3J0IG9zCmltcG9ydCBqc29uCmltcG9ydCBudW1weSBhcyBucAppbXBvcnQgeGdib29zdCBhcyB4Z2IKZnJvbSBjbG91ZHBpY2tsZSBpbXBvcnQgbG9hZAppbXBvcnQgbWxydW4KCgpjbGFzcyBYR0Jvb3N0TW9kZWwobWxydW4ucnVudGltZXMuTUxNb2RlbFNlcnZlcik6CiAgICBkZWYgbG9hZChzZWxmKToKICAgICAgICBtb2RlbF9maWxlLCBleHRyYV9kYXRhID0gc2VsZi5nZXRfbW9kZWwoIi5wa2wiKQogICAgICAgIHNlbGYubW9kZWwgPSBsb2FkKG9wZW4oc3RyKG1vZGVsX2ZpbGUpLCAicmIiKSkKCiAgICBkZWYgcHJlZGljdChzZWxmLCBib2R5KToKICAgICAgICB0cnk6CiAgICAgICAgICAgIGZlYXRzID0gbnAuYXNhcnJheShib2R5WyJpbnN0YW5jZXMiXSwgZHR5cGU9bnAuZmxvYXQzMikucmVzaGFwZSgtMSwgNSkKICAgICAgICAgICAgcmVzdWx0ID0gc2VsZi5tb2RlbC5wcmVkaWN0KGZlYXRzLCB2YWxpZGF0ZV9mZWF0dXJlcz1GYWxzZSkKICAgICAgICAgICAgcmV0dXJuIHJlc3VsdC50b2xpc3QoKQogICAgICAgIGV4Y2VwdCBFeGNlcHRpb24gYXMgZToKICAgICAgICAgICAgcmFpc2UgRXhjZXB0aW9uKCJGYWlsZWQgdG8gcHJlZGljdCAlcyIgJSBlKQ==
-    commands: []
-    code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/xgb_serving/xgb_serving.py
-  default_handler: handler
-  affinity: null
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/xgb_serving/0.8.0/static/item.html b/functions/development/xgb_serving/0.8.0/static/item.html deleted file mode 100644 index fe69a17b..00000000 --- a/functions/development/xgb_serving/0.8.0/static/item.html +++ /dev/null @@ -1,46 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- model-serving
-- machine-learning
-description: deploy an XGBoost model server.
-doc: ''
-example: xgb_serving.ipynb
-generationDate: 2021-05-19:23-13
-icon: ''
-labels:
-  author: Daniel
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 0.8.0
-name: xgb_serving
-platformVersion: 3.2.0
-spec:
-  filename: xgb_serving.py
-  handler: handler
-  image: mlrun/ml-models
-  kind: remote
-  requirements: []
-url: ''
-version: 0.8.0
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/xgb_serving/0.8.0/static/source.html b/functions/development/xgb_serving/0.8.0/static/source.html deleted file mode 100644 index ce0f84b6..00000000 --- a/functions/development/xgb_serving/0.8.0/static/source.html +++ /dev/null @@ -1,41 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-import os
-import json
-import numpy as np
-import xgboost as xgb
-from cloudpickle import load
-import mlrun
-
-
-class XGBoostModel(mlrun.runtimes.MLModelServer):
-    def load(self):
-        model_file, extra_data = self.get_model(".pkl")
-        self.model = load(open(str(model_file), "rb"))
-
-    def predict(self, body):
-        try:
-            feats = np.asarray(body["instances"], dtype=np.float32).reshape(-1, 5)
-            result = self.model.predict(feats, validate_features=False)
-            return result.tolist()
-        except Exception as e:
-            raise Exception("Failed to predict %s" % e)
-        
-    
- - \ No newline at end of file diff --git a/functions/development/xgb_serving/0.9.0/src/function.yaml b/functions/development/xgb_serving/0.9.0/src/function.yaml deleted file mode 100644 index 5784ed42..00000000 --- a/functions/development/xgb_serving/0.9.0/src/function.yaml +++ /dev/null @@ -1,49 +0,0 @@ -kind: remote -metadata: - name: xgb-serving - tag: '' - hash: d80a0b814b7707e100b3cd7d69fee276ebb2ff16 - project: '' - labels: - author: Daniel - categories: - - model-serving - - machine-learning -spec: - command: '' - args: [] - image: mlrun/ml-models - entry_points: - load: - name: load - doc: '' - parameters: - - name: self - default: '' - outputs: - - default: '' - lineno: 10 - predict: - name: predict - doc: '' - parameters: - - name: self - default: '' - - name: body - default: '' - outputs: - - default: '' - lineno: 14 - description: deploy an XGBoost model server. - min_replicas: 1 - max_replicas: 4 - env: [] - base_spec: '' - source: '' - build: - functionSourceCode: aW1wb3J0IG9zCmltcG9ydCBqc29uCmltcG9ydCBudW1weSBhcyBucAppbXBvcnQgeGdib29zdCBhcyB4Z2IKZnJvbSBjbG91ZHBpY2tsZSBpbXBvcnQgbG9hZAppbXBvcnQgbWxydW4KCgpjbGFzcyBYR0Jvb3N0TW9kZWwobWxydW4ucnVudGltZXMuTUxNb2RlbFNlcnZlcik6CiAgICBkZWYgbG9hZChzZWxmKToKICAgICAgICBtb2RlbF9maWxlLCBleHRyYV9kYXRhID0gc2VsZi5nZXRfbW9kZWwoIi5wa2wiKQogICAgICAgIHNlbGYubW9kZWwgPSBsb2FkKG9wZW4oc3RyKG1vZGVsX2ZpbGUpLCAicmIiKSkKCiAgICBkZWYgcHJlZGljdChzZWxmLCBib2R5KToKICAgICAgICB0cnk6CiAgICAgICAgICAgIGZlYXRzID0gbnAuYXNhcnJheShib2R5WyJpbnN0YW5jZXMiXSwgZHR5cGU9bnAuZmxvYXQzMikucmVzaGFwZSgtMSwgNSkKICAgICAgICAgICAgcmVzdWx0ID0gc2VsZi5tb2RlbC5wcmVkaWN0KGZlYXRzLCB2YWxpZGF0ZV9mZWF0dXJlcz1GYWxzZSkKICAgICAgICAgICAgcmV0dXJuIHJlc3VsdC50b2xpc3QoKQogICAgICAgIGV4Y2VwdCBFeGNlcHRpb24gYXMgZToKICAgICAgICAgICAgcmFpc2UgRXhjZXB0aW9uKCJGYWlsZWQgdG8gcHJlZGljdCAlcyIgJSBlKQ== - commands: [] - code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/xgb_serving/xgb_serving.py - default_handler: handler - affinity: null -verbose: false diff --git a/functions/development/xgb_serving/0.9.0/src/item.yaml b/functions/development/xgb_serving/0.9.0/src/item.yaml deleted file mode 100644 index e8be0cc6..00000000 --- a/functions/development/xgb_serving/0.9.0/src/item.yaml +++ /dev/null @@ -1,24 +0,0 @@ -apiVersion: v1 -categories: -- model-serving -- machine-learning -description: deploy an XGBoost model server. -doc: '' -example: xgb_serving.ipynb -generationDate: 2021-11-18:12-28 -icon: '' -labels: - author: Daniel -maintainers: [] -marketplaceType: '' -mlrunVersion: 0.8.0 -name: xgb_serving -platformVersion: 3.2.0 -spec: - filename: xgb_serving.py - handler: handler - image: mlrun/ml-models - kind: remote - requirements: [] -url: '' -version: 0.9.0 diff --git a/functions/development/xgb_serving/0.9.0/src/requirements.txt b/functions/development/xgb_serving/0.9.0/src/requirements.txt deleted file mode 100644 index 2e6aaf5a..00000000 --- a/functions/development/xgb_serving/0.9.0/src/requirements.txt +++ /dev/null @@ -1,8 +0,0 @@ -mlrun -pandas -xgboost -cloudpickle -pygit2 -sklearn -scikit-plot -seaborn \ No newline at end of file diff --git a/functions/development/xgb_serving/0.9.0/src/test_xgb_serving.py b/functions/development/xgb_serving/0.9.0/src/test_xgb_serving.py deleted file mode 100644 index 4c7c7038..00000000 --- a/functions/development/xgb_serving/0.9.0/src/test_xgb_serving.py +++ /dev/null @@ -1,45 +0,0 @@ -from mlrun import import_function -import os -import pandas as pd -from xgb_serving import XGBoostModel - - -ARTIFACT_PATH = "artifacts" -FUNCTION_PATH = "functions" -MODELS_PATH = "models" -PLOTS_PATH = "plots" -RUNS_PATH = "runs" -SCHEDULES_PATH = "schedules" - - -def test_local_xgb_serving(): - # importing data preparation function (gen_class_data) locally - fn = import_function("../gen_class_data/function.yaml") - fn.run(params={ - "n_samples": 10_000, - "m_features": 5, - "k_classes": 2, - "weight": [0.5, 0.5], - "sk_params": {"n_informative": 2}, - "file_ext": "csv"}, local=True, artifact_path="./artifacts/inputs") - - # importing model training function (xgb_trainer) locally - fn = import_function("../xgb_trainer/function.yaml") - fn.run(params={ - "model_type": "classifier", - "CLASS_tree_method": "hist", - "CLASS_objective": "binary:logistic", - "CLASS_booster": "gbtree", - "FIT_verbose": 0, - "label_column": "labels", - "test_set": "./artifacts/test-set"}, - local=True, inputs={"dataset": './artifacts/inputs/classifier-data.csv'}) - - # because this class is implemented with MLModelServer, creating a class instance and not to_mock_server(V2_Model_Server). - model = os.getcwd() + "/models/model.pkl" - my_server = XGBoostModel("my-model", model_dir=model) - my_server.load() - # Testing the model - xtest = pd.read_csv('./artifacts/inputs/classifier-data.csv') - preds = my_server.predict({"instances": xtest.values[:10, :-1].tolist()}) - assert (True if preds == [1, 0, 0, 0, 0, 0, 1, 1, 0, 1] else False) is True diff --git a/functions/development/xgb_serving/0.9.0/src/xgb_serving.ipynb b/functions/development/xgb_serving/0.9.0/src/xgb_serving.ipynb deleted file mode 100644 index 6c605367..00000000 --- a/functions/development/xgb_serving/0.9.0/src/xgb_serving.ipynb +++ /dev/null @@ -1,421 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Deploy a Serverless XGBoost Model Server\n", - " --------------------------------------------------------------------\n", - "\n", - "The following notebook demonstrates how to deploy an XGBoost model server (a.k.a Nuclio-serving)\n", - "\n", - "#### **notebook how-to's**\n", - "* Write and test model serving class in a notebook.\n", - "* Deploy the model server function.\n", - "* Invoke and test the serving function." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "#### **steps**\n", - "**[define a new function and its dependencies](#define-function)**
\n", - "**[test the model serving class locally](#test-locally)**
\n", - "**[deploy our serving class using as a serverless function](#deploy)**
\n", - "**[test our model server using HTTP request](#test-model-server)**
" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: ignore\n", - "import nuclio " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "### **define a new function and its dependencies**" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "%nuclio: setting kind to 'nuclio:serving'\n", - "%nuclio: setting 'MODEL_CLASS' environment variable\n", - "%nuclio: setting spec.build.baseImage to 'mlrun/ml-models'\n" - ] - } - ], - "source": [ - "%nuclio config kind=\"nuclio:serving\"\n", - "%nuclio env MODEL_CLASS=XGBoostModel\n", - "\n", - "%nuclio config spec.build.baseImage = \"mlrun/ml-models\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Function Code" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "# import kfserving\n", - "import os\n", - "import json\n", - "import numpy as np\n", - "import xgboost as xgb\n", - "from cloudpickle import load\n", - "\n", - "### Model Serving Class\n", - "\n", - "import mlrun\n", - "class XGBoostModel(mlrun.runtimes.MLModelServer):\n", - " def load(self):\n", - " model_file, extra_data = self.get_model(\".pkl\")\n", - " self.model = load(open(str(model_file), \"rb\"))\n", - " \n", - "\n", - " def predict(self, body):\n", - " try:\n", - " feats = np.asarray(body[\"instances\"], dtype=np.float32).reshape(-1, 5)\n", - " result = self.model.predict(feats, validate_features=False)\n", - " return result.tolist()\n", - " except Exception as e:\n", - " raise Exception(\"Failed to predict %s\" % e)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The following end-code annotation tells ```nuclio``` to stop parsing the notebook from this cell. _**Please do not remove this cell**_:" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: end-code" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "## Test the function locally\n", - "\n", - "The class above can be tested locally. Just instantiate the class, `.load()` will load the model to a local dir.\n", - "\n", - "> **Verify there is a model file in the model_dir path (generated by the training notebook)**" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import mlconf\n", - "\n", - "model_dir = os.path.join(mlconf.artifact_path, \"xgb/models\")\n", - "\n", - "my_server = XGBoostModel(\"my-model\", model_dir=model_dir)\n", - "my_server.load()" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "metadata": {}, - "outputs": [], - "source": [ - "DATA_PATH = mlconf.artifact_path + \"/xgb/classifier-data.csv\"\n", - "MODEL_PATH = mlconf.artifact_path + \"/xgb/models/xgb_test\"" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "xtest = pd.read_csv(DATA_PATH)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can use the `.predict(body)` method to test the model." - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "metadata": {}, - "outputs": [], - "source": [ - "import json, numpy as np\n", - "preds = my_server.predict({\"instances\":xtest.values[:10,:-1].tolist()})" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "predicted class: [1, 0, 0, 0, 0, 0, 1, 1, 0, 1]\n" - ] - } - ], - "source": [ - "print(\"predicted class:\", preds)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "### **deploy our serving class using as a serverless function**\n", - "in the following section we create a new model serving function which wraps our class , and specify model and other resources.\n", - "\n", - "the `models` dict store model names and the assosiated model **dir** URL (the URL can start with `S3://` and other blob store options), the faster way is to use a shared file volume, we use `.apply(mount_v3io())` to attach a v3io (iguazio data fabric) volume to our function. By default v3io will mount the current user home into the `\\User` function path.\n", - "\n", - "**verify the model dir does contain a valid `model.bst` file**" - ] - }, - { - "cell_type": "code", - "execution_count": 41, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import new_model_server, mount_v3io\n", - "import requests" - ] - }, - { - "cell_type": "code", - "execution_count": 42, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-06-14 12:49:05,013 function spec saved to path: function.yaml\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 42, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fn = new_model_server(\"xgb-serving\",\n", - " model_class=\"XGBoostModel\",\n", - " models={\"xgb_serving_v2\": f\"{model_dir}\"})\n", - "fn.spec.description = \"xgboost test data classification server\"\n", - "fn.metadata.categories = [\"serving\", \"ml\"]\n", - "fn.metadata.labels = {\"author\": \"yaronh\", \"framework\": \"xgboost\"}\n", - "\n", - "fn.export(\"function.yaml\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## tests" - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 43, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from mlrun.platforms.other import auto_mount\n", - "fn.apply(auto_mount())" - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-06-14 12:49:18,128 deploy started\n", - "[nuclio] 2020-06-14 12:49:19,213 (info) Build complete\n", - "[nuclio] 2020-06-14 12:49:27,347 (info) Function deploy complete\n", - "[nuclio] 2020-06-14 12:49:27,354 done updating default-xgb-test, function address: 3.23.82.202:30104\n" - ] - } - ], - "source": [ - "addr = fn.deploy()" - ] - }, - { - "cell_type": "code", - "execution_count": 45, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'http://3.23.82.202:30104'" - ] - }, - "execution_count": 45, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "addr" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "### **test our model server using HTTP request**\n", - "\n", - "\n", - "We invoke our model serving function using test data, the data vector is specified in the `instances` attribute." - ] - }, - { - "cell_type": "code", - "execution_count": 46, - "metadata": {}, - "outputs": [], - "source": [ - "# KFServing protocol event\n", - "event_data = {\"instances\": xtest.values[:10,:-1].tolist()}" - ] - }, - { - "cell_type": "code", - "execution_count": 47, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'[1, 0, 0, 0, 0, 0, 1, 1, 0, 1]'" - ] - }, - "execution_count": 47, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import json\n", - "resp = requests.put(addr + \"/xgb_serving_v2/predict\", json=json.dumps(event_data))\n", - "resp.text" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[1, 0, 0, 0, 0, 0, 1, 1, 0, 1]" - ] - }, - "execution_count": 26, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "preds" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**[back to top](#top)**" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.8" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/functions/development/xgb_serving/0.9.0/src/xgb_serving.py b/functions/development/xgb_serving/0.9.0/src/xgb_serving.py deleted file mode 100644 index 41bf327c..00000000 --- a/functions/development/xgb_serving/0.9.0/src/xgb_serving.py +++ /dev/null @@ -1,20 +0,0 @@ -import os -import json -import numpy as np -import xgboost as xgb -from cloudpickle import load -import mlrun - - -class XGBoostModel(mlrun.runtimes.MLModelServer): - def load(self): - model_file, extra_data = self.get_model(".pkl") - self.model = load(open(str(model_file), "rb")) - - def predict(self, body): - try: - feats = np.asarray(body["instances"], dtype=np.float32).reshape(-1, 5) - result = self.model.predict(feats, validate_features=False) - return result.tolist() - except Exception as e: - raise Exception("Failed to predict %s" % e) \ No newline at end of file diff --git a/functions/development/xgb_serving/0.9.0/static/documentation.html b/functions/development/xgb_serving/0.9.0/static/documentation.html deleted file mode 100644 index f7a0ab6b..00000000 --- a/functions/development/xgb_serving/0.9.0/static/documentation.html +++ /dev/null @@ -1,125 +0,0 @@ - - - - - - - -xgb_serving package - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- - -
-
-
-
-
-
-

xgb_serving package

-
-

Submodules

-
-
-

xgb_serving.xgb_serving module

-
-
-

Module contents

-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/xgb_serving/0.9.0/static/example.html b/functions/development/xgb_serving/0.9.0/static/example.html deleted file mode 100644 index 41836f94..00000000 --- a/functions/development/xgb_serving/0.9.0/static/example.html +++ /dev/null @@ -1,419 +0,0 @@ - - - - - - - -Deploy a Serverless XGBoost Model Server - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
- -
-
-
-
-

Deploy a Serverless XGBoost Model Server

-
-

The following notebook demonstrates how to deploy an XGBoost model server (a.k.a Nuclio-serving)

-
-

notebook how-to’s

-
    -
  • Write and test model serving class in a notebook.

  • -
  • Deploy the model server function.

  • -
  • Invoke and test the serving function.

  • -
-

-
- -
-

define a new function and its dependencies

-
-
-
%nuclio config kind="nuclio:serving"
-%nuclio env MODEL_CLASS=XGBoostModel
-
-%nuclio config spec.build.baseImage = "mlrun/ml-models"
-
-
-
-
-
%nuclio: setting kind to 'nuclio:serving'
-%nuclio: setting 'MODEL_CLASS' environment variable
-%nuclio: setting spec.build.baseImage to 'mlrun/ml-models'
-
-
-
-
-
-
-

Function Code

-
-
-
# import kfserving
-import os
-import json
-import numpy as np
-import xgboost as xgb
-from cloudpickle import load
-
-### Model Serving Class
-
-import mlrun
-class XGBoostModel(mlrun.runtimes.MLModelServer):
-    def load(self):
-        model_file, extra_data = self.get_model(".pkl")
-        self.model = load(open(str(model_file), "rb"))
-  
-
-    def predict(self, body):
-        try:
-            feats = np.asarray(body["instances"], dtype=np.float32).reshape(-1, 5)
-            result = self.model.predict(feats, validate_features=False)
-            return result.tolist()
-        except Exception as e:
-            raise Exception("Failed to predict %s" % e)
-
-
-
-
-

The following end-code annotation tells nuclio to stop parsing the notebook from this cell. Please do not remove this cell:

-
-
-
# nuclio: end-code
-
-
-
-
-

-
-
-

Test the function locally

-

The class above can be tested locally. Just instantiate the class, .load() will load the model to a local dir.

-
-

Verify there is a model file in the model_dir path (generated by the training notebook)

-
-
-
-
from mlrun import mlconf
-
-model_dir = os.path.join(mlconf.artifact_path, "xgb/models")
-
-my_server = XGBoostModel("my-model", model_dir=model_dir)
-my_server.load()
-
-
-
-
-
-
-
DATA_PATH = mlconf.artifact_path + "/xgb/classifier-data.csv"
-MODEL_PATH = mlconf.artifact_path + "/xgb/models/xgb_test"
-
-
-
-
-
-
-
import pandas as pd
-xtest = pd.read_csv(DATA_PATH)
-
-
-
-
-

We can use the .predict(body) method to test the model.

-
-
-
import json, numpy as np
-preds = my_server.predict({"instances":xtest.values[:10,:-1].tolist()})
-
-
-
-
-
-
-
print("predicted class:", preds)
-
-
-
-
-
predicted class: [1, 0, 0, 0, 0, 0, 1, 1, 0, 1]
-
-
-
-
-

-
-

deploy our serving class using as a serverless function

-

in the following section we create a new model serving function which wraps our class , and specify model and other resources.

-

the models dict store model names and the assosiated model dir URL (the URL can start with S3:// and other blob store options), the faster way is to use a shared file volume, we use .apply(mount_v3io()) to attach a v3io (iguazio data fabric) volume to our function. By default v3io will mount the current user home into the \User function path.

-

verify the model dir does contain a valid model.bst file

-
-
-
from mlrun import new_model_server, mount_v3io
-import requests
-
-
-
-
-
-
-
fn = new_model_server("xgb-serving",
-                      model_class="XGBoostModel",
-                      models={"xgb_serving_v2": f"{model_dir}"})
-fn.spec.description = "xgboost test data classification server"
-fn.metadata.categories = ["serving", "ml"]
-fn.metadata.labels = {"author": "yaronh", "framework": "xgboost"}
-
-fn.export("function.yaml")
-
-
-
-
-
[mlrun] 2020-06-14 12:49:05,013 function spec saved to path: function.yaml
-
-
-
<mlrun.runtimes.function.RemoteRuntime at 0x7f0218662f60>
-
-
-
-
-
-
-
-

tests

-
-
-
from mlrun.platforms.other import auto_mount
-fn.apply(auto_mount())
-
-
-
-
-
<mlrun.runtimes.function.RemoteRuntime at 0x7f0218662f60>
-
-
-
-
-
-
-
addr = fn.deploy()
-
-
-
-
-
[mlrun] 2020-06-14 12:49:18,128 deploy started
-[nuclio] 2020-06-14 12:49:19,213 (info) Build complete
-[nuclio] 2020-06-14 12:49:27,347 (info) Function deploy complete
-[nuclio] 2020-06-14 12:49:27,354 done updating default-xgb-test, function address: 3.23.82.202:30104
-
-
-
-
-
-
-
addr
-
-
-
-
-
'http://3.23.82.202:30104'
-
-
-
-
-

-
-

test our model server using HTTP request

-

We invoke our model serving function using test data, the data vector is specified in the instances attribute.

-
-
-
# KFServing protocol event
-event_data = {"instances": xtest.values[:10,:-1].tolist()}
-
-
-
-
-
-
-
import json
-resp = requests.put(addr + "/xgb_serving_v2/predict", json=json.dumps(event_data))
-resp.text
-
-
-
-
-
'[1, 0, 0, 0, 0, 0, 1, 1, 0, 1]'
-
-
-
-
-
-
-
preds
-
-
-
-
-
[1, 0, 0, 0, 0, 0, 1, 1, 0, 1]
-
-
-
-
-

back to top

-
-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/xgb_serving/0.9.0/static/function.html b/functions/development/xgb_serving/0.9.0/static/function.html deleted file mode 100644 index 07bbb576..00000000 --- a/functions/development/xgb_serving/0.9.0/static/function.html +++ /dev/null @@ -1,71 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: remote
-metadata:
-  name: xgb-serving
-  tag: ''
-  hash: d80a0b814b7707e100b3cd7d69fee276ebb2ff16
-  project: ''
-  labels:
-    author: Daniel
-  categories:
-  - model-serving
-  - machine-learning
-spec:
-  command: ''
-  args: []
-  image: mlrun/ml-models
-  entry_points:
-    load:
-      name: load
-      doc: ''
-      parameters:
-      - name: self
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 10
-    predict:
-      name: predict
-      doc: ''
-      parameters:
-      - name: self
-        default: ''
-      - name: body
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 14
-  description: deploy an XGBoost model server.
-  min_replicas: 1
-  max_replicas: 4
-  env: []
-  base_spec: ''
-  source: ''
-  build:
-    functionSourceCode: aW1wb3J0IG9zCmltcG9ydCBqc29uCmltcG9ydCBudW1weSBhcyBucAppbXBvcnQgeGdib29zdCBhcyB4Z2IKZnJvbSBjbG91ZHBpY2tsZSBpbXBvcnQgbG9hZAppbXBvcnQgbWxydW4KCgpjbGFzcyBYR0Jvb3N0TW9kZWwobWxydW4ucnVudGltZXMuTUxNb2RlbFNlcnZlcik6CiAgICBkZWYgbG9hZChzZWxmKToKICAgICAgICBtb2RlbF9maWxlLCBleHRyYV9kYXRhID0gc2VsZi5nZXRfbW9kZWwoIi5wa2wiKQogICAgICAgIHNlbGYubW9kZWwgPSBsb2FkKG9wZW4oc3RyKG1vZGVsX2ZpbGUpLCAicmIiKSkKCiAgICBkZWYgcHJlZGljdChzZWxmLCBib2R5KToKICAgICAgICB0cnk6CiAgICAgICAgICAgIGZlYXRzID0gbnAuYXNhcnJheShib2R5WyJpbnN0YW5jZXMiXSwgZHR5cGU9bnAuZmxvYXQzMikucmVzaGFwZSgtMSwgNSkKICAgICAgICAgICAgcmVzdWx0ID0gc2VsZi5tb2RlbC5wcmVkaWN0KGZlYXRzLCB2YWxpZGF0ZV9mZWF0dXJlcz1GYWxzZSkKICAgICAgICAgICAgcmV0dXJuIHJlc3VsdC50b2xpc3QoKQogICAgICAgIGV4Y2VwdCBFeGNlcHRpb24gYXMgZToKICAgICAgICAgICAgcmFpc2UgRXhjZXB0aW9uKCJGYWlsZWQgdG8gcHJlZGljdCAlcyIgJSBlKQ==
-    commands: []
-    code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/xgb_serving/xgb_serving.py
-  default_handler: handler
-  affinity: null
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/xgb_serving/0.9.0/static/item.html b/functions/development/xgb_serving/0.9.0/static/item.html deleted file mode 100644 index 8ea0a05d..00000000 --- a/functions/development/xgb_serving/0.9.0/static/item.html +++ /dev/null @@ -1,46 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- model-serving
-- machine-learning
-description: deploy an XGBoost model server.
-doc: ''
-example: xgb_serving.ipynb
-generationDate: 2021-11-18:12-28
-icon: ''
-labels:
-  author: Daniel
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 0.8.0
-name: xgb_serving
-platformVersion: 3.2.0
-spec:
-  filename: xgb_serving.py
-  handler: handler
-  image: mlrun/ml-models
-  kind: remote
-  requirements: []
-url: ''
-version: 0.9.0
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/xgb_serving/0.9.0/static/source.html b/functions/development/xgb_serving/0.9.0/static/source.html deleted file mode 100644 index ce0f84b6..00000000 --- a/functions/development/xgb_serving/0.9.0/static/source.html +++ /dev/null @@ -1,41 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-import os
-import json
-import numpy as np
-import xgboost as xgb
-from cloudpickle import load
-import mlrun
-
-
-class XGBoostModel(mlrun.runtimes.MLModelServer):
-    def load(self):
-        model_file, extra_data = self.get_model(".pkl")
-        self.model = load(open(str(model_file), "rb"))
-
-    def predict(self, body):
-        try:
-            feats = np.asarray(body["instances"], dtype=np.float32).reshape(-1, 5)
-            result = self.model.predict(feats, validate_features=False)
-            return result.tolist()
-        except Exception as e:
-            raise Exception("Failed to predict %s" % e)
-        
-    
- - \ No newline at end of file diff --git a/functions/development/xgb_serving/1.0.0/src/function.yaml b/functions/development/xgb_serving/1.0.0/src/function.yaml deleted file mode 100644 index 5784ed42..00000000 --- a/functions/development/xgb_serving/1.0.0/src/function.yaml +++ /dev/null @@ -1,49 +0,0 @@ -kind: remote -metadata: - name: xgb-serving - tag: '' - hash: d80a0b814b7707e100b3cd7d69fee276ebb2ff16 - project: '' - labels: - author: Daniel - categories: - - model-serving - - machine-learning -spec: - command: '' - args: [] - image: mlrun/ml-models - entry_points: - load: - name: load - doc: '' - parameters: - - name: self - default: '' - outputs: - - default: '' - lineno: 10 - predict: - name: predict - doc: '' - parameters: - - name: self - default: '' - - name: body - default: '' - outputs: - - default: '' - lineno: 14 - description: deploy an XGBoost model server. - min_replicas: 1 - max_replicas: 4 - env: [] - base_spec: '' - source: '' - build: - functionSourceCode: aW1wb3J0IG9zCmltcG9ydCBqc29uCmltcG9ydCBudW1weSBhcyBucAppbXBvcnQgeGdib29zdCBhcyB4Z2IKZnJvbSBjbG91ZHBpY2tsZSBpbXBvcnQgbG9hZAppbXBvcnQgbWxydW4KCgpjbGFzcyBYR0Jvb3N0TW9kZWwobWxydW4ucnVudGltZXMuTUxNb2RlbFNlcnZlcik6CiAgICBkZWYgbG9hZChzZWxmKToKICAgICAgICBtb2RlbF9maWxlLCBleHRyYV9kYXRhID0gc2VsZi5nZXRfbW9kZWwoIi5wa2wiKQogICAgICAgIHNlbGYubW9kZWwgPSBsb2FkKG9wZW4oc3RyKG1vZGVsX2ZpbGUpLCAicmIiKSkKCiAgICBkZWYgcHJlZGljdChzZWxmLCBib2R5KToKICAgICAgICB0cnk6CiAgICAgICAgICAgIGZlYXRzID0gbnAuYXNhcnJheShib2R5WyJpbnN0YW5jZXMiXSwgZHR5cGU9bnAuZmxvYXQzMikucmVzaGFwZSgtMSwgNSkKICAgICAgICAgICAgcmVzdWx0ID0gc2VsZi5tb2RlbC5wcmVkaWN0KGZlYXRzLCB2YWxpZGF0ZV9mZWF0dXJlcz1GYWxzZSkKICAgICAgICAgICAgcmV0dXJuIHJlc3VsdC50b2xpc3QoKQogICAgICAgIGV4Y2VwdCBFeGNlcHRpb24gYXMgZToKICAgICAgICAgICAgcmFpc2UgRXhjZXB0aW9uKCJGYWlsZWQgdG8gcHJlZGljdCAlcyIgJSBlKQ== - commands: [] - code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/xgb_serving/xgb_serving.py - default_handler: handler - affinity: null -verbose: false diff --git a/functions/development/xgb_serving/1.0.0/src/item.yaml b/functions/development/xgb_serving/1.0.0/src/item.yaml deleted file mode 100644 index 205689f6..00000000 --- a/functions/development/xgb_serving/1.0.0/src/item.yaml +++ /dev/null @@ -1,24 +0,0 @@ -apiVersion: v1 -categories: -- model-serving -- machine-learning -description: deploy an XGBoost model server. -doc: '' -example: xgb_serving.ipynb -generationDate: 2021-11-18:12-28 -icon: '' -labels: - author: Daniel -maintainers: [] -marketplaceType: '' -mlrunVersion: 0.8.0 -name: xgb_serving -platformVersion: 3.2.0 -spec: - filename: xgb_serving.py - handler: handler - image: mlrun/ml-models - kind: remote - requirements: [] -url: '' -version: 1.0.0 diff --git a/functions/development/xgb_serving/1.0.0/src/requirements.txt b/functions/development/xgb_serving/1.0.0/src/requirements.txt deleted file mode 100644 index 2e6aaf5a..00000000 --- a/functions/development/xgb_serving/1.0.0/src/requirements.txt +++ /dev/null @@ -1,8 +0,0 @@ -mlrun -pandas -xgboost -cloudpickle -pygit2 -sklearn -scikit-plot -seaborn \ No newline at end of file diff --git a/functions/development/xgb_serving/1.0.0/src/test_xgb_serving.py b/functions/development/xgb_serving/1.0.0/src/test_xgb_serving.py deleted file mode 100644 index 629a46a1..00000000 --- a/functions/development/xgb_serving/1.0.0/src/test_xgb_serving.py +++ /dev/null @@ -1,46 +0,0 @@ -from mlrun import import_function -import os -import pandas as pd -from xgb_serving import XGBoostModel - - -ARTIFACT_PATH = "artifacts" -FUNCTION_PATH = "functions" -MODELS_PATH = "models" -PLOTS_PATH = "plots" -RUNS_PATH = "runs" -SCHEDULES_PATH = "schedules" - - -def test_local_xgb_serving(): - # importing data preparation function (gen_class_data) locally - fn = import_function("hub://gen_class_data") - fn.run(params={ - "n_samples": 10_000, - "m_features": 5, - "k_classes": 2, - "header": None, - "weight": [0.5, 0.5], - "sk_params": {"n_informative": 2}, - "file_ext": "csv"}, local=True, artifact_path="./artifacts/inputs") - - # importing model training function (xgb_trainer) locally - fn = import_function("../xgb_trainer/function.yaml") - fn.run(params={ - "model_type": "classifier", - "CLASS_tree_method": "hist", - "CLASS_objective": "binary:logistic", - "CLASS_booster": "gbtree", - "FIT_verbose": 0, - "label_column": "labels", - "test_set": "./artifacts/test-set"}, - local=True, inputs={"dataset": './artifacts/inputs/classifier-data.csv'}) - - # because this class is implemented with MLModelServer, creating a class instance and not to_mock_server(V2_Model_Server). - model = os.getcwd() + "/models/model.pkl" - my_server = XGBoostModel("my-model", model_dir=model) - my_server.load() - # Testing the model - xtest = pd.read_csv('./artifacts/inputs/classifier-data.csv') - preds = my_server.predict({"instances": xtest.values[:10, :-1].tolist()}) - assert (True if preds == [1, 0, 0, 0, 0, 0, 1, 1, 0, 1] else False) is True diff --git a/functions/development/xgb_serving/1.0.0/src/xgb_serving.ipynb b/functions/development/xgb_serving/1.0.0/src/xgb_serving.ipynb deleted file mode 100644 index 6c605367..00000000 --- a/functions/development/xgb_serving/1.0.0/src/xgb_serving.ipynb +++ /dev/null @@ -1,421 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Deploy a Serverless XGBoost Model Server\n", - " --------------------------------------------------------------------\n", - "\n", - "The following notebook demonstrates how to deploy an XGBoost model server (a.k.a Nuclio-serving)\n", - "\n", - "#### **notebook how-to's**\n", - "* Write and test model serving class in a notebook.\n", - "* Deploy the model server function.\n", - "* Invoke and test the serving function." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "#### **steps**\n", - "**[define a new function and its dependencies](#define-function)**
\n", - "**[test the model serving class locally](#test-locally)**
\n", - "**[deploy our serving class using as a serverless function](#deploy)**
\n", - "**[test our model server using HTTP request](#test-model-server)**
" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: ignore\n", - "import nuclio " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "### **define a new function and its dependencies**" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "%nuclio: setting kind to 'nuclio:serving'\n", - "%nuclio: setting 'MODEL_CLASS' environment variable\n", - "%nuclio: setting spec.build.baseImage to 'mlrun/ml-models'\n" - ] - } - ], - "source": [ - "%nuclio config kind=\"nuclio:serving\"\n", - "%nuclio env MODEL_CLASS=XGBoostModel\n", - "\n", - "%nuclio config spec.build.baseImage = \"mlrun/ml-models\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Function Code" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "# import kfserving\n", - "import os\n", - "import json\n", - "import numpy as np\n", - "import xgboost as xgb\n", - "from cloudpickle import load\n", - "\n", - "### Model Serving Class\n", - "\n", - "import mlrun\n", - "class XGBoostModel(mlrun.runtimes.MLModelServer):\n", - " def load(self):\n", - " model_file, extra_data = self.get_model(\".pkl\")\n", - " self.model = load(open(str(model_file), \"rb\"))\n", - " \n", - "\n", - " def predict(self, body):\n", - " try:\n", - " feats = np.asarray(body[\"instances\"], dtype=np.float32).reshape(-1, 5)\n", - " result = self.model.predict(feats, validate_features=False)\n", - " return result.tolist()\n", - " except Exception as e:\n", - " raise Exception(\"Failed to predict %s\" % e)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The following end-code annotation tells ```nuclio``` to stop parsing the notebook from this cell. _**Please do not remove this cell**_:" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: end-code" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "## Test the function locally\n", - "\n", - "The class above can be tested locally. Just instantiate the class, `.load()` will load the model to a local dir.\n", - "\n", - "> **Verify there is a model file in the model_dir path (generated by the training notebook)**" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import mlconf\n", - "\n", - "model_dir = os.path.join(mlconf.artifact_path, \"xgb/models\")\n", - "\n", - "my_server = XGBoostModel(\"my-model\", model_dir=model_dir)\n", - "my_server.load()" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "metadata": {}, - "outputs": [], - "source": [ - "DATA_PATH = mlconf.artifact_path + \"/xgb/classifier-data.csv\"\n", - "MODEL_PATH = mlconf.artifact_path + \"/xgb/models/xgb_test\"" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "xtest = pd.read_csv(DATA_PATH)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can use the `.predict(body)` method to test the model." - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "metadata": {}, - "outputs": [], - "source": [ - "import json, numpy as np\n", - "preds = my_server.predict({\"instances\":xtest.values[:10,:-1].tolist()})" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "predicted class: [1, 0, 0, 0, 0, 0, 1, 1, 0, 1]\n" - ] - } - ], - "source": [ - "print(\"predicted class:\", preds)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "### **deploy our serving class using as a serverless function**\n", - "in the following section we create a new model serving function which wraps our class , and specify model and other resources.\n", - "\n", - "the `models` dict store model names and the assosiated model **dir** URL (the URL can start with `S3://` and other blob store options), the faster way is to use a shared file volume, we use `.apply(mount_v3io())` to attach a v3io (iguazio data fabric) volume to our function. By default v3io will mount the current user home into the `\\User` function path.\n", - "\n", - "**verify the model dir does contain a valid `model.bst` file**" - ] - }, - { - "cell_type": "code", - "execution_count": 41, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import new_model_server, mount_v3io\n", - "import requests" - ] - }, - { - "cell_type": "code", - "execution_count": 42, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-06-14 12:49:05,013 function spec saved to path: function.yaml\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 42, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fn = new_model_server(\"xgb-serving\",\n", - " model_class=\"XGBoostModel\",\n", - " models={\"xgb_serving_v2\": f\"{model_dir}\"})\n", - "fn.spec.description = \"xgboost test data classification server\"\n", - "fn.metadata.categories = [\"serving\", \"ml\"]\n", - "fn.metadata.labels = {\"author\": \"yaronh\", \"framework\": \"xgboost\"}\n", - "\n", - "fn.export(\"function.yaml\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## tests" - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 43, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from mlrun.platforms.other import auto_mount\n", - "fn.apply(auto_mount())" - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-06-14 12:49:18,128 deploy started\n", - "[nuclio] 2020-06-14 12:49:19,213 (info) Build complete\n", - "[nuclio] 2020-06-14 12:49:27,347 (info) Function deploy complete\n", - "[nuclio] 2020-06-14 12:49:27,354 done updating default-xgb-test, function address: 3.23.82.202:30104\n" - ] - } - ], - "source": [ - "addr = fn.deploy()" - ] - }, - { - "cell_type": "code", - "execution_count": 45, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'http://3.23.82.202:30104'" - ] - }, - "execution_count": 45, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "addr" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "### **test our model server using HTTP request**\n", - "\n", - "\n", - "We invoke our model serving function using test data, the data vector is specified in the `instances` attribute." - ] - }, - { - "cell_type": "code", - "execution_count": 46, - "metadata": {}, - "outputs": [], - "source": [ - "# KFServing protocol event\n", - "event_data = {\"instances\": xtest.values[:10,:-1].tolist()}" - ] - }, - { - "cell_type": "code", - "execution_count": 47, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'[1, 0, 0, 0, 0, 0, 1, 1, 0, 1]'" - ] - }, - "execution_count": 47, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import json\n", - "resp = requests.put(addr + \"/xgb_serving_v2/predict\", json=json.dumps(event_data))\n", - "resp.text" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[1, 0, 0, 0, 0, 0, 1, 1, 0, 1]" - ] - }, - "execution_count": 26, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "preds" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**[back to top](#top)**" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.8" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/functions/development/xgb_serving/1.0.0/src/xgb_serving.py b/functions/development/xgb_serving/1.0.0/src/xgb_serving.py deleted file mode 100644 index 41bf327c..00000000 --- a/functions/development/xgb_serving/1.0.0/src/xgb_serving.py +++ /dev/null @@ -1,20 +0,0 @@ -import os -import json -import numpy as np -import xgboost as xgb -from cloudpickle import load -import mlrun - - -class XGBoostModel(mlrun.runtimes.MLModelServer): - def load(self): - model_file, extra_data = self.get_model(".pkl") - self.model = load(open(str(model_file), "rb")) - - def predict(self, body): - try: - feats = np.asarray(body["instances"], dtype=np.float32).reshape(-1, 5) - result = self.model.predict(feats, validate_features=False) - return result.tolist() - except Exception as e: - raise Exception("Failed to predict %s" % e) \ No newline at end of file diff --git a/functions/development/xgb_serving/1.0.0/static/documentation.html b/functions/development/xgb_serving/1.0.0/static/documentation.html deleted file mode 100644 index 5b6ba17c..00000000 --- a/functions/development/xgb_serving/1.0.0/static/documentation.html +++ /dev/null @@ -1,141 +0,0 @@ - - - - - - - -xgb_serving package - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
- -
- - - - - - -
- - -
-
-
-
-
-
-

xgb_serving package

-
-

Submodules

-
-
-

xgb_serving.xgb_serving module

-
-
-class xgb_serving.xgb_serving.XGBoostModel(name: str, model_dir: Optional[str] = None, model=None)[source]
-

Bases: mlrun.serving.v1_serving.MLModelServer

-
-
-load()[source]
-
-
-
-predict(body)[source]
-
-
-
-
-

Module contents

-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/xgb_serving/1.0.0/static/example.html b/functions/development/xgb_serving/1.0.0/static/example.html deleted file mode 100644 index 768a5629..00000000 --- a/functions/development/xgb_serving/1.0.0/static/example.html +++ /dev/null @@ -1,422 +0,0 @@ - - - - - - - -Deploy a Serverless XGBoost Model Server - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
- -
-
-
-
-

Deploy a Serverless XGBoost Model Server

-
-

The following notebook demonstrates how to deploy an XGBoost model server (a.k.a Nuclio-serving)

-
-

notebook how-to’s

-
    -
  • Write and test model serving class in a notebook.

  • -
  • Deploy the model server function.

  • -
  • Invoke and test the serving function.

  • -
-

-
-
-

steps

-

define a new function and its dependencies
-test the model serving class locally
-deploy our serving class using as a serverless function
-test our model server using HTTP request

-
-
-
# nuclio: ignore
-import nuclio 
-
-
-
-
-

-
-
-

define a new function and its dependencies

-
-
-
%nuclio config kind="nuclio:serving"
-%nuclio env MODEL_CLASS=XGBoostModel
-
-%nuclio config spec.build.baseImage = "mlrun/ml-models"
-
-
-
-
-
%nuclio: setting kind to 'nuclio:serving'
-%nuclio: setting 'MODEL_CLASS' environment variable
-%nuclio: setting spec.build.baseImage to 'mlrun/ml-models'
-
-
-
-
-
-
-

Function Code

-
-
-
# import kfserving
-import os
-import json
-import numpy as np
-import xgboost as xgb
-from cloudpickle import load
-
-### Model Serving Class
-
-import mlrun
-class XGBoostModel(mlrun.runtimes.MLModelServer):
-    def load(self):
-        model_file, extra_data = self.get_model(".pkl")
-        self.model = load(open(str(model_file), "rb"))
-  
-
-    def predict(self, body):
-        try:
-            feats = np.asarray(body["instances"], dtype=np.float32).reshape(-1, 5)
-            result = self.model.predict(feats, validate_features=False)
-            return result.tolist()
-        except Exception as e:
-            raise Exception("Failed to predict %s" % e)
-
-
-
-
-

The following end-code annotation tells nuclio to stop parsing the notebook from this cell. Please do not remove this cell:

-
-
-
# nuclio: end-code
-
-
-
-
-

-
-
-

Test the function locally

-

The class above can be tested locally. Just instantiate the class, .load() will load the model to a local dir.

-
-

Verify there is a model file in the model_dir path (generated by the training notebook)

-
-
-
-
from mlrun import mlconf
-
-model_dir = os.path.join(mlconf.artifact_path, "xgb/models")
-
-my_server = XGBoostModel("my-model", model_dir=model_dir)
-my_server.load()
-
-
-
-
-
-
-
DATA_PATH = mlconf.artifact_path + "/xgb/classifier-data.csv"
-MODEL_PATH = mlconf.artifact_path + "/xgb/models/xgb_test"
-
-
-
-
-
-
-
import pandas as pd
-xtest = pd.read_csv(DATA_PATH)
-
-
-
-
-

We can use the .predict(body) method to test the model.

-
-
-
import json, numpy as np
-preds = my_server.predict({"instances":xtest.values[:10,:-1].tolist()})
-
-
-
-
-
-
-
print("predicted class:", preds)
-
-
-
-
-
predicted class: [1, 0, 0, 0, 0, 0, 1, 1, 0, 1]
-
-
-
-
-

-
-

deploy our serving class using as a serverless function

-

in the following section we create a new model serving function which wraps our class , and specify model and other resources.

-

the models dict store model names and the assosiated model dir URL (the URL can start with S3:// and other blob store options), the faster way is to use a shared file volume, we use .apply(mount_v3io()) to attach a v3io (iguazio data fabric) volume to our function. By default v3io will mount the current user home into the \User function path.

-

verify the model dir does contain a valid model.bst file

-
-
-
from mlrun import new_model_server, mount_v3io
-import requests
-
-
-
-
-
-
-
fn = new_model_server("xgb-serving",
-                      model_class="XGBoostModel",
-                      models={"xgb_serving_v2": f"{model_dir}"})
-fn.spec.description = "xgboost test data classification server"
-fn.metadata.categories = ["serving", "ml"]
-fn.metadata.labels = {"author": "yaronh", "framework": "xgboost"}
-
-fn.export("function.yaml")
-
-
-
-
-
[mlrun] 2020-06-14 12:49:05,013 function spec saved to path: function.yaml
-
-
-
<mlrun.runtimes.function.RemoteRuntime at 0x7f0218662f60>
-
-
-
-
-
-
-
-

tests

-
-
-
from mlrun.platforms.other import auto_mount
-fn.apply(auto_mount())
-
-
-
-
-
<mlrun.runtimes.function.RemoteRuntime at 0x7f0218662f60>
-
-
-
-
-
-
-
addr = fn.deploy()
-
-
-
-
-
[mlrun] 2020-06-14 12:49:18,128 deploy started
-[nuclio] 2020-06-14 12:49:19,213 (info) Build complete
-[nuclio] 2020-06-14 12:49:27,347 (info) Function deploy complete
-[nuclio] 2020-06-14 12:49:27,354 done updating default-xgb-test, function address: 3.23.82.202:30104
-
-
-
-
-
-
-
addr
-
-
-
-
-
'http://3.23.82.202:30104'
-
-
-
-
-

-
-

test our model server using HTTP request

-

We invoke our model serving function using test data, the data vector is specified in the instances attribute.

-
-
-
# KFServing protocol event
-event_data = {"instances": xtest.values[:10,:-1].tolist()}
-
-
-
-
-
-
-
import json
-resp = requests.put(addr + "/xgb_serving_v2/predict", json=json.dumps(event_data))
-resp.text
-
-
-
-
-
'[1, 0, 0, 0, 0, 0, 1, 1, 0, 1]'
-
-
-
-
-
-
-
preds
-
-
-
-
-
[1, 0, 0, 0, 0, 0, 1, 1, 0, 1]
-
-
-
-
-

back to top

-
-
-
-
-
-
-
-
-
-
-

- - © Copyright .
-

-
-
-
-
-
- - - \ No newline at end of file diff --git a/functions/development/xgb_serving/1.0.0/static/function.html b/functions/development/xgb_serving/1.0.0/static/function.html deleted file mode 100644 index 07bbb576..00000000 --- a/functions/development/xgb_serving/1.0.0/static/function.html +++ /dev/null @@ -1,71 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: remote
-metadata:
-  name: xgb-serving
-  tag: ''
-  hash: d80a0b814b7707e100b3cd7d69fee276ebb2ff16
-  project: ''
-  labels:
-    author: Daniel
-  categories:
-  - model-serving
-  - machine-learning
-spec:
-  command: ''
-  args: []
-  image: mlrun/ml-models
-  entry_points:
-    load:
-      name: load
-      doc: ''
-      parameters:
-      - name: self
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 10
-    predict:
-      name: predict
-      doc: ''
-      parameters:
-      - name: self
-        default: ''
-      - name: body
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 14
-  description: deploy an XGBoost model server.
-  min_replicas: 1
-  max_replicas: 4
-  env: []
-  base_spec: ''
-  source: ''
-  build:
-    functionSourceCode: aW1wb3J0IG9zCmltcG9ydCBqc29uCmltcG9ydCBudW1weSBhcyBucAppbXBvcnQgeGdib29zdCBhcyB4Z2IKZnJvbSBjbG91ZHBpY2tsZSBpbXBvcnQgbG9hZAppbXBvcnQgbWxydW4KCgpjbGFzcyBYR0Jvb3N0TW9kZWwobWxydW4ucnVudGltZXMuTUxNb2RlbFNlcnZlcik6CiAgICBkZWYgbG9hZChzZWxmKToKICAgICAgICBtb2RlbF9maWxlLCBleHRyYV9kYXRhID0gc2VsZi5nZXRfbW9kZWwoIi5wa2wiKQogICAgICAgIHNlbGYubW9kZWwgPSBsb2FkKG9wZW4oc3RyKG1vZGVsX2ZpbGUpLCAicmIiKSkKCiAgICBkZWYgcHJlZGljdChzZWxmLCBib2R5KToKICAgICAgICB0cnk6CiAgICAgICAgICAgIGZlYXRzID0gbnAuYXNhcnJheShib2R5WyJpbnN0YW5jZXMiXSwgZHR5cGU9bnAuZmxvYXQzMikucmVzaGFwZSgtMSwgNSkKICAgICAgICAgICAgcmVzdWx0ID0gc2VsZi5tb2RlbC5wcmVkaWN0KGZlYXRzLCB2YWxpZGF0ZV9mZWF0dXJlcz1GYWxzZSkKICAgICAgICAgICAgcmV0dXJuIHJlc3VsdC50b2xpc3QoKQogICAgICAgIGV4Y2VwdCBFeGNlcHRpb24gYXMgZToKICAgICAgICAgICAgcmFpc2UgRXhjZXB0aW9uKCJGYWlsZWQgdG8gcHJlZGljdCAlcyIgJSBlKQ==
-    commands: []
-    code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/xgb_serving/xgb_serving.py
-  default_handler: handler
-  affinity: null
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/xgb_serving/1.0.0/static/item.html b/functions/development/xgb_serving/1.0.0/static/item.html deleted file mode 100644 index fb0a1f2c..00000000 --- a/functions/development/xgb_serving/1.0.0/static/item.html +++ /dev/null @@ -1,46 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- model-serving
-- machine-learning
-description: deploy an XGBoost model server.
-doc: ''
-example: xgb_serving.ipynb
-generationDate: 2021-11-18:12-28
-icon: ''
-labels:
-  author: Daniel
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 0.8.0
-name: xgb_serving
-platformVersion: 3.2.0
-spec:
-  filename: xgb_serving.py
-  handler: handler
-  image: mlrun/ml-models
-  kind: remote
-  requirements: []
-url: ''
-version: 1.0.0
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/xgb_serving/1.0.0/static/source.html b/functions/development/xgb_serving/1.0.0/static/source.html deleted file mode 100644 index ce0f84b6..00000000 --- a/functions/development/xgb_serving/1.0.0/static/source.html +++ /dev/null @@ -1,41 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-import os
-import json
-import numpy as np
-import xgboost as xgb
-from cloudpickle import load
-import mlrun
-
-
-class XGBoostModel(mlrun.runtimes.MLModelServer):
-    def load(self):
-        model_file, extra_data = self.get_model(".pkl")
-        self.model = load(open(str(model_file), "rb"))
-
-    def predict(self, body):
-        try:
-            feats = np.asarray(body["instances"], dtype=np.float32).reshape(-1, 5)
-            result = self.model.predict(feats, validate_features=False)
-            return result.tolist()
-        except Exception as e:
-            raise Exception("Failed to predict %s" % e)
-        
-    
- - \ No newline at end of file diff --git a/functions/development/xgb_serving/1.1.0/src/function.yaml b/functions/development/xgb_serving/1.1.0/src/function.yaml deleted file mode 100644 index 5784ed42..00000000 --- a/functions/development/xgb_serving/1.1.0/src/function.yaml +++ /dev/null @@ -1,49 +0,0 @@ -kind: remote -metadata: - name: xgb-serving - tag: '' - hash: d80a0b814b7707e100b3cd7d69fee276ebb2ff16 - project: '' - labels: - author: Daniel - categories: - - model-serving - - machine-learning -spec: - command: '' - args: [] - image: mlrun/ml-models - entry_points: - load: - name: load - doc: '' - parameters: - - name: self - default: '' - outputs: - - default: '' - lineno: 10 - predict: - name: predict - doc: '' - parameters: - - name: self - default: '' - - name: body - default: '' - outputs: - - default: '' - lineno: 14 - description: deploy an XGBoost model server. - min_replicas: 1 - max_replicas: 4 - env: [] - base_spec: '' - source: '' - build: - functionSourceCode: aW1wb3J0IG9zCmltcG9ydCBqc29uCmltcG9ydCBudW1weSBhcyBucAppbXBvcnQgeGdib29zdCBhcyB4Z2IKZnJvbSBjbG91ZHBpY2tsZSBpbXBvcnQgbG9hZAppbXBvcnQgbWxydW4KCgpjbGFzcyBYR0Jvb3N0TW9kZWwobWxydW4ucnVudGltZXMuTUxNb2RlbFNlcnZlcik6CiAgICBkZWYgbG9hZChzZWxmKToKICAgICAgICBtb2RlbF9maWxlLCBleHRyYV9kYXRhID0gc2VsZi5nZXRfbW9kZWwoIi5wa2wiKQogICAgICAgIHNlbGYubW9kZWwgPSBsb2FkKG9wZW4oc3RyKG1vZGVsX2ZpbGUpLCAicmIiKSkKCiAgICBkZWYgcHJlZGljdChzZWxmLCBib2R5KToKICAgICAgICB0cnk6CiAgICAgICAgICAgIGZlYXRzID0gbnAuYXNhcnJheShib2R5WyJpbnN0YW5jZXMiXSwgZHR5cGU9bnAuZmxvYXQzMikucmVzaGFwZSgtMSwgNSkKICAgICAgICAgICAgcmVzdWx0ID0gc2VsZi5tb2RlbC5wcmVkaWN0KGZlYXRzLCB2YWxpZGF0ZV9mZWF0dXJlcz1GYWxzZSkKICAgICAgICAgICAgcmV0dXJuIHJlc3VsdC50b2xpc3QoKQogICAgICAgIGV4Y2VwdCBFeGNlcHRpb24gYXMgZToKICAgICAgICAgICAgcmFpc2UgRXhjZXB0aW9uKCJGYWlsZWQgdG8gcHJlZGljdCAlcyIgJSBlKQ== - commands: [] - code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/xgb_serving/xgb_serving.py - default_handler: handler - affinity: null -verbose: false diff --git a/functions/development/xgb_serving/1.1.0/src/item.yaml b/functions/development/xgb_serving/1.1.0/src/item.yaml deleted file mode 100644 index 9a78110b..00000000 --- a/functions/development/xgb_serving/1.1.0/src/item.yaml +++ /dev/null @@ -1,25 +0,0 @@ -apiVersion: v1 -categories: -- model-serving -- machine-learning -description: deploy an XGBoost model server. -doc: '' -example: xgb_serving.ipynb -generationDate: 2022-08-28:17-25 -hidden: false -icon: '' -labels: - author: Daniel -maintainers: [] -marketplaceType: '' -mlrunVersion: 1.1.0 -name: xgb_serving -platformVersion: 3.5.0 -spec: - filename: xgb_serving.py - handler: handler - image: mlrun/ml-models - kind: remote - requirements: [] -url: '' -version: 1.1.0 diff --git a/functions/development/xgb_serving/1.1.0/src/requirements.txt b/functions/development/xgb_serving/1.1.0/src/requirements.txt deleted file mode 100644 index 17b25ec3..00000000 --- a/functions/development/xgb_serving/1.1.0/src/requirements.txt +++ /dev/null @@ -1,8 +0,0 @@ -mlrun -pandas -xgboost -cloudpickle -pygit2 -scikit-learn -scikit-plot -seaborn \ No newline at end of file diff --git a/functions/development/xgb_serving/1.1.0/src/test_xgb_serving.py b/functions/development/xgb_serving/1.1.0/src/test_xgb_serving.py deleted file mode 100644 index ce5e8aaa..00000000 --- a/functions/development/xgb_serving/1.1.0/src/test_xgb_serving.py +++ /dev/null @@ -1,52 +0,0 @@ -# Copyright 2019 Iguazio -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -from mlrun import import_function -import os -import pandas as pd -from xgb_serving import XGBoostModel - -def test_local_xgb_serving(): - # importing data preparation function (gen_class_data) locally - fn = import_function("hub://gen_class_data") - gen_data_run = fn.run(params={"n_samples": 10_000, - "m_features": 5, - "k_classes": 2, - "header": None, - "weight": [0.5, 0.5], - "sk_params": {"n_informative": 2}, - "file_ext": "csv"}, - local=True, - artifact_path="./") - - # importing model training function (xgb_trainer) locally - fn = import_function("../xgb_trainer/function.yaml") - xgb_trainer_run = fn.run(params={"model_type": "classifier", - "CLASS_tree_method": "hist", - "CLASS_objective": "binary:logistic", - "CLASS_booster": "gbtree", - "FIT_verbose": 0, - "label_column": "labels"}, - local=True, - inputs={"dataset": gen_data_run.artifact('classifier-data').url}, - artifact_path='./') - - # because this class is implemented with MLModelServer, creating a class instance and not to_mock_server(V2_Model_Server). - model = xgb_trainer_run.artifact('model').url - my_server = XGBoostModel("my-model", model_dir=model) - my_server.load() - # Testing the model - xtest = pd.read_csv(gen_data_run.artifact('classifier-data').url) - preds = my_server.predict({"instances": xtest.values[:10, :-1].tolist()}) - assert (True if preds == [1, 0, 0, 0, 0, 0, 1, 1, 0, 1] else False) is True \ No newline at end of file diff --git a/functions/development/xgb_serving/1.1.0/src/xgb_serving.ipynb b/functions/development/xgb_serving/1.1.0/src/xgb_serving.ipynb deleted file mode 100644 index 6c605367..00000000 --- a/functions/development/xgb_serving/1.1.0/src/xgb_serving.ipynb +++ /dev/null @@ -1,421 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Deploy a Serverless XGBoost Model Server\n", - " --------------------------------------------------------------------\n", - "\n", - "The following notebook demonstrates how to deploy an XGBoost model server (a.k.a Nuclio-serving)\n", - "\n", - "#### **notebook how-to's**\n", - "* Write and test model serving class in a notebook.\n", - "* Deploy the model server function.\n", - "* Invoke and test the serving function." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "#### **steps**\n", - "**[define a new function and its dependencies](#define-function)**
\n", - "**[test the model serving class locally](#test-locally)**
\n", - "**[deploy our serving class using as a serverless function](#deploy)**
\n", - "**[test our model server using HTTP request](#test-model-server)**
" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: ignore\n", - "import nuclio " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "### **define a new function and its dependencies**" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "%nuclio: setting kind to 'nuclio:serving'\n", - "%nuclio: setting 'MODEL_CLASS' environment variable\n", - "%nuclio: setting spec.build.baseImage to 'mlrun/ml-models'\n" - ] - } - ], - "source": [ - "%nuclio config kind=\"nuclio:serving\"\n", - "%nuclio env MODEL_CLASS=XGBoostModel\n", - "\n", - "%nuclio config spec.build.baseImage = \"mlrun/ml-models\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Function Code" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "# import kfserving\n", - "import os\n", - "import json\n", - "import numpy as np\n", - "import xgboost as xgb\n", - "from cloudpickle import load\n", - "\n", - "### Model Serving Class\n", - "\n", - "import mlrun\n", - "class XGBoostModel(mlrun.runtimes.MLModelServer):\n", - " def load(self):\n", - " model_file, extra_data = self.get_model(\".pkl\")\n", - " self.model = load(open(str(model_file), \"rb\"))\n", - " \n", - "\n", - " def predict(self, body):\n", - " try:\n", - " feats = np.asarray(body[\"instances\"], dtype=np.float32).reshape(-1, 5)\n", - " result = self.model.predict(feats, validate_features=False)\n", - " return result.tolist()\n", - " except Exception as e:\n", - " raise Exception(\"Failed to predict %s\" % e)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The following end-code annotation tells ```nuclio``` to stop parsing the notebook from this cell. _**Please do not remove this cell**_:" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: end-code" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "## Test the function locally\n", - "\n", - "The class above can be tested locally. Just instantiate the class, `.load()` will load the model to a local dir.\n", - "\n", - "> **Verify there is a model file in the model_dir path (generated by the training notebook)**" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import mlconf\n", - "\n", - "model_dir = os.path.join(mlconf.artifact_path, \"xgb/models\")\n", - "\n", - "my_server = XGBoostModel(\"my-model\", model_dir=model_dir)\n", - "my_server.load()" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "metadata": {}, - "outputs": [], - "source": [ - "DATA_PATH = mlconf.artifact_path + \"/xgb/classifier-data.csv\"\n", - "MODEL_PATH = mlconf.artifact_path + \"/xgb/models/xgb_test\"" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "xtest = pd.read_csv(DATA_PATH)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can use the `.predict(body)` method to test the model." - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "metadata": {}, - "outputs": [], - "source": [ - "import json, numpy as np\n", - "preds = my_server.predict({\"instances\":xtest.values[:10,:-1].tolist()})" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "predicted class: [1, 0, 0, 0, 0, 0, 1, 1, 0, 1]\n" - ] - } - ], - "source": [ - "print(\"predicted class:\", preds)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "### **deploy our serving class using as a serverless function**\n", - "in the following section we create a new model serving function which wraps our class , and specify model and other resources.\n", - "\n", - "the `models` dict store model names and the assosiated model **dir** URL (the URL can start with `S3://` and other blob store options), the faster way is to use a shared file volume, we use `.apply(mount_v3io())` to attach a v3io (iguazio data fabric) volume to our function. By default v3io will mount the current user home into the `\\User` function path.\n", - "\n", - "**verify the model dir does contain a valid `model.bst` file**" - ] - }, - { - "cell_type": "code", - "execution_count": 41, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import new_model_server, mount_v3io\n", - "import requests" - ] - }, - { - "cell_type": "code", - "execution_count": 42, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-06-14 12:49:05,013 function spec saved to path: function.yaml\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 42, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fn = new_model_server(\"xgb-serving\",\n", - " model_class=\"XGBoostModel\",\n", - " models={\"xgb_serving_v2\": f\"{model_dir}\"})\n", - "fn.spec.description = \"xgboost test data classification server\"\n", - "fn.metadata.categories = [\"serving\", \"ml\"]\n", - "fn.metadata.labels = {\"author\": \"yaronh\", \"framework\": \"xgboost\"}\n", - "\n", - "fn.export(\"function.yaml\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## tests" - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 43, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from mlrun.platforms.other import auto_mount\n", - "fn.apply(auto_mount())" - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-06-14 12:49:18,128 deploy started\n", - "[nuclio] 2020-06-14 12:49:19,213 (info) Build complete\n", - "[nuclio] 2020-06-14 12:49:27,347 (info) Function deploy complete\n", - "[nuclio] 2020-06-14 12:49:27,354 done updating default-xgb-test, function address: 3.23.82.202:30104\n" - ] - } - ], - "source": [ - "addr = fn.deploy()" - ] - }, - { - "cell_type": "code", - "execution_count": 45, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'http://3.23.82.202:30104'" - ] - }, - "execution_count": 45, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "addr" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "### **test our model server using HTTP request**\n", - "\n", - "\n", - "We invoke our model serving function using test data, the data vector is specified in the `instances` attribute." - ] - }, - { - "cell_type": "code", - "execution_count": 46, - "metadata": {}, - "outputs": [], - "source": [ - "# KFServing protocol event\n", - "event_data = {\"instances\": xtest.values[:10,:-1].tolist()}" - ] - }, - { - "cell_type": "code", - "execution_count": 47, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'[1, 0, 0, 0, 0, 0, 1, 1, 0, 1]'" - ] - }, - "execution_count": 47, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import json\n", - "resp = requests.put(addr + \"/xgb_serving_v2/predict\", json=json.dumps(event_data))\n", - "resp.text" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[1, 0, 0, 0, 0, 0, 1, 1, 0, 1]" - ] - }, - "execution_count": 26, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "preds" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**[back to top](#top)**" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.8" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/functions/development/xgb_serving/1.1.0/src/xgb_serving.py b/functions/development/xgb_serving/1.1.0/src/xgb_serving.py deleted file mode 100644 index 94ecb622..00000000 --- a/functions/development/xgb_serving/1.1.0/src/xgb_serving.py +++ /dev/null @@ -1,34 +0,0 @@ -# Copyright 2019 Iguazio -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -import os -import json -import numpy as np -import xgboost as xgb -from cloudpickle import load -import mlrun - - -class XGBoostModel(mlrun.runtimes.MLModelServer): - def load(self): - model_file, extra_data = self.get_model(".pkl") - self.model = load(open(str(model_file), "rb")) - - def predict(self, body): - try: - feats = np.asarray(body["instances"], dtype=np.float32).reshape(-1, 5) - result = self.model.predict(feats, validate_features=False) - return result.tolist() - except Exception as e: - raise Exception("Failed to predict %s" % e) \ No newline at end of file diff --git a/functions/development/xgb_serving/1.1.0/static/documentation.html b/functions/development/xgb_serving/1.1.0/static/documentation.html deleted file mode 100644 index bf910cd7..00000000 --- a/functions/development/xgb_serving/1.1.0/static/documentation.html +++ /dev/null @@ -1,235 +0,0 @@ - - - - - - - -xgb_serving package - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - - - -
-
- - -
-
-
- -
-

xgb_serving package

- -
- -
-
-
-
-
-

xgb_serving package#

-
-

Submodules#

-
-
-

xgb_serving.xgb_serving module#

-
-
-class xgb_serving.xgb_serving.XGBoostModel(name: str, model_dir: Optional[str] = None, model=None)[source]#
-

Bases: mlrun.serving.v1_serving.MLModelServer

-
-
-load()[source]#
-
-
-
-predict(body)[source]#
-
-
-
-
-

Module contents#

-
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/xgb_serving/1.1.0/static/example.html b/functions/development/xgb_serving/1.1.0/static/example.html deleted file mode 100644 index 5f2174ae..00000000 --- a/functions/development/xgb_serving/1.1.0/static/example.html +++ /dev/null @@ -1,565 +0,0 @@ - - - - - - - -Deploy a Serverless XGBoost Model Server - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - - - -
-
- - -
-
-
- - -
-
-
-

Deploy a Serverless XGBoost Model Server#

-
-

The following notebook demonstrates how to deploy an XGBoost model server (a.k.a Nuclio-serving)

-
-

notebook how-to’s#

-
    -
  • Write and test model serving class in a notebook.

  • -
  • Deploy the model server function.

  • -
  • Invoke and test the serving function.

  • -
-

-
-
-

steps#

-

define a new function and its dependencies
-test the model serving class locally
-deploy our serving class using as a serverless function
-test our model server using HTTP request

-
-
-
# nuclio: ignore
-import nuclio 
-
-
-
-
-

-
-
-

define a new function and its dependencies#

-
-
-
%nuclio config kind="nuclio:serving"
-%nuclio env MODEL_CLASS=XGBoostModel
-
-%nuclio config spec.build.baseImage = "mlrun/ml-models"
-
-
-
-
-
%nuclio: setting kind to 'nuclio:serving'
-%nuclio: setting 'MODEL_CLASS' environment variable
-%nuclio: setting spec.build.baseImage to 'mlrun/ml-models'
-
-
-
-
-
-
-

Function Code#

-
-
-
# import kfserving
-import os
-import json
-import numpy as np
-import xgboost as xgb
-from cloudpickle import load
-
-### Model Serving Class
-
-import mlrun
-class XGBoostModel(mlrun.runtimes.MLModelServer):
-    def load(self):
-        model_file, extra_data = self.get_model(".pkl")
-        self.model = load(open(str(model_file), "rb"))
-  
-
-    def predict(self, body):
-        try:
-            feats = np.asarray(body["instances"], dtype=np.float32).reshape(-1, 5)
-            result = self.model.predict(feats, validate_features=False)
-            return result.tolist()
-        except Exception as e:
-            raise Exception("Failed to predict %s" % e)
-
-
-
-
-

The following end-code annotation tells nuclio to stop parsing the notebook from this cell. Please do not remove this cell:

-
-
-
# nuclio: end-code
-
-
-
-
-

-
-
-

Test the function locally#

-

The class above can be tested locally. Just instantiate the class, .load() will load the model to a local dir.

-
-

Verify there is a model file in the model_dir path (generated by the training notebook)

-
-
-
-
from mlrun import mlconf
-
-model_dir = os.path.join(mlconf.artifact_path, "xgb/models")
-
-my_server = XGBoostModel("my-model", model_dir=model_dir)
-my_server.load()
-
-
-
-
-
-
-
DATA_PATH = mlconf.artifact_path + "/xgb/classifier-data.csv"
-MODEL_PATH = mlconf.artifact_path + "/xgb/models/xgb_test"
-
-
-
-
-
-
-
import pandas as pd
-xtest = pd.read_csv(DATA_PATH)
-
-
-
-
-

We can use the .predict(body) method to test the model.

-
-
-
import json, numpy as np
-preds = my_server.predict({"instances":xtest.values[:10,:-1].tolist()})
-
-
-
-
-
-
-
print("predicted class:", preds)
-
-
-
-
-
predicted class: [1, 0, 0, 0, 0, 0, 1, 1, 0, 1]
-
-
-
-
-

-
-

deploy our serving class using as a serverless function#

-

in the following section we create a new model serving function which wraps our class , and specify model and other resources.

-

the models dict store model names and the assosiated model dir URL (the URL can start with S3:// and other blob store options), the faster way is to use a shared file volume, we use .apply(mount_v3io()) to attach a v3io (iguazio data fabric) volume to our function. By default v3io will mount the current user home into the \User function path.

-

verify the model dir does contain a valid model.bst file

-
-
-
from mlrun import new_model_server, mount_v3io
-import requests
-
-
-
-
-
-
-
fn = new_model_server("xgb-serving",
-                      model_class="XGBoostModel",
-                      models={"xgb_serving_v2": f"{model_dir}"})
-fn.spec.description = "xgboost test data classification server"
-fn.metadata.categories = ["serving", "ml"]
-fn.metadata.labels = {"author": "yaronh", "framework": "xgboost"}
-
-fn.export("function.yaml")
-
-
-
-
-
[mlrun] 2020-06-14 12:49:05,013 function spec saved to path: function.yaml
-
-
-
<mlrun.runtimes.function.RemoteRuntime at 0x7f0218662f60>
-
-
-
-
-
-
-
-

tests#

-
-
-
from mlrun.platforms.other import auto_mount
-fn.apply(auto_mount())
-
-
-
-
-
<mlrun.runtimes.function.RemoteRuntime at 0x7f0218662f60>
-
-
-
-
-
-
-
addr = fn.deploy()
-
-
-
-
-
[mlrun] 2020-06-14 12:49:18,128 deploy started
-[nuclio] 2020-06-14 12:49:19,213 (info) Build complete
-[nuclio] 2020-06-14 12:49:27,347 (info) Function deploy complete
-[nuclio] 2020-06-14 12:49:27,354 done updating default-xgb-test, function address: 3.23.82.202:30104
-
-
-
-
-
-
-
addr
-
-
-
-
-
'http://3.23.82.202:30104'
-
-
-
-
-

-
-

test our model server using HTTP request#

-

We invoke our model serving function using test data, the data vector is specified in the instances attribute.

-
-
-
# KFServing protocol event
-event_data = {"instances": xtest.values[:10,:-1].tolist()}
-
-
-
-
-
-
-
import json
-resp = requests.put(addr + "/xgb_serving_v2/predict", json=json.dumps(event_data))
-resp.text
-
-
-
-
-
'[1, 0, 0, 0, 0, 0, 1, 1, 0, 1]'
-
-
-
-
-
-
-
preds
-
-
-
-
-
[1, 0, 0, 0, 0, 0, 1, 1, 0, 1]
-
-
-
-
-

back to top

-
-
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/xgb_serving/1.1.0/static/function.html b/functions/development/xgb_serving/1.1.0/static/function.html deleted file mode 100644 index 07bbb576..00000000 --- a/functions/development/xgb_serving/1.1.0/static/function.html +++ /dev/null @@ -1,71 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: remote
-metadata:
-  name: xgb-serving
-  tag: ''
-  hash: d80a0b814b7707e100b3cd7d69fee276ebb2ff16
-  project: ''
-  labels:
-    author: Daniel
-  categories:
-  - model-serving
-  - machine-learning
-spec:
-  command: ''
-  args: []
-  image: mlrun/ml-models
-  entry_points:
-    load:
-      name: load
-      doc: ''
-      parameters:
-      - name: self
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 10
-    predict:
-      name: predict
-      doc: ''
-      parameters:
-      - name: self
-        default: ''
-      - name: body
-        default: ''
-      outputs:
-      - default: ''
-      lineno: 14
-  description: deploy an XGBoost model server.
-  min_replicas: 1
-  max_replicas: 4
-  env: []
-  base_spec: ''
-  source: ''
-  build:
-    functionSourceCode: aW1wb3J0IG9zCmltcG9ydCBqc29uCmltcG9ydCBudW1weSBhcyBucAppbXBvcnQgeGdib29zdCBhcyB4Z2IKZnJvbSBjbG91ZHBpY2tsZSBpbXBvcnQgbG9hZAppbXBvcnQgbWxydW4KCgpjbGFzcyBYR0Jvb3N0TW9kZWwobWxydW4ucnVudGltZXMuTUxNb2RlbFNlcnZlcik6CiAgICBkZWYgbG9hZChzZWxmKToKICAgICAgICBtb2RlbF9maWxlLCBleHRyYV9kYXRhID0gc2VsZi5nZXRfbW9kZWwoIi5wa2wiKQogICAgICAgIHNlbGYubW9kZWwgPSBsb2FkKG9wZW4oc3RyKG1vZGVsX2ZpbGUpLCAicmIiKSkKCiAgICBkZWYgcHJlZGljdChzZWxmLCBib2R5KToKICAgICAgICB0cnk6CiAgICAgICAgICAgIGZlYXRzID0gbnAuYXNhcnJheShib2R5WyJpbnN0YW5jZXMiXSwgZHR5cGU9bnAuZmxvYXQzMikucmVzaGFwZSgtMSwgNSkKICAgICAgICAgICAgcmVzdWx0ID0gc2VsZi5tb2RlbC5wcmVkaWN0KGZlYXRzLCB2YWxpZGF0ZV9mZWF0dXJlcz1GYWxzZSkKICAgICAgICAgICAgcmV0dXJuIHJlc3VsdC50b2xpc3QoKQogICAgICAgIGV4Y2VwdCBFeGNlcHRpb24gYXMgZToKICAgICAgICAgICAgcmFpc2UgRXhjZXB0aW9uKCJGYWlsZWQgdG8gcHJlZGljdCAlcyIgJSBlKQ==
-    commands: []
-    code_origin: https://github.com/daniels290813/functions.git#55a79c32be5d233cc11efcf40cd3edbe309bfdef:/home/kali/functions/xgb_serving/xgb_serving.py
-  default_handler: handler
-  affinity: null
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/xgb_serving/1.1.0/static/item.html b/functions/development/xgb_serving/1.1.0/static/item.html deleted file mode 100644 index a5f1378c..00000000 --- a/functions/development/xgb_serving/1.1.0/static/item.html +++ /dev/null @@ -1,47 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- model-serving
-- machine-learning
-description: deploy an XGBoost model server.
-doc: ''
-example: xgb_serving.ipynb
-generationDate: 2022-08-28:17-25
-hidden: false
-icon: ''
-labels:
-  author: Daniel
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 1.1.0
-name: xgb_serving
-platformVersion: 3.5.0
-spec:
-  filename: xgb_serving.py
-  handler: handler
-  image: mlrun/ml-models
-  kind: remote
-  requirements: []
-url: ''
-version: 1.1.0
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/xgb_serving/1.1.0/static/source.html b/functions/development/xgb_serving/1.1.0/static/source.html deleted file mode 100644 index d052d23b..00000000 --- a/functions/development/xgb_serving/1.1.0/static/source.html +++ /dev/null @@ -1,55 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-# Copyright 2019 Iguazio
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-import os
-import json
-import numpy as np
-import xgboost as xgb
-from cloudpickle import load
-import mlrun
-
-
-class XGBoostModel(mlrun.runtimes.MLModelServer):
-    def load(self):
-        model_file, extra_data = self.get_model(".pkl")
-        self.model = load(open(str(model_file), "rb"))
-
-    def predict(self, body):
-        try:
-            feats = np.asarray(body["instances"], dtype=np.float32).reshape(-1, 5)
-            result = self.model.predict(feats, validate_features=False)
-            return result.tolist()
-        except Exception as e:
-            raise Exception("Failed to predict %s" % e)
-        
-    
- - \ No newline at end of file diff --git a/functions/development/xgb_serving/1.1.0/static/xgb_serving.html b/functions/development/xgb_serving/1.1.0/static/xgb_serving.html deleted file mode 100644 index 160d7274..00000000 --- a/functions/development/xgb_serving/1.1.0/static/xgb_serving.html +++ /dev/null @@ -1,174 +0,0 @@ - - - - - - - -xgb_serving.xgb_serving - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - -
-
- -
-
-
-
-
- -
-

- -
-
-
-
-
-
-
-

Source code for xgb_serving.xgb_serving

-# Copyright 2019 Iguazio
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-import os
-import json
-import numpy as np
-import xgboost as xgb
-from cloudpickle import load
-import mlrun
-
-
-
[docs]class XGBoostModel(mlrun.runtimes.MLModelServer): -
[docs] def load(self): - model_file, extra_data = self.get_model(".pkl") - self.model = load(open(str(model_file), "rb"))
- -
[docs] def predict(self, body): - try: - feats = np.asarray(body["instances"], dtype=np.float32).reshape(-1, 5) - result = self.model.predict(feats, validate_features=False) - return result.tolist() - except Exception as e: - raise Exception("Failed to predict %s" % e)
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/xgb_serving/1.1.2/src/function.yaml b/functions/development/xgb_serving/1.1.2/src/function.yaml deleted file mode 100644 index 7073d8ba..00000000 --- a/functions/development/xgb_serving/1.1.2/src/function.yaml +++ /dev/null @@ -1,40 +0,0 @@ -kind: serving -metadata: - name: xgb-serving - tag: '' - hash: 200148a9a4815d8b0394038d973b59eda1776d36 - project: '' - labels: - author: Daniel - categories: - - model-serving - - machine-learning -spec: - command: '' - args: [] - image: mlrun/mlrun - build: - functionSourceCode: IyBDb3B5cmlnaHQgMjAxOSBJZ3VhemlvCiMKIyBMaWNlbnNlZCB1bmRlciB0aGUgQXBhY2hlIExpY2Vuc2UsIFZlcnNpb24gMi4wICh0aGUgIkxpY2Vuc2UiKTsKIyB5b3UgbWF5IG5vdCB1c2UgdGhpcyBmaWxlIGV4Y2VwdCBpbiBjb21wbGlhbmNlIHdpdGggdGhlIExpY2Vuc2UuCiMgWW91IG1heSBvYnRhaW4gYSBjb3B5IG9mIHRoZSBMaWNlbnNlIGF0CiMKIyAgICAgaHR0cDovL3d3dy5hcGFjaGUub3JnL2xpY2Vuc2VzL0xJQ0VOU0UtMi4wCiMKIyBVbmxlc3MgcmVxdWlyZWQgYnkgYXBwbGljYWJsZSBsYXcgb3IgYWdyZWVkIHRvIGluIHdyaXRpbmcsIHNvZnR3YXJlCiMgZGlzdHJpYnV0ZWQgdW5kZXIgdGhlIExpY2Vuc2UgaXMgZGlzdHJpYnV0ZWQgb24gYW4gIkFTIElTIiBCQVNJUywKIyBXSVRIT1VUIFdBUlJBTlRJRVMgT1IgQ09ORElUSU9OUyBPRiBBTlkgS0lORCwgZWl0aGVyIGV4cHJlc3Mgb3IgaW1wbGllZC4KIyBTZWUgdGhlIExpY2Vuc2UgZm9yIHRoZSBzcGVjaWZpYyBsYW5ndWFnZSBnb3Zlcm5pbmcgcGVybWlzc2lvbnMgYW5kCiMgbGltaXRhdGlvbnMgdW5kZXIgdGhlIExpY2Vuc2UuCiMKaW1wb3J0IG9zCmltcG9ydCBqc29uCmltcG9ydCBudW1weSBhcyBucApmcm9tIGNsb3VkcGlja2xlIGltcG9ydCBsb2FkCmltcG9ydCBtbHJ1bgoKCmNsYXNzIFhHQm9vc3RNb2RlbChtbHJ1bi5zZXJ2aW5nLlYyTW9kZWxTZXJ2ZXIpOgogICAgZGVmIGxvYWQoc2VsZik6CiAgICAgICAgbW9kZWxfZmlsZSwgZXh0cmFfZGF0YSA9IHNlbGYuZ2V0X21vZGVsKCIucGtsIikKICAgICAgICBzZWxmLm1vZGVsID0gbG9hZChvcGVuKHN0cihtb2RlbF9maWxlKSwgInJiIikpCgogICAgZGVmIHByZWRpY3Qoc2VsZiwgYm9keSk6CiAgICAgICAgdHJ5OgogICAgICAgICAgICBmZWF0cyA9IG5wLmFzYXJyYXkoYm9keVsiaW5wdXRzIl0sIGR0eXBlPW5wLmZsb2F0MzIpLnJlc2hhcGUoLTEsIDUpCiAgICAgICAgICAgIHJlc3VsdCA9IHNlbGYubW9kZWwucHJlZGljdChmZWF0cywgdmFsaWRhdGVfZmVhdHVyZXM9RmFsc2UpCiAgICAgICAgICAgIHJldHVybiByZXN1bHQudG9saXN0KCkKICAgICAgICBleGNlcHQgRXhjZXB0aW9uIGFzIGU6CiAgICAgICAgICAgIHJhaXNlIEV4Y2VwdGlvbigiRmFpbGVkIHRvIHByZWRpY3QgJXMiICUgZSkKZnJvbSBtbHJ1bi5ydW50aW1lcyBpbXBvcnQgbnVjbGlvX2luaXRfaG9vawpkZWYgaW5pdF9jb250ZXh0KGNvbnRleHQpOgogICAgbnVjbGlvX2luaXRfaG9vayhjb250ZXh0LCBnbG9iYWxzKCksICdzZXJ2aW5nX3YyJykKCmRlZiBoYW5kbGVyKGNvbnRleHQsIGV2ZW50KToKICAgIHJldHVybiBjb250ZXh0Lm1scnVuX2hhbmRsZXIoY29udGV4dCwgZXZlbnQpCg== - commands: [] - code_origin: https://github.com/daniels290813/functions.git#2675b0d235d93571a696296c93cfb2103cbf261f:/Users/Daniel_Sabba/functions/xgb_serving/xgb_serving.py - origin_filename: /Users/Daniel_Sabba/functions/xgb_serving/xgb_serving.py - requirements: [] - description: deploy an XGBoost model server. - default_handler: '' - disable_auto_mount: false - clone_target_dir: '' - env: [] - priority_class_name: '' - preemption_mode: prevent - min_replicas: 1 - max_replicas: 4 - source: '' - function_kind: serving_v2 - function_handler: xgb_serving:handler - base_image_pull: false - default_class: ClassifierModel - secret_sources: [] - affinity: null - tolerations: null - security_context: {} -verbose: false diff --git a/functions/development/xgb_serving/1.1.2/src/item.yaml b/functions/development/xgb_serving/1.1.2/src/item.yaml deleted file mode 100644 index 413e26bf..00000000 --- a/functions/development/xgb_serving/1.1.2/src/item.yaml +++ /dev/null @@ -1,29 +0,0 @@ -apiVersion: v1 -categories: -- model-serving -- machine-learning -description: deploy an XGBoost model server. -doc: '' -example: xgb_serving.ipynb -generationDate: 2022-08-28:17-25 -hidden: false -icon: '' -labels: - author: Daniel -maintainers: [] -marketplaceType: '' -mlrunVersion: 1.4.1 -name: xgb_serving -platformVersion: 3.5.3 -spec: - customFields: - default_class: ClassifierModel - filename: xgb_serving.py - handler: handler - image: mlrun/mlrun - kind: serving - requirements: [] -url: '' -version: 1.1.2 - - diff --git a/functions/development/xgb_serving/1.1.2/src/requirements.txt b/functions/development/xgb_serving/1.1.2/src/requirements.txt deleted file mode 100644 index a5bbcdde..00000000 --- a/functions/development/xgb_serving/1.1.2/src/requirements.txt +++ /dev/null @@ -1,7 +0,0 @@ -pandas -xgboost -cloudpickle -pygit2 -scikit-learn==1.0.2 -scikit-plot -seaborn diff --git a/functions/development/xgb_serving/1.1.2/src/test_xgb_serving.py b/functions/development/xgb_serving/1.1.2/src/test_xgb_serving.py deleted file mode 100644 index 52f6ccb6..00000000 --- a/functions/development/xgb_serving/1.1.2/src/test_xgb_serving.py +++ /dev/null @@ -1,67 +0,0 @@ -# Copyright 2019 Iguazio -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -import mlrun -import os -import pandas as pd -from xgb_serving import XGBoostModel - - -def get_class_data(): - fn = mlrun.import_function('../gen_class_data/function.yaml') - run = fn.run(params={'key': 'classifier-data', - 'n_samples': 10_000, - 'm_features': 5, - 'k_classes': 2, - 'header': None, - 'weight': [0.5, 0.5], - 'sk_params': {'n_informative': 2}, - 'file_ext': 'csv'}, local=True, artifact_path="./artifacts") - return run - - -def xgb_trainer(): - # running data preparation function locally - gen_data_run = get_class_data() - - fn = mlrun.import_function('../xgb_trainer/function.yaml') - run = fn.run(params={'model_type': 'classifier', - 'CLASS_tree_method': 'hist', - 'CLASS_objective': 'binary:logistic', - 'CLASS_booster': 'gbtree', - 'FIT_verbose': 0, - 'label_column': 'labels'}, - local=True, inputs={'dataset': gen_data_run.status.artifacts[0]['spec']['target_path']}) - - for artifact in run.status.artifacts: - if artifact['kind'] == 'model': - assert os.path.exists(artifact['spec']['target_path']), "Failed locating model file" # validating model exists - return artifact['spec']['target_path'] + artifact['spec']['model_file'], gen_data_run.status.artifacts[0]['spec']['target_path'] - assert False, "Failed creating model" - - -def test_local_xgb_serving(): - model_path, dataset_path = xgb_trainer() - fn = mlrun.import_function('function.yaml') - - fn.add_model(key='my_model', model_path=model_path, class_name='XGBoostModel') - server = fn.to_mock_server() - - # Testing the model - df = pd.read_csv(dataset_path) - x = df.drop(['labels'], axis=1).iloc[0].tolist() - y_true = df['labels'][0] - - y_pred = server.test(path='/v2/models/my_model/predict', body={"inputs": x})['outputs'][0] - assert y_true == y_pred diff --git a/functions/development/xgb_serving/1.1.2/src/xgb_serving.ipynb b/functions/development/xgb_serving/1.1.2/src/xgb_serving.ipynb deleted file mode 100644 index 6c605367..00000000 --- a/functions/development/xgb_serving/1.1.2/src/xgb_serving.ipynb +++ /dev/null @@ -1,421 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Deploy a Serverless XGBoost Model Server\n", - " --------------------------------------------------------------------\n", - "\n", - "The following notebook demonstrates how to deploy an XGBoost model server (a.k.a Nuclio-serving)\n", - "\n", - "#### **notebook how-to's**\n", - "* Write and test model serving class in a notebook.\n", - "* Deploy the model server function.\n", - "* Invoke and test the serving function." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "#### **steps**\n", - "**[define a new function and its dependencies](#define-function)**
\n", - "**[test the model serving class locally](#test-locally)**
\n", - "**[deploy our serving class using as a serverless function](#deploy)**
\n", - "**[test our model server using HTTP request](#test-model-server)**
" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: ignore\n", - "import nuclio " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "### **define a new function and its dependencies**" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "%nuclio: setting kind to 'nuclio:serving'\n", - "%nuclio: setting 'MODEL_CLASS' environment variable\n", - "%nuclio: setting spec.build.baseImage to 'mlrun/ml-models'\n" - ] - } - ], - "source": [ - "%nuclio config kind=\"nuclio:serving\"\n", - "%nuclio env MODEL_CLASS=XGBoostModel\n", - "\n", - "%nuclio config spec.build.baseImage = \"mlrun/ml-models\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Function Code" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "# import kfserving\n", - "import os\n", - "import json\n", - "import numpy as np\n", - "import xgboost as xgb\n", - "from cloudpickle import load\n", - "\n", - "### Model Serving Class\n", - "\n", - "import mlrun\n", - "class XGBoostModel(mlrun.runtimes.MLModelServer):\n", - " def load(self):\n", - " model_file, extra_data = self.get_model(\".pkl\")\n", - " self.model = load(open(str(model_file), \"rb\"))\n", - " \n", - "\n", - " def predict(self, body):\n", - " try:\n", - " feats = np.asarray(body[\"instances\"], dtype=np.float32).reshape(-1, 5)\n", - " result = self.model.predict(feats, validate_features=False)\n", - " return result.tolist()\n", - " except Exception as e:\n", - " raise Exception(\"Failed to predict %s\" % e)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The following end-code annotation tells ```nuclio``` to stop parsing the notebook from this cell. _**Please do not remove this cell**_:" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: end-code" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "## Test the function locally\n", - "\n", - "The class above can be tested locally. Just instantiate the class, `.load()` will load the model to a local dir.\n", - "\n", - "> **Verify there is a model file in the model_dir path (generated by the training notebook)**" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import mlconf\n", - "\n", - "model_dir = os.path.join(mlconf.artifact_path, \"xgb/models\")\n", - "\n", - "my_server = XGBoostModel(\"my-model\", model_dir=model_dir)\n", - "my_server.load()" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "metadata": {}, - "outputs": [], - "source": [ - "DATA_PATH = mlconf.artifact_path + \"/xgb/classifier-data.csv\"\n", - "MODEL_PATH = mlconf.artifact_path + \"/xgb/models/xgb_test\"" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "xtest = pd.read_csv(DATA_PATH)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can use the `.predict(body)` method to test the model." - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "metadata": {}, - "outputs": [], - "source": [ - "import json, numpy as np\n", - "preds = my_server.predict({\"instances\":xtest.values[:10,:-1].tolist()})" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "predicted class: [1, 0, 0, 0, 0, 0, 1, 1, 0, 1]\n" - ] - } - ], - "source": [ - "print(\"predicted class:\", preds)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "### **deploy our serving class using as a serverless function**\n", - "in the following section we create a new model serving function which wraps our class , and specify model and other resources.\n", - "\n", - "the `models` dict store model names and the assosiated model **dir** URL (the URL can start with `S3://` and other blob store options), the faster way is to use a shared file volume, we use `.apply(mount_v3io())` to attach a v3io (iguazio data fabric) volume to our function. By default v3io will mount the current user home into the `\\User` function path.\n", - "\n", - "**verify the model dir does contain a valid `model.bst` file**" - ] - }, - { - "cell_type": "code", - "execution_count": 41, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import new_model_server, mount_v3io\n", - "import requests" - ] - }, - { - "cell_type": "code", - "execution_count": 42, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-06-14 12:49:05,013 function spec saved to path: function.yaml\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 42, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fn = new_model_server(\"xgb-serving\",\n", - " model_class=\"XGBoostModel\",\n", - " models={\"xgb_serving_v2\": f\"{model_dir}\"})\n", - "fn.spec.description = \"xgboost test data classification server\"\n", - "fn.metadata.categories = [\"serving\", \"ml\"]\n", - "fn.metadata.labels = {\"author\": \"yaronh\", \"framework\": \"xgboost\"}\n", - "\n", - "fn.export(\"function.yaml\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## tests" - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 43, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from mlrun.platforms.other import auto_mount\n", - "fn.apply(auto_mount())" - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-06-14 12:49:18,128 deploy started\n", - "[nuclio] 2020-06-14 12:49:19,213 (info) Build complete\n", - "[nuclio] 2020-06-14 12:49:27,347 (info) Function deploy complete\n", - "[nuclio] 2020-06-14 12:49:27,354 done updating default-xgb-test, function address: 3.23.82.202:30104\n" - ] - } - ], - "source": [ - "addr = fn.deploy()" - ] - }, - { - "cell_type": "code", - "execution_count": 45, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'http://3.23.82.202:30104'" - ] - }, - "execution_count": 45, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "addr" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "### **test our model server using HTTP request**\n", - "\n", - "\n", - "We invoke our model serving function using test data, the data vector is specified in the `instances` attribute." - ] - }, - { - "cell_type": "code", - "execution_count": 46, - "metadata": {}, - "outputs": [], - "source": [ - "# KFServing protocol event\n", - "event_data = {\"instances\": xtest.values[:10,:-1].tolist()}" - ] - }, - { - "cell_type": "code", - "execution_count": 47, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'[1, 0, 0, 0, 0, 0, 1, 1, 0, 1]'" - ] - }, - "execution_count": 47, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import json\n", - "resp = requests.put(addr + \"/xgb_serving_v2/predict\", json=json.dumps(event_data))\n", - "resp.text" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[1, 0, 0, 0, 0, 0, 1, 1, 0, 1]" - ] - }, - "execution_count": 26, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "preds" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**[back to top](#top)**" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.8" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/functions/development/xgb_serving/1.1.2/src/xgb_serving.py b/functions/development/xgb_serving/1.1.2/src/xgb_serving.py deleted file mode 100644 index a4d095e5..00000000 --- a/functions/development/xgb_serving/1.1.2/src/xgb_serving.py +++ /dev/null @@ -1,33 +0,0 @@ -# Copyright 2019 Iguazio -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -import os -import json -import numpy as np -from cloudpickle import load -import mlrun - - -class XGBoostModel(mlrun.serving.V2ModelServer): - def load(self): - model_file, extra_data = self.get_model(".pkl") - self.model = load(open(str(model_file), "rb")) - - def predict(self, body): - try: - feats = np.asarray(body["inputs"], dtype=np.float32).reshape(-1, 5) - result = self.model.predict(feats, validate_features=False) - return result.tolist() - except Exception as e: - raise Exception("Failed to predict %s" % e) \ No newline at end of file diff --git a/functions/development/xgb_serving/1.1.2/static/documentation.html b/functions/development/xgb_serving/1.1.2/static/documentation.html deleted file mode 100644 index 54daeb03..00000000 --- a/functions/development/xgb_serving/1.1.2/static/documentation.html +++ /dev/null @@ -1,237 +0,0 @@ - - - - - - - -xgb_serving package - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - - - -
-
- - -
-
-
- -
-

xgb_serving package

- -
- -
-
-
-
-
-

xgb_serving package#

-
-

Submodules#

-
-
-

xgb_serving.xgb_serving module#

-
-
-class xgb_serving.xgb_serving.XGBoostModel(context=None, name: Optional[str] = None, model_path: Optional[str] = None, model=None, protocol=None, input_path: Optional[str] = None, result_path: Optional[str] = None, **kwargs)[source]#
-

Bases: mlrun.serving.v2_serving.V2ModelServer

-
-
-load()[source]#
-

model loading function, see also .get_model() method

-
-
-
-predict(body)[source]#
-

model prediction operation

-
-
-
-
-

Module contents#

-
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/xgb_serving/1.1.2/static/example.html b/functions/development/xgb_serving/1.1.2/static/example.html deleted file mode 100644 index 5f2174ae..00000000 --- a/functions/development/xgb_serving/1.1.2/static/example.html +++ /dev/null @@ -1,565 +0,0 @@ - - - - - - - -Deploy a Serverless XGBoost Model Server - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - - - -
-
- - -
-
-
- - -
-
-
-

Deploy a Serverless XGBoost Model Server#

-
-

The following notebook demonstrates how to deploy an XGBoost model server (a.k.a Nuclio-serving)

-
-

notebook how-to’s#

-
    -
  • Write and test model serving class in a notebook.

  • -
  • Deploy the model server function.

  • -
  • Invoke and test the serving function.

  • -
-

-
-
-

steps#

-

define a new function and its dependencies
-test the model serving class locally
-deploy our serving class using as a serverless function
-test our model server using HTTP request

-
-
-
# nuclio: ignore
-import nuclio 
-
-
-
-
-

-
-
-

define a new function and its dependencies#

-
-
-
%nuclio config kind="nuclio:serving"
-%nuclio env MODEL_CLASS=XGBoostModel
-
-%nuclio config spec.build.baseImage = "mlrun/ml-models"
-
-
-
-
-
%nuclio: setting kind to 'nuclio:serving'
-%nuclio: setting 'MODEL_CLASS' environment variable
-%nuclio: setting spec.build.baseImage to 'mlrun/ml-models'
-
-
-
-
-
-
-

Function Code#

-
-
-
# import kfserving
-import os
-import json
-import numpy as np
-import xgboost as xgb
-from cloudpickle import load
-
-### Model Serving Class
-
-import mlrun
-class XGBoostModel(mlrun.runtimes.MLModelServer):
-    def load(self):
-        model_file, extra_data = self.get_model(".pkl")
-        self.model = load(open(str(model_file), "rb"))
-  
-
-    def predict(self, body):
-        try:
-            feats = np.asarray(body["instances"], dtype=np.float32).reshape(-1, 5)
-            result = self.model.predict(feats, validate_features=False)
-            return result.tolist()
-        except Exception as e:
-            raise Exception("Failed to predict %s" % e)
-
-
-
-
-

The following end-code annotation tells nuclio to stop parsing the notebook from this cell. Please do not remove this cell:

-
-
-
# nuclio: end-code
-
-
-
-
-

-
-
-

Test the function locally#

-

The class above can be tested locally. Just instantiate the class, .load() will load the model to a local dir.

-
-

Verify there is a model file in the model_dir path (generated by the training notebook)

-
-
-
-
from mlrun import mlconf
-
-model_dir = os.path.join(mlconf.artifact_path, "xgb/models")
-
-my_server = XGBoostModel("my-model", model_dir=model_dir)
-my_server.load()
-
-
-
-
-
-
-
DATA_PATH = mlconf.artifact_path + "/xgb/classifier-data.csv"
-MODEL_PATH = mlconf.artifact_path + "/xgb/models/xgb_test"
-
-
-
-
-
-
-
import pandas as pd
-xtest = pd.read_csv(DATA_PATH)
-
-
-
-
-

We can use the .predict(body) method to test the model.

-
-
-
import json, numpy as np
-preds = my_server.predict({"instances":xtest.values[:10,:-1].tolist()})
-
-
-
-
-
-
-
print("predicted class:", preds)
-
-
-
-
-
predicted class: [1, 0, 0, 0, 0, 0, 1, 1, 0, 1]
-
-
-
-
-

-
-

deploy our serving class using as a serverless function#

-

in the following section we create a new model serving function which wraps our class , and specify model and other resources.

-

the models dict store model names and the assosiated model dir URL (the URL can start with S3:// and other blob store options), the faster way is to use a shared file volume, we use .apply(mount_v3io()) to attach a v3io (iguazio data fabric) volume to our function. By default v3io will mount the current user home into the \User function path.

-

verify the model dir does contain a valid model.bst file

-
-
-
from mlrun import new_model_server, mount_v3io
-import requests
-
-
-
-
-
-
-
fn = new_model_server("xgb-serving",
-                      model_class="XGBoostModel",
-                      models={"xgb_serving_v2": f"{model_dir}"})
-fn.spec.description = "xgboost test data classification server"
-fn.metadata.categories = ["serving", "ml"]
-fn.metadata.labels = {"author": "yaronh", "framework": "xgboost"}
-
-fn.export("function.yaml")
-
-
-
-
-
[mlrun] 2020-06-14 12:49:05,013 function spec saved to path: function.yaml
-
-
-
<mlrun.runtimes.function.RemoteRuntime at 0x7f0218662f60>
-
-
-
-
-
-
-
-

tests#

-
-
-
from mlrun.platforms.other import auto_mount
-fn.apply(auto_mount())
-
-
-
-
-
<mlrun.runtimes.function.RemoteRuntime at 0x7f0218662f60>
-
-
-
-
-
-
-
addr = fn.deploy()
-
-
-
-
-
[mlrun] 2020-06-14 12:49:18,128 deploy started
-[nuclio] 2020-06-14 12:49:19,213 (info) Build complete
-[nuclio] 2020-06-14 12:49:27,347 (info) Function deploy complete
-[nuclio] 2020-06-14 12:49:27,354 done updating default-xgb-test, function address: 3.23.82.202:30104
-
-
-
-
-
-
-
addr
-
-
-
-
-
'http://3.23.82.202:30104'
-
-
-
-
-

-
-

test our model server using HTTP request#

-

We invoke our model serving function using test data, the data vector is specified in the instances attribute.

-
-
-
# KFServing protocol event
-event_data = {"instances": xtest.values[:10,:-1].tolist()}
-
-
-
-
-
-
-
import json
-resp = requests.put(addr + "/xgb_serving_v2/predict", json=json.dumps(event_data))
-resp.text
-
-
-
-
-
'[1, 0, 0, 0, 0, 0, 1, 1, 0, 1]'
-
-
-
-
-
-
-
preds
-
-
-
-
-
[1, 0, 0, 0, 0, 0, 1, 1, 0, 1]
-
-
-
-
-

back to top

-
-
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/xgb_serving/1.1.2/static/function.html b/functions/development/xgb_serving/1.1.2/static/function.html deleted file mode 100644 index 2c8642eb..00000000 --- a/functions/development/xgb_serving/1.1.2/static/function.html +++ /dev/null @@ -1,62 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: serving
-metadata:
-  name: xgb-serving
-  tag: ''
-  hash: 200148a9a4815d8b0394038d973b59eda1776d36
-  project: ''
-  labels:
-    author: Daniel
-  categories:
-  - model-serving
-  - machine-learning
-spec:
-  command: ''
-  args: []
-  image: mlrun/mlrun
-  build:
-    functionSourceCode: IyBDb3B5cmlnaHQgMjAxOSBJZ3VhemlvCiMKIyBMaWNlbnNlZCB1bmRlciB0aGUgQXBhY2hlIExpY2Vuc2UsIFZlcnNpb24gMi4wICh0aGUgIkxpY2Vuc2UiKTsKIyB5b3UgbWF5IG5vdCB1c2UgdGhpcyBmaWxlIGV4Y2VwdCBpbiBjb21wbGlhbmNlIHdpdGggdGhlIExpY2Vuc2UuCiMgWW91IG1heSBvYnRhaW4gYSBjb3B5IG9mIHRoZSBMaWNlbnNlIGF0CiMKIyAgICAgaHR0cDovL3d3dy5hcGFjaGUub3JnL2xpY2Vuc2VzL0xJQ0VOU0UtMi4wCiMKIyBVbmxlc3MgcmVxdWlyZWQgYnkgYXBwbGljYWJsZSBsYXcgb3IgYWdyZWVkIHRvIGluIHdyaXRpbmcsIHNvZnR3YXJlCiMgZGlzdHJpYnV0ZWQgdW5kZXIgdGhlIExpY2Vuc2UgaXMgZGlzdHJpYnV0ZWQgb24gYW4gIkFTIElTIiBCQVNJUywKIyBXSVRIT1VUIFdBUlJBTlRJRVMgT1IgQ09ORElUSU9OUyBPRiBBTlkgS0lORCwgZWl0aGVyIGV4cHJlc3Mgb3IgaW1wbGllZC4KIyBTZWUgdGhlIExpY2Vuc2UgZm9yIHRoZSBzcGVjaWZpYyBsYW5ndWFnZSBnb3Zlcm5pbmcgcGVybWlzc2lvbnMgYW5kCiMgbGltaXRhdGlvbnMgdW5kZXIgdGhlIExpY2Vuc2UuCiMKaW1wb3J0IG9zCmltcG9ydCBqc29uCmltcG9ydCBudW1weSBhcyBucApmcm9tIGNsb3VkcGlja2xlIGltcG9ydCBsb2FkCmltcG9ydCBtbHJ1bgoKCmNsYXNzIFhHQm9vc3RNb2RlbChtbHJ1bi5zZXJ2aW5nLlYyTW9kZWxTZXJ2ZXIpOgogICAgZGVmIGxvYWQoc2VsZik6CiAgICAgICAgbW9kZWxfZmlsZSwgZXh0cmFfZGF0YSA9IHNlbGYuZ2V0X21vZGVsKCIucGtsIikKICAgICAgICBzZWxmLm1vZGVsID0gbG9hZChvcGVuKHN0cihtb2RlbF9maWxlKSwgInJiIikpCgogICAgZGVmIHByZWRpY3Qoc2VsZiwgYm9keSk6CiAgICAgICAgdHJ5OgogICAgICAgICAgICBmZWF0cyA9IG5wLmFzYXJyYXkoYm9keVsiaW5wdXRzIl0sIGR0eXBlPW5wLmZsb2F0MzIpLnJlc2hhcGUoLTEsIDUpCiAgICAgICAgICAgIHJlc3VsdCA9IHNlbGYubW9kZWwucHJlZGljdChmZWF0cywgdmFsaWRhdGVfZmVhdHVyZXM9RmFsc2UpCiAgICAgICAgICAgIHJldHVybiByZXN1bHQudG9saXN0KCkKICAgICAgICBleGNlcHQgRXhjZXB0aW9uIGFzIGU6CiAgICAgICAgICAgIHJhaXNlIEV4Y2VwdGlvbigiRmFpbGVkIHRvIHByZWRpY3QgJXMiICUgZSkKZnJvbSBtbHJ1bi5ydW50aW1lcyBpbXBvcnQgbnVjbGlvX2luaXRfaG9vawpkZWYgaW5pdF9jb250ZXh0KGNvbnRleHQpOgogICAgbnVjbGlvX2luaXRfaG9vayhjb250ZXh0LCBnbG9iYWxzKCksICdzZXJ2aW5nX3YyJykKCmRlZiBoYW5kbGVyKGNvbnRleHQsIGV2ZW50KToKICAgIHJldHVybiBjb250ZXh0Lm1scnVuX2hhbmRsZXIoY29udGV4dCwgZXZlbnQpCg==
-    commands: []
-    code_origin: https://github.com/daniels290813/functions.git#2675b0d235d93571a696296c93cfb2103cbf261f:/Users/Daniel_Sabba/functions/xgb_serving/xgb_serving.py
-    origin_filename: /Users/Daniel_Sabba/functions/xgb_serving/xgb_serving.py
-    requirements: []
-  description: deploy an XGBoost model server.
-  default_handler: ''
-  disable_auto_mount: false
-  clone_target_dir: ''
-  env: []
-  priority_class_name: ''
-  preemption_mode: prevent
-  min_replicas: 1
-  max_replicas: 4
-  source: ''
-  function_kind: serving_v2
-  function_handler: xgb_serving:handler
-  base_image_pull: false
-  default_class: ClassifierModel
-  secret_sources: []
-  affinity: null
-  tolerations: null
-  security_context: {}
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/xgb_serving/1.1.2/static/item.html b/functions/development/xgb_serving/1.1.2/static/item.html deleted file mode 100644 index 7cb5bff1..00000000 --- a/functions/development/xgb_serving/1.1.2/static/item.html +++ /dev/null @@ -1,51 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- model-serving
-- machine-learning
-description: deploy an XGBoost model server.
-doc: ''
-example: xgb_serving.ipynb
-generationDate: 2022-08-28:17-25
-hidden: false
-icon: ''
-labels:
-  author: Daniel
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 1.4.1
-name: xgb_serving
-platformVersion: 3.5.3
-spec:
-  customFields:
-    default_class: ClassifierModel
-  filename: xgb_serving.py
-  handler: handler
-  image: mlrun/mlrun
-  kind: serving
-  requirements: []
-url: ''
-version: 1.1.2
-
-
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/xgb_serving/1.1.2/static/source.html b/functions/development/xgb_serving/1.1.2/static/source.html deleted file mode 100644 index 4789f702..00000000 --- a/functions/development/xgb_serving/1.1.2/static/source.html +++ /dev/null @@ -1,54 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-# Copyright 2019 Iguazio
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-import os
-import json
-import numpy as np
-from cloudpickle import load
-import mlrun
-
-
-class XGBoostModel(mlrun.serving.V2ModelServer):
-    def load(self):
-        model_file, extra_data = self.get_model(".pkl")
-        self.model = load(open(str(model_file), "rb"))
-
-    def predict(self, body):
-        try:
-            feats = np.asarray(body["inputs"], dtype=np.float32).reshape(-1, 5)
-            result = self.model.predict(feats, validate_features=False)
-            return result.tolist()
-        except Exception as e:
-            raise Exception("Failed to predict %s" % e)
-        
-    
- - \ No newline at end of file diff --git a/functions/development/xgb_serving/1.1.2/static/xgb_serving.html b/functions/development/xgb_serving/1.1.2/static/xgb_serving.html deleted file mode 100644 index e8f69eaf..00000000 --- a/functions/development/xgb_serving/1.1.2/static/xgb_serving.html +++ /dev/null @@ -1,173 +0,0 @@ - - - - - - - -xgb_serving.xgb_serving - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - -
-
- -
-
-
-
-
- -
-

- -
-
-
-
-
-
-
-

Source code for xgb_serving.xgb_serving

-# Copyright 2019 Iguazio
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-import os
-import json
-import numpy as np
-from cloudpickle import load
-import mlrun
-
-
-
[docs]class XGBoostModel(mlrun.serving.V2ModelServer): -
[docs] def load(self): - model_file, extra_data = self.get_model(".pkl") - self.model = load(open(str(model_file), "rb"))
- -
[docs] def predict(self, body): - try: - feats = np.asarray(body["inputs"], dtype=np.float32).reshape(-1, 5) - result = self.model.predict(feats, validate_features=False) - return result.tolist() - except Exception as e: - raise Exception("Failed to predict %s" % e)
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/xgb_serving/latest/src/function.yaml b/functions/development/xgb_serving/latest/src/function.yaml deleted file mode 100644 index 7073d8ba..00000000 --- a/functions/development/xgb_serving/latest/src/function.yaml +++ /dev/null @@ -1,40 +0,0 @@ -kind: serving -metadata: - name: xgb-serving - tag: '' - hash: 200148a9a4815d8b0394038d973b59eda1776d36 - project: '' - labels: - author: Daniel - categories: - - model-serving - - machine-learning -spec: - command: '' - args: [] - image: mlrun/mlrun - build: - functionSourceCode: IyBDb3B5cmlnaHQgMjAxOSBJZ3VhemlvCiMKIyBMaWNlbnNlZCB1bmRlciB0aGUgQXBhY2hlIExpY2Vuc2UsIFZlcnNpb24gMi4wICh0aGUgIkxpY2Vuc2UiKTsKIyB5b3UgbWF5IG5vdCB1c2UgdGhpcyBmaWxlIGV4Y2VwdCBpbiBjb21wbGlhbmNlIHdpdGggdGhlIExpY2Vuc2UuCiMgWW91IG1heSBvYnRhaW4gYSBjb3B5IG9mIHRoZSBMaWNlbnNlIGF0CiMKIyAgICAgaHR0cDovL3d3dy5hcGFjaGUub3JnL2xpY2Vuc2VzL0xJQ0VOU0UtMi4wCiMKIyBVbmxlc3MgcmVxdWlyZWQgYnkgYXBwbGljYWJsZSBsYXcgb3IgYWdyZWVkIHRvIGluIHdyaXRpbmcsIHNvZnR3YXJlCiMgZGlzdHJpYnV0ZWQgdW5kZXIgdGhlIExpY2Vuc2UgaXMgZGlzdHJpYnV0ZWQgb24gYW4gIkFTIElTIiBCQVNJUywKIyBXSVRIT1VUIFdBUlJBTlRJRVMgT1IgQ09ORElUSU9OUyBPRiBBTlkgS0lORCwgZWl0aGVyIGV4cHJlc3Mgb3IgaW1wbGllZC4KIyBTZWUgdGhlIExpY2Vuc2UgZm9yIHRoZSBzcGVjaWZpYyBsYW5ndWFnZSBnb3Zlcm5pbmcgcGVybWlzc2lvbnMgYW5kCiMgbGltaXRhdGlvbnMgdW5kZXIgdGhlIExpY2Vuc2UuCiMKaW1wb3J0IG9zCmltcG9ydCBqc29uCmltcG9ydCBudW1weSBhcyBucApmcm9tIGNsb3VkcGlja2xlIGltcG9ydCBsb2FkCmltcG9ydCBtbHJ1bgoKCmNsYXNzIFhHQm9vc3RNb2RlbChtbHJ1bi5zZXJ2aW5nLlYyTW9kZWxTZXJ2ZXIpOgogICAgZGVmIGxvYWQoc2VsZik6CiAgICAgICAgbW9kZWxfZmlsZSwgZXh0cmFfZGF0YSA9IHNlbGYuZ2V0X21vZGVsKCIucGtsIikKICAgICAgICBzZWxmLm1vZGVsID0gbG9hZChvcGVuKHN0cihtb2RlbF9maWxlKSwgInJiIikpCgogICAgZGVmIHByZWRpY3Qoc2VsZiwgYm9keSk6CiAgICAgICAgdHJ5OgogICAgICAgICAgICBmZWF0cyA9IG5wLmFzYXJyYXkoYm9keVsiaW5wdXRzIl0sIGR0eXBlPW5wLmZsb2F0MzIpLnJlc2hhcGUoLTEsIDUpCiAgICAgICAgICAgIHJlc3VsdCA9IHNlbGYubW9kZWwucHJlZGljdChmZWF0cywgdmFsaWRhdGVfZmVhdHVyZXM9RmFsc2UpCiAgICAgICAgICAgIHJldHVybiByZXN1bHQudG9saXN0KCkKICAgICAgICBleGNlcHQgRXhjZXB0aW9uIGFzIGU6CiAgICAgICAgICAgIHJhaXNlIEV4Y2VwdGlvbigiRmFpbGVkIHRvIHByZWRpY3QgJXMiICUgZSkKZnJvbSBtbHJ1bi5ydW50aW1lcyBpbXBvcnQgbnVjbGlvX2luaXRfaG9vawpkZWYgaW5pdF9jb250ZXh0KGNvbnRleHQpOgogICAgbnVjbGlvX2luaXRfaG9vayhjb250ZXh0LCBnbG9iYWxzKCksICdzZXJ2aW5nX3YyJykKCmRlZiBoYW5kbGVyKGNvbnRleHQsIGV2ZW50KToKICAgIHJldHVybiBjb250ZXh0Lm1scnVuX2hhbmRsZXIoY29udGV4dCwgZXZlbnQpCg== - commands: [] - code_origin: https://github.com/daniels290813/functions.git#2675b0d235d93571a696296c93cfb2103cbf261f:/Users/Daniel_Sabba/functions/xgb_serving/xgb_serving.py - origin_filename: /Users/Daniel_Sabba/functions/xgb_serving/xgb_serving.py - requirements: [] - description: deploy an XGBoost model server. - default_handler: '' - disable_auto_mount: false - clone_target_dir: '' - env: [] - priority_class_name: '' - preemption_mode: prevent - min_replicas: 1 - max_replicas: 4 - source: '' - function_kind: serving_v2 - function_handler: xgb_serving:handler - base_image_pull: false - default_class: ClassifierModel - secret_sources: [] - affinity: null - tolerations: null - security_context: {} -verbose: false diff --git a/functions/development/xgb_serving/latest/src/item.yaml b/functions/development/xgb_serving/latest/src/item.yaml deleted file mode 100644 index 413e26bf..00000000 --- a/functions/development/xgb_serving/latest/src/item.yaml +++ /dev/null @@ -1,29 +0,0 @@ -apiVersion: v1 -categories: -- model-serving -- machine-learning -description: deploy an XGBoost model server. -doc: '' -example: xgb_serving.ipynb -generationDate: 2022-08-28:17-25 -hidden: false -icon: '' -labels: - author: Daniel -maintainers: [] -marketplaceType: '' -mlrunVersion: 1.4.1 -name: xgb_serving -platformVersion: 3.5.3 -spec: - customFields: - default_class: ClassifierModel - filename: xgb_serving.py - handler: handler - image: mlrun/mlrun - kind: serving - requirements: [] -url: '' -version: 1.1.2 - - diff --git a/functions/development/xgb_serving/latest/src/requirements.txt b/functions/development/xgb_serving/latest/src/requirements.txt deleted file mode 100644 index a5bbcdde..00000000 --- a/functions/development/xgb_serving/latest/src/requirements.txt +++ /dev/null @@ -1,7 +0,0 @@ -pandas -xgboost -cloudpickle -pygit2 -scikit-learn==1.0.2 -scikit-plot -seaborn diff --git a/functions/development/xgb_serving/latest/src/test_xgb_serving.py b/functions/development/xgb_serving/latest/src/test_xgb_serving.py deleted file mode 100644 index 52f6ccb6..00000000 --- a/functions/development/xgb_serving/latest/src/test_xgb_serving.py +++ /dev/null @@ -1,67 +0,0 @@ -# Copyright 2019 Iguazio -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -import mlrun -import os -import pandas as pd -from xgb_serving import XGBoostModel - - -def get_class_data(): - fn = mlrun.import_function('../gen_class_data/function.yaml') - run = fn.run(params={'key': 'classifier-data', - 'n_samples': 10_000, - 'm_features': 5, - 'k_classes': 2, - 'header': None, - 'weight': [0.5, 0.5], - 'sk_params': {'n_informative': 2}, - 'file_ext': 'csv'}, local=True, artifact_path="./artifacts") - return run - - -def xgb_trainer(): - # running data preparation function locally - gen_data_run = get_class_data() - - fn = mlrun.import_function('../xgb_trainer/function.yaml') - run = fn.run(params={'model_type': 'classifier', - 'CLASS_tree_method': 'hist', - 'CLASS_objective': 'binary:logistic', - 'CLASS_booster': 'gbtree', - 'FIT_verbose': 0, - 'label_column': 'labels'}, - local=True, inputs={'dataset': gen_data_run.status.artifacts[0]['spec']['target_path']}) - - for artifact in run.status.artifacts: - if artifact['kind'] == 'model': - assert os.path.exists(artifact['spec']['target_path']), "Failed locating model file" # validating model exists - return artifact['spec']['target_path'] + artifact['spec']['model_file'], gen_data_run.status.artifacts[0]['spec']['target_path'] - assert False, "Failed creating model" - - -def test_local_xgb_serving(): - model_path, dataset_path = xgb_trainer() - fn = mlrun.import_function('function.yaml') - - fn.add_model(key='my_model', model_path=model_path, class_name='XGBoostModel') - server = fn.to_mock_server() - - # Testing the model - df = pd.read_csv(dataset_path) - x = df.drop(['labels'], axis=1).iloc[0].tolist() - y_true = df['labels'][0] - - y_pred = server.test(path='/v2/models/my_model/predict', body={"inputs": x})['outputs'][0] - assert y_true == y_pred diff --git a/functions/development/xgb_serving/latest/src/xgb_serving.ipynb b/functions/development/xgb_serving/latest/src/xgb_serving.ipynb deleted file mode 100644 index 6c605367..00000000 --- a/functions/development/xgb_serving/latest/src/xgb_serving.ipynb +++ /dev/null @@ -1,421 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Deploy a Serverless XGBoost Model Server\n", - " --------------------------------------------------------------------\n", - "\n", - "The following notebook demonstrates how to deploy an XGBoost model server (a.k.a Nuclio-serving)\n", - "\n", - "#### **notebook how-to's**\n", - "* Write and test model serving class in a notebook.\n", - "* Deploy the model server function.\n", - "* Invoke and test the serving function." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "#### **steps**\n", - "**[define a new function and its dependencies](#define-function)**
\n", - "**[test the model serving class locally](#test-locally)**
\n", - "**[deploy our serving class using as a serverless function](#deploy)**
\n", - "**[test our model server using HTTP request](#test-model-server)**
" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: ignore\n", - "import nuclio " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "### **define a new function and its dependencies**" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "%nuclio: setting kind to 'nuclio:serving'\n", - "%nuclio: setting 'MODEL_CLASS' environment variable\n", - "%nuclio: setting spec.build.baseImage to 'mlrun/ml-models'\n" - ] - } - ], - "source": [ - "%nuclio config kind=\"nuclio:serving\"\n", - "%nuclio env MODEL_CLASS=XGBoostModel\n", - "\n", - "%nuclio config spec.build.baseImage = \"mlrun/ml-models\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Function Code" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "# import kfserving\n", - "import os\n", - "import json\n", - "import numpy as np\n", - "import xgboost as xgb\n", - "from cloudpickle import load\n", - "\n", - "### Model Serving Class\n", - "\n", - "import mlrun\n", - "class XGBoostModel(mlrun.runtimes.MLModelServer):\n", - " def load(self):\n", - " model_file, extra_data = self.get_model(\".pkl\")\n", - " self.model = load(open(str(model_file), \"rb\"))\n", - " \n", - "\n", - " def predict(self, body):\n", - " try:\n", - " feats = np.asarray(body[\"instances\"], dtype=np.float32).reshape(-1, 5)\n", - " result = self.model.predict(feats, validate_features=False)\n", - " return result.tolist()\n", - " except Exception as e:\n", - " raise Exception(\"Failed to predict %s\" % e)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The following end-code annotation tells ```nuclio``` to stop parsing the notebook from this cell. _**Please do not remove this cell**_:" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "# nuclio: end-code" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "## Test the function locally\n", - "\n", - "The class above can be tested locally. Just instantiate the class, `.load()` will load the model to a local dir.\n", - "\n", - "> **Verify there is a model file in the model_dir path (generated by the training notebook)**" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import mlconf\n", - "\n", - "model_dir = os.path.join(mlconf.artifact_path, \"xgb/models\")\n", - "\n", - "my_server = XGBoostModel(\"my-model\", model_dir=model_dir)\n", - "my_server.load()" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "metadata": {}, - "outputs": [], - "source": [ - "DATA_PATH = mlconf.artifact_path + \"/xgb/classifier-data.csv\"\n", - "MODEL_PATH = mlconf.artifact_path + \"/xgb/models/xgb_test\"" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "xtest = pd.read_csv(DATA_PATH)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can use the `.predict(body)` method to test the model." - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "metadata": {}, - "outputs": [], - "source": [ - "import json, numpy as np\n", - "preds = my_server.predict({\"instances\":xtest.values[:10,:-1].tolist()})" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "predicted class: [1, 0, 0, 0, 0, 0, 1, 1, 0, 1]\n" - ] - } - ], - "source": [ - "print(\"predicted class:\", preds)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "### **deploy our serving class using as a serverless function**\n", - "in the following section we create a new model serving function which wraps our class , and specify model and other resources.\n", - "\n", - "the `models` dict store model names and the assosiated model **dir** URL (the URL can start with `S3://` and other blob store options), the faster way is to use a shared file volume, we use `.apply(mount_v3io())` to attach a v3io (iguazio data fabric) volume to our function. By default v3io will mount the current user home into the `\\User` function path.\n", - "\n", - "**verify the model dir does contain a valid `model.bst` file**" - ] - }, - { - "cell_type": "code", - "execution_count": 41, - "metadata": {}, - "outputs": [], - "source": [ - "from mlrun import new_model_server, mount_v3io\n", - "import requests" - ] - }, - { - "cell_type": "code", - "execution_count": 42, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-06-14 12:49:05,013 function spec saved to path: function.yaml\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 42, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fn = new_model_server(\"xgb-serving\",\n", - " model_class=\"XGBoostModel\",\n", - " models={\"xgb_serving_v2\": f\"{model_dir}\"})\n", - "fn.spec.description = \"xgboost test data classification server\"\n", - "fn.metadata.categories = [\"serving\", \"ml\"]\n", - "fn.metadata.labels = {\"author\": \"yaronh\", \"framework\": \"xgboost\"}\n", - "\n", - "fn.export(\"function.yaml\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## tests" - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 43, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from mlrun.platforms.other import auto_mount\n", - "fn.apply(auto_mount())" - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[mlrun] 2020-06-14 12:49:18,128 deploy started\n", - "[nuclio] 2020-06-14 12:49:19,213 (info) Build complete\n", - "[nuclio] 2020-06-14 12:49:27,347 (info) Function deploy complete\n", - "[nuclio] 2020-06-14 12:49:27,354 done updating default-xgb-test, function address: 3.23.82.202:30104\n" - ] - } - ], - "source": [ - "addr = fn.deploy()" - ] - }, - { - "cell_type": "code", - "execution_count": 45, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'http://3.23.82.202:30104'" - ] - }, - "execution_count": 45, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "addr" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "### **test our model server using HTTP request**\n", - "\n", - "\n", - "We invoke our model serving function using test data, the data vector is specified in the `instances` attribute." - ] - }, - { - "cell_type": "code", - "execution_count": 46, - "metadata": {}, - "outputs": [], - "source": [ - "# KFServing protocol event\n", - "event_data = {\"instances\": xtest.values[:10,:-1].tolist()}" - ] - }, - { - "cell_type": "code", - "execution_count": 47, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'[1, 0, 0, 0, 0, 0, 1, 1, 0, 1]'" - ] - }, - "execution_count": 47, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import json\n", - "resp = requests.put(addr + \"/xgb_serving_v2/predict\", json=json.dumps(event_data))\n", - "resp.text" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[1, 0, 0, 0, 0, 0, 1, 1, 0, 1]" - ] - }, - "execution_count": 26, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "preds" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**[back to top](#top)**" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.8" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/functions/development/xgb_serving/latest/src/xgb_serving.py b/functions/development/xgb_serving/latest/src/xgb_serving.py deleted file mode 100644 index a4d095e5..00000000 --- a/functions/development/xgb_serving/latest/src/xgb_serving.py +++ /dev/null @@ -1,33 +0,0 @@ -# Copyright 2019 Iguazio -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -import os -import json -import numpy as np -from cloudpickle import load -import mlrun - - -class XGBoostModel(mlrun.serving.V2ModelServer): - def load(self): - model_file, extra_data = self.get_model(".pkl") - self.model = load(open(str(model_file), "rb")) - - def predict(self, body): - try: - feats = np.asarray(body["inputs"], dtype=np.float32).reshape(-1, 5) - result = self.model.predict(feats, validate_features=False) - return result.tolist() - except Exception as e: - raise Exception("Failed to predict %s" % e) \ No newline at end of file diff --git a/functions/development/xgb_serving/latest/static/documentation.html b/functions/development/xgb_serving/latest/static/documentation.html deleted file mode 100644 index 54daeb03..00000000 --- a/functions/development/xgb_serving/latest/static/documentation.html +++ /dev/null @@ -1,237 +0,0 @@ - - - - - - - -xgb_serving package - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - - - -
-
- - -
-
-
- -
-

xgb_serving package

- -
- -
-
-
-
-
-

xgb_serving package#

-
-

Submodules#

-
-
-

xgb_serving.xgb_serving module#

-
-
-class xgb_serving.xgb_serving.XGBoostModel(context=None, name: Optional[str] = None, model_path: Optional[str] = None, model=None, protocol=None, input_path: Optional[str] = None, result_path: Optional[str] = None, **kwargs)[source]#
-

Bases: mlrun.serving.v2_serving.V2ModelServer

-
-
-load()[source]#
-

model loading function, see also .get_model() method

-
-
-
-predict(body)[source]#
-

model prediction operation

-
-
-
-
-

Module contents#

-
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/xgb_serving/latest/static/example.html b/functions/development/xgb_serving/latest/static/example.html deleted file mode 100644 index 5f2174ae..00000000 --- a/functions/development/xgb_serving/latest/static/example.html +++ /dev/null @@ -1,565 +0,0 @@ - - - - - - - -Deploy a Serverless XGBoost Model Server - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - - - -
-
- - -
-
-
- - -
-
-
-

Deploy a Serverless XGBoost Model Server#

-
-

The following notebook demonstrates how to deploy an XGBoost model server (a.k.a Nuclio-serving)

-
-

notebook how-to’s#

-
    -
  • Write and test model serving class in a notebook.

  • -
  • Deploy the model server function.

  • -
  • Invoke and test the serving function.

  • -
-

-
-
-

steps#

-

define a new function and its dependencies
-test the model serving class locally
-deploy our serving class using as a serverless function
-test our model server using HTTP request

-
-
-
# nuclio: ignore
-import nuclio 
-
-
-
-
-

-
-
-

define a new function and its dependencies#

-
-
-
%nuclio config kind="nuclio:serving"
-%nuclio env MODEL_CLASS=XGBoostModel
-
-%nuclio config spec.build.baseImage = "mlrun/ml-models"
-
-
-
-
-
%nuclio: setting kind to 'nuclio:serving'
-%nuclio: setting 'MODEL_CLASS' environment variable
-%nuclio: setting spec.build.baseImage to 'mlrun/ml-models'
-
-
-
-
-
-
-

Function Code#

-
-
-
# import kfserving
-import os
-import json
-import numpy as np
-import xgboost as xgb
-from cloudpickle import load
-
-### Model Serving Class
-
-import mlrun
-class XGBoostModel(mlrun.runtimes.MLModelServer):
-    def load(self):
-        model_file, extra_data = self.get_model(".pkl")
-        self.model = load(open(str(model_file), "rb"))
-  
-
-    def predict(self, body):
-        try:
-            feats = np.asarray(body["instances"], dtype=np.float32).reshape(-1, 5)
-            result = self.model.predict(feats, validate_features=False)
-            return result.tolist()
-        except Exception as e:
-            raise Exception("Failed to predict %s" % e)
-
-
-
-
-

The following end-code annotation tells nuclio to stop parsing the notebook from this cell. Please do not remove this cell:

-
-
-
# nuclio: end-code
-
-
-
-
-

-
-
-

Test the function locally#

-

The class above can be tested locally. Just instantiate the class, .load() will load the model to a local dir.

-
-

Verify there is a model file in the model_dir path (generated by the training notebook)

-
-
-
-
from mlrun import mlconf
-
-model_dir = os.path.join(mlconf.artifact_path, "xgb/models")
-
-my_server = XGBoostModel("my-model", model_dir=model_dir)
-my_server.load()
-
-
-
-
-
-
-
DATA_PATH = mlconf.artifact_path + "/xgb/classifier-data.csv"
-MODEL_PATH = mlconf.artifact_path + "/xgb/models/xgb_test"
-
-
-
-
-
-
-
import pandas as pd
-xtest = pd.read_csv(DATA_PATH)
-
-
-
-
-

We can use the .predict(body) method to test the model.

-
-
-
import json, numpy as np
-preds = my_server.predict({"instances":xtest.values[:10,:-1].tolist()})
-
-
-
-
-
-
-
print("predicted class:", preds)
-
-
-
-
-
predicted class: [1, 0, 0, 0, 0, 0, 1, 1, 0, 1]
-
-
-
-
-

-
-

deploy our serving class using as a serverless function#

-

in the following section we create a new model serving function which wraps our class , and specify model and other resources.

-

the models dict store model names and the assosiated model dir URL (the URL can start with S3:// and other blob store options), the faster way is to use a shared file volume, we use .apply(mount_v3io()) to attach a v3io (iguazio data fabric) volume to our function. By default v3io will mount the current user home into the \User function path.

-

verify the model dir does contain a valid model.bst file

-
-
-
from mlrun import new_model_server, mount_v3io
-import requests
-
-
-
-
-
-
-
fn = new_model_server("xgb-serving",
-                      model_class="XGBoostModel",
-                      models={"xgb_serving_v2": f"{model_dir}"})
-fn.spec.description = "xgboost test data classification server"
-fn.metadata.categories = ["serving", "ml"]
-fn.metadata.labels = {"author": "yaronh", "framework": "xgboost"}
-
-fn.export("function.yaml")
-
-
-
-
-
[mlrun] 2020-06-14 12:49:05,013 function spec saved to path: function.yaml
-
-
-
<mlrun.runtimes.function.RemoteRuntime at 0x7f0218662f60>
-
-
-
-
-
-
-
-

tests#

-
-
-
from mlrun.platforms.other import auto_mount
-fn.apply(auto_mount())
-
-
-
-
-
<mlrun.runtimes.function.RemoteRuntime at 0x7f0218662f60>
-
-
-
-
-
-
-
addr = fn.deploy()
-
-
-
-
-
[mlrun] 2020-06-14 12:49:18,128 deploy started
-[nuclio] 2020-06-14 12:49:19,213 (info) Build complete
-[nuclio] 2020-06-14 12:49:27,347 (info) Function deploy complete
-[nuclio] 2020-06-14 12:49:27,354 done updating default-xgb-test, function address: 3.23.82.202:30104
-
-
-
-
-
-
-
addr
-
-
-
-
-
'http://3.23.82.202:30104'
-
-
-
-
-

-
-

test our model server using HTTP request#

-

We invoke our model serving function using test data, the data vector is specified in the instances attribute.

-
-
-
# KFServing protocol event
-event_data = {"instances": xtest.values[:10,:-1].tolist()}
-
-
-
-
-
-
-
import json
-resp = requests.put(addr + "/xgb_serving_v2/predict", json=json.dumps(event_data))
-resp.text
-
-
-
-
-
'[1, 0, 0, 0, 0, 0, 1, 1, 0, 1]'
-
-
-
-
-
-
-
preds
-
-
-
-
-
[1, 0, 0, 0, 0, 0, 1, 1, 0, 1]
-
-
-
-
-

back to top

-
-
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file diff --git a/functions/development/xgb_serving/latest/static/function.html b/functions/development/xgb_serving/latest/static/function.html deleted file mode 100644 index 2c8642eb..00000000 --- a/functions/development/xgb_serving/latest/static/function.html +++ /dev/null @@ -1,62 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-kind: serving
-metadata:
-  name: xgb-serving
-  tag: ''
-  hash: 200148a9a4815d8b0394038d973b59eda1776d36
-  project: ''
-  labels:
-    author: Daniel
-  categories:
-  - model-serving
-  - machine-learning
-spec:
-  command: ''
-  args: []
-  image: mlrun/mlrun
-  build:
-    functionSourceCode: IyBDb3B5cmlnaHQgMjAxOSBJZ3VhemlvCiMKIyBMaWNlbnNlZCB1bmRlciB0aGUgQXBhY2hlIExpY2Vuc2UsIFZlcnNpb24gMi4wICh0aGUgIkxpY2Vuc2UiKTsKIyB5b3UgbWF5IG5vdCB1c2UgdGhpcyBmaWxlIGV4Y2VwdCBpbiBjb21wbGlhbmNlIHdpdGggdGhlIExpY2Vuc2UuCiMgWW91IG1heSBvYnRhaW4gYSBjb3B5IG9mIHRoZSBMaWNlbnNlIGF0CiMKIyAgICAgaHR0cDovL3d3dy5hcGFjaGUub3JnL2xpY2Vuc2VzL0xJQ0VOU0UtMi4wCiMKIyBVbmxlc3MgcmVxdWlyZWQgYnkgYXBwbGljYWJsZSBsYXcgb3IgYWdyZWVkIHRvIGluIHdyaXRpbmcsIHNvZnR3YXJlCiMgZGlzdHJpYnV0ZWQgdW5kZXIgdGhlIExpY2Vuc2UgaXMgZGlzdHJpYnV0ZWQgb24gYW4gIkFTIElTIiBCQVNJUywKIyBXSVRIT1VUIFdBUlJBTlRJRVMgT1IgQ09ORElUSU9OUyBPRiBBTlkgS0lORCwgZWl0aGVyIGV4cHJlc3Mgb3IgaW1wbGllZC4KIyBTZWUgdGhlIExpY2Vuc2UgZm9yIHRoZSBzcGVjaWZpYyBsYW5ndWFnZSBnb3Zlcm5pbmcgcGVybWlzc2lvbnMgYW5kCiMgbGltaXRhdGlvbnMgdW5kZXIgdGhlIExpY2Vuc2UuCiMKaW1wb3J0IG9zCmltcG9ydCBqc29uCmltcG9ydCBudW1weSBhcyBucApmcm9tIGNsb3VkcGlja2xlIGltcG9ydCBsb2FkCmltcG9ydCBtbHJ1bgoKCmNsYXNzIFhHQm9vc3RNb2RlbChtbHJ1bi5zZXJ2aW5nLlYyTW9kZWxTZXJ2ZXIpOgogICAgZGVmIGxvYWQoc2VsZik6CiAgICAgICAgbW9kZWxfZmlsZSwgZXh0cmFfZGF0YSA9IHNlbGYuZ2V0X21vZGVsKCIucGtsIikKICAgICAgICBzZWxmLm1vZGVsID0gbG9hZChvcGVuKHN0cihtb2RlbF9maWxlKSwgInJiIikpCgogICAgZGVmIHByZWRpY3Qoc2VsZiwgYm9keSk6CiAgICAgICAgdHJ5OgogICAgICAgICAgICBmZWF0cyA9IG5wLmFzYXJyYXkoYm9keVsiaW5wdXRzIl0sIGR0eXBlPW5wLmZsb2F0MzIpLnJlc2hhcGUoLTEsIDUpCiAgICAgICAgICAgIHJlc3VsdCA9IHNlbGYubW9kZWwucHJlZGljdChmZWF0cywgdmFsaWRhdGVfZmVhdHVyZXM9RmFsc2UpCiAgICAgICAgICAgIHJldHVybiByZXN1bHQudG9saXN0KCkKICAgICAgICBleGNlcHQgRXhjZXB0aW9uIGFzIGU6CiAgICAgICAgICAgIHJhaXNlIEV4Y2VwdGlvbigiRmFpbGVkIHRvIHByZWRpY3QgJXMiICUgZSkKZnJvbSBtbHJ1bi5ydW50aW1lcyBpbXBvcnQgbnVjbGlvX2luaXRfaG9vawpkZWYgaW5pdF9jb250ZXh0KGNvbnRleHQpOgogICAgbnVjbGlvX2luaXRfaG9vayhjb250ZXh0LCBnbG9iYWxzKCksICdzZXJ2aW5nX3YyJykKCmRlZiBoYW5kbGVyKGNvbnRleHQsIGV2ZW50KToKICAgIHJldHVybiBjb250ZXh0Lm1scnVuX2hhbmRsZXIoY29udGV4dCwgZXZlbnQpCg==
-    commands: []
-    code_origin: https://github.com/daniels290813/functions.git#2675b0d235d93571a696296c93cfb2103cbf261f:/Users/Daniel_Sabba/functions/xgb_serving/xgb_serving.py
-    origin_filename: /Users/Daniel_Sabba/functions/xgb_serving/xgb_serving.py
-    requirements: []
-  description: deploy an XGBoost model server.
-  default_handler: ''
-  disable_auto_mount: false
-  clone_target_dir: ''
-  env: []
-  priority_class_name: ''
-  preemption_mode: prevent
-  min_replicas: 1
-  max_replicas: 4
-  source: ''
-  function_kind: serving_v2
-  function_handler: xgb_serving:handler
-  base_image_pull: false
-  default_class: ClassifierModel
-  secret_sources: []
-  affinity: null
-  tolerations: null
-  security_context: {}
-verbose: false
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/xgb_serving/latest/static/item.html b/functions/development/xgb_serving/latest/static/item.html deleted file mode 100644 index 7cb5bff1..00000000 --- a/functions/development/xgb_serving/latest/static/item.html +++ /dev/null @@ -1,51 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-apiVersion: v1
-categories:
-- model-serving
-- machine-learning
-description: deploy an XGBoost model server.
-doc: ''
-example: xgb_serving.ipynb
-generationDate: 2022-08-28:17-25
-hidden: false
-icon: ''
-labels:
-  author: Daniel
-maintainers: []
-marketplaceType: ''
-mlrunVersion: 1.4.1
-name: xgb_serving
-platformVersion: 3.5.3
-spec:
-  customFields:
-    default_class: ClassifierModel
-  filename: xgb_serving.py
-  handler: handler
-  image: mlrun/mlrun
-  kind: serving
-  requirements: []
-url: ''
-version: 1.1.2
-
-
-
-        
-    
- - \ No newline at end of file diff --git a/functions/development/xgb_serving/latest/static/source.html b/functions/development/xgb_serving/latest/static/source.html deleted file mode 100644 index 4789f702..00000000 --- a/functions/development/xgb_serving/latest/static/source.html +++ /dev/null @@ -1,54 +0,0 @@ - - - - - - - - - - - Source - - - - -
-        
-# Copyright 2019 Iguazio
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-import os
-import json
-import numpy as np
-from cloudpickle import load
-import mlrun
-
-
-class XGBoostModel(mlrun.serving.V2ModelServer):
-    def load(self):
-        model_file, extra_data = self.get_model(".pkl")
-        self.model = load(open(str(model_file), "rb"))
-
-    def predict(self, body):
-        try:
-            feats = np.asarray(body["inputs"], dtype=np.float32).reshape(-1, 5)
-            result = self.model.predict(feats, validate_features=False)
-            return result.tolist()
-        except Exception as e:
-            raise Exception("Failed to predict %s" % e)
-        
-    
- - \ No newline at end of file diff --git a/functions/development/xgb_serving/latest/static/xgb_serving.html b/functions/development/xgb_serving/latest/static/xgb_serving.html deleted file mode 100644 index e8f69eaf..00000000 --- a/functions/development/xgb_serving/latest/static/xgb_serving.html +++ /dev/null @@ -1,173 +0,0 @@ - - - - - - - -xgb_serving.xgb_serving - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
- - - -
- -
-
-
-
-
-
- - -
-
- -
-
-
-
-
- -
-

- -
-
-
-
-
-
-
-

Source code for xgb_serving.xgb_serving

-# Copyright 2019 Iguazio
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-import os
-import json
-import numpy as np
-from cloudpickle import load
-import mlrun
-
-
-
[docs]class XGBoostModel(mlrun.serving.V2ModelServer): -
[docs] def load(self): - model_file, extra_data = self.get_model(".pkl") - self.model = load(open(str(model_file), "rb"))
- -
[docs] def predict(self, body): - try: - feats = np.asarray(body["inputs"], dtype=np.float32).reshape(-1, 5) - result = self.model.predict(feats, validate_features=False) - return result.tolist() - except Exception as e: - raise Exception("Failed to predict %s" % e)
-
-
-
-
- -
-
-
-
-
- -
-
-
- - - - \ No newline at end of file